1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
43 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
47 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
48 ALIGNED_4( int16_t mvc[32][5][2] );
52 int i_cost4x4[4]; /* cost per 8x8 partition */
53 x264_me_t me4x4[4][4];
56 int i_cost8x4[4]; /* cost per 8x8 partition */
57 x264_me_t me8x4[4][2];
60 int i_cost4x8[4]; /* cost per 8x8 partition */
61 x264_me_t me4x8[4][2];
71 } x264_mb_analysis_list_t;
75 /* conduct the analysis using this lamda and QP */
80 uint16_t *p_cost_ref[2];
85 /* Take some shortcuts in intra search if intra is deemed unlikely */
87 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
92 int i_satd_i16x16_dir[7];
97 int i_satd_i8x8_dir[12][4];
101 int i_predict4x4[16];
106 int i_satd_i8x8chroma;
107 int i_satd_i8x8chroma_dir[7];
108 int i_predict8x8chroma;
110 /* II: Inter part P/B frame */
111 x264_mb_analysis_list_t l0;
112 x264_mb_analysis_list_t l1;
114 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
115 int i_cost16x16direct;
117 int i_cost8x8direct[4];
126 int i_mb_partition16x8[2]; /* mb_partition_e */
127 int i_mb_partition8x16[2];
128 int i_mb_type16x8; /* mb_class_e */
131 int b_direct_available;
133 } x264_mb_analysis_t;
135 /* lambda = pow(2,qp/6-2) */
136 const uint8_t x264_lambda_tab[52] = {
137 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
138 1, 1, 1, 1, /* 8-11 */
139 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
140 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
141 6, 7, 8, 9,10,11,13,14, /* 28-35 */
142 16,18,20,23,25,29,32,36, /* 36-43 */
143 40,45,51,57,64,72,81,91 /* 44-51 */
146 /* lambda2 = pow(lambda,2) * .9 * 256 */
147 const int x264_lambda2_tab[52] = {
148 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
149 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
150 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
151 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
152 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
153 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
154 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
157 const uint8_t x264_exp2_lut[64] = {
158 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
159 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
160 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
161 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
164 const float x264_log2_lut[128] = {
165 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
166 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
167 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
168 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
169 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
170 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
171 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
172 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
173 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
174 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
175 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
176 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
177 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
178 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
179 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
180 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
183 /* Avoid an int/float conversion. */
184 const float x264_log2_lz_lut[32] = {
185 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
188 // should the intra and inter lambdas be different?
189 // I'm just matching the behaviour of deadzone quant.
190 static const int x264_trellis_lambda2_tab[2][52] = {
191 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
192 { 46, 58, 73, 92, 117, 147,
193 185, 233, 294, 370, 466, 587,
194 740, 932, 1174, 1480, 1864, 2349,
195 2959, 3728, 4697, 5918, 7457, 9395,
196 11837, 14914, 18790, 23674, 29828, 37581,
197 47349, 59656, 75163, 94699, 119313, 150326,
198 189399, 238627, 300652, 378798, 477255, 601304,
199 757596, 954511, 1202608, 1515192, 1909022, 2405217,
200 3030384, 3818045, 4810435, 6060769 },
201 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
202 { 27, 34, 43, 54, 68, 86,
203 108, 136, 172, 216, 273, 343,
204 433, 545, 687, 865, 1090, 1374,
205 1731, 2180, 2747, 3461, 4361, 5494,
206 6922, 8721, 10988, 13844, 17442, 21976,
207 27688, 34885, 43953, 55377, 69771, 87906,
208 110755, 139543, 175813, 221511, 279087, 351627,
209 443023, 558174, 703255, 886046, 1116348, 1406511,
210 1772093, 2232697, 2813022, 3544186 }
213 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
214 16, 20, 25, 32, 40, 50,
215 64, 80, 101, 128, 161, 203,
216 256, 322, 406, 512, 645, 812,
217 1024, 1290, 1625, 2048, 2580, 3250,
218 4096, 5160, 6501, 8192, 10321, 13003,
219 16384, 20642, 26007, 32768, 41285, 52015,
223 /* TODO: calculate CABAC costs */
224 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
225 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
227 static const uint8_t i_mb_b16x8_cost_table[17] = {
228 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
230 static const uint8_t i_sub_mb_b_cost_table[13] = {
231 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
233 static const uint8_t i_sub_mb_p_cost_table[4] = {
237 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
239 static uint16_t x264_cost_ref[92][3][33];
240 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
242 int x264_analyse_init_costs( x264_t *h, int qp )
245 int lambda = x264_lambda_tab[qp];
246 if( h->cost_mv[lambda] )
248 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
249 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
250 h->cost_mv[lambda] += 2*4*2048;
251 for( i = 0; i <= 2*4*2048; i++ )
253 h->cost_mv[lambda][-i] =
254 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
256 x264_pthread_mutex_lock( &cost_ref_mutex );
257 for( i = 0; i < 3; i++ )
258 for( j = 0; j < 33; j++ )
259 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
260 x264_pthread_mutex_unlock( &cost_ref_mutex );
261 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
265 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
266 h->cost_mv_fpel[lambda][j] += 2*2048;
267 for( i = -2*2048; i < 2*2048; i++ )
268 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
276 void x264_analyse_free_costs( x264_t *h )
279 for( i = 0; i < 92; i++ )
282 x264_free( h->cost_mv[i] - 2*4*2048 );
283 if( h->cost_mv_fpel[i][0] )
284 for( j = 0; j < 4; j++ )
285 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
289 void x264_analyse_weight_frame( x264_t *h, int end )
292 for( j=0; j<h->i_ref0; j++ )
294 if( h->sh.weight[j][0].weightfn )
296 x264_frame_t *frame = h->fref0[j];
297 int width = frame->i_width[0] + 2*PADH;
298 int i_padv = PADV << h->param.b_interlaced;
300 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
302 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
303 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
304 h->fenc->i_lines_weighted += height;
307 for( k = j; k < h->i_ref0; k++ )
308 if( h->sh.weight[k][0].weightfn )
310 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
311 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
312 src + offset, frame->i_stride[0],
313 width, height, &h->sh.weight[k][0] );
321 /* initialize an array of lambda*nbits for all possible mvs */
322 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
324 a->p_cost_mv = h->cost_mv[a->i_lambda];
325 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
326 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
329 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
331 /* conduct the analysis using this lamda and QP */
332 a->i_qp = h->mb.i_qp = i_qp;
333 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
335 a->i_lambda = x264_lambda_tab[i_qp];
336 a->i_lambda2 = x264_lambda2_tab[i_qp];
338 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
339 if( h->param.analyse.i_trellis )
341 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
342 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
343 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
344 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
346 h->mb.i_psy_rd_lambda = a->i_lambda;
347 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
348 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
352 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
354 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
356 /* mbrd == 1 -> RD mode decision */
357 /* mbrd == 2 -> RD refinement */
358 /* mbrd == 3 -> QPRD */
359 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
361 x264_mb_analyse_init_qp( h, a, i_qp );
363 h->mb.i_me_method = h->param.analyse.i_me_method;
364 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
365 if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
366 h->mb.i_subpel_refine--;
367 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
368 && h->mb.i_subpel_refine >= 5;
369 h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
370 (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
372 h->mb.b_transform_8x8 = 0;
373 h->mb.b_noise_reduction = 0;
379 a->i_satd_i8x8chroma = COST_MAX;
381 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
382 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
386 h->mb.b_lossless ? 0 :
388 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
390 /* II: Inter part P/B frame */
391 if( h->sh.i_type != SLICE_TYPE_I )
394 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
395 // limit motion search to a slightly smaller range than the theoretical limit,
396 // since the search may go a few iterations past its given range
397 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
399 /* Calculate max allowed MV range */
400 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
401 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
402 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
403 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
404 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
405 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
407 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
408 int max_mv = max_x - 4*16*h->mb.i_mb_x;
409 /* If we're left of the refresh bar, don't reference right of it. */
410 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
411 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
413 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
414 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
415 if( h->mb.i_mb_x == 0 )
417 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
418 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
419 int thread_mvy_range = i_fmv_range;
421 if( h->i_thread_frames > 1 )
423 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
424 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
425 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
427 x264_frame_t **fref = i ? h->fref1 : h->fref0;
428 int i_ref = i ? h->i_ref1 : h->i_ref0;
429 for( j=0; j<i_ref; j++ )
431 x264_frame_cond_wait( fref[j]->orig, thresh );
432 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
436 if( h->param.b_deterministic )
437 thread_mvy_range = h->param.analyse.i_mv_range_thread;
438 if( h->mb.b_interlaced )
439 thread_mvy_range >>= 1;
441 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
444 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
445 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
446 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
447 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
448 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
449 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
450 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
456 a->l0.i_cost8x8 = COST_MAX;
458 for( i = 0; i < 4; i++ )
462 a->l0.i_cost4x8[i] = COST_MAX;
466 a->l0.i_cost8x16 = COST_MAX;
467 if( h->sh.i_type == SLICE_TYPE_B )
471 a->l1.i_cost8x8 = COST_MAX;
473 for( i = 0; i < 4; i++ )
478 a->i_cost8x8direct[i] = COST_MAX;
489 a->i_cost16x16direct =
492 a->i_cost8x16bi = COST_MAX;
495 /* Fast intra decision */
496 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
498 if( IS_INTRA( h->mb.i_mb_type_left )
499 || IS_INTRA( h->mb.i_mb_type_top )
500 || IS_INTRA( h->mb.i_mb_type_topleft )
501 || IS_INTRA( h->mb.i_mb_type_topright )
502 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
503 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
504 { /* intra is likely */ }
511 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
512 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
514 a->b_force_intra = 1;
518 a->b_force_intra = 0;
522 /* Prediction modes allowed for various combinations of neighbors. */
523 /* Terminated by a -1. */
524 /* In order, no neighbors, left, top, top/left, top/left/topleft */
525 static const int8_t i16x16_mode_available[5][5] =
527 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
528 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
529 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
530 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
531 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
534 static const int8_t i8x8chroma_mode_available[5][5] =
536 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
537 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
538 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
539 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
540 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
543 static const int8_t i4x4_mode_available[5][10] =
545 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
546 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
547 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
548 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
549 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
552 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
554 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
555 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
558 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
560 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
561 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
564 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
566 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
567 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
570 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
571 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
573 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
575 if( do_both_dct || h->mb.b_transform_8x8 )
576 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
577 if( do_both_dct || !h->mb.b_transform_8x8 )
578 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
581 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
582 static inline void x264_mb_cache_fenc_satd( x264_t *h )
584 ALIGNED_16( static uint8_t zero[16] ) = {0};
586 int x, y, satd_sum = 0, sa8d_sum = 0;
587 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
588 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
589 if( !h->mb.i_psy_rd )
591 for( y = 0; y < 4; y++ )
592 for( x = 0; x < 4; x++ )
594 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
595 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
596 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
597 satd_sum += h->mb.pic.fenc_satd[y][x];
599 for( y = 0; y < 2; y++ )
600 for( x = 0; x < 2; x++ )
602 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
603 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
604 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
605 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
607 h->mb.pic.fenc_satd_sum = satd_sum;
608 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
611 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
613 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
615 if( a->i_satd_i8x8chroma < COST_MAX )
618 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
620 /* 8x8 prediction selection for chroma */
621 if( predict_mode[3] >= 0 && b_merged_satd )
623 int satdu[4], satdv[4];
624 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
625 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
626 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
627 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
628 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
629 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
631 for( ; *predict_mode >= 0; predict_mode++ )
633 int i_mode = *predict_mode;
634 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
636 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
637 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
642 for( ; *predict_mode >= 0; predict_mode++ )
645 int i_mode = *predict_mode;
647 /* we do the prediction */
648 if( h->mb.b_lossless )
649 x264_predict_lossless_8x8_chroma( h, i_mode );
652 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
653 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
656 /* we calculate the cost */
657 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
658 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
659 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
661 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
662 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
666 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
669 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
671 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
672 uint8_t *p_src = h->mb.pic.p_fenc[0];
673 uint8_t *p_dst = h->mb.pic.p_fdec[0];
676 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
678 /*---------------- Try all mode and calculate their score ---------------*/
680 /* 16x16 prediction selection */
681 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
683 if( b_merged_satd && predict_mode[3] >= 0 )
685 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
686 h->predict_16x16[I_PRED_16x16_P]( p_dst );
687 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
688 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
691 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
692 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
697 for( ; *predict_mode >= 0; predict_mode++ )
700 int i_mode = *predict_mode;
702 if( h->mb.b_lossless )
703 x264_predict_lossless_16x16( h, i_mode );
705 h->predict_16x16[i_mode]( p_dst );
707 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
708 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
709 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
710 a->i_satd_i16x16_dir[i_mode] = i_satd;
714 if( h->sh.i_type == SLICE_TYPE_B )
715 /* cavlc mb type prefix */
716 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
717 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
720 /* 8x8 prediction selection */
721 if( flags & X264_ANALYSE_I8x8 )
723 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
724 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
725 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
727 h->mb.i_cbp_luma = 0;
728 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
730 // FIXME some bias like in i4x4?
731 if( h->sh.i_type == SLICE_TYPE_B )
732 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
734 for( idx = 0;; idx++ )
738 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
739 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
740 int i_best = COST_MAX;
741 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
743 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
744 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
746 if( b_merged_satd && predict_mode[8] >= 0 )
749 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
750 satd[i_pred_mode] -= 3 * a->i_lambda;
751 for( i=2; i>=0; i-- )
753 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
754 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
759 for( ; *predict_mode >= 0; predict_mode++ )
762 int i_mode = *predict_mode;
764 if( h->mb.b_lossless )
765 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
767 h->predict_8x8[i_mode]( p_dst_by, edge );
769 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
770 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
771 i_satd -= a->i_lambda * 3;
773 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
774 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
778 if( idx == 3 || i_cost > i_satd_thresh )
781 /* we need to encode this block now (for next ones) */
782 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
783 x264_mb_encode_i8x8( h, idx, a->i_qp );
785 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
790 a->i_satd_i8x8 = i_cost;
791 if( h->mb.i_skip_intra )
793 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
794 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
795 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
796 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
797 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
798 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
799 if( h->mb.i_skip_intra == 2 )
800 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
805 static const uint16_t cost_div_fix8[3] = {1024,512,341};
806 a->i_satd_i8x8 = COST_MAX;
807 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
809 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
813 /* 4x4 prediction selection */
814 if( flags & X264_ANALYSE_I4x4 )
817 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
818 h->mb.i_cbp_luma = 0;
819 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
821 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
823 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
824 if( h->sh.i_type == SLICE_TYPE_B )
825 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
827 for( idx = 0;; idx++ )
829 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
830 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
831 int i_best = COST_MAX;
832 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
834 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
836 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
837 /* emulate missing topright samples */
838 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
840 if( b_merged_satd && predict_mode[5] >= 0 )
843 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
844 satd[i_pred_mode] -= 3 * a->i_lambda;
845 for( i=2; i>=0; i-- )
846 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
850 for( ; *predict_mode >= 0; predict_mode++ )
853 int i_mode = *predict_mode;
855 if( h->mb.b_lossless )
856 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
858 h->predict_4x4[i_mode]( p_dst_by );
860 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
861 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
862 i_satd -= a->i_lambda * 3;
864 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
866 i_cost += i_best + 4 * a->i_lambda;
868 if( i_cost > i_satd_thresh || idx == 15 )
871 /* we need to encode this block now (for next ones) */
872 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
873 x264_mb_encode_i4x4( h, idx, a->i_qp );
875 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
879 a->i_satd_i4x4 = i_cost;
880 if( h->mb.i_skip_intra )
882 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
883 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
884 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
885 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
886 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
887 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
888 if( h->mb.i_skip_intra == 2 )
889 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
893 a->i_satd_i4x4 = COST_MAX;
897 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
899 if( a->i_satd_i16x16 <= i_satd_thresh )
901 h->mb.i_type = I_16x16;
902 x264_analyse_update_cache( h, a );
903 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
906 a->i_satd_i16x16 = COST_MAX;
908 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
910 h->mb.i_type = I_4x4;
911 x264_analyse_update_cache( h, a );
912 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
915 a->i_satd_i4x4 = COST_MAX;
917 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
919 h->mb.i_type = I_8x8;
920 x264_analyse_update_cache( h, a );
921 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
922 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
925 a->i_satd_i8x8 = COST_MAX;
928 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
930 uint8_t *p_dst = h->mb.pic.p_fdec[0];
933 int i_mode, i_thresh;
934 uint64_t i_satd, i_best;
935 h->mb.i_skip_intra = 0;
937 if( h->mb.i_type == I_16x16 )
939 int old_pred_mode = a->i_predict16x16;
940 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
941 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
942 i_best = a->i_satd_i16x16;
943 for( ; *predict_mode >= 0; predict_mode++ )
945 int i_mode = *predict_mode;
946 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
948 h->mb.i_intra16x16_pred_mode = i_mode;
949 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
950 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
954 /* RD selection for chroma prediction */
955 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
956 if( predict_mode[1] >= 0 )
958 int8_t predict_mode_sorted[4];
960 i_thresh = a->i_satd_i8x8chroma * 5/4;
962 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
964 i_mode = *predict_mode;
965 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
966 predict_mode_sorted[i_max++] = i_mode;
971 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
972 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
973 /* the previous thing encoded was x264_intra_rd(), so the pixels and
974 * coefs for the current chroma mode are still around, so we only
975 * have to recount the bits. */
976 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
977 for( i = 0; i < i_max; i++ )
979 i_mode = predict_mode_sorted[i];
980 if( h->mb.b_lossless )
981 x264_predict_lossless_8x8_chroma( h, i_mode );
984 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
985 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
987 /* if we've already found a mode that needs no residual, then
988 * probably any mode with a residual will be worse.
989 * so avoid dct on the remaining modes to improve speed. */
990 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
991 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
993 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
994 h->mb.i_cbp_chroma = i_cbp_chroma_best;
998 if( h->mb.i_type == I_4x4 )
1000 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1002 for( idx = 0; idx < 16; idx++ )
1004 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1005 i_best = COST_MAX64;
1007 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
1009 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1010 /* emulate missing topright samples */
1011 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1013 for( ; *predict_mode >= 0; predict_mode++ )
1015 i_mode = *predict_mode;
1016 if( h->mb.b_lossless )
1017 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1019 h->predict_4x4[i_mode]( p_dst_by );
1020 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1022 if( i_best > i_satd )
1024 a->i_predict4x4[idx] = i_mode;
1026 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1027 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1028 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1029 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1030 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1034 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1035 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1036 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1037 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1038 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1040 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1043 else if( h->mb.i_type == I_8x8 )
1045 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1046 for( idx = 0; idx < 4; idx++ )
1048 uint64_t pels_h = 0;
1050 uint16_t i_nnz[2] = {0}; //shut up gcc
1053 int cbp_luma_new = 0;
1054 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1056 i_best = COST_MAX64;
1060 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1061 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1062 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1064 for( ; *predict_mode >= 0; predict_mode++ )
1066 i_mode = *predict_mode;
1067 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1070 if( h->mb.b_lossless )
1071 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1073 h->predict_8x8[i_mode]( p_dst_by, edge );
1074 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1075 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1077 if( i_best > i_satd )
1079 a->i_predict8x8[idx] = i_mode;
1080 cbp_luma_new = h->mb.i_cbp_luma;
1083 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1085 for( j=0; j<7; j++ )
1086 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1087 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1088 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1091 a->i_cbp_i8x8_luma = cbp_luma_new;
1092 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1094 for( j=0; j<7; j++ )
1095 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1096 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1097 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1099 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1104 #define LOAD_FENC( m, src, xoff, yoff) \
1105 (m)->p_cost_mv = a->p_cost_mv; \
1106 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1107 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1108 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1109 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1110 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1112 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1113 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1114 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1115 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1116 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1117 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1118 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1119 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1120 (m)->weight = weight_none; \
1123 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1124 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1125 (m)->weight = h->sh.weight[i_ref];
1127 #define REF_COST(list, ref) \
1128 (a->p_cost_ref[list][ref])
1130 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1134 ALIGNED_4( int16_t mvc[8][2] );
1135 int i_halfpel_thresh = INT_MAX;
1136 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1138 /* 16x16 Search on all ref frame */
1139 m.i_pixel = PIXEL_16x16;
1140 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1142 a->l0.me16x16.cost = INT_MAX;
1143 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1145 const int i_ref_cost = REF_COST( 0, i_ref );
1146 i_halfpel_thresh -= i_ref_cost;
1147 m.i_ref_cost = i_ref_cost;
1149 /* search with ref */
1150 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1151 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1153 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1154 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1156 if( h->mb.ref_blind_dupe == i_ref )
1158 CP32( m.mv, a->l0.mvc[0][0] );
1159 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1162 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1164 /* early termination
1165 * SSD threshold would probably be better than SATD */
1168 && m.cost-m.cost_mv < 300*a->i_lambda
1169 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1170 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1171 && x264_macroblock_probe_pskip( h ) )
1173 h->mb.i_type = P_SKIP;
1174 x264_analyse_update_cache( h, a );
1175 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1179 m.cost += i_ref_cost;
1180 i_halfpel_thresh += i_ref_cost;
1182 if( m.cost < a->l0.me16x16.cost )
1183 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1185 /* save mv for predicting neighbors */
1186 CP32( a->l0.mvc[i_ref][0], m.mv );
1187 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1190 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1191 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1193 h->mb.i_type = P_L0;
1196 x264_mb_cache_fenc_satd( h );
1197 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1199 h->mb.i_partition = D_16x16;
1200 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1201 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1202 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1203 h->mb.i_type = P_SKIP;
1208 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1212 uint8_t **p_fenc = h->mb.pic.p_fenc;
1213 int i_halfpel_thresh = INT_MAX;
1214 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1216 int i_maxref = h->mb.pic.i_fref[0]-1;
1218 h->mb.i_partition = D_8x8;
1220 #define CHECK_NEIGHBOUR(i)\
1222 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1223 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1227 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1228 * than those used by the neighbors */
1229 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1230 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1233 CHECK_NEIGHBOUR( -8 - 1 );
1234 CHECK_NEIGHBOUR( -8 + 0 );
1235 CHECK_NEIGHBOUR( -8 + 2 );
1236 CHECK_NEIGHBOUR( -8 + 4 );
1237 CHECK_NEIGHBOUR( 0 - 1 );
1238 CHECK_NEIGHBOUR( 2*8 - 1 );
1241 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1242 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1244 for( i = 0; i < 4; i++ )
1246 x264_me_t *l0m = &a->l0.me8x8[i];
1250 m.i_pixel = PIXEL_8x8;
1252 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1253 l0m->cost = INT_MAX;
1254 for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1256 const int i_ref_cost = REF_COST( 0, i_ref );
1257 m.i_ref_cost = i_ref_cost;
1259 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1260 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1262 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1263 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1264 if( h->mb.ref_blind_dupe == i_ref )
1266 CP32( m.mv, a->l0.mvc[0][i+1] );
1267 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1270 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1272 m.cost += i_ref_cost;
1273 i_halfpel_thresh += i_ref_cost;
1274 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1276 if( m.cost < l0m->cost )
1277 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1278 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1279 i_ref = h->mb.ref_blind_dupe;
1283 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1284 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1286 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1287 are effectively zero. */
1288 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1289 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1292 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1293 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1294 /* P_8x8 ref0 has no ref cost */
1295 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1296 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1297 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1298 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1299 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1302 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1304 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1305 * reference frame flags. Thus, if we're not doing mixedrefs, just
1306 * don't bother analysing the dupes. */
1307 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1308 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1309 uint8_t **p_fenc = h->mb.pic.p_fenc;
1311 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1314 /* XXX Needed for x264_mb_predict_mv */
1315 h->mb.i_partition = D_8x8;
1318 CP32( mvc[0], a->l0.me16x16.mv );
1320 for( i = 0; i < 4; i++ )
1322 x264_me_t *m = &a->l0.me8x8[i];
1326 m->i_pixel = PIXEL_8x8;
1327 m->i_ref_cost = i_ref_cost;
1329 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1330 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1331 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1333 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1334 x264_me_search( h, m, mvc, i_mvc );
1336 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1338 CP32( mvc[i_mvc], m->mv );
1342 m->cost += i_ref_cost;
1343 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1344 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1347 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1348 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1349 /* theoretically this should include 4*ref_cost,
1350 * but 3 seems a better approximation of cabac. */
1351 if( h->param.b_cabac )
1352 a->l0.i_cost8x8 -= i_ref_cost;
1353 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1354 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1357 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1360 uint8_t **p_fenc = h->mb.pic.p_fenc;
1361 ALIGNED_4( int16_t mvc[3][2] );
1364 /* XXX Needed for x264_mb_predict_mv */
1365 h->mb.i_partition = D_16x8;
1367 for( i = 0; i < 2; i++ )
1369 x264_me_t *l0m = &a->l0.me16x8[i];
1370 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1371 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1372 const int ref8[2] = { minref, maxref };
1373 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1375 m.i_pixel = PIXEL_16x8;
1377 LOAD_FENC( &m, p_fenc, 0, 8*i );
1378 l0m->cost = INT_MAX;
1379 for( j = 0; j < i_ref8s; j++ )
1381 const int i_ref = ref8[j];
1382 const int i_ref_cost = REF_COST( 0, i_ref );
1383 m.i_ref_cost = i_ref_cost;
1385 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1386 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1387 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1388 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1390 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1391 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1393 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1394 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1395 /* We can only take this shortcut if the first search was performed on ref0. */
1396 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1398 /* We can just leave the MV from the previous ref search. */
1399 x264_me_refine_qpel_refdupe( h, &m, NULL );
1402 x264_me_search( h, &m, mvc, 3 );
1404 m.cost += i_ref_cost;
1406 if( m.cost < l0m->cost )
1407 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1409 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1410 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1413 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1416 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1419 uint8_t **p_fenc = h->mb.pic.p_fenc;
1420 ALIGNED_4( int16_t mvc[3][2] );
1423 /* XXX Needed for x264_mb_predict_mv */
1424 h->mb.i_partition = D_8x16;
1426 for( i = 0; i < 2; i++ )
1428 x264_me_t *l0m = &a->l0.me8x16[i];
1429 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1430 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1431 const int ref8[2] = { minref, maxref };
1432 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1434 m.i_pixel = PIXEL_8x16;
1436 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1437 l0m->cost = INT_MAX;
1438 for( j = 0; j < i_ref8s; j++ )
1440 const int i_ref = ref8[j];
1441 const int i_ref_cost = REF_COST( 0, i_ref );
1442 m.i_ref_cost = i_ref_cost;
1444 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1445 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1446 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1448 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1449 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1451 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1452 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1453 /* We can only take this shortcut if the first search was performed on ref0. */
1454 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1456 /* We can just leave the MV from the previous ref search. */
1457 x264_me_refine_qpel_refdupe( h, &m, NULL );
1460 x264_me_search( h, &m, mvc, 3 );
1462 m.cost += i_ref_cost;
1464 if( m.cost < l0m->cost )
1465 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1467 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1468 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1471 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1474 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1476 ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1477 uint8_t *pix2 = pix1+8;
1478 const int i_stride = h->mb.pic.i_stride[1];
1479 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1480 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1481 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1482 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1483 x264_weight_t *weight = h->sh.weight[i_ref];
1485 #define CHROMA4x4MC( width, height, me, x, y ) \
1486 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1487 if( weight[1].weightfn ) \
1488 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1489 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1490 if( weight[2].weightfn ) \
1491 weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1494 if( pixel == PIXEL_4x4 )
1496 x264_me_t *m = a->l0.me4x4[i8x8];
1497 CHROMA4x4MC( 2,2, m[0], 0,0 );
1498 CHROMA4x4MC( 2,2, m[1], 2,0 );
1499 CHROMA4x4MC( 2,2, m[2], 0,2 );
1500 CHROMA4x4MC( 2,2, m[3], 2,2 );
1502 else if( pixel == PIXEL_8x4 )
1504 x264_me_t *m = a->l0.me8x4[i8x8];
1505 CHROMA4x4MC( 4,2, m[0], 0,0 );
1506 CHROMA4x4MC( 4,2, m[1], 0,2 );
1510 x264_me_t *m = a->l0.me4x8[i8x8];
1511 CHROMA4x4MC( 2,4, m[0], 0,0 );
1512 CHROMA4x4MC( 2,4, m[1], 2,0 );
1515 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1516 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1519 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1521 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1522 uint8_t **p_fenc = h->mb.pic.p_fenc;
1523 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1526 /* XXX Needed for x264_mb_predict_mv */
1527 h->mb.i_partition = D_8x8;
1529 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1531 const int idx = 4*i8x8 + i4x4;
1532 const int x4 = block_idx_x[idx];
1533 const int y4 = block_idx_y[idx];
1534 const int i_mvc = (i4x4 == 0);
1536 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1538 m->i_pixel = PIXEL_4x4;
1540 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1541 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1542 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1544 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1545 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1547 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1549 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1550 a->l0.me4x4[i8x8][1].cost +
1551 a->l0.me4x4[i8x8][2].cost +
1552 a->l0.me4x4[i8x8][3].cost +
1553 REF_COST( 0, i_ref ) +
1554 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1555 if( h->mb.b_chroma_me )
1556 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1559 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1561 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1562 uint8_t **p_fenc = h->mb.pic.p_fenc;
1563 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1566 /* XXX Needed for x264_mb_predict_mv */
1567 h->mb.i_partition = D_8x8;
1569 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1571 const int idx = 4*i8x8 + 2*i8x4;
1572 const int x4 = block_idx_x[idx];
1573 const int y4 = block_idx_y[idx];
1574 const int i_mvc = (i8x4 == 0);
1576 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1578 m->i_pixel = PIXEL_8x4;
1580 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1581 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1582 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1584 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1585 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1587 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1589 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1590 REF_COST( 0, i_ref ) +
1591 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1592 if( h->mb.b_chroma_me )
1593 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1596 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1598 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1599 uint8_t **p_fenc = h->mb.pic.p_fenc;
1600 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1603 /* XXX Needed for x264_mb_predict_mv */
1604 h->mb.i_partition = D_8x8;
1606 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1608 const int idx = 4*i8x8 + i4x8;
1609 const int x4 = block_idx_x[idx];
1610 const int y4 = block_idx_y[idx];
1611 const int i_mvc = (i4x8 == 0);
1613 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1615 m->i_pixel = PIXEL_4x8;
1617 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1618 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1619 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1621 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1622 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1624 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1626 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1627 REF_COST( 0, i_ref ) +
1628 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1629 if( h->mb.b_chroma_me )
1630 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1633 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1635 /* Assumes that fdec still contains the results of
1636 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1638 uint8_t **p_fenc = h->mb.pic.p_fenc;
1639 uint8_t **p_fdec = h->mb.pic.p_fdec;
1642 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1643 for( i = 0; i < 4; i++ )
1645 const int x = (i&1)*8;
1646 const int y = (i>>1)*8;
1647 a->i_cost16x16direct +=
1648 a->i_cost8x8direct[i] =
1649 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1652 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1656 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1658 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1659 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1660 uint8_t *src0, *src1;
1661 int stride0 = 16, stride1 = 16;
1665 ALIGNED_4( int16_t mvc[9][2] );
1666 int i_halfpel_thresh = INT_MAX;
1667 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1669 /* 16x16 Search on all ref frame */
1670 m.i_pixel = PIXEL_16x16;
1671 m.weight = weight_none;
1673 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1676 a->l0.me16x16.cost = INT_MAX;
1677 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1679 const int i_ref_cost = REF_COST( 0, i_ref );
1680 m.i_ref_cost = i_ref_cost;
1681 /* search with ref */
1682 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1683 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1684 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1685 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1688 m.cost += i_ref_cost;
1690 if( m.cost < a->l0.me16x16.cost )
1692 a->l0.i_ref = i_ref;
1693 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1696 /* save mv for predicting neighbors */
1697 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1699 a->l0.me16x16.i_ref = a->l0.i_ref;
1702 i_halfpel_thresh = INT_MAX;
1703 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1704 a->l1.me16x16.cost = INT_MAX;
1705 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1707 const int i_ref_cost = REF_COST( 0, i_ref );
1708 m.i_ref_cost = i_ref_cost;
1709 /* search with ref */
1710 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1711 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1712 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1713 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1716 m.cost += i_ref_cost;
1718 if( m.cost < a->l1.me16x16.cost )
1720 a->l1.i_ref = i_ref;
1721 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1724 /* save mv for predicting neighbors */
1725 CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1727 a->l1.me16x16.i_ref = a->l1.i_ref;
1729 /* get cost of BI mode */
1730 int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
1731 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1732 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1733 src0 = h->mc.get_ref( pix0, &stride0,
1734 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1735 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1736 src1 = h->mc.get_ref( pix1, &stride1,
1737 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1738 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1740 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1742 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1744 + a->l0.bi16x16.cost_mv
1745 + a->l1.bi16x16.cost_mv;
1748 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1749 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1751 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1752 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1753 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1754 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1755 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
1756 h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
1757 h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1758 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1759 + ref_costs + l0_mv_cost + l1_mv_cost;
1760 if( cost00 < a->i_cost16x16bi )
1762 M32( a->l0.bi16x16.mv ) = 0;
1763 M32( a->l1.bi16x16.mv ) = 0;
1764 a->l0.bi16x16.cost_mv = l0_mv_cost;
1765 a->l1.bi16x16.cost_mv = l1_mv_cost;
1766 a->i_cost16x16bi = cost00;
1771 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1772 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1773 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1776 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1778 const int x = 2*(i%2);
1779 const int y = 2*(i/2);
1781 switch( h->mb.i_sub_partition[i] )
1784 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1787 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1788 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1791 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1792 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1795 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1796 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1797 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1798 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1801 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1806 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1808 const int x = 2*(idx&1);
1809 const int y = 2*(idx>>1);
1810 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1811 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1812 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1813 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1816 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1817 if( x264_mb_partition_listX_table[0][part] ) \
1819 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1820 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1824 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1825 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1827 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1829 if( x264_mb_partition_listX_table[1][part] ) \
1831 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1832 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1836 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1837 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1839 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1842 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1846 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1848 x264_mb_load_mv_direct8x8( h, i );
1851 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1852 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1853 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1858 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1861 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1863 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1865 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1867 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1871 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1873 uint8_t **p_fref[2] =
1874 { h->mb.pic.p_fref[0][a->l0.i_ref],
1875 h->mb.pic.p_fref[1][a->l1.i_ref] };
1876 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1879 /* XXX Needed for x264_mb_predict_mv */
1880 h->mb.i_partition = D_8x8;
1884 for( i = 0; i < 4; i++ )
1889 int i_part_cost_bi = 0;
1890 int stride[2] = {8,8};
1893 for( l = 0; l < 2; l++ )
1895 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1896 const int i_ref_cost = REF_COST( l, lX->i_ref );
1897 x264_me_t *m = &lX->me8x8[i];
1899 m->i_pixel = PIXEL_8x8;
1900 m->i_ref_cost = i_ref_cost;
1902 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1903 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1905 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
1906 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1907 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1908 m->cost += i_ref_cost;
1910 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1913 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1914 m->mv[0], m->mv[1], 8, 8, weight_none );
1915 i_part_cost_bi += m->cost_mv + i_ref_cost;
1917 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1918 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1919 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1920 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1921 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1923 i_part_cost = a->l0.me8x8[i].cost;
1924 h->mb.i_sub_partition[i] = D_L0_8x8;
1925 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1926 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1927 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1928 a->i_cost8x8bi += i_part_cost;
1930 /* XXX Needed for x264_mb_predict_mv */
1931 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1935 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1938 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1940 uint8_t **p_fref[2] =
1941 { h->mb.pic.p_fref[0][a->l0.i_ref],
1942 h->mb.pic.p_fref[1][a->l1.i_ref] };
1943 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1944 ALIGNED_4( int16_t mvc[2][2] );
1947 h->mb.i_partition = D_16x8;
1948 a->i_cost16x8bi = 0;
1950 for( i = 0; i < 2; i++ )
1953 int i_part_cost_bi = 0;
1954 int stride[2] = {16,16};
1957 /* TODO: check only the list(s) that were used in b8x8? */
1958 for( l = 0; l < 2; l++ )
1960 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1961 const int i_ref_cost = REF_COST( l, lX->i_ref );
1962 x264_me_t *m = &lX->me16x8[i];
1964 m->i_pixel = PIXEL_16x8;
1965 m->i_ref_cost = i_ref_cost;
1967 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1968 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1970 CP32( mvc[0], lX->me8x8[2*i].mv );
1971 CP32( mvc[1], lX->me8x8[2*i+1].mv );
1973 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
1974 x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
1975 x264_me_search( h, m, mvc, 2 );
1976 m->cost += i_ref_cost;
1979 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1980 m->mv[0], m->mv[1], 16, 8, weight_none );
1981 i_part_cost_bi += m->cost_mv + i_ref_cost;
1983 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1984 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1986 i_part_cost = a->l0.me16x8[i].cost;
1987 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1988 if( a->l1.me16x8[i].cost < i_part_cost )
1990 i_part_cost = a->l1.me16x8[i].cost;
1991 a->i_mb_partition16x8[i] = D_L1_8x8;
1993 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1995 i_part_cost = i_part_cost_bi;
1996 a->i_mb_partition16x8[i] = D_BI_8x8;
1998 a->i_cost16x8bi += i_part_cost;
2000 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2004 a->i_mb_type16x8 = B_L0_L0
2005 + (a->i_mb_partition16x8[0]>>2) * 3
2006 + (a->i_mb_partition16x8[1]>>2);
2007 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2010 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2012 uint8_t **p_fref[2] =
2013 { h->mb.pic.p_fref[0][a->l0.i_ref],
2014 h->mb.pic.p_fref[1][a->l1.i_ref] };
2015 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2016 ALIGNED_4( int16_t mvc[2][2] );
2019 h->mb.i_partition = D_8x16;
2020 a->i_cost8x16bi = 0;
2022 for( i = 0; i < 2; i++ )
2025 int i_part_cost_bi = 0;
2026 int stride[2] = {8,8};
2029 for( l = 0; l < 2; l++ )
2031 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2032 const int i_ref_cost = REF_COST( l, lX->i_ref );
2033 x264_me_t *m = &lX->me8x16[i];
2035 m->i_pixel = PIXEL_8x16;
2036 m->i_ref_cost = i_ref_cost;
2038 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
2039 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2041 CP32( mvc[0], lX->me8x8[i].mv );
2042 CP32( mvc[1], lX->me8x8[i+2].mv );
2044 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
2045 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2046 x264_me_search( h, m, mvc, 2 );
2047 m->cost += i_ref_cost;
2050 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2051 m->mv[0], m->mv[1], 8, 16, weight_none );
2052 i_part_cost_bi += m->cost_mv + i_ref_cost;
2055 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2056 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2058 i_part_cost = a->l0.me8x16[i].cost;
2059 a->i_mb_partition8x16[i] = D_L0_8x8;
2060 if( a->l1.me8x16[i].cost < i_part_cost )
2062 i_part_cost = a->l1.me8x16[i].cost;
2063 a->i_mb_partition8x16[i] = D_L1_8x8;
2065 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2067 i_part_cost = i_part_cost_bi;
2068 a->i_mb_partition8x16[i] = D_BI_8x8;
2070 a->i_cost8x16bi += i_part_cost;
2072 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2076 a->i_mb_type8x16 = B_L0_L0
2077 + (a->i_mb_partition8x16[0]>>2) * 3
2078 + (a->i_mb_partition8x16[1]>>2);
2079 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2082 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2084 int thresh = i_satd * 5/4;
2086 h->mb.i_type = P_L0;
2087 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2089 h->mb.i_partition = D_16x16;
2090 x264_analyse_update_cache( h, a );
2091 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2094 if( a->l0.i_cost16x8 <= thresh )
2096 h->mb.i_partition = D_16x8;
2097 x264_analyse_update_cache( h, a );
2098 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2101 a->l0.i_cost16x8 = COST_MAX;
2103 if( a->l0.i_cost8x16 <= thresh )
2105 h->mb.i_partition = D_8x16;
2106 x264_analyse_update_cache( h, a );
2107 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2110 a->l0.i_cost8x16 = COST_MAX;
2112 if( a->l0.i_cost8x8 <= thresh )
2114 h->mb.i_type = P_8x8;
2115 h->mb.i_partition = D_8x8;
2116 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2119 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2120 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2121 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2122 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2123 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2124 * for future blocks are those left over from previous RDO calls. */
2125 for( i = 0; i < 4; i++ )
2127 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2128 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2129 int subtype, btype = D_L0_8x8;
2130 uint64_t bcost = COST_MAX64;
2131 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2134 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2136 h->mb.i_sub_partition[i] = subtype;
2137 x264_mb_cache_mv_p8x8( h, a, i );
2138 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2139 COPY2_IF_LT( bcost, cost, btype, subtype );
2141 if( h->mb.i_sub_partition[i] != btype )
2143 h->mb.i_sub_partition[i] = btype;
2144 x264_mb_cache_mv_p8x8( h, a, i );
2149 x264_analyse_update_cache( h, a );
2150 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2153 a->l0.i_cost8x8 = COST_MAX;
2156 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2158 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2160 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2162 h->mb.i_type = B_DIRECT;
2163 /* Assumes direct/skip MC is still in fdec */
2164 /* Requires b-rdo to be done before intra analysis */
2165 h->mb.b_skip_mc = 1;
2166 x264_analyse_update_cache( h, a );
2167 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2168 h->mb.b_skip_mc = 0;
2171 //FIXME not all the update_cache calls are needed
2172 h->mb.i_partition = D_16x16;
2174 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2176 h->mb.i_type = B_L0_L0;
2177 x264_analyse_update_cache( h, a );
2178 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2182 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2184 h->mb.i_type = B_L1_L1;
2185 x264_analyse_update_cache( h, a );
2186 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2190 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2192 h->mb.i_type = B_BI_BI;
2193 x264_analyse_update_cache( h, a );
2194 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2198 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2200 h->mb.i_type = B_8x8;
2201 h->mb.i_partition = D_8x8;
2202 x264_analyse_update_cache( h, a );
2203 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2204 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2208 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2210 h->mb.i_type = a->i_mb_type16x8;
2211 h->mb.i_partition = D_16x8;
2212 x264_analyse_update_cache( h, a );
2213 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2217 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2219 h->mb.i_type = a->i_mb_type8x16;
2220 h->mb.i_partition = D_8x16;
2221 x264_analyse_update_cache( h, a );
2222 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2226 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2228 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2231 if( IS_INTRA(h->mb.i_type) )
2234 switch( h->mb.i_partition )
2237 if( h->mb.i_type == B_BI_BI )
2238 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2241 for( i=0; i<2; i++ )
2242 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2243 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2246 for( i=0; i<2; i++ )
2247 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2248 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2251 for( i=0; i<4; i++ )
2252 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2253 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2258 static inline void x264_mb_analyse_transform( x264_t *h )
2260 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2262 int i_cost4, i_cost8;
2263 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2266 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2267 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2268 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2269 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2271 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2272 h->mb.b_skip_mc = 1;
2276 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2278 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2281 x264_analyse_update_cache( h, a );
2282 h->mb.b_transform_8x8 ^= 1;
2283 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2284 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2286 if( *i_rd >= i_rd8 )
2289 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2293 h->mb.b_transform_8x8 ^= 1;
2297 /* Rate-distortion optimal QP selection.
2298 * FIXME: More than half of the benefit of this function seems to be
2299 * in the way it improves the coding of chroma DC (by decimating or
2300 * finding a better way to code a single DC coefficient.)
2301 * There must be a more efficient way to get that portion of the benefit
2302 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2304 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2306 int bcost, cost, direction, failures, prevcost, origcost;
2307 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2308 int last_qp_tried = 0;
2309 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2311 /* If CBP is already zero, don't raise the quantizer any higher. */
2312 for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2314 /* Without psy-RD, require monotonicity when moving quant away from previous
2315 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2316 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2317 * allow 2 failures when moving quant towards previous quant.
2318 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2319 int threshold = (!!h->mb.i_psy_rd);
2320 /* Raise the threshold for failures if we're moving towards the last QP. */
2321 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2322 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2324 h->mb.i_qp = orig_qp;
2326 prevcost = origcost;
2327 h->mb.i_qp += direction;
2328 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2330 if( h->mb.i_last_qp == h->mb.i_qp )
2332 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2333 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2334 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2336 /* We can't assume that the costs are monotonic over QPs.
2337 * Tie case-as-failure seems to give better results. */
2338 if( cost < prevcost )
2344 if( failures > threshold )
2346 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2348 h->mb.i_qp += direction;
2352 /* Always try the last block's QP. */
2353 if( !last_qp_tried )
2355 h->mb.i_qp = h->mb.i_last_qp;
2356 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2357 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2358 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2362 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2364 /* Check transform again; decision from before may no longer be optimal. */
2365 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2366 x264_mb_transform_8x8_allowed( h ) )
2368 h->mb.b_transform_8x8 ^= 1;
2369 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2371 h->mb.b_transform_8x8 ^= 1;
2375 /*****************************************************************************
2376 * x264_macroblock_analyse:
2377 *****************************************************************************/
2378 void x264_macroblock_analyse( x264_t *h )
2380 x264_mb_analysis_t analysis;
2381 int i_cost = COST_MAX;
2384 h->mb.i_qp = x264_ratecontrol_qp( h );
2385 if( h->param.rc.i_aq_mode )
2387 x264_adaptive_quant( h );
2388 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2389 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2390 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2391 h->mb.i_qp = h->mb.i_last_qp;
2394 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2396 /*--------------------------- Do the analysis ---------------------------*/
2397 if( h->sh.i_type == SLICE_TYPE_I )
2400 if( analysis.i_mbrd )
2401 x264_mb_cache_fenc_satd( h );
2402 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2403 if( analysis.i_mbrd )
2404 x264_intra_rd( h, &analysis, COST_MAX );
2406 i_cost = analysis.i_satd_i16x16;
2407 h->mb.i_type = I_16x16;
2408 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2409 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2410 if( analysis.i_satd_pcm < i_cost )
2411 h->mb.i_type = I_PCM;
2413 else if( analysis.i_mbrd >= 2 )
2414 x264_intra_rd_refine( h, &analysis );
2416 else if( h->sh.i_type == SLICE_TYPE_P )
2420 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2422 analysis.b_try_pskip = 0;
2423 if( analysis.b_force_intra )
2425 if( !h->param.analyse.b_psy )
2427 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2428 goto intra_analysis;
2433 /* Fast P_SKIP detection */
2434 if( h->param.analyse.b_fast_pskip )
2436 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2437 // FIXME don't need to check this if the reference frame is done
2439 else if( h->param.analyse.i_subpel_refine >= 3 )
2440 analysis.b_try_pskip = 1;
2441 else if( h->mb.i_mb_type_left == P_SKIP ||
2442 h->mb.i_mb_type_top == P_SKIP ||
2443 h->mb.i_mb_type_topleft == P_SKIP ||
2444 h->mb.i_mb_type_topright == P_SKIP )
2445 b_skip = x264_macroblock_probe_pskip( h );
2449 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2453 h->mb.i_type = P_SKIP;
2454 h->mb.i_partition = D_16x16;
2455 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2459 const unsigned int flags = h->param.analyse.inter;
2463 int i_satd_inter, i_satd_intra;
2465 x264_mb_analyse_load_costs( h, &analysis );
2467 x264_mb_analyse_inter_p16x16( h, &analysis );
2469 if( h->mb.i_type == P_SKIP )
2472 if( flags & X264_ANALYSE_PSUB16x16 )
2474 if( h->param.analyse.b_mixed_references )
2475 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2477 x264_mb_analyse_inter_p8x8( h, &analysis );
2480 /* Select best inter mode */
2482 i_partition = D_16x16;
2483 i_cost = analysis.l0.me16x16.cost;
2485 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2486 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2489 i_partition = D_8x8;
2490 i_cost = analysis.l0.i_cost8x8;
2493 if( flags & X264_ANALYSE_PSUB8x8 )
2495 for( i = 0; i < 4; i++ )
2497 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2498 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2500 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2501 h->mb.i_sub_partition[i] = D_L0_4x4;
2503 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2504 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2505 h->mb.i_sub_partition[i], D_L0_8x4 );
2507 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2508 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2509 h->mb.i_sub_partition[i], D_L0_4x8 );
2511 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2513 x264_mb_cache_mv_p8x8( h, &analysis, i );
2515 analysis.l0.i_cost8x8 = i_cost;
2519 /* Now do 16x8/8x16 */
2520 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2521 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2522 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2524 x264_mb_analyse_inter_p16x8( h, &analysis );
2525 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2527 x264_mb_analyse_inter_p8x16( h, &analysis );
2528 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2531 h->mb.i_partition = i_partition;
2534 //FIXME mb_type costs?
2535 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2539 else if( i_partition == D_16x16 )
2541 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2542 i_cost = analysis.l0.me16x16.cost;
2544 else if( i_partition == D_16x8 )
2546 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2547 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2548 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2550 else if( i_partition == D_8x16 )
2552 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2553 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2554 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2556 else if( i_partition == D_8x8 )
2560 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2562 switch( h->mb.i_sub_partition[i8x8] )
2565 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2566 i_cost += analysis.l0.me8x8[i8x8].cost;
2569 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2570 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2571 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2572 analysis.l0.me8x4[i8x8][1].cost;
2575 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2576 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2577 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2578 analysis.l0.me4x8[i8x8][1].cost;
2582 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2583 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2584 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2585 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2586 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2587 analysis.l0.me4x4[i8x8][1].cost +
2588 analysis.l0.me4x4[i8x8][2].cost +
2589 analysis.l0.me4x4[i8x8][3].cost;
2592 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2598 if( h->mb.b_chroma_me )
2600 x264_mb_analyse_intra_chroma( h, &analysis );
2601 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2602 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2603 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2604 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2607 x264_mb_analyse_intra( h, &analysis, i_cost );
2609 i_satd_inter = i_cost;
2610 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2611 analysis.i_satd_i8x8,
2612 analysis.i_satd_i4x4 );
2614 if( analysis.i_mbrd )
2616 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2618 i_partition = D_16x16;
2619 i_cost = analysis.l0.i_rd16x16;
2620 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2621 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2622 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2623 h->mb.i_type = i_type;
2624 h->mb.i_partition = i_partition;
2625 if( i_cost < COST_MAX )
2626 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2627 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2630 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2631 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2632 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2633 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2635 h->mb.i_type = i_type;
2637 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2639 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2640 * it was an inter block. */
2641 x264_analyse_update_cache( h, &analysis );
2642 x264_macroblock_encode( h );
2643 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2644 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2645 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2646 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2647 goto intra_analysis;
2650 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2652 if( IS_INTRA( h->mb.i_type ) )
2654 x264_intra_rd_refine( h, &analysis );
2656 else if( i_partition == D_16x16 )
2658 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2659 analysis.l0.me16x16.cost = i_cost;
2660 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2662 else if( i_partition == D_16x8 )
2664 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2665 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2666 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2667 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2668 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2669 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2671 else if( i_partition == D_8x16 )
2673 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2674 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2675 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2676 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2677 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2678 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2680 else if( i_partition == D_8x8 )
2683 x264_analyse_update_cache( h, &analysis );
2684 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2686 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2688 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2690 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2692 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2693 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2695 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2697 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2698 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2700 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2702 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2703 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2704 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2705 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2712 else if( h->sh.i_type == SLICE_TYPE_B )
2714 int i_bskip_cost = COST_MAX;
2717 if( analysis.i_mbrd )
2718 x264_mb_cache_fenc_satd( h );
2720 h->mb.i_type = B_SKIP;
2721 if( h->mb.b_direct_auto_write )
2723 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2724 for( i = 0; i < 2; i++ )
2727 h->sh.b_direct_spatial_mv_pred ^= 1;
2728 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2729 if( analysis.b_direct_available )
2734 b_skip = x264_macroblock_probe_bskip( h );
2736 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2743 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2745 if( analysis.b_direct_available )
2747 if( !h->mb.b_direct_auto_write )
2749 if( analysis.i_mbrd )
2751 i_bskip_cost = ssd_mb( h );
2752 /* 6 = minimum cavlc cost of a non-skipped MB */
2753 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2755 else if( !h->mb.b_direct_auto_write )
2757 /* Conditioning the probe on neighboring block types
2758 * doesn't seem to help speed or quality. */
2759 b_skip = x264_macroblock_probe_bskip( h );
2765 const unsigned int flags = h->param.analyse.inter;
2769 h->mb.b_skip_mc = 0;
2771 x264_mb_analyse_load_costs( h, &analysis );
2773 /* select best inter mode */
2774 /* direct must be first */
2775 if( analysis.b_direct_available )
2776 x264_mb_analyse_inter_direct( h, &analysis );
2778 x264_mb_analyse_inter_b16x16( h, &analysis );
2781 i_partition = D_16x16;
2782 i_cost = analysis.l0.me16x16.cost;
2783 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2784 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2785 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2787 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2789 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2790 if( i_bskip_cost < analysis.i_rd16x16direct &&
2791 i_bskip_cost < analysis.i_rd16x16bi &&
2792 i_bskip_cost < analysis.l0.i_rd16x16 &&
2793 i_bskip_cost < analysis.l1.i_rd16x16 )
2795 h->mb.i_type = B_SKIP;
2796 x264_analyse_update_cache( h, &analysis );
2801 if( flags & X264_ANALYSE_BSUB16x16 )
2803 x264_mb_analyse_inter_b8x8( h, &analysis );
2804 if( analysis.i_cost8x8bi < i_cost )
2807 i_partition = D_8x8;
2808 i_cost = analysis.i_cost8x8bi;
2810 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2811 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2813 x264_mb_analyse_inter_b16x8( h, &analysis );
2814 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2815 i_type, analysis.i_mb_type16x8,
2816 i_partition, D_16x8 );
2818 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2819 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2821 x264_mb_analyse_inter_b8x16( h, &analysis );
2822 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2823 i_type, analysis.i_mb_type8x16,
2824 i_partition, D_8x16 );
2829 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2834 else if( i_partition == D_16x16 )
2836 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2837 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2838 if( i_type == B_L0_L0 )
2840 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2841 i_cost = analysis.l0.me16x16.cost
2842 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2844 else if( i_type == B_L1_L1 )
2846 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2847 i_cost = analysis.l1.me16x16.cost
2848 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2850 else if( i_type == B_BI_BI )
2852 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
2853 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
2856 else if( i_partition == D_16x8 )
2858 for( i=0; i<2; i++ )
2860 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2861 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2862 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2863 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2866 else if( i_partition == D_8x16 )
2868 for( i=0; i<2; i++ )
2870 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2871 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2872 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2873 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2876 else if( i_partition == D_8x8 )
2878 for( i=0; i<4; i++ )
2881 int i_part_cost_old;
2883 int i_part_type = h->mb.i_sub_partition[i];
2884 int b_bidir = (i_part_type == D_BI_8x8);
2886 if( i_part_type == D_DIRECT_8x8 )
2888 if( x264_mb_partition_listX_table[0][i_part_type] )
2890 m = &analysis.l0.me8x8[i];
2891 i_part_cost_old = m->cost;
2892 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2893 m->cost -= i_type_cost;
2894 x264_me_refine_qpel( h, m );
2896 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2898 if( x264_mb_partition_listX_table[1][i_part_type] )
2900 m = &analysis.l1.me8x8[i];
2901 i_part_cost_old = m->cost;
2902 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2903 m->cost -= i_type_cost;
2904 x264_me_refine_qpel( h, m );
2906 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2908 /* TODO: update mvp? */
2912 i_satd_inter = i_cost;
2914 if( analysis.i_mbrd )
2916 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2918 i_cost = i_bskip_cost;
2919 i_partition = D_16x16;
2920 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2921 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2922 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2923 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2924 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2925 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2926 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2928 h->mb.i_type = i_type;
2929 h->mb.i_partition = i_partition;
2932 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2934 if( analysis.i_mbrd )
2936 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2937 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2940 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2941 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2942 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2943 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2945 h->mb.i_type = i_type;
2946 h->mb.i_partition = i_partition;
2948 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2949 x264_intra_rd_refine( h, &analysis );
2950 if( h->mb.i_subpel_refine >= 5 )
2951 x264_refine_bidir( h, &analysis );
2953 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2955 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2956 x264_analyse_update_cache( h, &analysis );
2958 if( i_partition == D_16x16 )
2960 if( i_type == B_L0_L0 )
2962 analysis.l0.me16x16.cost = i_cost;
2963 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2965 else if( i_type == B_L1_L1 )
2967 analysis.l1.me16x16.cost = i_cost;
2968 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2970 else if( i_type == B_BI_BI )
2971 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
2973 else if( i_partition == D_16x8 )
2975 for( i = 0; i < 2; i++ )
2977 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2978 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2979 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2980 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2981 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2982 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2983 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2986 else if( i_partition == D_8x16 )
2988 for( i = 0; i < 2; i++ )
2990 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2991 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2992 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2993 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2994 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2995 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2996 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2999 else if( i_partition == D_8x8 )
3001 for( i = 0; i < 4; i++ )
3003 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3004 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3005 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3006 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3007 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3008 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3015 x264_analyse_update_cache( h, &analysis );
3017 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3018 * without realizing it. Check for this and account for it if necessary. */
3019 if( analysis.i_mbrd >= 2 )
3021 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3022 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3023 int list = check_mv_lists[h->mb.i_type] - 1;
3024 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3025 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3026 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3027 h->mb.i_partition = D_16x16;
3030 if( !analysis.i_mbrd )
3031 x264_mb_analyse_transform( h );
3033 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3034 x264_mb_analyse_qp_rd( h, &analysis );
3036 h->mb.b_trellis = h->param.analyse.i_trellis;
3037 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3038 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3039 x264_psy_trellis_init( h, 0 );
3040 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3041 h->mb.i_skip_intra = 0;
3044 /*-------------------- Update MB from the analysis ----------------------*/
3045 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3049 switch( h->mb.i_type )
3052 for( i = 0; i < 16; i++ )
3053 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3055 x264_mb_analyse_intra_chroma( h, a );
3058 for( i = 0; i < 4; i++ )
3059 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3061 x264_mb_analyse_intra_chroma( h, a );
3064 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3065 x264_mb_analyse_intra_chroma( h, a );
3072 switch( h->mb.i_partition )
3075 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3076 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3080 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3081 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3082 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3083 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3087 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3088 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3089 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3090 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3094 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3100 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3101 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3102 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3103 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3104 for( i = 0; i < 4; i++ )
3105 x264_mb_cache_mv_p8x8( h, a, i );
3110 h->mb.i_partition = D_16x16;
3111 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3112 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3118 x264_mb_load_mv_direct8x8( h, 0 );
3119 x264_mb_load_mv_direct8x8( h, 1 );
3120 x264_mb_load_mv_direct8x8( h, 2 );
3121 x264_mb_load_mv_direct8x8( h, 3 );
3125 /* optimize: cache might not need to be rewritten */
3126 for( i = 0; i < 4; i++ )
3127 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3130 default: /* the rest of the B types */
3131 switch( h->mb.i_partition )
3134 switch( h->mb.i_type )
3137 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3138 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3140 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3141 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3142 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3145 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3146 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3147 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3149 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3150 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3153 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3154 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3156 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3157 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3162 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3163 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3166 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3167 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3170 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3176 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3179 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3182 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3185 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3186 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3188 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3189 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3190 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3191 h->mb.cache.mv[l][x264_scan8[15]][0],
3192 h->mb.cache.mv[l][x264_scan8[15]][1] );
3193 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3194 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3195 fprintf(stderr, "completed: %d \n", completed );
3196 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3197 x264_mb_analyse_intra( h, a, COST_MAX );
3198 h->mb.i_type = I_16x16;
3199 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3200 x264_mb_analyse_intra_chroma( h, a );
3207 #include "slicetype.c"