1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
43 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
47 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
48 ALIGNED_4( int16_t mvc[32][5][2] );
52 int i_cost4x4[4]; /* cost per 8x8 partition */
53 x264_me_t me4x4[4][4];
56 int i_cost8x4[4]; /* cost per 8x8 partition */
57 x264_me_t me8x4[4][2];
60 int i_cost4x8[4]; /* cost per 8x8 partition */
61 x264_me_t me4x8[4][2];
71 } x264_mb_analysis_list_t;
75 /* conduct the analysis using this lamda and QP */
80 uint16_t *p_cost_ref[2];
85 /* Take some shortcuts in intra search if intra is deemed unlikely */
87 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
92 int i_satd_i16x16_dir[7];
97 int i_satd_i8x8_dir[12][4];
101 int i_predict4x4[16];
106 int i_satd_i8x8chroma;
107 int i_satd_i8x8chroma_dir[7];
108 int i_predict8x8chroma;
110 /* II: Inter part P/B frame */
111 x264_mb_analysis_list_t l0;
112 x264_mb_analysis_list_t l1;
114 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
115 int i_cost16x16direct;
117 int i_cost8x8direct[4];
126 int i_mb_partition16x8[2]; /* mb_partition_e */
127 int i_mb_partition8x16[2];
128 int i_mb_type16x8; /* mb_class_e */
131 int b_direct_available;
133 } x264_mb_analysis_t;
135 /* lambda = pow(2,qp/6-2) */
136 const uint8_t x264_lambda_tab[52] = {
137 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
138 1, 1, 1, 1, /* 8-11 */
139 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
140 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
141 6, 7, 8, 9,10,11,13,14, /* 28-35 */
142 16,18,20,23,25,29,32,36, /* 36-43 */
143 40,45,51,57,64,72,81,91 /* 44-51 */
146 /* lambda2 = pow(lambda,2) * .9 * 256 */
147 const int x264_lambda2_tab[52] = {
148 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
149 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
150 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
151 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
152 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
153 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
154 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
157 const uint8_t x264_exp2_lut[64] = {
158 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
159 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
160 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
161 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
164 const float x264_log2_lut[128] = {
165 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
166 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
167 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
168 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
169 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
170 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
171 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
172 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
173 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
174 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
175 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
176 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
177 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
178 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
179 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
180 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
183 /* Avoid an int/float conversion. */
184 const float x264_log2_lz_lut[32] = {
185 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
188 // should the intra and inter lambdas be different?
189 // I'm just matching the behaviour of deadzone quant.
190 static const int x264_trellis_lambda2_tab[2][52] = {
191 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
192 { 46, 58, 73, 92, 117, 147,
193 185, 233, 294, 370, 466, 587,
194 740, 932, 1174, 1480, 1864, 2349,
195 2959, 3728, 4697, 5918, 7457, 9395,
196 11837, 14914, 18790, 23674, 29828, 37581,
197 47349, 59656, 75163, 94699, 119313, 150326,
198 189399, 238627, 300652, 378798, 477255, 601304,
199 757596, 954511, 1202608, 1515192, 1909022, 2405217,
200 3030384, 3818045, 4810435, 6060769 },
201 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
202 { 27, 34, 43, 54, 68, 86,
203 108, 136, 172, 216, 273, 343,
204 433, 545, 687, 865, 1090, 1374,
205 1731, 2180, 2747, 3461, 4361, 5494,
206 6922, 8721, 10988, 13844, 17442, 21976,
207 27688, 34885, 43953, 55377, 69771, 87906,
208 110755, 139543, 175813, 221511, 279087, 351627,
209 443023, 558174, 703255, 886046, 1116348, 1406511,
210 1772093, 2232697, 2813022, 3544186 }
213 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
214 16, 20, 25, 32, 40, 50,
215 64, 80, 101, 128, 161, 203,
216 256, 322, 406, 512, 645, 812,
217 1024, 1290, 1625, 2048, 2580, 3250,
218 4096, 5160, 6501, 8192, 10321, 13003,
219 16384, 20642, 26007, 32768, 41285, 52015,
223 /* TODO: calculate CABAC costs */
224 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
225 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
227 static const uint8_t i_mb_b16x8_cost_table[17] = {
228 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
230 static const uint8_t i_sub_mb_b_cost_table[13] = {
231 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
233 static const uint8_t i_sub_mb_p_cost_table[4] = {
237 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
239 static uint16_t x264_cost_ref[92][3][33];
240 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
242 int x264_analyse_init_costs( x264_t *h, int qp )
245 int lambda = x264_lambda_tab[qp];
246 if( h->cost_mv[lambda] )
248 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
249 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
250 h->cost_mv[lambda] += 2*4*2048;
251 for( i = 0; i <= 2*4*2048; i++ )
253 h->cost_mv[lambda][-i] =
254 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
256 x264_pthread_mutex_lock( &cost_ref_mutex );
257 for( i = 0; i < 3; i++ )
258 for( j = 0; j < 33; j++ )
259 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
260 x264_pthread_mutex_unlock( &cost_ref_mutex );
261 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
265 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
266 h->cost_mv_fpel[lambda][j] += 2*2048;
267 for( i = -2*2048; i < 2*2048; i++ )
268 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
276 void x264_analyse_free_costs( x264_t *h )
279 for( i = 0; i < 92; i++ )
282 x264_free( h->cost_mv[i] - 2*4*2048 );
283 if( h->cost_mv_fpel[i][0] )
284 for( j = 0; j < 4; j++ )
285 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
289 void x264_analyse_weight_frame( x264_t *h, int end )
292 for( j=0; j<h->i_ref0; j++ )
294 if( h->sh.weight[j][0].weightfn )
296 x264_frame_t *frame = h->fref0[j];
297 int width = frame->i_width[0] + 2*PADH;
298 int i_padv = PADV << h->param.b_interlaced;
300 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
302 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
303 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
304 h->fenc->i_lines_weighted += height;
307 for( k = j; k < h->i_ref0; k++ )
308 if( h->sh.weight[k][0].weightfn )
310 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
311 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
312 src + offset, frame->i_stride[0],
313 width, height, &h->sh.weight[k][0] );
321 /* initialize an array of lambda*nbits for all possible mvs */
322 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
324 a->p_cost_mv = h->cost_mv[a->i_lambda];
325 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
326 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
329 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
331 /* conduct the analysis using this lamda and QP */
332 a->i_qp = h->mb.i_qp = i_qp;
333 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
335 a->i_lambda = x264_lambda_tab[i_qp];
336 a->i_lambda2 = x264_lambda2_tab[i_qp];
338 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
339 if( h->param.analyse.i_trellis )
341 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
342 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
343 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
344 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
346 h->mb.i_psy_rd_lambda = a->i_lambda;
347 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
348 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
352 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
354 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
356 /* mbrd == 1 -> RD mode decision */
357 /* mbrd == 2 -> RD refinement */
358 /* mbrd == 3 -> QPRD */
359 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
361 x264_mb_analyse_init_qp( h, a, i_qp );
363 h->mb.i_me_method = h->param.analyse.i_me_method;
364 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
365 if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
366 h->mb.i_subpel_refine--;
367 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
368 && h->mb.i_subpel_refine >= 5;
369 h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
370 (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
372 h->mb.b_transform_8x8 = 0;
373 h->mb.b_noise_reduction = 0;
379 a->i_satd_i8x8chroma = COST_MAX;
381 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
382 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
386 h->mb.b_lossless ? 0 :
388 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
390 /* II: Inter part P/B frame */
391 if( h->sh.i_type != SLICE_TYPE_I )
394 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
395 // limit motion search to a slightly smaller range than the theoretical limit,
396 // since the search may go a few iterations past its given range
397 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
399 /* Calculate max allowed MV range */
400 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
401 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
402 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
403 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
404 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
405 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
407 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
408 int max_mv = max_x - 4*16*h->mb.i_mb_x;
409 /* If we're left of the refresh bar, don't reference right of it. */
410 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
411 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
413 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
414 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
415 if( h->mb.i_mb_x == 0 )
417 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
418 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
419 int thread_mvy_range = i_fmv_range;
421 if( h->i_thread_frames > 1 )
423 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
424 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
425 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
427 x264_frame_t **fref = i ? h->fref1 : h->fref0;
428 int i_ref = i ? h->i_ref1 : h->i_ref0;
429 for( j=0; j<i_ref; j++ )
431 x264_frame_cond_wait( fref[j]->orig, thresh );
432 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
436 if( h->param.b_deterministic )
437 thread_mvy_range = h->param.analyse.i_mv_range_thread;
438 if( h->mb.b_interlaced )
439 thread_mvy_range >>= 1;
441 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
444 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
445 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
446 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
447 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
448 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
449 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
450 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
456 a->l0.i_cost8x8 = COST_MAX;
458 for( i = 0; i < 4; i++ )
462 a->l0.i_cost4x8[i] = COST_MAX;
466 a->l0.i_cost8x16 = COST_MAX;
467 if( h->sh.i_type == SLICE_TYPE_B )
471 a->l1.i_cost8x8 = COST_MAX;
473 for( i = 0; i < 4; i++ )
478 a->i_cost8x8direct[i] = COST_MAX;
489 a->i_cost16x16direct =
492 a->i_cost8x16bi = COST_MAX;
495 /* Fast intra decision */
496 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
498 if( IS_INTRA( h->mb.i_mb_type_left )
499 || IS_INTRA( h->mb.i_mb_type_top )
500 || IS_INTRA( h->mb.i_mb_type_topleft )
501 || IS_INTRA( h->mb.i_mb_type_topright )
502 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
503 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
504 { /* intra is likely */ }
511 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
512 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
514 a->b_force_intra = 1;
518 a->b_force_intra = 0;
522 /* Prediction modes allowed for various combinations of neighbors. */
523 /* Terminated by a -1. */
524 /* In order, no neighbors, left, top, top/left, top/left/topleft */
525 static const int8_t i16x16_mode_available[5][5] =
527 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
528 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
529 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
530 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
531 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
534 static const int8_t i8x8chroma_mode_available[5][5] =
536 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
537 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
538 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
539 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
540 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
543 static const int8_t i4x4_mode_available[5][10] =
545 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
546 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
547 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
548 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
549 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
552 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
554 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
555 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
558 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
560 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
561 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
564 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
566 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
567 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
570 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
571 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
573 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
575 if( do_both_dct || h->mb.b_transform_8x8 )
576 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
577 if( do_both_dct || !h->mb.b_transform_8x8 )
578 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
581 /* Reset fenc satd scores cache for psy RD */
582 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
584 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
585 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
586 if( !h->mb.i_psy_rd )
588 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
589 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
591 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
594 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
596 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
598 if( a->i_satd_i8x8chroma < COST_MAX )
601 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
603 /* 8x8 prediction selection for chroma */
604 if( predict_mode[3] >= 0 && b_merged_satd )
606 int satdu[4], satdv[4];
607 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
608 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
609 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
610 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
611 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
612 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
614 for( ; *predict_mode >= 0; predict_mode++ )
616 int i_mode = *predict_mode;
617 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
619 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
620 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
625 for( ; *predict_mode >= 0; predict_mode++ )
628 int i_mode = *predict_mode;
630 /* we do the prediction */
631 if( h->mb.b_lossless )
632 x264_predict_lossless_8x8_chroma( h, i_mode );
635 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
636 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
639 /* we calculate the cost */
640 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
641 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
642 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
644 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
645 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
649 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
652 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
654 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
655 uint8_t *p_src = h->mb.pic.p_fenc[0];
656 uint8_t *p_dst = h->mb.pic.p_fdec[0];
659 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
661 /*---------------- Try all mode and calculate their score ---------------*/
663 /* 16x16 prediction selection */
664 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
666 if( b_merged_satd && predict_mode[3] >= 0 )
668 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
669 h->predict_16x16[I_PRED_16x16_P]( p_dst );
670 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
671 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
674 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
675 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
680 for( ; *predict_mode >= 0; predict_mode++ )
683 int i_mode = *predict_mode;
685 if( h->mb.b_lossless )
686 x264_predict_lossless_16x16( h, i_mode );
688 h->predict_16x16[i_mode]( p_dst );
690 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
691 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
692 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
693 a->i_satd_i16x16_dir[i_mode] = i_satd;
697 if( h->sh.i_type == SLICE_TYPE_B )
698 /* cavlc mb type prefix */
699 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
700 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
703 /* 8x8 prediction selection */
704 if( flags & X264_ANALYSE_I8x8 )
706 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
707 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
708 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
710 h->mb.i_cbp_luma = 0;
711 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
713 // FIXME some bias like in i4x4?
714 if( h->sh.i_type == SLICE_TYPE_B )
715 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
717 for( idx = 0;; idx++ )
721 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
722 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
723 int i_best = COST_MAX;
724 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
726 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
727 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
729 if( b_merged_satd && predict_mode[8] >= 0 )
732 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
733 satd[i_pred_mode] -= 3 * a->i_lambda;
734 for( i=2; i>=0; i-- )
736 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
737 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
742 for( ; *predict_mode >= 0; predict_mode++ )
745 int i_mode = *predict_mode;
747 if( h->mb.b_lossless )
748 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
750 h->predict_8x8[i_mode]( p_dst_by, edge );
752 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
753 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
754 i_satd -= a->i_lambda * 3;
756 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
757 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
761 if( idx == 3 || i_cost > i_satd_thresh )
764 /* we need to encode this block now (for next ones) */
765 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
766 x264_mb_encode_i8x8( h, idx, a->i_qp );
768 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
773 a->i_satd_i8x8 = i_cost;
774 if( h->mb.i_skip_intra )
776 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
777 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
778 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
779 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
780 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
781 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
782 if( h->mb.i_skip_intra == 2 )
783 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
788 static const uint16_t cost_div_fix8[3] = {1024,512,341};
789 a->i_satd_i8x8 = COST_MAX;
790 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
792 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
796 /* 4x4 prediction selection */
797 if( flags & X264_ANALYSE_I4x4 )
800 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
801 h->mb.i_cbp_luma = 0;
802 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
804 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
806 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
807 if( h->sh.i_type == SLICE_TYPE_B )
808 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
810 for( idx = 0;; idx++ )
812 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
813 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
814 int i_best = COST_MAX;
815 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
817 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
819 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
820 /* emulate missing topright samples */
821 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
823 if( b_merged_satd && predict_mode[5] >= 0 )
826 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
827 satd[i_pred_mode] -= 3 * a->i_lambda;
828 for( i=2; i>=0; i-- )
829 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
833 for( ; *predict_mode >= 0; predict_mode++ )
836 int i_mode = *predict_mode;
838 if( h->mb.b_lossless )
839 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
841 h->predict_4x4[i_mode]( p_dst_by );
843 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
844 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
845 i_satd -= a->i_lambda * 3;
847 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
849 i_cost += i_best + 4 * a->i_lambda;
851 if( i_cost > i_satd_thresh || idx == 15 )
854 /* we need to encode this block now (for next ones) */
855 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
856 x264_mb_encode_i4x4( h, idx, a->i_qp );
858 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
862 a->i_satd_i4x4 = i_cost;
863 if( h->mb.i_skip_intra )
865 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
866 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
867 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
868 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
869 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
870 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
871 if( h->mb.i_skip_intra == 2 )
872 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
876 a->i_satd_i4x4 = COST_MAX;
880 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
882 if( a->i_satd_i16x16 <= i_satd_thresh )
884 h->mb.i_type = I_16x16;
885 x264_analyse_update_cache( h, a );
886 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
889 a->i_satd_i16x16 = COST_MAX;
891 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
893 h->mb.i_type = I_4x4;
894 x264_analyse_update_cache( h, a );
895 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
898 a->i_satd_i4x4 = COST_MAX;
900 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
902 h->mb.i_type = I_8x8;
903 x264_analyse_update_cache( h, a );
904 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
905 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
908 a->i_satd_i8x8 = COST_MAX;
911 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
913 uint8_t *p_dst = h->mb.pic.p_fdec[0];
916 int i_mode, i_thresh;
917 uint64_t i_satd, i_best;
918 h->mb.i_skip_intra = 0;
920 if( h->mb.i_type == I_16x16 )
922 int old_pred_mode = a->i_predict16x16;
923 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
924 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
925 i_best = a->i_satd_i16x16;
926 for( ; *predict_mode >= 0; predict_mode++ )
928 int i_mode = *predict_mode;
929 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
931 h->mb.i_intra16x16_pred_mode = i_mode;
932 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
933 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
937 /* RD selection for chroma prediction */
938 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
939 if( predict_mode[1] >= 0 )
941 int8_t predict_mode_sorted[4];
943 i_thresh = a->i_satd_i8x8chroma * 5/4;
945 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
947 i_mode = *predict_mode;
948 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
949 predict_mode_sorted[i_max++] = i_mode;
954 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
955 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
956 /* the previous thing encoded was x264_intra_rd(), so the pixels and
957 * coefs for the current chroma mode are still around, so we only
958 * have to recount the bits. */
959 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
960 for( i = 0; i < i_max; i++ )
962 i_mode = predict_mode_sorted[i];
963 if( h->mb.b_lossless )
964 x264_predict_lossless_8x8_chroma( h, i_mode );
967 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
968 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
970 /* if we've already found a mode that needs no residual, then
971 * probably any mode with a residual will be worse.
972 * so avoid dct on the remaining modes to improve speed. */
973 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
974 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
976 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
977 h->mb.i_cbp_chroma = i_cbp_chroma_best;
981 if( h->mb.i_type == I_4x4 )
983 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
985 for( idx = 0; idx < 16; idx++ )
987 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
990 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
992 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
993 /* emulate missing topright samples */
994 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
996 for( ; *predict_mode >= 0; predict_mode++ )
998 i_mode = *predict_mode;
999 if( h->mb.b_lossless )
1000 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1002 h->predict_4x4[i_mode]( p_dst_by );
1003 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1005 if( i_best > i_satd )
1007 a->i_predict4x4[idx] = i_mode;
1009 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1010 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1011 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1012 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1013 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1017 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1018 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1019 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1020 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1021 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1023 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1026 else if( h->mb.i_type == I_8x8 )
1028 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1029 for( idx = 0; idx < 4; idx++ )
1031 uint64_t pels_h = 0;
1033 uint16_t i_nnz[2] = {0}; //shut up gcc
1036 int cbp_luma_new = 0;
1037 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1039 i_best = COST_MAX64;
1043 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1044 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1045 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1047 for( ; *predict_mode >= 0; predict_mode++ )
1049 i_mode = *predict_mode;
1050 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1053 if( h->mb.b_lossless )
1054 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1056 h->predict_8x8[i_mode]( p_dst_by, edge );
1057 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1058 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1060 if( i_best > i_satd )
1062 a->i_predict8x8[idx] = i_mode;
1063 cbp_luma_new = h->mb.i_cbp_luma;
1066 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1068 for( j=0; j<7; j++ )
1069 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1070 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1071 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1074 a->i_cbp_i8x8_luma = cbp_luma_new;
1075 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1077 for( j=0; j<7; j++ )
1078 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1079 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1080 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1082 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1087 #define LOAD_FENC( m, src, xoff, yoff) \
1088 (m)->p_cost_mv = a->p_cost_mv; \
1089 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1090 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1091 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1092 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1093 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1095 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1096 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1097 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1098 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1099 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1100 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1101 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1102 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1103 (m)->weight = weight_none; \
1106 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1107 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1108 (m)->weight = h->sh.weight[i_ref];
1110 #define REF_COST(list, ref) \
1111 (a->p_cost_ref[list][ref])
1113 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1117 ALIGNED_4( int16_t mvc[8][2] );
1118 int i_halfpel_thresh = INT_MAX;
1119 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1121 /* 16x16 Search on all ref frame */
1122 m.i_pixel = PIXEL_16x16;
1123 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1125 a->l0.me16x16.cost = INT_MAX;
1126 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1128 const int i_ref_cost = REF_COST( 0, i_ref );
1129 i_halfpel_thresh -= i_ref_cost;
1130 m.i_ref_cost = i_ref_cost;
1132 /* search with ref */
1133 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1134 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1136 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1137 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1139 if( h->mb.ref_blind_dupe == i_ref )
1141 CP32( m.mv, a->l0.mvc[0][0] );
1142 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1145 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1147 /* early termination
1148 * SSD threshold would probably be better than SATD */
1151 && m.cost-m.cost_mv < 300*a->i_lambda
1152 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1153 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1154 && x264_macroblock_probe_pskip( h ) )
1156 h->mb.i_type = P_SKIP;
1157 x264_analyse_update_cache( h, a );
1158 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1162 m.cost += i_ref_cost;
1163 i_halfpel_thresh += i_ref_cost;
1165 if( m.cost < a->l0.me16x16.cost )
1166 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1168 /* save mv for predicting neighbors */
1169 CP32( a->l0.mvc[i_ref][0], m.mv );
1170 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1173 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1174 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1176 h->mb.i_type = P_L0;
1179 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1180 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1182 h->mb.i_partition = D_16x16;
1183 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1184 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1185 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1186 h->mb.i_type = P_SKIP;
1191 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1195 uint8_t **p_fenc = h->mb.pic.p_fenc;
1196 int i_halfpel_thresh = INT_MAX;
1197 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1199 int i_maxref = h->mb.pic.i_fref[0]-1;
1201 h->mb.i_partition = D_8x8;
1203 #define CHECK_NEIGHBOUR(i)\
1205 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1206 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1210 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1211 * than those used by the neighbors */
1212 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1213 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1216 CHECK_NEIGHBOUR( -8 - 1 );
1217 CHECK_NEIGHBOUR( -8 + 0 );
1218 CHECK_NEIGHBOUR( -8 + 2 );
1219 CHECK_NEIGHBOUR( -8 + 4 );
1220 CHECK_NEIGHBOUR( 0 - 1 );
1221 CHECK_NEIGHBOUR( 2*8 - 1 );
1224 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1225 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1227 for( i = 0; i < 4; i++ )
1229 x264_me_t *l0m = &a->l0.me8x8[i];
1233 m.i_pixel = PIXEL_8x8;
1235 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1236 l0m->cost = INT_MAX;
1237 for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1239 const int i_ref_cost = REF_COST( 0, i_ref );
1240 m.i_ref_cost = i_ref_cost;
1242 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1243 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1245 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1246 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1247 if( h->mb.ref_blind_dupe == i_ref )
1249 CP32( m.mv, a->l0.mvc[0][i+1] );
1250 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1253 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1255 m.cost += i_ref_cost;
1256 i_halfpel_thresh += i_ref_cost;
1257 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1259 if( m.cost < l0m->cost )
1260 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1261 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1262 i_ref = h->mb.ref_blind_dupe;
1266 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1267 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1269 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1270 are effectively zero. */
1271 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1272 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1275 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1276 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1277 /* P_8x8 ref0 has no ref cost */
1278 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1279 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1280 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1281 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1282 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1285 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1287 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1288 * reference frame flags. Thus, if we're not doing mixedrefs, just
1289 * don't bother analysing the dupes. */
1290 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1291 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1292 uint8_t **p_fenc = h->mb.pic.p_fenc;
1294 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1297 /* XXX Needed for x264_mb_predict_mv */
1298 h->mb.i_partition = D_8x8;
1301 CP32( mvc[0], a->l0.me16x16.mv );
1303 for( i = 0; i < 4; i++ )
1305 x264_me_t *m = &a->l0.me8x8[i];
1309 m->i_pixel = PIXEL_8x8;
1310 m->i_ref_cost = i_ref_cost;
1312 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1313 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1314 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1316 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1317 x264_me_search( h, m, mvc, i_mvc );
1319 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1321 CP32( mvc[i_mvc], m->mv );
1325 m->cost += i_ref_cost;
1326 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1327 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1330 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1331 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1332 /* theoretically this should include 4*ref_cost,
1333 * but 3 seems a better approximation of cabac. */
1334 if( h->param.b_cabac )
1335 a->l0.i_cost8x8 -= i_ref_cost;
1336 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1337 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1340 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1343 uint8_t **p_fenc = h->mb.pic.p_fenc;
1344 ALIGNED_4( int16_t mvc[3][2] );
1347 /* XXX Needed for x264_mb_predict_mv */
1348 h->mb.i_partition = D_16x8;
1350 for( i = 0; i < 2; i++ )
1352 x264_me_t *l0m = &a->l0.me16x8[i];
1353 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1354 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1355 const int ref8[2] = { minref, maxref };
1356 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1358 m.i_pixel = PIXEL_16x8;
1360 LOAD_FENC( &m, p_fenc, 0, 8*i );
1361 l0m->cost = INT_MAX;
1362 for( j = 0; j < i_ref8s; j++ )
1364 const int i_ref = ref8[j];
1365 const int i_ref_cost = REF_COST( 0, i_ref );
1366 m.i_ref_cost = i_ref_cost;
1368 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1369 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1370 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1371 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1373 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1374 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1376 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1377 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1378 /* We can only take this shortcut if the first search was performed on ref0. */
1379 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1381 /* We can just leave the MV from the previous ref search. */
1382 x264_me_refine_qpel_refdupe( h, &m, NULL );
1385 x264_me_search( h, &m, mvc, 3 );
1387 m.cost += i_ref_cost;
1389 if( m.cost < l0m->cost )
1390 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1392 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1393 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1396 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1399 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1402 uint8_t **p_fenc = h->mb.pic.p_fenc;
1403 ALIGNED_4( int16_t mvc[3][2] );
1406 /* XXX Needed for x264_mb_predict_mv */
1407 h->mb.i_partition = D_8x16;
1409 for( i = 0; i < 2; i++ )
1411 x264_me_t *l0m = &a->l0.me8x16[i];
1412 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1413 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1414 const int ref8[2] = { minref, maxref };
1415 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1417 m.i_pixel = PIXEL_8x16;
1419 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1420 l0m->cost = INT_MAX;
1421 for( j = 0; j < i_ref8s; j++ )
1423 const int i_ref = ref8[j];
1424 const int i_ref_cost = REF_COST( 0, i_ref );
1425 m.i_ref_cost = i_ref_cost;
1427 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1428 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1429 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1431 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1432 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1434 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1435 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1436 /* We can only take this shortcut if the first search was performed on ref0. */
1437 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1439 /* We can just leave the MV from the previous ref search. */
1440 x264_me_refine_qpel_refdupe( h, &m, NULL );
1443 x264_me_search( h, &m, mvc, 3 );
1445 m.cost += i_ref_cost;
1447 if( m.cost < l0m->cost )
1448 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1450 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1451 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1454 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1457 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1459 ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1460 uint8_t *pix2 = pix1+8;
1461 const int i_stride = h->mb.pic.i_stride[1];
1462 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1463 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1464 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1465 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1466 x264_weight_t *weight = h->sh.weight[i_ref];
1468 #define CHROMA4x4MC( width, height, me, x, y ) \
1469 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1470 if( weight[1].weightfn ) \
1471 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1472 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1473 if( weight[2].weightfn ) \
1474 weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1477 if( pixel == PIXEL_4x4 )
1479 x264_me_t *m = a->l0.me4x4[i8x8];
1480 CHROMA4x4MC( 2,2, m[0], 0,0 );
1481 CHROMA4x4MC( 2,2, m[1], 2,0 );
1482 CHROMA4x4MC( 2,2, m[2], 0,2 );
1483 CHROMA4x4MC( 2,2, m[3], 2,2 );
1485 else if( pixel == PIXEL_8x4 )
1487 x264_me_t *m = a->l0.me8x4[i8x8];
1488 CHROMA4x4MC( 4,2, m[0], 0,0 );
1489 CHROMA4x4MC( 4,2, m[1], 0,2 );
1493 x264_me_t *m = a->l0.me4x8[i8x8];
1494 CHROMA4x4MC( 2,4, m[0], 0,0 );
1495 CHROMA4x4MC( 2,4, m[1], 2,0 );
1498 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1499 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1502 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1504 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1505 uint8_t **p_fenc = h->mb.pic.p_fenc;
1506 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1509 /* XXX Needed for x264_mb_predict_mv */
1510 h->mb.i_partition = D_8x8;
1512 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1514 const int idx = 4*i8x8 + i4x4;
1515 const int x4 = block_idx_x[idx];
1516 const int y4 = block_idx_y[idx];
1517 const int i_mvc = (i4x4 == 0);
1519 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1521 m->i_pixel = PIXEL_4x4;
1523 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1524 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1525 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1527 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1528 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1530 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1532 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1533 a->l0.me4x4[i8x8][1].cost +
1534 a->l0.me4x4[i8x8][2].cost +
1535 a->l0.me4x4[i8x8][3].cost +
1536 REF_COST( 0, i_ref ) +
1537 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1538 if( h->mb.b_chroma_me )
1539 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1542 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1544 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1545 uint8_t **p_fenc = h->mb.pic.p_fenc;
1546 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1549 /* XXX Needed for x264_mb_predict_mv */
1550 h->mb.i_partition = D_8x8;
1552 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1554 const int idx = 4*i8x8 + 2*i8x4;
1555 const int x4 = block_idx_x[idx];
1556 const int y4 = block_idx_y[idx];
1557 const int i_mvc = (i8x4 == 0);
1559 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1561 m->i_pixel = PIXEL_8x4;
1563 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1564 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1565 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1567 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1568 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1570 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1572 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1573 REF_COST( 0, i_ref ) +
1574 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1575 if( h->mb.b_chroma_me )
1576 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1579 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1581 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1582 uint8_t **p_fenc = h->mb.pic.p_fenc;
1583 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1586 /* XXX Needed for x264_mb_predict_mv */
1587 h->mb.i_partition = D_8x8;
1589 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1591 const int idx = 4*i8x8 + i4x8;
1592 const int x4 = block_idx_x[idx];
1593 const int y4 = block_idx_y[idx];
1594 const int i_mvc = (i4x8 == 0);
1596 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1598 m->i_pixel = PIXEL_4x8;
1600 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1601 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1602 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1604 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1605 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1607 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1609 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1610 REF_COST( 0, i_ref ) +
1611 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1612 if( h->mb.b_chroma_me )
1613 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1616 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1618 /* Assumes that fdec still contains the results of
1619 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1621 uint8_t **p_fenc = h->mb.pic.p_fenc;
1622 uint8_t **p_fdec = h->mb.pic.p_fdec;
1625 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1626 for( i = 0; i < 4; i++ )
1628 const int x = (i&1)*8;
1629 const int y = (i>>1)*8;
1630 a->i_cost16x16direct +=
1631 a->i_cost8x8direct[i] =
1632 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1635 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1639 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1641 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1642 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1643 uint8_t *src0, *src1;
1644 int stride0 = 16, stride1 = 16;
1648 ALIGNED_4( int16_t mvc[9][2] );
1649 int i_halfpel_thresh = INT_MAX;
1650 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1652 /* 16x16 Search on all ref frame */
1653 m.i_pixel = PIXEL_16x16;
1654 m.weight = weight_none;
1656 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1659 a->l0.me16x16.cost = INT_MAX;
1660 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1662 const int i_ref_cost = REF_COST( 0, i_ref );
1663 m.i_ref_cost = i_ref_cost;
1664 /* search with ref */
1665 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1666 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1667 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1668 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1671 m.cost += i_ref_cost;
1673 if( m.cost < a->l0.me16x16.cost )
1675 a->l0.i_ref = i_ref;
1676 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1679 /* save mv for predicting neighbors */
1680 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1682 a->l0.me16x16.i_ref = a->l0.i_ref;
1685 i_halfpel_thresh = INT_MAX;
1686 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1687 a->l1.me16x16.cost = INT_MAX;
1688 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1690 const int i_ref_cost = REF_COST( 0, i_ref );
1691 m.i_ref_cost = i_ref_cost;
1692 /* search with ref */
1693 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1694 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1695 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1696 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1699 m.cost += i_ref_cost;
1701 if( m.cost < a->l1.me16x16.cost )
1703 a->l1.i_ref = i_ref;
1704 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1707 /* save mv for predicting neighbors */
1708 CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
1710 a->l1.me16x16.i_ref = a->l1.i_ref;
1712 /* get cost of BI mode */
1713 int ref_costs = REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref );
1714 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1715 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1716 src0 = h->mc.get_ref( pix0, &stride0,
1717 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1718 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1719 src1 = h->mc.get_ref( pix1, &stride1,
1720 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1721 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1723 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1725 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1727 + a->l0.bi16x16.cost_mv
1728 + a->l1.bi16x16.cost_mv;
1731 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1732 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1734 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1735 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1736 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1737 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1738 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.i_ref][0], h->mb.pic.i_stride[0],
1739 h->mb.pic.p_fref[1][a->l1.i_ref][0], h->mb.pic.i_stride[0],
1740 h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1741 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1742 + ref_costs + l0_mv_cost + l1_mv_cost;
1743 if( cost00 < a->i_cost16x16bi )
1745 M32( a->l0.bi16x16.mv ) = 0;
1746 M32( a->l1.bi16x16.mv ) = 0;
1747 a->l0.bi16x16.cost_mv = l0_mv_cost;
1748 a->l1.bi16x16.cost_mv = l1_mv_cost;
1749 a->i_cost16x16bi = cost00;
1754 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1755 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1756 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1759 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1761 const int x = 2*(i%2);
1762 const int y = 2*(i/2);
1764 switch( h->mb.i_sub_partition[i] )
1767 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1770 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1771 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1774 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1775 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1778 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1779 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1780 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1781 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1784 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1789 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1791 const int x = 2*(idx&1);
1792 const int y = 2*(idx>>1);
1793 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1794 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1795 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1796 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1799 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1800 if( x264_mb_partition_listX_table[0][part] ) \
1802 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1803 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1807 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1808 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1810 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1812 if( x264_mb_partition_listX_table[1][part] ) \
1814 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1815 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1819 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1820 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1822 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1825 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1829 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1831 x264_mb_load_mv_direct8x8( h, i );
1834 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1835 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1836 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1841 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1844 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1846 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1848 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1850 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1854 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1856 uint8_t **p_fref[2] =
1857 { h->mb.pic.p_fref[0][a->l0.i_ref],
1858 h->mb.pic.p_fref[1][a->l1.i_ref] };
1859 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1862 /* XXX Needed for x264_mb_predict_mv */
1863 h->mb.i_partition = D_8x8;
1867 for( i = 0; i < 4; i++ )
1872 int i_part_cost_bi = 0;
1873 int stride[2] = {8,8};
1876 for( l = 0; l < 2; l++ )
1878 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1879 const int i_ref_cost = REF_COST( l, lX->i_ref );
1880 x264_me_t *m = &lX->me8x8[i];
1882 m->i_pixel = PIXEL_8x8;
1883 m->i_ref_cost = i_ref_cost;
1885 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1886 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1888 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->i_ref );
1889 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1890 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1891 m->cost += i_ref_cost;
1893 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1896 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1897 m->mv[0], m->mv[1], 8, 8, weight_none );
1898 i_part_cost_bi += m->cost_mv + i_ref_cost;
1900 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1901 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1902 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1903 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1904 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1906 i_part_cost = a->l0.me8x8[i].cost;
1907 h->mb.i_sub_partition[i] = D_L0_8x8;
1908 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1909 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1910 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1911 a->i_cost8x8bi += i_part_cost;
1913 /* XXX Needed for x264_mb_predict_mv */
1914 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1918 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1921 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1923 uint8_t **p_fref[2] =
1924 { h->mb.pic.p_fref[0][a->l0.i_ref],
1925 h->mb.pic.p_fref[1][a->l1.i_ref] };
1926 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1927 ALIGNED_4( int16_t mvc[2][2] );
1930 h->mb.i_partition = D_16x8;
1931 a->i_cost16x8bi = 0;
1933 for( i = 0; i < 2; i++ )
1936 int i_part_cost_bi = 0;
1937 int stride[2] = {16,16};
1940 /* TODO: check only the list(s) that were used in b8x8? */
1941 for( l = 0; l < 2; l++ )
1943 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1944 const int i_ref_cost = REF_COST( l, lX->i_ref );
1945 x264_me_t *m = &lX->me16x8[i];
1947 m->i_pixel = PIXEL_16x8;
1948 m->i_ref_cost = i_ref_cost;
1950 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1951 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1953 CP32( mvc[0], lX->me8x8[2*i].mv );
1954 CP32( mvc[1], lX->me8x8[2*i+1].mv );
1956 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, lX->i_ref );
1957 x264_mb_predict_mv( h, l, 8*i, 4, m->mvp );
1958 x264_me_search( h, m, mvc, 2 );
1959 m->cost += i_ref_cost;
1962 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1963 m->mv[0], m->mv[1], 16, 8, weight_none );
1964 i_part_cost_bi += m->cost_mv + i_ref_cost;
1966 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1967 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1969 i_part_cost = a->l0.me16x8[i].cost;
1970 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1971 if( a->l1.me16x8[i].cost < i_part_cost )
1973 i_part_cost = a->l1.me16x8[i].cost;
1974 a->i_mb_partition16x8[i] = D_L1_8x8;
1976 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1978 i_part_cost = i_part_cost_bi;
1979 a->i_mb_partition16x8[i] = D_BI_8x8;
1981 a->i_cost16x8bi += i_part_cost;
1983 x264_mb_cache_mv_b16x8( h, a, i, 0 );
1987 a->i_mb_type16x8 = B_L0_L0
1988 + (a->i_mb_partition16x8[0]>>2) * 3
1989 + (a->i_mb_partition16x8[1]>>2);
1990 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1993 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1995 uint8_t **p_fref[2] =
1996 { h->mb.pic.p_fref[0][a->l0.i_ref],
1997 h->mb.pic.p_fref[1][a->l1.i_ref] };
1998 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
1999 ALIGNED_4( int16_t mvc[2][2] );
2002 h->mb.i_partition = D_8x16;
2003 a->i_cost8x16bi = 0;
2005 for( i = 0; i < 2; i++ )
2008 int i_part_cost_bi = 0;
2009 int stride[2] = {8,8};
2012 for( l = 0; l < 2; l++ )
2014 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2015 const int i_ref_cost = REF_COST( l, lX->i_ref );
2016 x264_me_t *m = &lX->me8x16[i];
2018 m->i_pixel = PIXEL_8x16;
2019 m->i_ref_cost = i_ref_cost;
2021 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
2022 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
2024 CP32( mvc[0], lX->me8x8[i].mv );
2025 CP32( mvc[1], lX->me8x8[i+2].mv );
2027 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, lX->i_ref );
2028 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2029 x264_me_search( h, m, mvc, 2 );
2030 m->cost += i_ref_cost;
2033 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2034 m->mv[0], m->mv[1], 8, 16, weight_none );
2035 i_part_cost_bi += m->cost_mv + i_ref_cost;
2038 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2039 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2041 i_part_cost = a->l0.me8x16[i].cost;
2042 a->i_mb_partition8x16[i] = D_L0_8x8;
2043 if( a->l1.me8x16[i].cost < i_part_cost )
2045 i_part_cost = a->l1.me8x16[i].cost;
2046 a->i_mb_partition8x16[i] = D_L1_8x8;
2048 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2050 i_part_cost = i_part_cost_bi;
2051 a->i_mb_partition8x16[i] = D_BI_8x8;
2053 a->i_cost8x16bi += i_part_cost;
2055 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2059 a->i_mb_type8x16 = B_L0_L0
2060 + (a->i_mb_partition8x16[0]>>2) * 3
2061 + (a->i_mb_partition8x16[1]>>2);
2062 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2065 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2067 int thresh = i_satd * 5/4;
2069 h->mb.i_type = P_L0;
2070 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2072 h->mb.i_partition = D_16x16;
2073 x264_analyse_update_cache( h, a );
2074 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2077 if( a->l0.i_cost16x8 <= thresh )
2079 h->mb.i_partition = D_16x8;
2080 x264_analyse_update_cache( h, a );
2081 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2084 a->l0.i_cost16x8 = COST_MAX;
2086 if( a->l0.i_cost8x16 <= thresh )
2088 h->mb.i_partition = D_8x16;
2089 x264_analyse_update_cache( h, a );
2090 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2093 a->l0.i_cost8x16 = COST_MAX;
2095 if( a->l0.i_cost8x8 <= thresh )
2097 h->mb.i_type = P_8x8;
2098 h->mb.i_partition = D_8x8;
2099 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2102 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2103 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2104 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2105 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2106 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2107 * for future blocks are those left over from previous RDO calls. */
2108 for( i = 0; i < 4; i++ )
2110 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2111 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2112 int subtype, btype = D_L0_8x8;
2113 uint64_t bcost = COST_MAX64;
2114 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2117 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2119 h->mb.i_sub_partition[i] = subtype;
2120 x264_mb_cache_mv_p8x8( h, a, i );
2121 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2122 COPY2_IF_LT( bcost, cost, btype, subtype );
2124 if( h->mb.i_sub_partition[i] != btype )
2126 h->mb.i_sub_partition[i] = btype;
2127 x264_mb_cache_mv_p8x8( h, a, i );
2132 x264_analyse_update_cache( h, a );
2133 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2136 a->l0.i_cost8x8 = COST_MAX;
2139 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2141 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2143 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2145 h->mb.i_type = B_DIRECT;
2146 /* Assumes direct/skip MC is still in fdec */
2147 /* Requires b-rdo to be done before intra analysis */
2148 h->mb.b_skip_mc = 1;
2149 x264_analyse_update_cache( h, a );
2150 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2151 h->mb.b_skip_mc = 0;
2154 //FIXME not all the update_cache calls are needed
2155 h->mb.i_partition = D_16x16;
2157 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2159 h->mb.i_type = B_L0_L0;
2160 x264_analyse_update_cache( h, a );
2161 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2165 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2167 h->mb.i_type = B_L1_L1;
2168 x264_analyse_update_cache( h, a );
2169 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2173 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2175 h->mb.i_type = B_BI_BI;
2176 x264_analyse_update_cache( h, a );
2177 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2181 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2183 h->mb.i_type = B_8x8;
2184 h->mb.i_partition = D_8x8;
2185 x264_analyse_update_cache( h, a );
2186 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2187 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2191 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2193 h->mb.i_type = a->i_mb_type16x8;
2194 h->mb.i_partition = D_16x8;
2195 x264_analyse_update_cache( h, a );
2196 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2200 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2202 h->mb.i_type = a->i_mb_type8x16;
2203 h->mb.i_partition = D_8x16;
2204 x264_analyse_update_cache( h, a );
2205 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2209 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2211 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2214 if( IS_INTRA(h->mb.i_type) )
2217 switch( h->mb.i_partition )
2220 if( h->mb.i_type == B_BI_BI )
2221 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2224 for( i=0; i<2; i++ )
2225 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2226 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2229 for( i=0; i<2; i++ )
2230 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2231 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2234 for( i=0; i<4; i++ )
2235 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2236 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2241 static inline void x264_mb_analyse_transform( x264_t *h )
2243 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2245 int i_cost4, i_cost8;
2246 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2249 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2250 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2251 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2252 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2254 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2255 h->mb.b_skip_mc = 1;
2259 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2261 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2264 x264_analyse_update_cache( h, a );
2265 h->mb.b_transform_8x8 ^= 1;
2266 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2267 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2269 if( *i_rd >= i_rd8 )
2272 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2276 h->mb.b_transform_8x8 ^= 1;
2280 /* Rate-distortion optimal QP selection.
2281 * FIXME: More than half of the benefit of this function seems to be
2282 * in the way it improves the coding of chroma DC (by decimating or
2283 * finding a better way to code a single DC coefficient.)
2284 * There must be a more efficient way to get that portion of the benefit
2285 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2287 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2289 int bcost, cost, direction, failures, prevcost, origcost;
2290 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2291 int last_qp_tried = 0;
2292 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2293 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2295 /* If CBP is already zero, don't raise the quantizer any higher. */
2296 for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2298 /* Without psy-RD, require monotonicity when moving quant away from previous
2299 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2300 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2301 * allow 2 failures when moving quant towards previous quant.
2302 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2303 int threshold = (!!h->mb.i_psy_rd);
2304 /* Raise the threshold for failures if we're moving towards the last QP. */
2305 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2306 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2308 h->mb.i_qp = orig_qp;
2310 prevcost = origcost;
2312 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2313 * (up to a point) will too. So, jump down to where the threshold will kick in
2314 * and check the QP there. If the CBP is still empty, skip the main loop.
2315 * If it isn't empty, we would have ended up having to check this QP anyways,
2316 * so as long as we store it for later lookup, we lose nothing. */
2317 int already_checked_qp = -1;
2318 int already_checked_cost = COST_MAX;
2319 if( direction == -1 )
2323 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2324 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2325 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2326 if( !h->mb.cbp[h->mb.i_mb_xy] )
2328 /* If our empty-CBP block is lower QP than the last QP,
2329 * the last QP almost surely doesn't have a CBP either. */
2330 if( h->mb.i_last_qp > h->mb.i_qp )
2334 already_checked_qp = h->mb.i_qp;
2335 h->mb.i_qp = orig_qp;
2339 h->mb.i_qp += direction;
2340 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2342 if( h->mb.i_last_qp == h->mb.i_qp )
2344 if( h->mb.i_qp == already_checked_qp )
2345 cost = already_checked_cost;
2348 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2349 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2350 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2353 /* We can't assume that the costs are monotonic over QPs.
2354 * Tie case-as-failure seems to give better results. */
2355 if( cost < prevcost )
2361 if( failures > threshold )
2363 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2365 h->mb.i_qp += direction;
2369 /* Always try the last block's QP. */
2370 if( !last_qp_tried )
2372 h->mb.i_qp = h->mb.i_last_qp;
2373 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2374 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2375 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2379 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2381 /* Check transform again; decision from before may no longer be optimal. */
2382 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2383 x264_mb_transform_8x8_allowed( h ) )
2385 h->mb.b_transform_8x8 ^= 1;
2386 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2388 h->mb.b_transform_8x8 ^= 1;
2392 /*****************************************************************************
2393 * x264_macroblock_analyse:
2394 *****************************************************************************/
2395 void x264_macroblock_analyse( x264_t *h )
2397 x264_mb_analysis_t analysis;
2398 int i_cost = COST_MAX;
2401 h->mb.i_qp = x264_ratecontrol_qp( h );
2402 if( h->param.rc.i_aq_mode )
2404 x264_adaptive_quant( h );
2405 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2406 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2407 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2408 h->mb.i_qp = h->mb.i_last_qp;
2411 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2413 /*--------------------------- Do the analysis ---------------------------*/
2414 if( h->sh.i_type == SLICE_TYPE_I )
2417 if( analysis.i_mbrd )
2418 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2419 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2420 if( analysis.i_mbrd )
2421 x264_intra_rd( h, &analysis, COST_MAX );
2423 i_cost = analysis.i_satd_i16x16;
2424 h->mb.i_type = I_16x16;
2425 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2426 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2427 if( analysis.i_satd_pcm < i_cost )
2428 h->mb.i_type = I_PCM;
2430 else if( analysis.i_mbrd >= 2 )
2431 x264_intra_rd_refine( h, &analysis );
2433 else if( h->sh.i_type == SLICE_TYPE_P )
2437 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2439 analysis.b_try_pskip = 0;
2440 if( analysis.b_force_intra )
2442 if( !h->param.analyse.b_psy )
2444 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2445 goto intra_analysis;
2450 /* Fast P_SKIP detection */
2451 if( h->param.analyse.b_fast_pskip )
2453 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2454 // FIXME don't need to check this if the reference frame is done
2456 else if( h->param.analyse.i_subpel_refine >= 3 )
2457 analysis.b_try_pskip = 1;
2458 else if( h->mb.i_mb_type_left == P_SKIP ||
2459 h->mb.i_mb_type_top == P_SKIP ||
2460 h->mb.i_mb_type_topleft == P_SKIP ||
2461 h->mb.i_mb_type_topright == P_SKIP )
2462 b_skip = x264_macroblock_probe_pskip( h );
2466 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2470 h->mb.i_type = P_SKIP;
2471 h->mb.i_partition = D_16x16;
2472 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2476 const unsigned int flags = h->param.analyse.inter;
2480 int i_satd_inter, i_satd_intra;
2482 x264_mb_analyse_load_costs( h, &analysis );
2484 x264_mb_analyse_inter_p16x16( h, &analysis );
2486 if( h->mb.i_type == P_SKIP )
2489 if( flags & X264_ANALYSE_PSUB16x16 )
2491 if( h->param.analyse.b_mixed_references )
2492 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2494 x264_mb_analyse_inter_p8x8( h, &analysis );
2497 /* Select best inter mode */
2499 i_partition = D_16x16;
2500 i_cost = analysis.l0.me16x16.cost;
2502 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2503 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2506 i_partition = D_8x8;
2507 i_cost = analysis.l0.i_cost8x8;
2510 if( flags & X264_ANALYSE_PSUB8x8 )
2512 for( i = 0; i < 4; i++ )
2514 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2515 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2517 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2518 h->mb.i_sub_partition[i] = D_L0_4x4;
2520 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2521 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2522 h->mb.i_sub_partition[i], D_L0_8x4 );
2524 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2525 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2526 h->mb.i_sub_partition[i], D_L0_4x8 );
2528 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2530 x264_mb_cache_mv_p8x8( h, &analysis, i );
2532 analysis.l0.i_cost8x8 = i_cost;
2536 /* Now do 16x8/8x16 */
2537 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2538 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2539 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2541 x264_mb_analyse_inter_p16x8( h, &analysis );
2542 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2544 x264_mb_analyse_inter_p8x16( h, &analysis );
2545 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2548 h->mb.i_partition = i_partition;
2551 //FIXME mb_type costs?
2552 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2556 else if( i_partition == D_16x16 )
2558 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2559 i_cost = analysis.l0.me16x16.cost;
2561 else if( i_partition == D_16x8 )
2563 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2564 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2565 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2567 else if( i_partition == D_8x16 )
2569 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2570 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2571 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2573 else if( i_partition == D_8x8 )
2577 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2579 switch( h->mb.i_sub_partition[i8x8] )
2582 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2583 i_cost += analysis.l0.me8x8[i8x8].cost;
2586 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2587 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2588 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2589 analysis.l0.me8x4[i8x8][1].cost;
2592 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2593 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2594 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2595 analysis.l0.me4x8[i8x8][1].cost;
2599 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2600 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2601 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2602 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2603 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2604 analysis.l0.me4x4[i8x8][1].cost +
2605 analysis.l0.me4x4[i8x8][2].cost +
2606 analysis.l0.me4x4[i8x8][3].cost;
2609 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2615 if( h->mb.b_chroma_me )
2617 x264_mb_analyse_intra_chroma( h, &analysis );
2618 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2619 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2620 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2621 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2624 x264_mb_analyse_intra( h, &analysis, i_cost );
2626 i_satd_inter = i_cost;
2627 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2628 analysis.i_satd_i8x8,
2629 analysis.i_satd_i4x4 );
2631 if( analysis.i_mbrd )
2633 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2635 i_partition = D_16x16;
2636 i_cost = analysis.l0.i_rd16x16;
2637 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2638 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2639 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2640 h->mb.i_type = i_type;
2641 h->mb.i_partition = i_partition;
2642 if( i_cost < COST_MAX )
2643 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2644 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2647 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2648 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2649 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2650 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2652 h->mb.i_type = i_type;
2654 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2656 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2657 * it was an inter block. */
2658 x264_analyse_update_cache( h, &analysis );
2659 x264_macroblock_encode( h );
2660 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2661 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2662 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2663 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2664 goto intra_analysis;
2667 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2669 if( IS_INTRA( h->mb.i_type ) )
2671 x264_intra_rd_refine( h, &analysis );
2673 else if( i_partition == D_16x16 )
2675 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2676 analysis.l0.me16x16.cost = i_cost;
2677 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2679 else if( i_partition == D_16x8 )
2681 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2682 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2683 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2684 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2685 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2686 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2688 else if( i_partition == D_8x16 )
2690 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2691 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2692 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2693 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2694 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2695 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2697 else if( i_partition == D_8x8 )
2700 x264_analyse_update_cache( h, &analysis );
2701 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2703 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2705 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2707 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2709 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2710 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2712 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2714 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2715 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2717 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2719 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2720 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2721 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2722 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2729 else if( h->sh.i_type == SLICE_TYPE_B )
2731 int i_bskip_cost = COST_MAX;
2734 if( analysis.i_mbrd )
2735 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2737 h->mb.i_type = B_SKIP;
2738 if( h->mb.b_direct_auto_write )
2740 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2741 for( i = 0; i < 2; i++ )
2744 h->sh.b_direct_spatial_mv_pred ^= 1;
2745 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2746 if( analysis.b_direct_available )
2751 b_skip = x264_macroblock_probe_bskip( h );
2753 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2760 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2762 if( analysis.b_direct_available )
2764 if( !h->mb.b_direct_auto_write )
2766 if( analysis.i_mbrd )
2768 i_bskip_cost = ssd_mb( h );
2769 /* 6 = minimum cavlc cost of a non-skipped MB */
2770 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2772 else if( !h->mb.b_direct_auto_write )
2774 /* Conditioning the probe on neighboring block types
2775 * doesn't seem to help speed or quality. */
2776 b_skip = x264_macroblock_probe_bskip( h );
2782 const unsigned int flags = h->param.analyse.inter;
2786 h->mb.b_skip_mc = 0;
2788 x264_mb_analyse_load_costs( h, &analysis );
2790 /* select best inter mode */
2791 /* direct must be first */
2792 if( analysis.b_direct_available )
2793 x264_mb_analyse_inter_direct( h, &analysis );
2795 x264_mb_analyse_inter_b16x16( h, &analysis );
2798 i_partition = D_16x16;
2799 i_cost = analysis.l0.me16x16.cost;
2800 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2801 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2802 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2804 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2806 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2807 if( i_bskip_cost < analysis.i_rd16x16direct &&
2808 i_bskip_cost < analysis.i_rd16x16bi &&
2809 i_bskip_cost < analysis.l0.i_rd16x16 &&
2810 i_bskip_cost < analysis.l1.i_rd16x16 )
2812 h->mb.i_type = B_SKIP;
2813 x264_analyse_update_cache( h, &analysis );
2818 if( flags & X264_ANALYSE_BSUB16x16 )
2820 x264_mb_analyse_inter_b8x8( h, &analysis );
2821 if( analysis.i_cost8x8bi < i_cost )
2824 i_partition = D_8x8;
2825 i_cost = analysis.i_cost8x8bi;
2827 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2828 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2830 x264_mb_analyse_inter_b16x8( h, &analysis );
2831 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2832 i_type, analysis.i_mb_type16x8,
2833 i_partition, D_16x8 );
2835 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2836 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2838 x264_mb_analyse_inter_b8x16( h, &analysis );
2839 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2840 i_type, analysis.i_mb_type8x16,
2841 i_partition, D_8x16 );
2846 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2851 else if( i_partition == D_16x16 )
2853 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2854 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2855 if( i_type == B_L0_L0 )
2857 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2858 i_cost = analysis.l0.me16x16.cost
2859 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2861 else if( i_type == B_L1_L1 )
2863 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2864 i_cost = analysis.l1.me16x16.cost
2865 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2867 else if( i_type == B_BI_BI )
2869 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
2870 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
2873 else if( i_partition == D_16x8 )
2875 for( i=0; i<2; i++ )
2877 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2878 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2879 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2880 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2883 else if( i_partition == D_8x16 )
2885 for( i=0; i<2; i++ )
2887 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2888 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2889 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2890 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2893 else if( i_partition == D_8x8 )
2895 for( i=0; i<4; i++ )
2898 int i_part_cost_old;
2900 int i_part_type = h->mb.i_sub_partition[i];
2901 int b_bidir = (i_part_type == D_BI_8x8);
2903 if( i_part_type == D_DIRECT_8x8 )
2905 if( x264_mb_partition_listX_table[0][i_part_type] )
2907 m = &analysis.l0.me8x8[i];
2908 i_part_cost_old = m->cost;
2909 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2910 m->cost -= i_type_cost;
2911 x264_me_refine_qpel( h, m );
2913 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2915 if( x264_mb_partition_listX_table[1][i_part_type] )
2917 m = &analysis.l1.me8x8[i];
2918 i_part_cost_old = m->cost;
2919 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2920 m->cost -= i_type_cost;
2921 x264_me_refine_qpel( h, m );
2923 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2925 /* TODO: update mvp? */
2929 i_satd_inter = i_cost;
2931 if( analysis.i_mbrd )
2933 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2935 i_cost = i_bskip_cost;
2936 i_partition = D_16x16;
2937 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2938 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2939 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2940 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2941 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2942 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2943 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2945 h->mb.i_type = i_type;
2946 h->mb.i_partition = i_partition;
2949 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2951 if( analysis.i_mbrd )
2953 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2954 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2957 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2958 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2959 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2960 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2962 h->mb.i_type = i_type;
2963 h->mb.i_partition = i_partition;
2965 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2966 x264_intra_rd_refine( h, &analysis );
2967 if( h->mb.i_subpel_refine >= 5 )
2968 x264_refine_bidir( h, &analysis );
2970 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2972 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2973 x264_analyse_update_cache( h, &analysis );
2975 if( i_partition == D_16x16 )
2977 if( i_type == B_L0_L0 )
2979 analysis.l0.me16x16.cost = i_cost;
2980 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2982 else if( i_type == B_L1_L1 )
2984 analysis.l1.me16x16.cost = i_cost;
2985 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2987 else if( i_type == B_BI_BI )
2988 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
2990 else if( i_partition == D_16x8 )
2992 for( i = 0; i < 2; i++ )
2994 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2995 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2996 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2997 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2998 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2999 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3000 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3003 else if( i_partition == D_8x16 )
3005 for( i = 0; i < 2; i++ )
3007 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3008 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3009 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3010 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3011 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3012 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3013 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3016 else if( i_partition == D_8x8 )
3018 for( i = 0; i < 4; i++ )
3020 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3021 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3022 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3023 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3024 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3025 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3032 x264_analyse_update_cache( h, &analysis );
3034 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3035 * without realizing it. Check for this and account for it if necessary. */
3036 if( analysis.i_mbrd >= 2 )
3038 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3039 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3040 int list = check_mv_lists[h->mb.i_type] - 1;
3041 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3042 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3043 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3044 h->mb.i_partition = D_16x16;
3047 if( !analysis.i_mbrd )
3048 x264_mb_analyse_transform( h );
3050 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3051 x264_mb_analyse_qp_rd( h, &analysis );
3053 h->mb.b_trellis = h->param.analyse.i_trellis;
3054 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3055 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3056 x264_psy_trellis_init( h, 0 );
3057 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3058 h->mb.i_skip_intra = 0;
3061 /*-------------------- Update MB from the analysis ----------------------*/
3062 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3066 switch( h->mb.i_type )
3069 for( i = 0; i < 16; i++ )
3070 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3072 x264_mb_analyse_intra_chroma( h, a );
3075 for( i = 0; i < 4; i++ )
3076 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3078 x264_mb_analyse_intra_chroma( h, a );
3081 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3082 x264_mb_analyse_intra_chroma( h, a );
3089 switch( h->mb.i_partition )
3092 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3093 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3097 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3098 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3099 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3100 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3104 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3105 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3106 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3107 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3111 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3117 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3118 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3119 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3120 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3121 for( i = 0; i < 4; i++ )
3122 x264_mb_cache_mv_p8x8( h, a, i );
3127 h->mb.i_partition = D_16x16;
3128 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3129 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3135 h->mb.i_partition = h->mb.cache.direct_partition;
3136 x264_mb_load_mv_direct8x8( h, 0 );
3137 x264_mb_load_mv_direct8x8( h, 1 );
3138 x264_mb_load_mv_direct8x8( h, 2 );
3139 x264_mb_load_mv_direct8x8( h, 3 );
3143 /* optimize: cache might not need to be rewritten */
3144 for( i = 0; i < 4; i++ )
3145 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3148 default: /* the rest of the B types */
3149 switch( h->mb.i_partition )
3152 switch( h->mb.i_type )
3155 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3156 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3158 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3159 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3160 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3163 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3164 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3165 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3167 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3168 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3171 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3172 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3174 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3175 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3180 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3181 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3184 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3185 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3188 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3194 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3197 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3200 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3203 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3204 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3206 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3207 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3208 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3209 h->mb.cache.mv[l][x264_scan8[15]][0],
3210 h->mb.cache.mv[l][x264_scan8[15]][1] );
3211 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3212 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3213 fprintf(stderr, "completed: %d \n", completed );
3214 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3215 x264_mb_analyse_intra( h, a, COST_MAX );
3216 h->mb.i_type = I_16x16;
3217 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3218 x264_mb_analyse_intra_chroma( h, a );
3225 #include "slicetype.c"