1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
42 x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref[2];
84 /* Take some shortcuts in intra search if intra is deemed unlikely */
86 int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
91 int i_satd_i16x16_dir[7];
96 int i_satd_i8x8_dir[12][4];
100 int i_predict4x4[16];
105 int i_satd_i8x8chroma;
106 int i_satd_i8x8chroma_dir[7];
107 int i_predict8x8chroma;
109 /* II: Inter part P/B frame */
110 x264_mb_analysis_list_t l0;
111 x264_mb_analysis_list_t l1;
113 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
114 int i_cost16x16direct;
116 int i_cost8x8direct[4];
125 int i_mb_partition16x8[2]; /* mb_partition_e */
126 int i_mb_partition8x16[2];
127 int i_mb_type16x8; /* mb_class_e */
130 int b_direct_available;
132 } x264_mb_analysis_t;
134 /* lambda = pow(2,qp/6-2) */
135 const uint8_t x264_lambda_tab[52] = {
136 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
137 1, 1, 1, 1, /* 8-11 */
138 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
139 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
140 6, 7, 8, 9,10,11,13,14, /* 28-35 */
141 16,18,20,23,25,29,32,36, /* 36-43 */
142 40,45,51,57,64,72,81,91 /* 44-51 */
145 /* lambda2 = pow(lambda,2) * .9 * 256 */
146 const int x264_lambda2_tab[52] = {
147 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
148 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
149 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
150 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
151 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
152 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
153 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
156 const uint8_t x264_exp2_lut[64] = {
157 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
158 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
159 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
160 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
163 const float x264_log2_lut[128] = {
164 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
165 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
166 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
167 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
168 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
169 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
170 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
171 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
172 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
173 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
174 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
175 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
176 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
177 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
178 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
179 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
182 /* Avoid an int/float conversion. */
183 const float x264_log2_lz_lut[32] = {
184 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
187 // should the intra and inter lambdas be different?
188 // I'm just matching the behaviour of deadzone quant.
189 static const int x264_trellis_lambda2_tab[2][52] = {
190 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
191 { 46, 58, 73, 92, 117, 147,
192 185, 233, 294, 370, 466, 587,
193 740, 932, 1174, 1480, 1864, 2349,
194 2959, 3728, 4697, 5918, 7457, 9395,
195 11837, 14914, 18790, 23674, 29828, 37581,
196 47349, 59656, 75163, 94699, 119313, 150326,
197 189399, 238627, 300652, 378798, 477255, 601304,
198 757596, 954511, 1202608, 1515192, 1909022, 2405217,
199 3030384, 3818045, 4810435, 6060769 },
200 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
201 { 27, 34, 43, 54, 68, 86,
202 108, 136, 172, 216, 273, 343,
203 433, 545, 687, 865, 1090, 1374,
204 1731, 2180, 2747, 3461, 4361, 5494,
205 6922, 8721, 10988, 13844, 17442, 21976,
206 27688, 34885, 43953, 55377, 69771, 87906,
207 110755, 139543, 175813, 221511, 279087, 351627,
208 443023, 558174, 703255, 886046, 1116348, 1406511,
209 1772093, 2232697, 2813022, 3544186 }
212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
213 16, 20, 25, 32, 40, 50,
214 64, 80, 101, 128, 161, 203,
215 256, 322, 406, 512, 645, 812,
216 1024, 1290, 1625, 2048, 2580, 3250,
217 4096, 5160, 6501, 8192, 10321, 13003,
218 16384, 20642, 26007, 32768, 41285, 52015,
222 /* TODO: calculate CABAC costs */
223 static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = {
224 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
226 static const uint8_t i_mb_b16x8_cost_table[17] = {
227 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
229 static const uint8_t i_sub_mb_b_cost_table[13] = {
230 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
232 static const uint8_t i_sub_mb_p_cost_table[4] = {
236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
238 static uint16_t x264_cost_ref[92][3][33];
239 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
241 int x264_analyse_init_costs( x264_t *h, int qp )
244 int lambda = x264_lambda_tab[qp];
245 if( h->cost_mv[lambda] )
247 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
248 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
249 h->cost_mv[lambda] += 2*4*2048;
250 for( i = 0; i <= 2*4*2048; i++ )
252 h->cost_mv[lambda][-i] =
253 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
255 x264_pthread_mutex_lock( &cost_ref_mutex );
256 for( i = 0; i < 3; i++ )
257 for( j = 0; j < 33; j++ )
258 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
259 x264_pthread_mutex_unlock( &cost_ref_mutex );
260 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
264 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
265 h->cost_mv_fpel[lambda][j] += 2*2048;
266 for( i = -2*2048; i < 2*2048; i++ )
267 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
275 void x264_analyse_free_costs( x264_t *h )
278 for( i = 0; i < 92; i++ )
281 x264_free( h->cost_mv[i] - 2*4*2048 );
282 if( h->cost_mv_fpel[i][0] )
283 for( j = 0; j < 4; j++ )
284 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
288 void x264_analyse_weight_frame( x264_t *h, int end )
291 for( j=0; j<h->i_ref0; j++ )
293 if( h->sh.weight[j][0].weightfn )
295 x264_frame_t *frame = h->fref0[j];
296 int width = frame->i_width[0] + 2*PADH;
297 int i_padv = PADV << h->param.b_interlaced;
299 uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
301 height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
302 offset = h->fenc->i_lines_weighted*frame->i_stride[0];
303 h->fenc->i_lines_weighted += height;
306 for( k = j; k < h->i_ref0; k++ )
307 if( h->sh.weight[k][0].weightfn )
309 uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
310 x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
311 src + offset, frame->i_stride[0],
312 width, height, &h->sh.weight[k][0] );
320 /* initialize an array of lambda*nbits for all possible mvs */
321 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
323 a->p_cost_mv = h->cost_mv[a->i_lambda];
324 a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
325 a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
328 static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp )
330 /* conduct the analysis using this lamda and QP */
331 a->i_qp = h->mb.i_qp = i_qp;
332 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
334 a->i_lambda = x264_lambda_tab[i_qp];
335 a->i_lambda2 = x264_lambda2_tab[i_qp];
337 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
338 if( h->param.analyse.i_trellis )
340 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
341 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
342 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
343 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
345 h->mb.i_psy_rd_lambda = a->i_lambda;
346 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
347 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
351 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
353 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
355 /* mbrd == 1 -> RD mode decision */
356 /* mbrd == 2 -> RD refinement */
357 /* mbrd == 3 -> QPRD */
358 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
360 x264_mb_analyse_init_qp( h, a, i_qp );
362 h->mb.i_me_method = h->param.analyse.i_me_method;
363 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
364 if( h->sh.i_type == SLICE_TYPE_B && (h->mb.i_subpel_refine == 6 || h->mb.i_subpel_refine == 8) )
365 h->mb.i_subpel_refine--;
366 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
367 && h->mb.i_subpel_refine >= 5;
368 h->mb.b_dct_decimate = h->sh.i_type == SLICE_TYPE_B ||
369 (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
371 h->mb.b_transform_8x8 = 0;
372 h->mb.b_noise_reduction = 0;
378 a->i_satd_i8x8chroma = COST_MAX;
380 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
381 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
385 h->mb.b_lossless ? 0 :
387 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
389 /* II: Inter part P/B frame */
390 if( h->sh.i_type != SLICE_TYPE_I )
393 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
394 // limit motion search to a slightly smaller range than the theoretical limit,
395 // since the search may go a few iterations past its given range
396 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
398 /* Calculate max allowed MV range */
399 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
400 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
401 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
402 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
403 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
404 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
406 int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
407 int max_mv = max_x - 4*16*h->mb.i_mb_x;
408 /* If we're left of the refresh bar, don't reference right of it. */
409 if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
410 h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
412 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
413 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
414 if( h->mb.i_mb_x == 0 )
416 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
417 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
418 int thread_mvy_range = i_fmv_range;
420 if( h->i_thread_frames > 1 )
422 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
423 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
424 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
426 x264_frame_t **fref = i ? h->fref1 : h->fref0;
427 int i_ref = i ? h->i_ref1 : h->i_ref0;
428 for( j=0; j<i_ref; j++ )
430 x264_frame_cond_wait( fref[j]->orig, thresh );
431 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y );
435 if( h->param.b_deterministic )
436 thread_mvy_range = h->param.analyse.i_mv_range_thread;
437 if( h->mb.b_interlaced )
438 thread_mvy_range >>= 1;
440 x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
443 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
444 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
445 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
446 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
447 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
448 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
449 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
455 a->l0.i_cost8x8 = COST_MAX;
457 for( i = 0; i < 4; i++ )
461 a->l0.i_cost4x8[i] = COST_MAX;
465 a->l0.i_cost8x16 = COST_MAX;
466 if( h->sh.i_type == SLICE_TYPE_B )
470 a->l1.i_cost8x8 = COST_MAX;
472 for( i = 0; i < 4; i++ )
477 a->i_cost8x8direct[i] = COST_MAX;
488 a->i_cost16x16direct =
491 a->i_cost8x16bi = COST_MAX;
494 /* Fast intra decision */
495 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
497 if( IS_INTRA( h->mb.i_mb_type_left )
498 || IS_INTRA( h->mb.i_mb_type_top )
499 || IS_INTRA( h->mb.i_mb_type_topleft )
500 || IS_INTRA( h->mb.i_mb_type_topright )
501 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
502 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
503 { /* intra is likely */ }
510 if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
511 h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
513 a->b_force_intra = 1;
517 a->b_force_intra = 0;
521 /* Prediction modes allowed for various combinations of neighbors. */
522 /* Terminated by a -1. */
523 /* In order, no neighbors, left, top, top/left, top/left/topleft */
524 static const int8_t i16x16_mode_available[5][5] =
526 {I_PRED_16x16_DC_128, -1, -1, -1, -1},
527 {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
528 {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
529 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
530 {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
533 static const int8_t i8x8chroma_mode_available[5][5] =
535 {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
536 {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
537 {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
538 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
539 {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
542 static const int8_t i4x4_mode_available[5][10] =
544 {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
545 {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
546 {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
547 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
548 {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
551 static inline const int8_t *predict_16x16_mode_available( int i_neighbour )
553 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
554 return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx];
557 static inline const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
559 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
560 return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx];
563 static inline const int8_t *predict_4x4_mode_available( int i_neighbour )
565 int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
566 return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx];
569 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
570 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
572 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
574 if( do_both_dct || h->mb.b_transform_8x8 )
575 h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
576 if( do_both_dct || !h->mb.b_transform_8x8 )
577 h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
580 /* Reset fenc satd scores cache for psy RD */
581 static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
583 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
584 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
585 if( !h->mb.i_psy_rd )
587 /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
588 h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
590 h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
593 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
595 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
597 if( a->i_satd_i8x8chroma < COST_MAX )
600 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
602 /* 8x8 prediction selection for chroma */
603 if( predict_mode[3] >= 0 && b_merged_satd )
605 int satdu[4], satdv[4];
606 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
607 h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
608 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
609 h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
610 satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
611 satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
613 for( ; *predict_mode >= 0; predict_mode++ )
615 int i_mode = *predict_mode;
616 int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
618 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
619 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
624 for( ; *predict_mode >= 0; predict_mode++ )
627 int i_mode = *predict_mode;
629 /* we do the prediction */
630 if( h->mb.b_lossless )
631 x264_predict_lossless_8x8_chroma( h, i_mode );
634 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
635 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
638 /* we calculate the cost */
639 i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
640 h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
641 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
643 a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
644 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
648 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
651 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
653 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
654 uint8_t *p_src = h->mb.pic.p_fenc[0];
655 uint8_t *p_dst = h->mb.pic.p_fdec[0];
658 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
660 /*---------------- Try all mode and calculate their score ---------------*/
662 /* 16x16 prediction selection */
663 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
665 if( b_merged_satd && predict_mode[3] >= 0 )
667 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
668 h->predict_16x16[I_PRED_16x16_P]( p_dst );
669 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
670 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
673 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
674 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
679 for( ; *predict_mode >= 0; predict_mode++ )
682 int i_mode = *predict_mode;
684 if( h->mb.b_lossless )
685 x264_predict_lossless_16x16( h, i_mode );
687 h->predict_16x16[i_mode]( p_dst );
689 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
690 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
691 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
692 a->i_satd_i16x16_dir[i_mode] = i_satd;
696 if( h->sh.i_type == SLICE_TYPE_B )
697 /* cavlc mb type prefix */
698 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
699 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
702 /* 8x8 prediction selection */
703 if( flags & X264_ANALYSE_I8x8 )
705 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
706 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
707 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
709 h->mb.i_cbp_luma = 0;
710 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
712 // FIXME some bias like in i4x4?
713 if( h->sh.i_type == SLICE_TYPE_B )
714 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
716 for( idx = 0;; idx++ )
720 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
721 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
722 int i_best = COST_MAX;
723 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
725 predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
726 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
728 if( b_merged_satd && predict_mode[8] >= 0 )
731 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
732 satd[i_pred_mode] -= 3 * a->i_lambda;
733 for( i=2; i>=0; i-- )
735 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
736 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
741 for( ; *predict_mode >= 0; predict_mode++ )
744 int i_mode = *predict_mode;
746 if( h->mb.b_lossless )
747 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
749 h->predict_8x8[i_mode]( p_dst_by, edge );
751 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
752 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
753 i_satd -= a->i_lambda * 3;
755 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
756 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
760 if( idx == 3 || i_cost > i_satd_thresh )
763 /* we need to encode this block now (for next ones) */
764 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
765 x264_mb_encode_i8x8( h, idx, a->i_qp );
767 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
772 a->i_satd_i8x8 = i_cost;
773 if( h->mb.i_skip_intra )
775 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
776 h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
777 h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
778 h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
779 h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
780 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
781 if( h->mb.i_skip_intra == 2 )
782 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
787 static const uint16_t cost_div_fix8[3] = {1024,512,341};
788 a->i_satd_i8x8 = COST_MAX;
789 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
791 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
795 /* 4x4 prediction selection */
796 if( flags & X264_ANALYSE_I4x4 )
799 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
800 h->mb.i_cbp_luma = 0;
801 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
803 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
805 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
806 if( h->sh.i_type == SLICE_TYPE_B )
807 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
809 for( idx = 0;; idx++ )
811 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
812 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
813 int i_best = COST_MAX;
814 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
816 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
818 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
819 /* emulate missing topright samples */
820 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
822 if( b_merged_satd && predict_mode[5] >= 0 )
825 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
826 satd[i_pred_mode] -= 3 * a->i_lambda;
827 for( i=2; i>=0; i-- )
828 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
832 for( ; *predict_mode >= 0; predict_mode++ )
835 int i_mode = *predict_mode;
837 if( h->mb.b_lossless )
838 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
840 h->predict_4x4[i_mode]( p_dst_by );
842 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
843 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
844 i_satd -= a->i_lambda * 3;
846 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
848 i_cost += i_best + 4 * a->i_lambda;
850 if( i_cost > i_satd_thresh || idx == 15 )
853 /* we need to encode this block now (for next ones) */
854 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
855 x264_mb_encode_i4x4( h, idx, a->i_qp );
857 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
861 a->i_satd_i4x4 = i_cost;
862 if( h->mb.i_skip_intra )
864 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
865 h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
866 h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
867 h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
868 h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
869 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
870 if( h->mb.i_skip_intra == 2 )
871 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
875 a->i_satd_i4x4 = COST_MAX;
879 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
881 if( a->i_satd_i16x16 <= i_satd_thresh )
883 h->mb.i_type = I_16x16;
884 x264_analyse_update_cache( h, a );
885 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
888 a->i_satd_i16x16 = COST_MAX;
890 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
892 h->mb.i_type = I_4x4;
893 x264_analyse_update_cache( h, a );
894 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
897 a->i_satd_i4x4 = COST_MAX;
899 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
901 h->mb.i_type = I_8x8;
902 x264_analyse_update_cache( h, a );
903 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
904 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
907 a->i_satd_i8x8 = COST_MAX;
910 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
912 uint8_t *p_dst = h->mb.pic.p_fdec[0];
915 int i_mode, i_thresh;
916 uint64_t i_satd, i_best;
917 h->mb.i_skip_intra = 0;
919 if( h->mb.i_type == I_16x16 )
921 int old_pred_mode = a->i_predict16x16;
922 const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
923 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
924 i_best = a->i_satd_i16x16;
925 for( ; *predict_mode >= 0; predict_mode++ )
927 int i_mode = *predict_mode;
928 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
930 h->mb.i_intra16x16_pred_mode = i_mode;
931 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
932 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
936 /* RD selection for chroma prediction */
937 const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
938 if( predict_mode[1] >= 0 )
940 int8_t predict_mode_sorted[4];
942 i_thresh = a->i_satd_i8x8chroma * 5/4;
944 for( i_max = 0; *predict_mode >= 0; predict_mode++ )
946 i_mode = *predict_mode;
947 if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
948 predict_mode_sorted[i_max++] = i_mode;
953 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
954 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
955 /* the previous thing encoded was x264_intra_rd(), so the pixels and
956 * coefs for the current chroma mode are still around, so we only
957 * have to recount the bits. */
958 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
959 for( i = 0; i < i_max; i++ )
961 i_mode = predict_mode_sorted[i];
962 if( h->mb.b_lossless )
963 x264_predict_lossless_8x8_chroma( h, i_mode );
966 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
967 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
969 /* if we've already found a mode that needs no residual, then
970 * probably any mode with a residual will be worse.
971 * so avoid dct on the remaining modes to improve speed. */
972 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
973 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
975 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
976 h->mb.i_cbp_chroma = i_cbp_chroma_best;
980 if( h->mb.i_type == I_4x4 )
982 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
984 for( idx = 0; idx < 16; idx++ )
986 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
989 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
991 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
992 /* emulate missing topright samples */
993 M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
995 for( ; *predict_mode >= 0; predict_mode++ )
997 i_mode = *predict_mode;
998 if( h->mb.b_lossless )
999 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1001 h->predict_4x4[i_mode]( p_dst_by );
1002 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1004 if( i_best > i_satd )
1006 a->i_predict4x4[idx] = i_mode;
1008 pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
1009 pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
1010 pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
1011 pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
1012 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1016 M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
1017 M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
1018 M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
1019 M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
1020 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1022 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1025 else if( h->mb.i_type == I_8x8 )
1027 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1028 for( idx = 0; idx < 4; idx++ )
1030 uint64_t pels_h = 0;
1032 uint16_t i_nnz[2] = {0}; //shut up gcc
1035 int cbp_luma_new = 0;
1036 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1038 i_best = COST_MAX64;
1042 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1043 const int8_t *predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] );
1044 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1046 for( ; *predict_mode >= 0; predict_mode++ )
1048 i_mode = *predict_mode;
1049 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1052 if( h->mb.b_lossless )
1053 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1055 h->predict_8x8[i_mode]( p_dst_by, edge );
1056 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1057 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1059 if( i_best > i_satd )
1061 a->i_predict8x8[idx] = i_mode;
1062 cbp_luma_new = h->mb.i_cbp_luma;
1065 pels_h = M64( p_dst_by+7*FDEC_STRIDE );
1067 for( j=0; j<7; j++ )
1068 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1069 i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
1070 i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
1073 a->i_cbp_i8x8_luma = cbp_luma_new;
1074 M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
1076 for( j=0; j<7; j++ )
1077 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1078 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
1079 M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
1081 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1086 #define LOAD_FENC( m, src, xoff, yoff) \
1087 (m)->p_cost_mv = a->p_cost_mv; \
1088 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1089 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1090 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1091 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1092 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1094 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1095 (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1096 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1097 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1098 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1099 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1100 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1101 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
1102 (m)->weight = weight_none; \
1105 #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
1106 (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
1107 (m)->weight = h->sh.weight[i_ref];
1109 #define REF_COST(list, ref) \
1110 (a->p_cost_ref[list][ref])
1112 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1116 ALIGNED_4( int16_t mvc[8][2] );
1117 int i_halfpel_thresh = INT_MAX;
1118 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1120 /* 16x16 Search on all ref frame */
1121 m.i_pixel = PIXEL_16x16;
1122 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1124 a->l0.me16x16.cost = INT_MAX;
1125 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1127 m.i_ref_cost = REF_COST( 0, i_ref );
1128 i_halfpel_thresh -= m.i_ref_cost;
1130 /* search with ref */
1131 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1132 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
1134 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1135 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1137 if( h->mb.ref_blind_dupe == i_ref )
1139 CP32( m.mv, a->l0.mvc[0][0] );
1140 x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
1143 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1145 /* early termination
1146 * SSD threshold would probably be better than SATD */
1149 && m.cost-m.cost_mv < 300*a->i_lambda
1150 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1151 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1152 && x264_macroblock_probe_pskip( h ) )
1154 h->mb.i_type = P_SKIP;
1155 x264_analyse_update_cache( h, a );
1156 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1160 m.cost += m.i_ref_cost;
1161 i_halfpel_thresh += m.i_ref_cost;
1163 if( m.cost < a->l0.me16x16.cost )
1164 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1166 /* save mv for predicting neighbors */
1167 CP32( a->l0.mvc[i_ref][0], m.mv );
1168 CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
1171 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1172 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
1174 h->mb.i_type = P_L0;
1177 x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
1178 if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
1180 h->mb.i_partition = D_16x16;
1181 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1182 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1183 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1184 h->mb.i_type = P_SKIP;
1189 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1193 uint8_t **p_fenc = h->mb.pic.p_fenc;
1194 int i_maxref = h->mb.pic.i_fref[0]-1;
1196 h->mb.i_partition = D_8x8;
1198 #define CHECK_NEIGHBOUR(i)\
1200 int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
1201 if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
1205 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1206 * than those used by the neighbors */
1207 if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
1208 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1211 CHECK_NEIGHBOUR( -8 - 1 );
1212 CHECK_NEIGHBOUR( -8 + 0 );
1213 CHECK_NEIGHBOUR( -8 + 2 );
1214 CHECK_NEIGHBOUR( -8 + 4 );
1215 CHECK_NEIGHBOUR( 0 - 1 );
1216 CHECK_NEIGHBOUR( 2*8 - 1 );
1218 #undef CHECK_NEIGHBOUR
1220 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1221 CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
1223 for( i = 0; i < 4; i++ )
1225 x264_me_t *l0m = &a->l0.me8x8[i];
1229 m.i_pixel = PIXEL_8x8;
1231 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1232 l0m->cost = INT_MAX;
1233 for( i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
1235 m.i_ref_cost = REF_COST( 0, i_ref );
1237 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1238 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1240 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1241 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1242 if( h->mb.ref_blind_dupe == i_ref )
1244 CP32( m.mv, a->l0.mvc[0][i+1] );
1245 x264_me_refine_qpel_refdupe( h, &m, NULL );
1248 x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
1250 m.cost += m.i_ref_cost;
1252 CP32( a->l0.mvc[i_ref][i+1], m.mv );
1254 if( m.cost < l0m->cost )
1255 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1256 if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
1257 i_ref = h->mb.ref_blind_dupe;
1261 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1262 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1264 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1265 are effectively zero. */
1266 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1267 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1270 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1271 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1272 /* P_8x8 ref0 has no ref cost */
1273 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1274 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1275 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1276 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1277 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1280 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1282 /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
1283 * reference frame flags. Thus, if we're not doing mixedrefs, just
1284 * don't bother analysing the dupes. */
1285 const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
1286 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1287 uint8_t **p_fenc = h->mb.pic.p_fenc;
1289 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1292 /* XXX Needed for x264_mb_predict_mv */
1293 h->mb.i_partition = D_8x8;
1296 CP32( mvc[0], a->l0.me16x16.mv );
1298 for( i = 0; i < 4; i++ )
1300 x264_me_t *m = &a->l0.me8x8[i];
1304 m->i_pixel = PIXEL_8x8;
1305 m->i_ref_cost = i_ref_cost;
1307 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1308 LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1309 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
1311 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1312 x264_me_search( h, m, mvc, i_mvc );
1314 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1316 CP32( mvc[i_mvc], m->mv );
1320 m->cost += i_ref_cost;
1321 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1322 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1325 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1326 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1327 /* theoretically this should include 4*ref_cost,
1328 * but 3 seems a better approximation of cabac. */
1329 if( h->param.b_cabac )
1330 a->l0.i_cost8x8 -= i_ref_cost;
1331 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1332 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1335 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1338 uint8_t **p_fenc = h->mb.pic.p_fenc;
1339 ALIGNED_4( int16_t mvc[3][2] );
1342 /* XXX Needed for x264_mb_predict_mv */
1343 h->mb.i_partition = D_16x8;
1345 for( i = 0; i < 2; i++ )
1347 x264_me_t *l0m = &a->l0.me16x8[i];
1348 const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1349 const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
1350 const int ref8[2] = { minref, maxref };
1351 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1353 m.i_pixel = PIXEL_16x8;
1355 LOAD_FENC( &m, p_fenc, 0, 8*i );
1356 l0m->cost = INT_MAX;
1357 for( j = 0; j < i_ref8s; j++ )
1359 const int i_ref = ref8[j];
1360 m.i_ref_cost = REF_COST( 0, i_ref );
1362 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1363 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1364 CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
1365 CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
1367 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1368 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
1370 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1371 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1372 /* We can only take this shortcut if the first search was performed on ref0. */
1373 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1375 /* We can just leave the MV from the previous ref search. */
1376 x264_me_refine_qpel_refdupe( h, &m, NULL );
1379 x264_me_search( h, &m, mvc, 3 );
1381 m.cost += m.i_ref_cost;
1383 if( m.cost < l0m->cost )
1384 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1386 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1387 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1390 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1393 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1396 uint8_t **p_fenc = h->mb.pic.p_fenc;
1397 ALIGNED_4( int16_t mvc[3][2] );
1400 /* XXX Needed for x264_mb_predict_mv */
1401 h->mb.i_partition = D_8x16;
1403 for( i = 0; i < 2; i++ )
1405 x264_me_t *l0m = &a->l0.me8x16[i];
1406 const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1407 const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
1408 const int ref8[2] = { minref, maxref };
1409 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1411 m.i_pixel = PIXEL_8x16;
1413 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1414 l0m->cost = INT_MAX;
1415 for( j = 0; j < i_ref8s; j++ )
1417 const int i_ref = ref8[j];
1418 m.i_ref_cost = REF_COST( 0, i_ref );
1420 CP32( mvc[0], a->l0.mvc[i_ref][0] );
1421 CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
1422 CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
1424 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1425 LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
1427 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1428 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1429 /* We can only take this shortcut if the first search was performed on ref0. */
1430 if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
1432 /* We can just leave the MV from the previous ref search. */
1433 x264_me_refine_qpel_refdupe( h, &m, NULL );
1436 x264_me_search( h, &m, mvc, 3 );
1438 m.cost += m.i_ref_cost;
1440 if( m.cost < l0m->cost )
1441 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1443 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1444 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1447 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1450 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1452 ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
1453 uint8_t *pix2 = pix1+8;
1454 const int i_stride = h->mb.pic.i_stride[1];
1455 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1456 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1457 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1458 const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
1459 x264_weight_t *weight = h->sh.weight[i_ref];
1461 #define CHROMA4x4MC( width, height, me, x, y ) \
1462 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1463 if( weight[1].weightfn ) \
1464 weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
1465 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
1466 if( weight[2].weightfn ) \
1467 weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
1470 if( pixel == PIXEL_4x4 )
1472 x264_me_t *m = a->l0.me4x4[i8x8];
1473 CHROMA4x4MC( 2,2, m[0], 0,0 );
1474 CHROMA4x4MC( 2,2, m[1], 2,0 );
1475 CHROMA4x4MC( 2,2, m[2], 0,2 );
1476 CHROMA4x4MC( 2,2, m[3], 2,2 );
1478 else if( pixel == PIXEL_8x4 )
1480 x264_me_t *m = a->l0.me8x4[i8x8];
1481 CHROMA4x4MC( 4,2, m[0], 0,0 );
1482 CHROMA4x4MC( 4,2, m[1], 0,2 );
1486 x264_me_t *m = a->l0.me4x8[i8x8];
1487 CHROMA4x4MC( 2,4, m[0], 0,0 );
1488 CHROMA4x4MC( 2,4, m[1], 2,0 );
1491 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1492 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1495 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1497 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1498 uint8_t **p_fenc = h->mb.pic.p_fenc;
1499 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1502 /* XXX Needed for x264_mb_predict_mv */
1503 h->mb.i_partition = D_8x8;
1505 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1507 const int idx = 4*i8x8 + i4x4;
1508 const int x4 = block_idx_x[idx];
1509 const int y4 = block_idx_y[idx];
1510 const int i_mvc = (i4x4 == 0);
1512 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1514 m->i_pixel = PIXEL_4x4;
1516 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1517 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1518 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1520 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1521 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1523 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1525 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1526 a->l0.me4x4[i8x8][1].cost +
1527 a->l0.me4x4[i8x8][2].cost +
1528 a->l0.me4x4[i8x8][3].cost +
1529 REF_COST( 0, i_ref ) +
1530 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1531 if( h->mb.b_chroma_me )
1532 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1535 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1537 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1538 uint8_t **p_fenc = h->mb.pic.p_fenc;
1539 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1542 /* XXX Needed for x264_mb_predict_mv */
1543 h->mb.i_partition = D_8x8;
1545 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1547 const int idx = 4*i8x8 + 2*i8x4;
1548 const int x4 = block_idx_x[idx];
1549 const int y4 = block_idx_y[idx];
1550 const int i_mvc = (i8x4 == 0);
1552 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1554 m->i_pixel = PIXEL_8x4;
1556 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1557 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1558 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1560 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1561 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1563 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1565 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1566 REF_COST( 0, i_ref ) +
1567 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1568 if( h->mb.b_chroma_me )
1569 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1572 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1574 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1575 uint8_t **p_fenc = h->mb.pic.p_fenc;
1576 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1579 /* XXX Needed for x264_mb_predict_mv */
1580 h->mb.i_partition = D_8x8;
1582 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1584 const int idx = 4*i8x8 + i4x8;
1585 const int x4 = block_idx_x[idx];
1586 const int y4 = block_idx_y[idx];
1587 const int i_mvc = (i4x8 == 0);
1589 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1591 m->i_pixel = PIXEL_4x8;
1593 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1594 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1595 LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
1597 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1598 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1600 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1602 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1603 REF_COST( 0, i_ref ) +
1604 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1605 if( h->mb.b_chroma_me )
1606 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1609 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1611 /* Assumes that fdec still contains the results of
1612 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1614 uint8_t **p_fenc = h->mb.pic.p_fenc;
1615 uint8_t **p_fdec = h->mb.pic.p_fdec;
1618 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1619 for( i = 0; i < 4; i++ )
1621 const int x = (i&1)*8;
1622 const int y = (i>>1)*8;
1623 a->i_cost16x16direct +=
1624 a->i_cost8x8direct[i] =
1625 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1628 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1632 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1634 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1635 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1636 uint8_t *src0, *src1;
1637 int stride0 = 16, stride1 = 16;
1638 int i_ref, i_mvc, l;
1639 ALIGNED_4( int16_t mvc[9][2] );
1642 m.i_pixel = PIXEL_16x16;
1644 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1646 /* 16x16 Search on list 0 and list 1 */
1647 for( l = 0; l < 2; l++ )
1649 int i_halfpel_thresh = INT_MAX;
1650 int *p_halfpel_thresh = h->mb.pic.i_fref[l]>1 ? &i_halfpel_thresh : NULL;
1651 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1653 lX->me16x16.cost = INT_MAX;
1654 for( i_ref = 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
1656 m.i_ref_cost = REF_COST( l, i_ref );
1658 /* search with ref */
1659 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
1660 x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
1661 x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
1662 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1665 m.cost += m.i_ref_cost;
1667 if( m.cost < lX->me16x16.cost )
1668 h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
1670 /* save mv for predicting neighbors */
1671 CP32( lX->mvc[i_ref][0], m.mv );
1672 CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
1676 /* get cost of BI mode */
1677 h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
1678 h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
1679 int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
1680 src0 = h->mc.get_ref( pix0, &stride0,
1681 h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
1682 a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none );
1683 src1 = h->mc.get_ref( pix1, &stride1,
1684 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
1685 a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none );
1687 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1689 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1691 + a->l0.bi16x16.cost_mv
1692 + a->l1.bi16x16.cost_mv;
1695 /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
1696 if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
1698 int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
1699 + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
1700 int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
1701 + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
1702 h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1703 h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
1704 h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
1705 int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1706 + ref_costs + l0_mv_cost + l1_mv_cost;
1707 if( cost00 < a->i_cost16x16bi )
1709 M32( a->l0.bi16x16.mv ) = 0;
1710 M32( a->l1.bi16x16.mv ) = 0;
1711 a->l0.bi16x16.cost_mv = l0_mv_cost;
1712 a->l1.bi16x16.cost_mv = l1_mv_cost;
1713 a->i_cost16x16bi = cost00;
1718 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1719 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1720 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1723 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1725 const int x = 2*(i%2);
1726 const int y = 2*(i/2);
1728 switch( h->mb.i_sub_partition[i] )
1731 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1734 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1735 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1738 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1739 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1742 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1743 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1744 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1745 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1748 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1753 static void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
1755 const int x = 2*(idx&1);
1756 const int y = 2*(idx>>1);
1757 x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
1758 x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
1759 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
1760 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
1763 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1764 if( x264_mb_partition_listX_table[0][part] ) \
1766 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
1767 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1771 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1772 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1774 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1776 if( x264_mb_partition_listX_table[1][part] ) \
1778 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
1779 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1783 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1784 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1786 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1789 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1793 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1795 x264_mb_load_mv_direct8x8( h, i );
1798 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1799 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1800 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1805 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1808 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1810 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1812 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1814 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1818 static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1820 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1822 int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
1824 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1825 * than those used by the neighbors */
1826 #define CHECK_NEIGHBOUR(i)\
1828 int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
1829 if( ref > i_maxref[l] )\
1833 for( l = 0; l < 2; l++ )
1835 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1836 if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
1837 h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 )
1840 CHECK_NEIGHBOUR( -8 - 1 );
1841 CHECK_NEIGHBOUR( -8 + 0 );
1842 CHECK_NEIGHBOUR( -8 + 2 );
1843 CHECK_NEIGHBOUR( -8 + 4 );
1844 CHECK_NEIGHBOUR( 0 - 1 );
1845 CHECK_NEIGHBOUR( 2*8 - 1 );
1849 /* XXX Needed for x264_mb_predict_mv */
1850 h->mb.i_partition = D_8x8;
1854 for( i = 0; i < 4; i++ )
1860 int stride[2] = {8,8};
1863 m.i_pixel = PIXEL_8x8;
1864 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1866 for( l = 0; l < 2; l++ )
1868 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1870 lX->me8x8[i].cost = INT_MAX;
1871 for( i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
1873 m.i_ref_cost = REF_COST( l, i_ref );;
1875 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
1877 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
1878 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
1879 x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
1880 m.cost += m.i_ref_cost;
1882 if( m.cost < lX->me8x8[i].cost )
1883 h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
1885 /* save mv for predicting other partitions within this MB */
1886 CP32( lX->mvc[i_ref][i+1], m.mv );
1891 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
1892 a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none );
1893 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
1894 a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none );
1895 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
1896 h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
1898 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1899 + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv + a->l0.me8x8[i].i_ref_cost
1900 + a->l1.me8x8[i].i_ref_cost + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1902 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1903 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1905 i_part_cost = a->l0.me8x8[i].cost;
1906 h->mb.i_sub_partition[i] = D_L0_8x8;
1907 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1908 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1909 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1910 a->i_cost8x8bi += i_part_cost;
1912 /* XXX Needed for x264_mb_predict_mv */
1913 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1917 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1920 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1922 uint8_t **p_fref[2] =
1923 { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
1924 h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
1925 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
1928 /* XXX Needed for x264_mb_predict_mv */
1929 h->mb.i_partition = D_8x8;
1933 for( i = 0; i < 4; i++ )
1938 int i_part_cost_bi = 0;
1939 int stride[2] = {8,8};
1942 for( l = 0; l < 2; l++ )
1944 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1945 x264_me_t *m = &lX->me8x8[i];
1946 m->i_pixel = PIXEL_8x8;
1947 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1949 m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
1950 m->i_ref = lX->me16x16.i_ref;
1952 LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
1954 x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
1955 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1956 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1957 m->cost += m->i_ref_cost;
1959 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1961 /* save mv for predicting other partitions within this MB */
1962 CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
1965 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1966 m->mv[0], m->mv[1], 8, 8, weight_none );
1967 i_part_cost_bi += m->cost_mv + m->i_ref_cost;
1969 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
1970 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1971 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1972 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1973 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1975 i_part_cost = a->l0.me8x8[i].cost;
1976 h->mb.i_sub_partition[i] = D_L0_8x8;
1977 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1978 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1979 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1980 a->i_cost8x8bi += i_part_cost;
1982 /* XXX Needed for x264_mb_predict_mv */
1983 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1987 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1990 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1992 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1993 ALIGNED_4( int16_t mvc[3][2] );
1996 h->mb.i_partition = D_16x8;
1997 a->i_cost16x8bi = 0;
1999 for( i = 0; i < 2; i++ )
2002 int i_part_cost_bi = 0;
2003 int stride[2] = {16,16};
2006 m.i_pixel = PIXEL_16x8;
2007 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
2009 for( l = 0; l < 2; l++ )
2011 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2012 int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
2013 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2014 lX->me16x8[i].cost = INT_MAX;
2015 for( j = 0; j < i_ref8s; j++ )
2018 m.i_ref_cost = REF_COST( l, i_ref );;
2020 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
2022 CP32( mvc[0], lX->mvc[i_ref][0] );
2023 CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
2024 CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
2026 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
2027 x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
2028 x264_me_search( h, &m, mvc, 3 );
2029 m.cost += m.i_ref_cost;
2031 if( m.cost < lX->me16x8[i].cost )
2032 h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
2037 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
2038 a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none );
2039 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
2040 a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none );
2041 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
2042 h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
2044 i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
2045 + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
2046 + a->l1.me16x8[i].i_ref_cost;
2048 i_part_cost = a->l0.me16x8[i].cost;
2049 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
2051 if( a->l1.me16x8[i].cost < i_part_cost )
2053 i_part_cost = a->l1.me16x8[i].cost;
2054 a->i_mb_partition16x8[i] = D_L1_8x8;
2056 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2058 i_part_cost = i_part_cost_bi;
2059 a->i_mb_partition16x8[i] = D_BI_8x8;
2061 a->i_cost16x8bi += i_part_cost;
2063 x264_mb_cache_mv_b16x8( h, a, i, 0 );
2067 a->i_mb_type16x8 = B_L0_L0
2068 + (a->i_mb_partition16x8[0]>>2) * 3
2069 + (a->i_mb_partition16x8[1]>>2);
2070 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
2073 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
2075 ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
2076 ALIGNED_4( int16_t mvc[2][2] );
2079 h->mb.i_partition = D_8x16;
2080 a->i_cost8x16bi = 0;
2082 for( i = 0; i < 2; i++ )
2085 int i_part_cost_bi = 0;
2086 int stride[2] = {8,8};
2089 m.i_pixel = PIXEL_8x16;
2090 LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
2092 for( l = 0; l < 2; l++ )
2094 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
2095 int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
2096 int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
2097 lX->me8x16[i].cost = INT_MAX;
2098 for( j = 0; j < i_ref8s; j++ )
2101 m.i_ref_cost = REF_COST( l, i_ref );
2103 LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
2105 CP32( mvc[0], lX->mvc[i_ref][0] );
2106 CP32( mvc[1], lX->mvc[i_ref][i+1] );
2107 CP32( mvc[2], lX->mvc[i_ref][i+3] );
2109 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
2110 x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
2111 x264_me_search( h, &m, mvc, 3 );
2112 m.cost += m.i_ref_cost;
2114 if( m.cost < lX->me8x16[i].cost )
2115 h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
2120 src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
2121 a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none );
2122 src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
2123 a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none );
2124 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
2126 i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
2127 + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
2128 + a->l1.me8x16[i].i_ref_cost;
2130 i_part_cost = a->l0.me8x16[i].cost;
2131 a->i_mb_partition8x16[i] = D_L0_8x8;
2133 if( a->l1.me8x16[i].cost < i_part_cost )
2135 i_part_cost = a->l1.me8x16[i].cost;
2136 a->i_mb_partition8x16[i] = D_L1_8x8;
2138 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2140 i_part_cost = i_part_cost_bi;
2141 a->i_mb_partition8x16[i] = D_BI_8x8;
2143 a->i_cost8x16bi += i_part_cost;
2145 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2149 a->i_mb_type8x16 = B_L0_L0
2150 + (a->i_mb_partition8x16[0]>>2) * 3
2151 + (a->i_mb_partition8x16[1]>>2);
2152 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2155 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2157 int thresh = i_satd * 5/4;
2159 h->mb.i_type = P_L0;
2160 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2162 h->mb.i_partition = D_16x16;
2163 x264_analyse_update_cache( h, a );
2164 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2167 if( a->l0.i_cost16x8 <= thresh )
2169 h->mb.i_partition = D_16x8;
2170 x264_analyse_update_cache( h, a );
2171 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2174 a->l0.i_cost16x8 = COST_MAX;
2176 if( a->l0.i_cost8x16 <= thresh )
2178 h->mb.i_partition = D_8x16;
2179 x264_analyse_update_cache( h, a );
2180 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2183 a->l0.i_cost8x16 = COST_MAX;
2185 if( a->l0.i_cost8x8 <= thresh )
2187 h->mb.i_type = P_8x8;
2188 h->mb.i_partition = D_8x8;
2189 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2192 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2193 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2194 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2195 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2196 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2197 * for future blocks are those left over from previous RDO calls. */
2198 for( i = 0; i < 4; i++ )
2200 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2201 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2202 int subtype, btype = D_L0_8x8;
2203 uint64_t bcost = COST_MAX64;
2204 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2207 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2209 h->mb.i_sub_partition[i] = subtype;
2210 x264_mb_cache_mv_p8x8( h, a, i );
2211 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2212 COPY2_IF_LT( bcost, cost, btype, subtype );
2214 if( h->mb.i_sub_partition[i] != btype )
2216 h->mb.i_sub_partition[i] = btype;
2217 x264_mb_cache_mv_p8x8( h, a, i );
2222 x264_analyse_update_cache( h, a );
2223 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2226 a->l0.i_cost8x8 = COST_MAX;
2229 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2231 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2233 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2235 h->mb.i_type = B_DIRECT;
2236 /* Assumes direct/skip MC is still in fdec */
2237 /* Requires b-rdo to be done before intra analysis */
2238 h->mb.b_skip_mc = 1;
2239 x264_analyse_update_cache( h, a );
2240 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2241 h->mb.b_skip_mc = 0;
2244 //FIXME not all the update_cache calls are needed
2245 h->mb.i_partition = D_16x16;
2247 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2249 h->mb.i_type = B_L0_L0;
2250 x264_analyse_update_cache( h, a );
2251 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2255 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2257 h->mb.i_type = B_L1_L1;
2258 x264_analyse_update_cache( h, a );
2259 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2263 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2265 h->mb.i_type = B_BI_BI;
2266 x264_analyse_update_cache( h, a );
2267 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2271 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2273 h->mb.i_type = B_8x8;
2274 h->mb.i_partition = D_8x8;
2275 x264_analyse_update_cache( h, a );
2276 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2277 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2281 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2283 h->mb.i_type = a->i_mb_type16x8;
2284 h->mb.i_partition = D_16x8;
2285 x264_analyse_update_cache( h, a );
2286 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2290 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2292 h->mb.i_type = a->i_mb_type8x16;
2293 h->mb.i_partition = D_8x16;
2294 x264_analyse_update_cache( h, a );
2295 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2299 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2304 if( IS_INTRA(h->mb.i_type) )
2307 switch( h->mb.i_partition )
2310 if( h->mb.i_type == B_BI_BI )
2312 i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
2313 x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
2317 for( i=0; i<2; i++ )
2318 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2320 i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
2321 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2325 for( i=0; i<2; i++ )
2326 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2328 i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
2329 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2333 for( i=0; i<4; i++ )
2334 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2336 i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
2337 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2343 static inline void x264_mb_analyse_transform( x264_t *h )
2345 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2347 int i_cost4, i_cost8;
2348 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2351 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2352 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2353 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2354 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2356 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2357 h->mb.b_skip_mc = 1;
2361 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2363 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2366 x264_analyse_update_cache( h, a );
2367 h->mb.b_transform_8x8 ^= 1;
2368 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2369 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2371 if( *i_rd >= i_rd8 )
2374 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2378 h->mb.b_transform_8x8 ^= 1;
2382 /* Rate-distortion optimal QP selection.
2383 * FIXME: More than half of the benefit of this function seems to be
2384 * in the way it improves the coding of chroma DC (by decimating or
2385 * finding a better way to code a single DC coefficient.)
2386 * There must be a more efficient way to get that portion of the benefit
2387 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2389 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2391 int bcost, cost, direction, failures, prevcost, origcost;
2392 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2393 int last_qp_tried = 0;
2394 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2395 int origcbp = h->mb.cbp[h->mb.i_mb_xy];
2397 /* If CBP is already zero, don't raise the quantizer any higher. */
2398 for( direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
2400 /* Without psy-RD, require monotonicity when moving quant away from previous
2401 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2402 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2403 * allow 2 failures when moving quant towards previous quant.
2404 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2405 int threshold = (!!h->mb.i_psy_rd);
2406 /* Raise the threshold for failures if we're moving towards the last QP. */
2407 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2408 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2410 h->mb.i_qp = orig_qp;
2412 prevcost = origcost;
2414 /* If the current QP results in an empty CBP, it's highly likely that lower QPs
2415 * (up to a point) will too. So, jump down to where the threshold will kick in
2416 * and check the QP there. If the CBP is still empty, skip the main loop.
2417 * If it isn't empty, we would have ended up having to check this QP anyways,
2418 * so as long as we store it for later lookup, we lose nothing. */
2419 int already_checked_qp = -1;
2420 int already_checked_cost = COST_MAX;
2421 if( direction == -1 )
2425 h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min );
2426 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2427 already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 );
2428 if( !h->mb.cbp[h->mb.i_mb_xy] )
2430 /* If our empty-CBP block is lower QP than the last QP,
2431 * the last QP almost surely doesn't have a CBP either. */
2432 if( h->mb.i_last_qp > h->mb.i_qp )
2436 already_checked_qp = h->mb.i_qp;
2437 h->mb.i_qp = orig_qp;
2441 h->mb.i_qp += direction;
2442 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2444 if( h->mb.i_last_qp == h->mb.i_qp )
2446 if( h->mb.i_qp == already_checked_qp )
2447 cost = already_checked_cost;
2450 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2451 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2452 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2455 /* We can't assume that the costs are monotonic over QPs.
2456 * Tie case-as-failure seems to give better results. */
2457 if( cost < prevcost )
2463 if( failures > threshold )
2465 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2467 h->mb.i_qp += direction;
2471 /* Always try the last block's QP. */
2472 if( !last_qp_tried )
2474 h->mb.i_qp = h->mb.i_last_qp;
2475 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2476 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2477 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2481 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2483 /* Check transform again; decision from before may no longer be optimal. */
2484 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2485 x264_mb_transform_8x8_allowed( h ) )
2487 h->mb.b_transform_8x8 ^= 1;
2488 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2490 h->mb.b_transform_8x8 ^= 1;
2494 /*****************************************************************************
2495 * x264_macroblock_analyse:
2496 *****************************************************************************/
2497 void x264_macroblock_analyse( x264_t *h )
2499 x264_mb_analysis_t analysis;
2500 int i_cost = COST_MAX;
2503 h->mb.i_qp = x264_ratecontrol_qp( h );
2504 if( h->param.rc.i_aq_mode )
2506 x264_adaptive_quant( h );
2507 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2508 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2509 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2510 h->mb.i_qp = h->mb.i_last_qp;
2513 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2515 /*--------------------------- Do the analysis ---------------------------*/
2516 if( h->sh.i_type == SLICE_TYPE_I )
2519 if( analysis.i_mbrd )
2520 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2521 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2522 if( analysis.i_mbrd )
2523 x264_intra_rd( h, &analysis, COST_MAX );
2525 i_cost = analysis.i_satd_i16x16;
2526 h->mb.i_type = I_16x16;
2527 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2528 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2529 if( analysis.i_satd_pcm < i_cost )
2530 h->mb.i_type = I_PCM;
2532 else if( analysis.i_mbrd >= 2 )
2533 x264_intra_rd_refine( h, &analysis );
2535 else if( h->sh.i_type == SLICE_TYPE_P )
2539 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2541 analysis.b_try_pskip = 0;
2542 if( analysis.b_force_intra )
2544 if( !h->param.analyse.b_psy )
2546 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2547 goto intra_analysis;
2552 /* Fast P_SKIP detection */
2553 if( h->param.analyse.b_fast_pskip )
2555 if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2556 // FIXME don't need to check this if the reference frame is done
2558 else if( h->param.analyse.i_subpel_refine >= 3 )
2559 analysis.b_try_pskip = 1;
2560 else if( h->mb.i_mb_type_left == P_SKIP ||
2561 h->mb.i_mb_type_top == P_SKIP ||
2562 h->mb.i_mb_type_topleft == P_SKIP ||
2563 h->mb.i_mb_type_topright == P_SKIP )
2564 b_skip = x264_macroblock_probe_pskip( h );
2568 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2572 h->mb.i_type = P_SKIP;
2573 h->mb.i_partition = D_16x16;
2574 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
2578 const unsigned int flags = h->param.analyse.inter;
2582 int i_satd_inter, i_satd_intra;
2584 x264_mb_analyse_load_costs( h, &analysis );
2586 x264_mb_analyse_inter_p16x16( h, &analysis );
2588 if( h->mb.i_type == P_SKIP )
2591 if( flags & X264_ANALYSE_PSUB16x16 )
2593 if( h->param.analyse.b_mixed_references )
2594 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2596 x264_mb_analyse_inter_p8x8( h, &analysis );
2599 /* Select best inter mode */
2601 i_partition = D_16x16;
2602 i_cost = analysis.l0.me16x16.cost;
2604 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2605 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2608 i_partition = D_8x8;
2609 i_cost = analysis.l0.i_cost8x8;
2612 if( flags & X264_ANALYSE_PSUB8x8 )
2614 for( i = 0; i < 4; i++ )
2616 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2617 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2619 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2620 h->mb.i_sub_partition[i] = D_L0_4x4;
2622 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2623 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2624 h->mb.i_sub_partition[i], D_L0_8x4 );
2626 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2627 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2628 h->mb.i_sub_partition[i], D_L0_4x8 );
2630 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2632 x264_mb_cache_mv_p8x8( h, &analysis, i );
2634 analysis.l0.i_cost8x8 = i_cost;
2638 /* Now do 16x8/8x16 */
2639 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2640 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2641 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2643 x264_mb_analyse_inter_p16x8( h, &analysis );
2644 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2646 x264_mb_analyse_inter_p8x16( h, &analysis );
2647 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2650 h->mb.i_partition = i_partition;
2653 //FIXME mb_type costs?
2654 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2658 else if( i_partition == D_16x16 )
2660 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2661 i_cost = analysis.l0.me16x16.cost;
2663 else if( i_partition == D_16x8 )
2665 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2666 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2667 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2669 else if( i_partition == D_8x16 )
2671 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2672 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2673 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2675 else if( i_partition == D_8x8 )
2679 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2681 switch( h->mb.i_sub_partition[i8x8] )
2684 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2685 i_cost += analysis.l0.me8x8[i8x8].cost;
2688 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2689 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2690 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2691 analysis.l0.me8x4[i8x8][1].cost;
2694 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2695 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2696 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2697 analysis.l0.me4x8[i8x8][1].cost;
2701 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2702 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2703 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2704 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2705 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2706 analysis.l0.me4x4[i8x8][1].cost +
2707 analysis.l0.me4x4[i8x8][2].cost +
2708 analysis.l0.me4x4[i8x8][3].cost;
2711 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2717 if( h->mb.b_chroma_me )
2719 x264_mb_analyse_intra_chroma( h, &analysis );
2720 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2721 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2722 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2723 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2726 x264_mb_analyse_intra( h, &analysis, i_cost );
2728 i_satd_inter = i_cost;
2729 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2730 analysis.i_satd_i8x8,
2731 analysis.i_satd_i4x4 );
2733 if( analysis.i_mbrd )
2735 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2737 i_partition = D_16x16;
2738 i_cost = analysis.l0.i_rd16x16;
2739 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2740 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2741 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2742 h->mb.i_type = i_type;
2743 h->mb.i_partition = i_partition;
2744 if( i_cost < COST_MAX )
2745 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2746 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2749 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2750 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2751 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2752 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2754 h->mb.i_type = i_type;
2756 if( analysis.b_force_intra && !IS_INTRA(i_type) )
2758 /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
2759 * it was an inter block. */
2760 x264_analyse_update_cache( h, &analysis );
2761 x264_macroblock_encode( h );
2762 h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
2763 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
2764 h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
2765 x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
2766 goto intra_analysis;
2769 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2771 if( IS_INTRA( h->mb.i_type ) )
2773 x264_intra_rd_refine( h, &analysis );
2775 else if( i_partition == D_16x16 )
2777 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2778 analysis.l0.me16x16.cost = i_cost;
2779 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2781 else if( i_partition == D_16x8 )
2783 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2784 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2785 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2786 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2787 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2788 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2790 else if( i_partition == D_8x16 )
2792 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2793 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2794 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2795 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2796 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2797 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2799 else if( i_partition == D_8x8 )
2802 x264_analyse_update_cache( h, &analysis );
2803 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2805 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2807 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2809 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2811 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2812 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2814 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2816 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2817 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2819 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2821 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2822 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2823 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2824 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2831 else if( h->sh.i_type == SLICE_TYPE_B )
2833 int i_bskip_cost = COST_MAX;
2836 if( analysis.i_mbrd )
2837 x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
2839 h->mb.i_type = B_SKIP;
2840 if( h->mb.b_direct_auto_write )
2842 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2843 for( i = 0; i < 2; i++ )
2846 h->sh.b_direct_spatial_mv_pred ^= 1;
2847 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2848 if( analysis.b_direct_available )
2853 b_skip = x264_macroblock_probe_bskip( h );
2855 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2862 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2864 if( analysis.b_direct_available )
2866 if( !h->mb.b_direct_auto_write )
2868 if( analysis.i_mbrd )
2870 i_bskip_cost = ssd_mb( h );
2871 /* 6 = minimum cavlc cost of a non-skipped MB */
2872 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2874 else if( !h->mb.b_direct_auto_write )
2876 /* Conditioning the probe on neighboring block types
2877 * doesn't seem to help speed or quality. */
2878 b_skip = x264_macroblock_probe_bskip( h );
2884 const unsigned int flags = h->param.analyse.inter;
2888 h->mb.b_skip_mc = 0;
2890 x264_mb_analyse_load_costs( h, &analysis );
2892 /* select best inter mode */
2893 /* direct must be first */
2894 if( analysis.b_direct_available )
2895 x264_mb_analyse_inter_direct( h, &analysis );
2897 x264_mb_analyse_inter_b16x16( h, &analysis );
2900 i_partition = D_16x16;
2901 i_cost = analysis.l0.me16x16.cost;
2902 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2903 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2904 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2906 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2908 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2909 if( i_bskip_cost < analysis.i_rd16x16direct &&
2910 i_bskip_cost < analysis.i_rd16x16bi &&
2911 i_bskip_cost < analysis.l0.i_rd16x16 &&
2912 i_bskip_cost < analysis.l1.i_rd16x16 )
2914 h->mb.i_type = B_SKIP;
2915 x264_analyse_update_cache( h, &analysis );
2920 if( flags & X264_ANALYSE_BSUB16x16 )
2922 if( h->param.analyse.b_mixed_references )
2923 x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
2925 x264_mb_analyse_inter_b8x8( h, &analysis );
2927 if( analysis.i_cost8x8bi < i_cost )
2930 i_partition = D_8x8;
2931 i_cost = analysis.i_cost8x8bi;
2933 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2934 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2936 x264_mb_analyse_inter_b16x8( h, &analysis );
2937 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2938 i_type, analysis.i_mb_type16x8,
2939 i_partition, D_16x8 );
2941 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2942 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2944 x264_mb_analyse_inter_b8x16( h, &analysis );
2945 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2946 i_type, analysis.i_mb_type8x16,
2947 i_partition, D_8x16 );
2952 if( analysis.i_mbrd || !h->mb.i_subpel_refine )
2957 else if( i_partition == D_16x16 )
2959 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2960 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2961 if( i_type == B_L0_L0 )
2963 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2964 i_cost = analysis.l0.me16x16.cost
2965 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2967 else if( i_type == B_L1_L1 )
2969 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2970 i_cost = analysis.l1.me16x16.cost
2971 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2973 else if( i_type == B_BI_BI )
2975 x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
2976 x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
2979 else if( i_partition == D_16x8 )
2981 for( i=0; i<2; i++ )
2983 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2984 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2985 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2986 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2989 else if( i_partition == D_8x16 )
2991 for( i=0; i<2; i++ )
2993 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2994 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2995 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2996 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2999 else if( i_partition == D_8x8 )
3001 for( i=0; i<4; i++ )
3004 int i_part_cost_old;
3006 int i_part_type = h->mb.i_sub_partition[i];
3007 int b_bidir = (i_part_type == D_BI_8x8);
3009 if( i_part_type == D_DIRECT_8x8 )
3011 if( x264_mb_partition_listX_table[0][i_part_type] )
3013 m = &analysis.l0.me8x8[i];
3014 i_part_cost_old = m->cost;
3015 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
3016 m->cost -= i_type_cost;
3017 x264_me_refine_qpel( h, m );
3019 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3021 if( x264_mb_partition_listX_table[1][i_part_type] )
3023 m = &analysis.l1.me8x8[i];
3024 i_part_cost_old = m->cost;
3025 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
3026 m->cost -= i_type_cost;
3027 x264_me_refine_qpel( h, m );
3029 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
3031 /* TODO: update mvp? */
3035 i_satd_inter = i_cost;
3037 if( analysis.i_mbrd )
3039 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
3041 i_cost = i_bskip_cost;
3042 i_partition = D_16x16;
3043 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
3044 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
3045 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
3046 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
3047 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
3048 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
3049 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
3051 h->mb.i_type = i_type;
3052 h->mb.i_partition = i_partition;
3055 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
3057 if( analysis.i_mbrd )
3059 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
3060 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
3063 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
3064 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
3065 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
3066 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
3068 h->mb.i_type = i_type;
3069 h->mb.i_partition = i_partition;
3071 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
3072 x264_intra_rd_refine( h, &analysis );
3073 if( h->mb.i_subpel_refine >= 5 )
3074 x264_refine_bidir( h, &analysis );
3076 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
3079 x264_analyse_update_cache( h, &analysis );
3081 if( i_partition == D_16x16 )
3083 if( i_type == B_L0_L0 )
3085 analysis.l0.me16x16.cost = i_cost;
3086 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
3088 else if( i_type == B_L1_L1 )
3090 analysis.l1.me16x16.cost = i_cost;
3091 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
3093 else if( i_type == B_BI_BI )
3095 i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
3096 x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
3099 else if( i_partition == D_16x8 )
3101 for( i = 0; i < 2; i++ )
3103 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
3104 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
3105 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
3106 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
3107 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
3108 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
3110 i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
3111 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
3115 else if( i_partition == D_8x16 )
3117 for( i = 0; i < 2; i++ )
3119 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
3120 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
3121 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
3122 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
3123 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
3124 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
3126 i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
3127 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
3131 else if( i_partition == D_8x8 )
3133 for( i = 0; i < 4; i++ )
3135 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
3136 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
3137 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
3138 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
3139 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
3141 i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
3142 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
3150 x264_analyse_update_cache( h, &analysis );
3152 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
3153 * without realizing it. Check for this and account for it if necessary. */
3154 if( analysis.i_mbrd >= 2 )
3156 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
3157 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
3158 int list = check_mv_lists[h->mb.i_type] - 1;
3159 if( list >= 0 && h->mb.i_partition != D_16x16 &&
3160 M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
3161 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
3162 h->mb.i_partition = D_16x16;
3165 if( !analysis.i_mbrd )
3166 x264_mb_analyse_transform( h );
3168 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
3169 x264_mb_analyse_qp_rd( h, &analysis );
3171 h->mb.b_trellis = h->param.analyse.i_trellis;
3172 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
3173 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
3174 x264_psy_trellis_init( h, 0 );
3175 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
3176 h->mb.i_skip_intra = 0;
3179 /*-------------------- Update MB from the analysis ----------------------*/
3180 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
3184 switch( h->mb.i_type )
3187 for( i = 0; i < 16; i++ )
3188 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
3190 x264_mb_analyse_intra_chroma( h, a );
3193 for( i = 0; i < 4; i++ )
3194 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
3196 x264_mb_analyse_intra_chroma( h, a );
3199 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3200 x264_mb_analyse_intra_chroma( h, a );
3207 switch( h->mb.i_partition )
3210 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3211 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3215 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3216 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3217 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3218 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3222 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3223 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3224 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3225 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3229 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3235 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3236 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3237 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3238 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3239 for( i = 0; i < 4; i++ )
3240 x264_mb_cache_mv_p8x8( h, a, i );
3245 h->mb.i_partition = D_16x16;
3246 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3247 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3253 h->mb.i_partition = h->mb.cache.direct_partition;
3254 x264_mb_load_mv_direct8x8( h, 0 );
3255 x264_mb_load_mv_direct8x8( h, 1 );
3256 x264_mb_load_mv_direct8x8( h, 2 );
3257 x264_mb_load_mv_direct8x8( h, 3 );
3261 /* optimize: cache might not need to be rewritten */
3262 for( i = 0; i < 4; i++ )
3263 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3266 default: /* the rest of the B types */
3267 switch( h->mb.i_partition )
3270 switch( h->mb.i_type )
3273 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3274 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3276 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3277 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3278 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3281 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3282 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3283 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3285 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
3286 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3289 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
3290 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
3292 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
3293 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
3298 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3299 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3302 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3303 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3306 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3312 if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
3315 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3318 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3321 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed;
3322 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3324 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3325 x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
3326 x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
3327 h->mb.cache.mv[l][x264_scan8[15]][0],
3328 h->mb.cache.mv[l][x264_scan8[15]][1] );
3329 x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
3330 x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3331 x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
3332 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3333 x264_mb_analyse_intra( h, a, COST_MAX );
3334 h->mb.i_type = I_16x16;
3335 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3336 x264_mb_analyse_intra_chroma( h, a );
3343 #include "slicetype.c"