1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref0;
80 uint16_t *p_cost_ref1;
85 /* Take some shortcuts in intra search if intra is deemed unlikely */
91 int i_satd_i16x16_dir[7];
96 int i_satd_i8x8_dir[12][4];
100 int i_predict4x4[16];
105 int i_satd_i8x8chroma;
106 int i_satd_i8x8chroma_dir[4];
107 int i_predict8x8chroma;
109 /* II: Inter part P/B frame */
110 x264_mb_analysis_list_t l0;
111 x264_mb_analysis_list_t l1;
113 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
114 int i_cost16x16direct;
116 int i_cost8x8direct[4];
125 int i_mb_partition16x8[2]; /* mb_partition_e */
126 int i_mb_partition8x16[2];
127 int i_mb_type16x8; /* mb_class_e */
130 int b_direct_available;
132 } x264_mb_analysis_t;
134 /* lambda = pow(2,qp/6-2) */
135 const int x264_lambda_tab[52] = {
136 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
137 1, 1, 1, 1, /* 8-11 */
138 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
139 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
140 6, 7, 8, 9,10,11,13,14, /* 28-35 */
141 16,18,20,23,25,29,32,36, /* 36-43 */
142 40,45,51,57,64,72,81,91 /* 44-51 */
145 /* lambda2 = pow(lambda,2) * .9 * 256 */
146 const int x264_lambda2_tab[52] = {
147 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
148 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
149 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
150 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
151 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
152 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
153 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
156 const uint8_t x264_exp2_lut[64] = {
157 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
158 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
159 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
160 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
163 const float x264_log2_lut[128] = {
164 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
165 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
166 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
167 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
168 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
169 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
170 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
171 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
172 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
173 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
174 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
175 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
176 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
177 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
178 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
179 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
182 /* Avoid an int/float conversion. */
183 const float x264_log2_lz_lut[32] = {
184 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
187 // should the intra and inter lambdas be different?
188 // I'm just matching the behaviour of deadzone quant.
189 static const int x264_trellis_lambda2_tab[2][52] = {
190 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
191 { 46, 58, 73, 92, 117, 147,
192 185, 233, 294, 370, 466, 587,
193 740, 932, 1174, 1480, 1864, 2349,
194 2959, 3728, 4697, 5918, 7457, 9395,
195 11837, 14914, 18790, 23674, 29828, 37581,
196 47349, 59656, 75163, 94699, 119313, 150326,
197 189399, 238627, 300652, 378798, 477255, 601304,
198 757596, 954511, 1202608, 1515192, 1909022, 2405217,
199 3030384, 3818045, 4810435, 6060769 },
200 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
201 { 27, 34, 43, 54, 68, 86,
202 108, 136, 172, 216, 273, 343,
203 433, 545, 687, 865, 1090, 1374,
204 1731, 2180, 2747, 3461, 4361, 5494,
205 6922, 8721, 10988, 13844, 17442, 21976,
206 27688, 34885, 43953, 55377, 69771, 87906,
207 110755, 139543, 175813, 221511, 279087, 351627,
208 443023, 558174, 703255, 886046, 1116348, 1406511,
209 1772093, 2232697, 2813022, 3544186 }
212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
213 16, 20, 25, 32, 40, 50,
214 64, 80, 101, 128, 161, 203,
215 256, 322, 406, 512, 645, 812,
216 1024, 1290, 1625, 2048, 2580, 3250,
217 4096, 5160, 6501, 8192, 10321, 13003,
218 16384, 20642, 26007, 32768, 41285, 52015,
222 /* TODO: calculate CABAC costs */
223 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
224 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
226 static const int i_mb_b16x8_cost_table[17] = {
227 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
229 static const int i_sub_mb_b_cost_table[13] = {
230 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
232 static const int i_sub_mb_p_cost_table[4] = {
236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
238 static uint16_t x264_cost_ref[92][3][33];
239 static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
241 int x264_analyse_init_costs( x264_t *h, int qp )
244 int lambda = x264_lambda_tab[qp];
245 if( h->cost_mv[lambda] )
247 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
248 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
249 h->cost_mv[lambda] += 2*4*2048;
250 for( i = 0; i <= 2*4*2048; i++ )
252 h->cost_mv[lambda][-i] =
253 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
255 x264_pthread_mutex_lock( &cost_ref_mutex );
256 for( i = 0; i < 3; i++ )
257 for( j = 0; j < 33; j++ )
258 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
259 x264_pthread_mutex_unlock( &cost_ref_mutex );
260 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
264 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
265 h->cost_mv_fpel[lambda][j] += 2*2048;
266 for( i = -2*2048; i < 2*2048; i++ )
267 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
275 void x264_analyse_free_costs( x264_t *h )
278 for( i = 0; i < 92; i++ )
281 x264_free( h->cost_mv[i] - 2*4*2048 );
282 if( h->cost_mv_fpel[i][0] )
283 for( j = 0; j < 4; j++ )
284 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
288 /* initialize an array of lambda*nbits for all possible mvs */
289 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
291 a->p_cost_mv = h->cost_mv[a->i_lambda];
292 a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
293 a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
296 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
298 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
300 /* mbrd == 1 -> RD mode decision */
301 /* mbrd == 2 -> RD refinement */
302 /* mbrd == 3 -> QPRD */
303 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
305 /* conduct the analysis using this lamda and QP */
306 a->i_qp = h->mb.i_qp = i_qp;
307 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
309 a->i_lambda = x264_lambda_tab[i_qp];
310 a->i_lambda2 = x264_lambda2_tab[i_qp];
312 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
313 if( h->param.analyse.i_trellis )
315 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
316 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
317 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
318 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
320 h->mb.i_psy_rd_lambda = a->i_lambda;
321 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
322 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
324 h->mb.i_me_method = h->param.analyse.i_me_method;
325 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
326 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
327 && h->mb.i_subpel_refine >= 5;
329 h->mb.b_transform_8x8 = 0;
330 h->mb.b_noise_reduction = 0;
336 a->i_satd_i8x8chroma = COST_MAX;
338 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
339 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
343 h->mb.b_lossless ? 0 :
345 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
347 /* II: Inter part P/B frame */
348 if( h->sh.i_type != SLICE_TYPE_I )
351 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
352 // limit motion search to a slightly smaller range than the theoretical limit,
353 // since the search may go a few iterations past its given range
354 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
356 /* Calculate max allowed MV range */
357 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
358 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
359 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
360 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
361 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
362 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
363 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
364 if( h->mb.i_mb_x == 0)
366 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
367 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
368 int thread_mvy_range = i_fmv_range;
370 if( h->param.i_threads > 1 )
372 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
373 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
374 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
376 x264_frame_t **fref = i ? h->fref1 : h->fref0;
377 int i_ref = i ? h->i_ref1 : h->i_ref0;
378 for( j=0; j<i_ref; j++ )
380 x264_frame_cond_wait( fref[j], thresh );
381 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
384 if( h->param.b_deterministic )
385 thread_mvy_range = h->param.analyse.i_mv_range_thread;
386 if( h->mb.b_interlaced )
387 thread_mvy_range >>= 1;
390 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
391 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
392 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
393 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
394 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
395 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
396 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
402 a->l0.i_cost8x8 = COST_MAX;
404 for( i = 0; i < 4; i++ )
408 a->l0.i_cost4x8[i] = COST_MAX;
412 a->l0.i_cost8x16 = COST_MAX;
413 if( h->sh.i_type == SLICE_TYPE_B )
417 a->l1.i_cost8x8 = COST_MAX;
419 for( i = 0; i < 4; i++ )
424 a->i_cost8x8direct[i] = COST_MAX;
435 a->i_cost16x16direct =
438 a->i_cost8x16bi = COST_MAX;
441 /* Fast intra decision */
442 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
444 if( IS_INTRA( h->mb.i_mb_type_left )
445 || IS_INTRA( h->mb.i_mb_type_top )
446 || IS_INTRA( h->mb.i_mb_type_topleft )
447 || IS_INTRA( h->mb.i_mb_type_topright )
448 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
449 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
450 { /* intra is likely */ }
466 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
468 int b_top = i_neighbour & MB_TOP;
469 int b_left = i_neighbour & MB_LEFT;
470 if( b_top && b_left )
472 /* top and left available */
473 *mode++ = I_PRED_16x16_V;
474 *mode++ = I_PRED_16x16_H;
475 *mode++ = I_PRED_16x16_DC;
477 if( i_neighbour & MB_TOPLEFT )
479 /* top left available*/
480 *mode++ = I_PRED_16x16_P;
487 *mode++ = I_PRED_16x16_DC_LEFT;
488 *mode++ = I_PRED_16x16_H;
494 *mode++ = I_PRED_16x16_DC_TOP;
495 *mode++ = I_PRED_16x16_V;
501 *mode = I_PRED_16x16_DC_128;
507 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
509 int b_top = i_neighbour & MB_TOP;
510 int b_left = i_neighbour & MB_LEFT;
511 if( b_top && b_left )
513 /* top and left available */
514 *mode++ = I_PRED_CHROMA_V;
515 *mode++ = I_PRED_CHROMA_H;
516 *mode++ = I_PRED_CHROMA_DC;
518 if( i_neighbour & MB_TOPLEFT )
520 /* top left available */
521 *mode++ = I_PRED_CHROMA_P;
528 *mode++ = I_PRED_CHROMA_DC_LEFT;
529 *mode++ = I_PRED_CHROMA_H;
535 *mode++ = I_PRED_CHROMA_DC_TOP;
536 *mode++ = I_PRED_CHROMA_V;
542 *mode = I_PRED_CHROMA_DC_128;
548 static void predict_4x4_mode_available( unsigned int i_neighbour,
549 int *mode, int *pi_count )
551 int b_top = i_neighbour & MB_TOP;
552 int b_left = i_neighbour & MB_LEFT;
553 if( b_top && b_left )
556 *mode++ = I_PRED_4x4_DC;
557 *mode++ = I_PRED_4x4_H;
558 *mode++ = I_PRED_4x4_V;
559 *mode++ = I_PRED_4x4_DDL;
560 if( i_neighbour & MB_TOPLEFT )
562 *mode++ = I_PRED_4x4_DDR;
563 *mode++ = I_PRED_4x4_VR;
564 *mode++ = I_PRED_4x4_HD;
567 *mode++ = I_PRED_4x4_VL;
568 *mode++ = I_PRED_4x4_HU;
572 *mode++ = I_PRED_4x4_DC_LEFT;
573 *mode++ = I_PRED_4x4_H;
574 *mode++ = I_PRED_4x4_HU;
579 *mode++ = I_PRED_4x4_DC_TOP;
580 *mode++ = I_PRED_4x4_V;
581 *mode++ = I_PRED_4x4_DDL;
582 *mode++ = I_PRED_4x4_VL;
587 *mode++ = I_PRED_4x4_DC_128;
592 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
593 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
595 ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[64] );
596 ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
597 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
600 if( do_both_dct || h->mb.b_transform_8x8 )
602 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
603 for( i = 0; i < 4; i++ )
604 h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
606 if( do_both_dct || !h->mb.b_transform_8x8 )
608 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
609 for( i = 0; i < 16; i++ )
610 h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
614 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
615 static inline void x264_mb_cache_fenc_satd( x264_t *h )
617 ALIGNED_16( static uint8_t zero[16] ) = {0};
619 int x, y, satd_sum = 0, sa8d_sum = 0;
620 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
621 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
622 if( !h->mb.i_psy_rd )
624 for( y = 0; y < 4; y++ )
625 for( x = 0; x < 4; x++ )
627 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
628 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
629 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
630 satd_sum += h->mb.pic.fenc_satd[y][x];
632 for( y = 0; y < 2; y++ )
633 for( x = 0; x < 2; x++ )
635 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
636 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
637 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
638 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
640 h->mb.pic.fenc_satd_sum = satd_sum;
641 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
644 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
650 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
652 uint8_t *p_dstc[2], *p_srcc[2];
654 if( a->i_satd_i8x8chroma < COST_MAX )
657 /* 8x8 prediction selection for chroma */
658 p_dstc[0] = h->mb.pic.p_fdec[1];
659 p_dstc[1] = h->mb.pic.p_fdec[2];
660 p_srcc[0] = h->mb.pic.p_fenc[1];
661 p_srcc[1] = h->mb.pic.p_fenc[2];
663 predict_8x8chroma_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
664 a->i_satd_i8x8chroma = COST_MAX;
665 if( i_max == 4 && b_merged_satd )
667 int satdu[4], satdv[4];
668 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
669 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
670 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
671 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
672 satdu[I_PRED_CHROMA_P] =
673 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
674 satdv[I_PRED_CHROMA_P] =
675 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
677 for( i=0; i<i_max; i++ )
679 int i_mode = predict_mode[i];
680 int i_satd = satdu[i_mode] + satdv[i_mode]
681 + a->i_lambda * bs_size_ue(i_mode);
683 a->i_satd_i8x8chroma_dir[i] = i_satd;
684 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
689 for( i=0; i<i_max; i++ )
692 int i_mode = predict_mode[i];
694 /* we do the prediction */
695 if( h->mb.b_lossless )
696 x264_predict_lossless_8x8_chroma( h, i_mode );
699 h->predict_8x8c[i_mode]( p_dstc[0] );
700 h->predict_8x8c[i_mode]( p_dstc[1] );
703 /* we calculate the cost */
704 i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
705 p_srcc[0], FENC_STRIDE ) +
706 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
707 p_srcc[1], FENC_STRIDE ) +
708 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
710 a->i_satd_i8x8chroma_dir[i] = i_satd;
711 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
715 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
718 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
720 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
721 uint8_t *p_src = h->mb.pic.p_fenc[0];
722 uint8_t *p_dst = h->mb.pic.p_fdec[0];
727 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
729 /*---------------- Try all mode and calculate their score ---------------*/
731 /* 16x16 prediction selection */
732 predict_16x16_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
734 if( b_merged_satd && i_max == 4 )
736 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
737 h->predict_16x16[I_PRED_16x16_P]( p_dst );
738 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
739 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
742 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
743 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
748 for( i = 0; i < i_max; i++ )
751 int i_mode = predict_mode[i];
753 if( h->mb.b_lossless )
754 x264_predict_lossless_16x16( h, i_mode );
756 h->predict_16x16[i_mode]( p_dst );
758 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
759 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
760 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
761 a->i_satd_i16x16_dir[i_mode] = i_satd;
765 if( h->sh.i_type == SLICE_TYPE_B )
766 /* cavlc mb type prefix */
767 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
768 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
771 /* 8x8 prediction selection */
772 if( flags & X264_ANALYSE_I8x8 )
774 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
775 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
776 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
778 h->mb.i_cbp_luma = 0;
779 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
781 // FIXME some bias like in i4x4?
782 if( h->sh.i_type == SLICE_TYPE_B )
783 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
785 for( idx = 0;; idx++ )
789 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
790 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
791 int i_best = COST_MAX;
792 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
794 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
795 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
797 if( b_merged_satd && i_max == 9 )
800 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
801 satd[i_pred_mode] -= 3 * a->i_lambda;
802 for( i=2; i>=0; i-- )
804 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
805 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
812 for( ; i<i_max; i++ )
815 int i_mode = predict_mode[i];
817 if( h->mb.b_lossless )
818 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
820 h->predict_8x8[i_mode]( p_dst_by, edge );
822 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) + a->i_lambda * 4;
823 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
824 i_satd -= a->i_lambda * 3;
826 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
827 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
831 if( idx == 3 || i_cost > i_satd_thresh )
834 /* we need to encode this block now (for next ones) */
835 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
836 x264_mb_encode_i8x8( h, idx, a->i_qp );
838 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
843 a->i_satd_i8x8 = i_cost;
844 if( h->mb.i_skip_intra )
846 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
847 h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
848 h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
849 h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
850 h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
851 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
852 if( h->mb.i_skip_intra == 2 )
853 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
858 static const uint16_t cost_div_fix8[3] = {1024,512,341};
859 a->i_satd_i8x8 = COST_MAX;
860 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
862 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
866 /* 4x4 prediction selection */
867 if( flags & X264_ANALYSE_I4x4 )
870 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
871 h->mb.i_cbp_luma = 0;
872 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
874 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
876 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
877 if( h->sh.i_type == SLICE_TYPE_B )
878 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
880 for( idx = 0;; idx++ )
882 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
883 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
884 int i_best = COST_MAX;
885 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
887 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
889 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
890 /* emulate missing topright samples */
891 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
893 if( b_merged_satd && i_max >= 6 )
896 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
897 satd[i_pred_mode] -= 3 * a->i_lambda;
898 for( i=2; i>=0; i-- )
899 COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i );
905 for( ; i<i_max; i++ )
908 int i_mode = predict_mode[i];
909 if( h->mb.b_lossless )
910 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
912 h->predict_4x4[i_mode]( p_dst_by );
914 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
915 if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
916 i_satd -= a->i_lambda * 3;
918 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
920 i_cost += i_best + 4 * a->i_lambda;
922 if( i_cost > i_satd_thresh || idx == 15 )
925 /* we need to encode this block now (for next ones) */
926 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
927 x264_mb_encode_i4x4( h, idx, a->i_qp );
929 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
933 a->i_satd_i4x4 = i_cost;
934 if( h->mb.i_skip_intra )
936 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
937 h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
938 h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
939 h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
940 h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
941 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
942 if( h->mb.i_skip_intra == 2 )
943 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
947 a->i_satd_i4x4 = COST_MAX;
951 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
953 if( a->i_satd_i16x16 <= i_satd_thresh )
955 h->mb.i_type = I_16x16;
956 x264_analyse_update_cache( h, a );
957 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
960 a->i_satd_i16x16 = COST_MAX;
962 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
964 h->mb.i_type = I_4x4;
965 x264_analyse_update_cache( h, a );
966 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
969 a->i_satd_i4x4 = COST_MAX;
971 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
973 h->mb.i_type = I_8x8;
974 x264_analyse_update_cache( h, a );
975 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
976 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
979 a->i_satd_i8x8 = COST_MAX;
982 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
984 uint8_t *p_dst = h->mb.pic.p_fdec[0];
987 int i_max, i_mode, i_thresh;
988 uint64_t i_satd, i_best;
990 h->mb.i_skip_intra = 0;
992 if( h->mb.i_type == I_16x16 )
994 int old_pred_mode = a->i_predict16x16;
995 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
996 i_best = a->i_satd_i16x16;
997 predict_16x16_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
998 for( i = 0; i < i_max; i++ )
1000 int i_mode = predict_mode[i];
1001 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1003 h->mb.i_intra16x16_pred_mode = i_mode;
1004 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1005 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1009 /* RD selection for chroma prediction */
1010 predict_8x8chroma_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
1013 i_thresh = a->i_satd_i8x8chroma * 5/4;
1015 for( i = j = 0; i < i_max; i++ )
1016 if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
1017 predict_mode[i] != a->i_predict8x8chroma )
1019 predict_mode[j++] = predict_mode[i];
1025 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1026 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1027 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1028 * coefs for the current chroma mode are still around, so we only
1029 * have to recount the bits. */
1030 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1031 for( i = 0; i < i_max; i++ )
1033 i_mode = predict_mode[i];
1034 if( h->mb.b_lossless )
1035 x264_predict_lossless_8x8_chroma( h, i_mode );
1038 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
1039 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
1041 /* if we've already found a mode that needs no residual, then
1042 * probably any mode with a residual will be worse.
1043 * so avoid dct on the remaining modes to improve speed. */
1044 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1045 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1047 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1048 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1052 if( h->mb.i_type == I_4x4 )
1054 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1056 for( idx = 0; idx < 16; idx++ )
1058 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1059 i_best = COST_MAX64;
1061 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
1063 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1064 /* emulate missing topright samples */
1065 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1067 for( i = 0; i < i_max; i++ )
1069 i_mode = predict_mode[i];
1070 if( h->mb.b_lossless )
1071 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1073 h->predict_4x4[i_mode]( p_dst_by );
1074 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1076 if( i_best > i_satd )
1078 a->i_predict4x4[idx] = i_mode;
1080 pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
1081 pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
1082 pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
1083 pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
1084 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1088 *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
1089 *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
1090 *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
1091 *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
1092 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1094 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1097 else if( h->mb.i_type == I_8x8 )
1099 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1100 for( idx = 0; idx < 4; idx++ )
1102 uint64_t pels_h = 0;
1107 int cbp_luma_new = 0;
1108 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1110 i_best = COST_MAX64;
1114 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1115 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
1116 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1118 for( i = 0; i < i_max; i++ )
1120 i_mode = predict_mode[i];
1121 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1123 if( h->mb.b_lossless )
1124 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1126 h->predict_8x8[i_mode]( p_dst_by, edge );
1127 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1128 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1130 if( i_best > i_satd )
1132 a->i_predict8x8[idx] = i_mode;
1133 cbp_luma_new = h->mb.i_cbp_luma;
1136 pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
1138 for( j=0; j<7; j++ )
1139 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1140 i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
1141 i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
1144 a->i_cbp_i8x8_luma = cbp_luma_new;
1145 *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
1147 for( j=0; j<7; j++ )
1148 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1149 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
1150 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
1152 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1157 #define LOAD_FENC( m, src, xoff, yoff) \
1158 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1159 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1160 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1161 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1162 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1164 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1165 (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1166 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1167 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1168 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1169 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1170 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1171 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
1173 #define REF_COST(list, ref) \
1174 (a->p_cost_ref##list[ref])
1176 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1180 ALIGNED_4( int16_t mvc[8][2] );
1181 int i_halfpel_thresh = INT_MAX;
1182 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1184 /* 16x16 Search on all ref frame */
1185 m.i_pixel = PIXEL_16x16;
1186 m.p_cost_mv = a->p_cost_mv;
1187 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1189 a->l0.me16x16.cost = INT_MAX;
1190 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1192 const int i_ref_cost = REF_COST( 0, i_ref );
1193 i_halfpel_thresh -= i_ref_cost;
1194 m.i_ref_cost = i_ref_cost;
1197 /* search with ref */
1198 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1199 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1200 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1201 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1203 /* early termination
1204 * SSD threshold would probably be better than SATD */
1207 && m.cost-m.cost_mv < 300*a->i_lambda
1208 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1209 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1210 && x264_macroblock_probe_pskip( h ) )
1212 h->mb.i_type = P_SKIP;
1213 x264_analyse_update_cache( h, a );
1214 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1218 m.cost += i_ref_cost;
1219 i_halfpel_thresh += i_ref_cost;
1221 if( m.cost < a->l0.me16x16.cost )
1222 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1224 /* save mv for predicting neighbors */
1225 *(uint32_t*)a->l0.mvc[i_ref][0] =
1226 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1229 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1230 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1232 h->mb.i_type = P_L0;
1235 x264_mb_cache_fenc_satd( h );
1236 if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
1238 h->mb.i_partition = D_16x16;
1239 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1240 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1241 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1242 h->mb.i_type = P_SKIP;
1247 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1251 uint8_t **p_fenc = h->mb.pic.p_fenc;
1252 int i_halfpel_thresh = INT_MAX;
1253 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1255 int i_maxref = h->mb.pic.i_fref[0]-1;
1257 h->mb.i_partition = D_8x8;
1259 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1260 * than those used by the neighbors */
1261 if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
1262 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1265 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
1266 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
1267 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
1268 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
1269 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
1270 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
1273 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1274 *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
1276 for( i = 0; i < 4; i++ )
1278 x264_me_t *l0m = &a->l0.me8x8[i];
1282 m.i_pixel = PIXEL_8x8;
1283 m.p_cost_mv = a->p_cost_mv;
1285 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1286 l0m->cost = INT_MAX;
1287 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1289 const int i_ref_cost = REF_COST( 0, i_ref );
1290 i_halfpel_thresh -= i_ref_cost;
1291 m.i_ref_cost = i_ref_cost;
1294 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1295 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1296 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1297 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1299 m.cost += i_ref_cost;
1300 i_halfpel_thresh += i_ref_cost;
1301 *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
1303 if( m.cost < l0m->cost )
1304 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1306 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1307 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1309 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1310 are effectively zero. */
1311 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1312 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1315 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1316 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1317 /* P_8x8 ref0 has no ref cost */
1318 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1319 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1320 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1321 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1322 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1325 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1327 const int i_ref = a->l0.me16x16.i_ref;
1328 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1329 uint8_t **p_fref = h->mb.pic.p_fref[0][i_ref];
1330 uint8_t **p_fenc = h->mb.pic.p_fenc;
1332 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1335 /* XXX Needed for x264_mb_predict_mv */
1336 h->mb.i_partition = D_8x8;
1339 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
1341 for( i = 0; i < 4; i++ )
1343 x264_me_t *m = &a->l0.me8x8[i];
1347 m->i_pixel = PIXEL_8x8;
1348 m->p_cost_mv = a->p_cost_mv;
1349 m->i_ref_cost = i_ref_cost;
1352 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1353 LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
1354 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1355 x264_me_search( h, m, mvc, i_mvc );
1357 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1359 *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
1363 m->cost += i_ref_cost;
1364 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1365 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1368 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1369 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1370 /* theoretically this should include 4*ref_cost,
1371 * but 3 seems a better approximation of cabac. */
1372 if( h->param.b_cabac )
1373 a->l0.i_cost8x8 -= i_ref_cost;
1374 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1375 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1378 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1381 uint8_t **p_fenc = h->mb.pic.p_fenc;
1382 ALIGNED_4( int16_t mvc[3][2] );
1385 /* XXX Needed for x264_mb_predict_mv */
1386 h->mb.i_partition = D_16x8;
1388 for( i = 0; i < 2; i++ )
1390 x264_me_t *l0m = &a->l0.me16x8[i];
1391 const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1392 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1394 m.i_pixel = PIXEL_16x8;
1395 m.p_cost_mv = a->p_cost_mv;
1397 LOAD_FENC( &m, p_fenc, 0, 8*i );
1398 l0m->cost = INT_MAX;
1399 for( j = 0; j < i_ref8s; j++ )
1401 const int i_ref = ref8[j];
1402 const int i_ref_cost = REF_COST( 0, i_ref );
1403 m.i_ref_cost = i_ref_cost;
1406 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1407 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1408 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
1409 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
1411 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1412 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1413 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1414 x264_me_search( h, &m, mvc, 3 );
1416 m.cost += i_ref_cost;
1418 if( m.cost < l0m->cost )
1419 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1421 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1422 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1425 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1428 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1431 uint8_t **p_fenc = h->mb.pic.p_fenc;
1432 ALIGNED_4( int16_t mvc[3][2] );
1435 /* XXX Needed for x264_mb_predict_mv */
1436 h->mb.i_partition = D_8x16;
1438 for( i = 0; i < 2; i++ )
1440 x264_me_t *l0m = &a->l0.me8x16[i];
1441 const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1442 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1444 m.i_pixel = PIXEL_8x16;
1445 m.p_cost_mv = a->p_cost_mv;
1447 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1448 l0m->cost = INT_MAX;
1449 for( j = 0; j < i_ref8s; j++ )
1451 const int i_ref = ref8[j];
1452 const int i_ref_cost = REF_COST( 0, i_ref );
1453 m.i_ref_cost = i_ref_cost;
1456 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1457 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
1458 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
1460 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1461 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1462 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1463 x264_me_search( h, &m, mvc, 3 );
1465 m.cost += i_ref_cost;
1467 if( m.cost < l0m->cost )
1468 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1470 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1471 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1474 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1477 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1479 ALIGNED_8( uint8_t pix1[16*8] );
1480 uint8_t *pix2 = pix1+8;
1481 const int i_stride = h->mb.pic.i_stride[1];
1482 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1483 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1485 #define CHROMA4x4MC( width, height, me, x, y ) \
1486 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
1487 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
1489 if( pixel == PIXEL_4x4 )
1491 x264_me_t *m = a->l0.me4x4[i8x8];
1492 CHROMA4x4MC( 2,2, m[0], 0,0 );
1493 CHROMA4x4MC( 2,2, m[1], 2,0 );
1494 CHROMA4x4MC( 2,2, m[2], 0,2 );
1495 CHROMA4x4MC( 2,2, m[3], 2,2 );
1497 else if( pixel == PIXEL_8x4 )
1499 x264_me_t *m = a->l0.me8x4[i8x8];
1500 CHROMA4x4MC( 4,2, m[0], 0,0 );
1501 CHROMA4x4MC( 4,2, m[1], 0,2 );
1505 x264_me_t *m = a->l0.me4x8[i8x8];
1506 CHROMA4x4MC( 2,4, m[0], 0,0 );
1507 CHROMA4x4MC( 2,4, m[1], 2,0 );
1510 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1511 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1514 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1516 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1517 uint8_t **p_fenc = h->mb.pic.p_fenc;
1518 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1521 /* XXX Needed for x264_mb_predict_mv */
1522 h->mb.i_partition = D_8x8;
1524 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1526 const int idx = 4*i8x8 + i4x4;
1527 const int x4 = block_idx_x[idx];
1528 const int y4 = block_idx_y[idx];
1529 const int i_mvc = (i4x4 == 0);
1531 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1533 m->i_pixel = PIXEL_4x4;
1534 m->p_cost_mv = a->p_cost_mv;
1536 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1537 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1539 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1540 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1542 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1544 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1545 a->l0.me4x4[i8x8][1].cost +
1546 a->l0.me4x4[i8x8][2].cost +
1547 a->l0.me4x4[i8x8][3].cost +
1548 REF_COST( 0, i_ref ) +
1549 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1550 if( h->mb.b_chroma_me )
1551 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1554 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1556 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1557 uint8_t **p_fenc = h->mb.pic.p_fenc;
1558 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1561 /* XXX Needed for x264_mb_predict_mv */
1562 h->mb.i_partition = D_8x8;
1564 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1566 const int idx = 4*i8x8 + 2*i8x4;
1567 const int x4 = block_idx_x[idx];
1568 const int y4 = block_idx_y[idx];
1569 const int i_mvc = (i8x4 == 0);
1571 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1573 m->i_pixel = PIXEL_8x4;
1574 m->p_cost_mv = a->p_cost_mv;
1576 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1577 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1579 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1580 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1582 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1584 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1585 REF_COST( 0, i_ref ) +
1586 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1587 if( h->mb.b_chroma_me )
1588 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1591 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1593 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1594 uint8_t **p_fenc = h->mb.pic.p_fenc;
1595 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1598 /* XXX Needed for x264_mb_predict_mv */
1599 h->mb.i_partition = D_8x8;
1601 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1603 const int idx = 4*i8x8 + i4x8;
1604 const int x4 = block_idx_x[idx];
1605 const int y4 = block_idx_y[idx];
1606 const int i_mvc = (i4x8 == 0);
1608 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1610 m->i_pixel = PIXEL_4x8;
1611 m->p_cost_mv = a->p_cost_mv;
1613 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1614 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1616 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1617 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1619 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1621 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1622 REF_COST( 0, i_ref ) +
1623 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1624 if( h->mb.b_chroma_me )
1625 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1628 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1630 /* Assumes that fdec still contains the results of
1631 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1633 uint8_t **p_fenc = h->mb.pic.p_fenc;
1634 uint8_t **p_fdec = h->mb.pic.p_fdec;
1637 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1638 for( i = 0; i < 4; i++ )
1640 const int x = (i&1)*8;
1641 const int y = (i>>1)*8;
1642 a->i_cost16x16direct +=
1643 a->i_cost8x8direct[i] =
1644 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1647 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1651 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1653 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1654 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1655 uint8_t *src0, *src1;
1656 int stride0 = 16, stride1 = 16;
1660 ALIGNED_4( int16_t mvc[9][2] );
1661 int i_halfpel_thresh = INT_MAX;
1662 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1664 /* 16x16 Search on all ref frame */
1665 m.i_pixel = PIXEL_16x16;
1666 m.p_cost_mv = a->p_cost_mv;
1667 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1670 a->l0.me16x16.cost = INT_MAX;
1671 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1673 /* search with ref */
1674 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1675 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1676 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1677 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1680 m.cost += REF_COST( 0, i_ref );
1682 if( m.cost < a->l0.me16x16.cost )
1684 a->l0.i_ref = i_ref;
1685 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1688 /* save mv for predicting neighbors */
1689 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1691 /* subtract ref cost, so we don't have to add it for the other MB types */
1692 a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1695 i_halfpel_thresh = INT_MAX;
1696 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1697 a->l1.me16x16.cost = INT_MAX;
1698 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1700 /* search with ref */
1701 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1702 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1703 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1704 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1707 m.cost += REF_COST( 1, i_ref );
1709 if( m.cost < a->l1.me16x16.cost )
1711 a->l1.i_ref = i_ref;
1712 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1715 /* save mv for predicting neighbors */
1716 *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1718 /* subtract ref cost, so we don't have to add it for the other MB types */
1719 a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1721 /* Set global ref, needed for other modes? */
1722 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1723 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1725 /* get cost of BI mode */
1726 src0 = h->mc.get_ref( pix0, &stride0,
1727 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1728 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
1729 src1 = h->mc.get_ref( pix1, &stride1,
1730 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1731 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
1733 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1735 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1736 + REF_COST( 0, a->l0.i_ref )
1737 + REF_COST( 1, a->l1.i_ref )
1738 + a->l0.me16x16.cost_mv
1739 + a->l1.me16x16.cost_mv;
1742 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1743 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1744 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1747 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1749 const int x = 2*(i%2);
1750 const int y = 2*(i/2);
1752 switch( h->mb.i_sub_partition[i] )
1755 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1758 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1759 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1762 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1763 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1766 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1767 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1768 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1769 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1772 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1777 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1778 if( x264_mb_partition_listX_table[0][part] ) \
1780 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1781 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1785 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1786 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1788 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1790 if( x264_mb_partition_listX_table[1][part] ) \
1792 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1793 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1797 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1798 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1800 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1803 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1807 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1809 x264_mb_load_mv_direct8x8( h, i );
1812 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1813 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1814 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1819 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1822 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1824 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1826 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1828 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1832 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1834 uint8_t **p_fref[2] =
1835 { h->mb.pic.p_fref[0][a->l0.i_ref],
1836 h->mb.pic.p_fref[1][a->l1.i_ref] };
1837 ALIGNED_8( uint8_t pix[2][8*8] );
1840 /* XXX Needed for x264_mb_predict_mv */
1841 h->mb.i_partition = D_8x8;
1845 for( i = 0; i < 4; i++ )
1850 int i_part_cost_bi = 0;
1851 int stride[2] = {8,8};
1854 for( l = 0; l < 2; l++ )
1856 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1857 x264_me_t *m = &lX->me8x8[i];
1859 m->i_pixel = PIXEL_8x8;
1860 m->p_cost_mv = a->p_cost_mv;
1862 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1863 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1865 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1866 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1868 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1871 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1872 m->mv[0], m->mv[1], 8, 8 );
1873 i_part_cost_bi += m->cost_mv;
1874 /* FIXME: ref cost */
1876 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1877 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1878 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1879 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1880 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1882 i_part_cost = a->l0.me8x8[i].cost;
1883 h->mb.i_sub_partition[i] = D_L0_8x8;
1884 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1885 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1886 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1887 a->i_cost8x8bi += i_part_cost;
1889 /* XXX Needed for x264_mb_predict_mv */
1890 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1894 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1897 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1899 uint8_t **p_fref[2] =
1900 { h->mb.pic.p_fref[0][a->l0.i_ref],
1901 h->mb.pic.p_fref[1][a->l1.i_ref] };
1902 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1903 ALIGNED_4( int16_t mvc[2][2] );
1906 h->mb.i_partition = D_16x8;
1907 a->i_cost16x8bi = 0;
1909 for( i = 0; i < 2; i++ )
1912 int i_part_cost_bi = 0;
1913 int stride[2] = {16,16};
1916 /* TODO: check only the list(s) that were used in b8x8? */
1917 for( l = 0; l < 2; l++ )
1919 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1920 x264_me_t *m = &lX->me16x8[i];
1922 m->i_pixel = PIXEL_16x8;
1923 m->p_cost_mv = a->p_cost_mv;
1925 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1926 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1928 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
1929 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
1931 x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
1932 x264_me_search( h, m, mvc, 2 );
1935 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1936 m->mv[0], m->mv[1], 16, 8 );
1937 /* FIXME: ref cost */
1938 i_part_cost_bi += m->cost_mv;
1940 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1941 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1943 i_part_cost = a->l0.me16x8[i].cost;
1944 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1945 if( a->l1.me16x8[i].cost < i_part_cost )
1947 i_part_cost = a->l1.me16x8[i].cost;
1948 a->i_mb_partition16x8[i] = D_L1_8x8;
1950 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1952 i_part_cost = i_part_cost_bi;
1953 a->i_mb_partition16x8[i] = D_BI_8x8;
1955 a->i_cost16x8bi += i_part_cost;
1957 x264_mb_cache_mv_b16x8( h, a, i, 0 );
1961 a->i_mb_type16x8 = B_L0_L0
1962 + (a->i_mb_partition16x8[0]>>2) * 3
1963 + (a->i_mb_partition16x8[1]>>2);
1964 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1967 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1969 uint8_t **p_fref[2] =
1970 { h->mb.pic.p_fref[0][a->l0.i_ref],
1971 h->mb.pic.p_fref[1][a->l1.i_ref] };
1972 ALIGNED_8( uint8_t pix[2][8*16] );
1973 ALIGNED_4( int16_t mvc[2][2] );
1976 h->mb.i_partition = D_8x16;
1977 a->i_cost8x16bi = 0;
1979 for( i = 0; i < 2; i++ )
1982 int i_part_cost_bi = 0;
1983 int stride[2] = {8,8};
1986 for( l = 0; l < 2; l++ )
1988 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1989 x264_me_t *m = &lX->me8x16[i];
1991 m->i_pixel = PIXEL_8x16;
1992 m->p_cost_mv = a->p_cost_mv;
1994 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1995 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
1997 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
1998 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
2000 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2001 x264_me_search( h, m, mvc, 2 );
2004 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2005 m->mv[0], m->mv[1], 8, 16 );
2006 /* FIXME: ref cost */
2007 i_part_cost_bi += m->cost_mv;
2010 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2011 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2013 i_part_cost = a->l0.me8x16[i].cost;
2014 a->i_mb_partition8x16[i] = D_L0_8x8;
2015 if( a->l1.me8x16[i].cost < i_part_cost )
2017 i_part_cost = a->l1.me8x16[i].cost;
2018 a->i_mb_partition8x16[i] = D_L1_8x8;
2020 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2022 i_part_cost = i_part_cost_bi;
2023 a->i_mb_partition8x16[i] = D_BI_8x8;
2025 a->i_cost8x16bi += i_part_cost;
2027 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2031 a->i_mb_type8x16 = B_L0_L0
2032 + (a->i_mb_partition8x16[0]>>2) * 3
2033 + (a->i_mb_partition8x16[1]>>2);
2034 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2037 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2039 int thresh = i_satd * 5/4;
2041 h->mb.i_type = P_L0;
2042 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2044 h->mb.i_partition = D_16x16;
2045 x264_analyse_update_cache( h, a );
2046 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2049 if( a->l0.i_cost16x8 <= thresh )
2051 h->mb.i_partition = D_16x8;
2052 x264_analyse_update_cache( h, a );
2053 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2056 a->l0.i_cost16x8 = COST_MAX;
2058 if( a->l0.i_cost8x16 <= thresh )
2060 h->mb.i_partition = D_8x16;
2061 x264_analyse_update_cache( h, a );
2062 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2065 a->l0.i_cost8x16 = COST_MAX;
2067 if( a->l0.i_cost8x8 <= thresh )
2069 h->mb.i_type = P_8x8;
2070 h->mb.i_partition = D_8x8;
2071 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2074 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2075 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2076 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2077 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2078 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2079 * for future blocks are those left over from previous RDO calls. */
2080 for( i = 0; i < 4; i++ )
2082 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2083 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2084 int subtype, btype = D_L0_8x8;
2085 uint64_t bcost = COST_MAX64;
2086 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2089 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2091 h->mb.i_sub_partition[i] = subtype;
2092 x264_mb_cache_mv_p8x8( h, a, i );
2093 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2094 COPY2_IF_LT( bcost, cost, btype, subtype );
2096 h->mb.i_sub_partition[i] = btype;
2097 x264_mb_cache_mv_p8x8( h, a, i );
2101 x264_analyse_update_cache( h, a );
2102 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2105 a->l0.i_cost8x8 = COST_MAX;
2108 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2110 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2112 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2114 h->mb.i_type = B_DIRECT;
2115 /* Assumes direct/skip MC is still in fdec */
2116 /* Requires b-rdo to be done before intra analysis */
2117 h->mb.b_skip_mc = 1;
2118 x264_analyse_update_cache( h, a );
2119 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2120 h->mb.b_skip_mc = 0;
2123 //FIXME not all the update_cache calls are needed
2124 h->mb.i_partition = D_16x16;
2126 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2128 h->mb.i_type = B_L0_L0;
2129 x264_analyse_update_cache( h, a );
2130 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2134 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2136 h->mb.i_type = B_L1_L1;
2137 x264_analyse_update_cache( h, a );
2138 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2142 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2144 h->mb.i_type = B_BI_BI;
2145 x264_analyse_update_cache( h, a );
2146 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2150 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2152 h->mb.i_type = B_8x8;
2153 h->mb.i_partition = D_8x8;
2154 x264_analyse_update_cache( h, a );
2155 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2156 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2160 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2162 h->mb.i_type = a->i_mb_type16x8;
2163 h->mb.i_partition = D_16x8;
2164 x264_analyse_update_cache( h, a );
2165 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2169 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2171 h->mb.i_type = a->i_mb_type8x16;
2172 h->mb.i_partition = D_8x16;
2173 x264_analyse_update_cache( h, a );
2174 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2178 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2180 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2183 if( IS_INTRA(h->mb.i_type) )
2186 switch( h->mb.i_partition )
2189 if( h->mb.i_type == B_BI_BI )
2190 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2193 for( i=0; i<2; i++ )
2194 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2195 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2198 for( i=0; i<2; i++ )
2199 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2200 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2203 for( i=0; i<4; i++ )
2204 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2205 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2210 static inline void x264_mb_analyse_transform( x264_t *h )
2212 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2214 int i_cost4, i_cost8;
2215 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2218 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2219 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2220 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2221 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2223 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2224 h->mb.b_skip_mc = 1;
2228 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2230 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2233 x264_analyse_update_cache( h, a );
2234 h->mb.b_transform_8x8 ^= 1;
2235 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2236 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2238 if( *i_rd >= i_rd8 )
2241 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2245 h->mb.b_transform_8x8 ^= 1;
2249 /* Rate-distortion optimal QP selection.
2250 * FIXME: More than half of the benefit of this function seems to be
2251 * in the way it improves the coding of chroma DC (by decimating or
2252 * finding a better way to code a single DC coefficient.)
2253 * There must be a more efficient way to get that portion of the benefit
2254 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2256 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2258 int bcost, cost, direction, failures, prevcost, origcost;
2259 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2260 int last_qp_tried = 0;
2261 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2263 /* If CBP is already zero, don't raise the quantizer any higher. */
2264 for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2266 /* Without psy-RD, require monotonicity when moving quant away from previous
2267 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2268 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2269 * allow 2 failures when moving quant towards previous quant.
2270 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2271 int threshold = (!!h->mb.i_psy_rd);
2272 /* Raise the threshold for failures if we're moving towards the last QP. */
2273 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2274 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2276 h->mb.i_qp = orig_qp;
2278 prevcost = origcost;
2279 h->mb.i_qp += direction;
2280 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2282 if( h->mb.i_last_qp == h->mb.i_qp )
2284 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2285 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2286 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2288 /* We can't assume that the costs are monotonic over QPs.
2289 * Tie case-as-failure seems to give better results. */
2290 if( cost < prevcost )
2296 if( failures > threshold )
2298 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2300 h->mb.i_qp += direction;
2304 /* Always try the last block's QP. */
2305 if( !last_qp_tried )
2307 h->mb.i_qp = h->mb.i_last_qp;
2308 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2309 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2310 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2314 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2316 /* Check transform again; decision from before may no longer be optimal. */
2317 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2318 x264_mb_transform_8x8_allowed( h ) )
2320 h->mb.b_transform_8x8 ^= 1;
2321 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2323 h->mb.b_transform_8x8 ^= 1;
2327 /*****************************************************************************
2328 * x264_macroblock_analyse:
2329 *****************************************************************************/
2330 void x264_macroblock_analyse( x264_t *h )
2332 x264_mb_analysis_t analysis;
2333 int i_cost = COST_MAX;
2336 h->mb.i_qp = x264_ratecontrol_qp( h );
2337 if( h->param.rc.i_aq_mode )
2339 x264_adaptive_quant( h );
2340 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2341 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2342 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2343 h->mb.i_qp = h->mb.i_last_qp;
2346 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2348 /*--------------------------- Do the analysis ---------------------------*/
2349 if( h->sh.i_type == SLICE_TYPE_I )
2351 if( analysis.i_mbrd )
2352 x264_mb_cache_fenc_satd( h );
2353 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2354 if( analysis.i_mbrd )
2355 x264_intra_rd( h, &analysis, COST_MAX );
2357 i_cost = analysis.i_satd_i16x16;
2358 h->mb.i_type = I_16x16;
2359 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2360 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2361 if( analysis.i_satd_pcm < i_cost )
2362 h->mb.i_type = I_PCM;
2364 else if( analysis.i_mbrd >= 2 )
2365 x264_intra_rd_refine( h, &analysis );
2367 else if( h->sh.i_type == SLICE_TYPE_P )
2371 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2373 /* Fast P_SKIP detection */
2374 analysis.b_try_pskip = 0;
2375 if( h->param.analyse.b_fast_pskip )
2377 if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2378 // FIXME don't need to check this if the reference frame is done
2380 else if( h->param.analyse.i_subpel_refine >= 3 )
2381 analysis.b_try_pskip = 1;
2382 else if( h->mb.i_mb_type_left == P_SKIP ||
2383 h->mb.i_mb_type_top == P_SKIP ||
2384 h->mb.i_mb_type_topleft == P_SKIP ||
2385 h->mb.i_mb_type_topright == P_SKIP )
2386 b_skip = x264_macroblock_probe_pskip( h );
2389 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2393 h->mb.i_type = P_SKIP;
2394 h->mb.i_partition = D_16x16;
2395 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
2399 const unsigned int flags = h->param.analyse.inter;
2403 int i_satd_inter, i_satd_intra;
2405 x264_mb_analyse_load_costs( h, &analysis );
2407 x264_mb_analyse_inter_p16x16( h, &analysis );
2409 if( h->mb.i_type == P_SKIP )
2412 if( flags & X264_ANALYSE_PSUB16x16 )
2414 if( h->param.analyse.b_mixed_references )
2415 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2417 x264_mb_analyse_inter_p8x8( h, &analysis );
2420 /* Select best inter mode */
2422 i_partition = D_16x16;
2423 i_cost = analysis.l0.me16x16.cost;
2425 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2426 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2429 i_partition = D_8x8;
2430 i_cost = analysis.l0.i_cost8x8;
2433 if( flags & X264_ANALYSE_PSUB8x8 )
2435 for( i = 0; i < 4; i++ )
2437 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2438 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2440 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2441 h->mb.i_sub_partition[i] = D_L0_4x4;
2443 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2444 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2445 h->mb.i_sub_partition[i], D_L0_8x4 );
2447 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2448 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2449 h->mb.i_sub_partition[i], D_L0_4x8 );
2451 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2453 x264_mb_cache_mv_p8x8( h, &analysis, i );
2455 analysis.l0.i_cost8x8 = i_cost;
2459 /* Now do 16x8/8x16 */
2460 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2461 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2462 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2464 x264_mb_analyse_inter_p16x8( h, &analysis );
2465 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2467 x264_mb_analyse_inter_p8x16( h, &analysis );
2468 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2471 h->mb.i_partition = i_partition;
2474 //FIXME mb_type costs?
2475 if( analysis.i_mbrd )
2479 else if( i_partition == D_16x16 )
2481 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2482 i_cost = analysis.l0.me16x16.cost;
2484 else if( i_partition == D_16x8 )
2486 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2487 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2488 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2490 else if( i_partition == D_8x16 )
2492 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2493 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2494 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2496 else if( i_partition == D_8x8 )
2500 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2502 switch( h->mb.i_sub_partition[i8x8] )
2505 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2506 i_cost += analysis.l0.me8x8[i8x8].cost;
2509 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2510 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2511 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2512 analysis.l0.me8x4[i8x8][1].cost;
2515 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2516 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2517 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2518 analysis.l0.me4x8[i8x8][1].cost;
2522 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2523 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2524 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2525 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2526 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2527 analysis.l0.me4x4[i8x8][1].cost +
2528 analysis.l0.me4x4[i8x8][2].cost +
2529 analysis.l0.me4x4[i8x8][3].cost;
2532 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2538 if( h->mb.b_chroma_me )
2540 x264_mb_analyse_intra_chroma( h, &analysis );
2541 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2542 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2543 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2544 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2547 x264_mb_analyse_intra( h, &analysis, i_cost );
2549 i_satd_inter = i_cost;
2550 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2551 analysis.i_satd_i8x8,
2552 analysis.i_satd_i4x4 );
2554 if( analysis.i_mbrd )
2556 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2558 i_partition = D_16x16;
2559 i_cost = analysis.l0.i_rd16x16;
2560 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2561 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2562 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2563 h->mb.i_type = i_type;
2564 h->mb.i_partition = i_partition;
2565 if( i_cost < COST_MAX )
2566 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2567 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2570 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2571 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2572 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2573 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2575 h->mb.i_type = i_type;
2577 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2579 if( IS_INTRA( h->mb.i_type ) )
2581 x264_intra_rd_refine( h, &analysis );
2583 else if( i_partition == D_16x16 )
2585 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2586 analysis.l0.me16x16.cost = i_cost;
2587 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2589 else if( i_partition == D_16x8 )
2591 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2592 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2593 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2594 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2595 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2596 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2598 else if( i_partition == D_8x16 )
2600 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2601 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2602 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2603 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2604 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2605 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2607 else if( i_partition == D_8x8 )
2610 x264_analyse_update_cache( h, &analysis );
2611 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2613 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2615 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2617 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2619 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2620 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2622 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2624 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2625 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2627 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2629 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2630 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2631 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2632 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2639 else if( h->sh.i_type == SLICE_TYPE_B )
2641 int i_bskip_cost = COST_MAX;
2644 if( analysis.i_mbrd )
2645 x264_mb_cache_fenc_satd( h );
2647 h->mb.i_type = B_SKIP;
2648 if( h->mb.b_direct_auto_write )
2650 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2651 for( i = 0; i < 2; i++ )
2654 h->sh.b_direct_spatial_mv_pred ^= 1;
2655 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2656 if( analysis.b_direct_available )
2661 b_skip = x264_macroblock_probe_bskip( h );
2663 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2670 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2672 if( analysis.b_direct_available )
2674 if( !h->mb.b_direct_auto_write )
2676 if( analysis.i_mbrd )
2678 i_bskip_cost = ssd_mb( h );
2679 /* 6 = minimum cavlc cost of a non-skipped MB */
2680 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2682 else if( !h->mb.b_direct_auto_write )
2684 /* Conditioning the probe on neighboring block types
2685 * doesn't seem to help speed or quality. */
2686 b_skip = x264_macroblock_probe_bskip( h );
2692 const unsigned int flags = h->param.analyse.inter;
2696 h->mb.b_skip_mc = 0;
2698 x264_mb_analyse_load_costs( h, &analysis );
2700 /* select best inter mode */
2701 /* direct must be first */
2702 if( analysis.b_direct_available )
2703 x264_mb_analyse_inter_direct( h, &analysis );
2705 x264_mb_analyse_inter_b16x16( h, &analysis );
2708 i_partition = D_16x16;
2709 i_cost = analysis.l0.me16x16.cost;
2710 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2711 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2712 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2714 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2716 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2717 if( i_bskip_cost < analysis.i_rd16x16direct &&
2718 i_bskip_cost < analysis.i_rd16x16bi &&
2719 i_bskip_cost < analysis.l0.i_rd16x16 &&
2720 i_bskip_cost < analysis.l1.i_rd16x16 )
2722 h->mb.i_type = B_SKIP;
2723 x264_analyse_update_cache( h, &analysis );
2728 if( flags & X264_ANALYSE_BSUB16x16 )
2730 x264_mb_analyse_inter_b8x8( h, &analysis );
2731 if( analysis.i_cost8x8bi < i_cost )
2734 i_partition = D_8x8;
2735 i_cost = analysis.i_cost8x8bi;
2737 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2738 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2740 x264_mb_analyse_inter_b16x8( h, &analysis );
2741 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2742 i_type, analysis.i_mb_type16x8,
2743 i_partition, D_16x8 );
2745 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2746 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2748 x264_mb_analyse_inter_b8x16( h, &analysis );
2749 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2750 i_type, analysis.i_mb_type8x16,
2751 i_partition, D_8x16 );
2756 if( analysis.i_mbrd )
2761 else if( i_partition == D_16x16 )
2763 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2764 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2765 if( i_type == B_L0_L0 )
2767 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2768 i_cost = analysis.l0.me16x16.cost
2769 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2771 else if( i_type == B_L1_L1 )
2773 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2774 i_cost = analysis.l1.me16x16.cost
2775 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2777 else if( i_type == B_BI_BI )
2779 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2780 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2783 else if( i_partition == D_16x8 )
2785 for( i=0; i<2; i++ )
2787 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2788 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2789 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2790 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2793 else if( i_partition == D_8x16 )
2795 for( i=0; i<2; i++ )
2797 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2798 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2799 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2800 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2803 else if( i_partition == D_8x8 )
2805 for( i=0; i<4; i++ )
2808 int i_part_cost_old;
2810 int i_part_type = h->mb.i_sub_partition[i];
2811 int b_bidir = (i_part_type == D_BI_8x8);
2813 if( i_part_type == D_DIRECT_8x8 )
2815 if( x264_mb_partition_listX_table[0][i_part_type] )
2817 m = &analysis.l0.me8x8[i];
2818 i_part_cost_old = m->cost;
2819 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2820 m->cost -= i_type_cost;
2821 x264_me_refine_qpel( h, m );
2823 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2825 if( x264_mb_partition_listX_table[1][i_part_type] )
2827 m = &analysis.l1.me8x8[i];
2828 i_part_cost_old = m->cost;
2829 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2830 m->cost -= i_type_cost;
2831 x264_me_refine_qpel( h, m );
2833 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2835 /* TODO: update mvp? */
2839 i_satd_inter = i_cost;
2841 if( analysis.i_mbrd )
2843 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2845 i_cost = i_bskip_cost;
2846 i_partition = D_16x16;
2847 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2848 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2849 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2850 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2851 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2852 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2853 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2855 h->mb.i_type = i_type;
2856 h->mb.i_partition = i_partition;
2859 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2861 if( analysis.i_mbrd )
2863 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2864 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2867 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2868 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2869 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2870 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2872 h->mb.i_type = i_type;
2873 h->mb.i_partition = i_partition;
2875 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2876 x264_intra_rd_refine( h, &analysis );
2877 if( h->mb.i_subpel_refine >= 5 )
2878 x264_refine_bidir( h, &analysis );
2880 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2882 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2883 x264_analyse_update_cache( h, &analysis );
2885 if( i_partition == D_16x16 )
2887 if( i_type == B_L0_L0 )
2889 analysis.l0.me16x16.cost = i_cost;
2890 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2892 else if( i_type == B_L1_L1 )
2894 analysis.l1.me16x16.cost = i_cost;
2895 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2897 else if( i_type == B_BI_BI )
2898 x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2900 else if( i_partition == D_16x8 )
2902 for( i = 0; i < 2; i++ )
2904 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2905 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2906 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2907 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2908 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2909 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2910 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2913 else if( i_partition == D_8x16 )
2915 for( i = 0; i < 2; i++ )
2917 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2918 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2919 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2920 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2921 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2922 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2923 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2926 else if( i_partition == D_8x8 )
2928 for( i = 0; i < 4; i++ )
2930 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2931 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2932 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2933 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2934 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2935 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2942 x264_analyse_update_cache( h, &analysis );
2944 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
2945 * without realizing it. Check for this and account for it if necessary. */
2946 if( analysis.i_mbrd >= 2 )
2948 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
2949 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
2950 int list = check_mv_lists[h->mb.i_type] - 1;
2951 if( list >= 0 && h->mb.i_partition != D_16x16 &&
2952 *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[0]] == *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[12]] &&
2953 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
2954 h->mb.i_partition = D_16x16;
2957 if( !analysis.i_mbrd )
2958 x264_mb_analyse_transform( h );
2960 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
2961 x264_mb_analyse_qp_rd( h, &analysis );
2963 h->mb.b_trellis = h->param.analyse.i_trellis;
2964 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
2965 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
2966 x264_psy_trellis_init( h, 0 );
2967 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
2968 h->mb.i_skip_intra = 0;
2971 /*-------------------- Update MB from the analysis ----------------------*/
2972 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
2976 switch( h->mb.i_type )
2979 for( i = 0; i < 16; i++ )
2980 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
2982 x264_mb_analyse_intra_chroma( h, a );
2985 for( i = 0; i < 4; i++ )
2986 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
2988 x264_mb_analyse_intra_chroma( h, a );
2991 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
2992 x264_mb_analyse_intra_chroma( h, a );
2999 switch( h->mb.i_partition )
3002 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3003 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3007 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3008 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3009 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3010 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3014 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3015 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3016 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3017 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3021 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3027 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3028 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3029 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3030 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3031 for( i = 0; i < 4; i++ )
3032 x264_mb_cache_mv_p8x8( h, a, i );
3037 h->mb.i_partition = D_16x16;
3038 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3039 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3045 x264_mb_load_mv_direct8x8( h, 0 );
3046 x264_mb_load_mv_direct8x8( h, 1 );
3047 x264_mb_load_mv_direct8x8( h, 2 );
3048 x264_mb_load_mv_direct8x8( h, 3 );
3052 /* optimize: cache might not need to be rewritten */
3053 for( i = 0; i < 4; i++ )
3054 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3057 default: /* the rest of the B types */
3058 switch( h->mb.i_partition )
3061 switch( h->mb.i_type )
3064 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3065 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3067 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3068 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3069 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3072 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3073 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3074 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3076 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3077 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3080 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3081 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3083 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3084 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3089 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3090 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3093 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3094 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3097 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3103 if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
3106 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3109 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3112 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
3113 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3115 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3116 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3117 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3118 h->mb.cache.mv[l][x264_scan8[15]][0],
3119 h->mb.cache.mv[l][x264_scan8[15]][1] );
3120 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3121 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3122 fprintf(stderr, "completed: %d \n", completed );
3123 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3124 x264_mb_analyse_intra( h, a, COST_MAX );
3125 h->mb.i_type = I_16x16;
3126 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3127 x264_mb_analyse_intra_chroma( h, a );
3134 #include "slicetype.c"