1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
29 #include "common/common.h"
30 #include "common/cpu.h"
31 #include "macroblock.h"
33 #include "ratecontrol.h"
46 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
47 ALIGNED_4( int16_t mvc[32][5][2] );
51 int i_cost4x4[4]; /* cost per 8x8 partition */
52 x264_me_t me4x4[4][4];
55 int i_cost8x4[4]; /* cost per 8x8 partition */
56 x264_me_t me8x4[4][2];
59 int i_cost4x8[4]; /* cost per 8x8 partition */
60 x264_me_t me4x8[4][2];
70 } x264_mb_analysis_list_t;
74 /* conduct the analysis using this lamda and QP */
79 uint16_t *p_cost_ref0;
80 uint16_t *p_cost_ref1;
85 /* Take some shortcuts in intra search if intra is deemed unlikely */
91 int i_satd_i16x16_dir[7];
96 int i_satd_i8x8_dir[12][4];
100 int i_predict4x4[16];
105 int i_satd_i8x8chroma;
106 int i_satd_i8x8chroma_dir[4];
107 int i_predict8x8chroma;
109 /* II: Inter part P/B frame */
110 x264_mb_analysis_list_t l0;
111 x264_mb_analysis_list_t l1;
113 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
114 int i_cost16x16direct;
116 int i_cost8x8direct[4];
125 int i_mb_partition16x8[2]; /* mb_partition_e */
126 int i_mb_partition8x16[2];
127 int i_mb_type16x8; /* mb_class_e */
130 int b_direct_available;
132 } x264_mb_analysis_t;
134 /* lambda = pow(2,qp/6-2) */
135 const int x264_lambda_tab[52] = {
136 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
137 1, 1, 1, 1, /* 8-11 */
138 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
139 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
140 6, 7, 8, 9,10,11,13,14, /* 28-35 */
141 16,18,20,23,25,29,32,36, /* 36-43 */
142 40,45,51,57,64,72,81,91 /* 44-51 */
145 /* lambda2 = pow(lambda,2) * .9 * 256 */
146 const int x264_lambda2_tab[52] = {
147 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
148 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
149 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
150 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
151 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
152 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
153 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
156 const uint8_t x264_exp2_lut[64] = {
157 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45,
158 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102,
159 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170,
160 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250
163 const float x264_log2_lut[128] = {
164 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
165 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
166 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
167 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
168 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
169 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
170 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
171 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
172 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
173 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
174 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
175 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
176 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
177 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
178 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
179 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
182 /* Avoid an int/float conversion. */
183 const float x264_log2_lz_lut[32] = {
184 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
187 // should the intra and inter lambdas be different?
188 // I'm just matching the behaviour of deadzone quant.
189 static const int x264_trellis_lambda2_tab[2][52] = {
190 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
191 { 46, 58, 73, 92, 117, 147,
192 185, 233, 294, 370, 466, 587,
193 740, 932, 1174, 1480, 1864, 2349,
194 2959, 3728, 4697, 5918, 7457, 9395,
195 11837, 14914, 18790, 23674, 29828, 37581,
196 47349, 59656, 75163, 94699, 119313, 150326,
197 189399, 238627, 300652, 378798, 477255, 601304,
198 757596, 954511, 1202608, 1515192, 1909022, 2405217,
199 3030384, 3818045, 4810435, 6060769 },
200 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
201 { 27, 34, 43, 54, 68, 86,
202 108, 136, 172, 216, 273, 343,
203 433, 545, 687, 865, 1090, 1374,
204 1731, 2180, 2747, 3461, 4361, 5494,
205 6922, 8721, 10988, 13844, 17442, 21976,
206 27688, 34885, 43953, 55377, 69771, 87906,
207 110755, 139543, 175813, 221511, 279087, 351627,
208 443023, 558174, 703255, 886046, 1116348, 1406511,
209 1772093, 2232697, 2813022, 3544186 }
212 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
213 16, 20, 25, 32, 40, 50,
214 64, 80, 101, 128, 161, 203,
215 256, 322, 406, 512, 645, 812,
216 1024, 1290, 1625, 2048, 2580, 3250,
217 4096, 5160, 6501, 8192, 10321, 13003,
218 16384, 20642, 26007, 32768, 41285, 52015,
222 /* TODO: calculate CABAC costs */
223 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
224 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
226 static const int i_mb_b16x8_cost_table[17] = {
227 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
229 static const int i_sub_mb_b_cost_table[13] = {
230 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
232 static const int i_sub_mb_p_cost_table[4] = {
236 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
238 static uint16_t x264_cost_ref[92][3][33];
239 static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
241 int x264_analyse_init_costs( x264_t *h, int qp )
244 int lambda = x264_lambda_tab[qp];
245 if( h->cost_mv[lambda] )
247 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
248 CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
249 h->cost_mv[lambda] += 2*4*2048;
250 for( i = 0; i <= 2*4*2048; i++ )
252 h->cost_mv[lambda][-i] =
253 h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
255 x264_pthread_mutex_lock( &cost_ref_mutex );
256 for( i = 0; i < 3; i++ )
257 for( j = 0; j < 33; j++ )
258 x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
259 x264_pthread_mutex_unlock( &cost_ref_mutex );
260 if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
264 CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
265 h->cost_mv_fpel[lambda][j] += 2*2048;
266 for( i = -2*2048; i < 2*2048; i++ )
267 h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
275 void x264_analyse_free_costs( x264_t *h )
278 for( i = 0; i < 92; i++ )
281 x264_free( h->cost_mv[i] - 2*4*2048 );
282 if( h->cost_mv_fpel[i][0] )
283 for( j = 0; j < 4; j++ )
284 x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
288 /* initialize an array of lambda*nbits for all possible mvs */
289 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
291 a->p_cost_mv = h->cost_mv[a->i_lambda];
292 a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
293 a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
296 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
298 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
300 /* mbrd == 1 -> RD mode decision */
301 /* mbrd == 2 -> RD refinement */
302 /* mbrd == 3 -> QPRD */
303 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
305 /* conduct the analysis using this lamda and QP */
306 a->i_qp = h->mb.i_qp = i_qp;
307 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
309 a->i_lambda = x264_lambda_tab[i_qp];
310 a->i_lambda2 = x264_lambda2_tab[i_qp];
312 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
313 if( h->param.analyse.i_trellis )
315 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
316 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
317 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
318 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
320 h->mb.i_psy_rd_lambda = a->i_lambda;
321 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
322 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
324 h->mb.i_me_method = h->param.analyse.i_me_method;
325 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
326 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
327 && h->mb.i_subpel_refine >= 5;
329 h->mb.b_transform_8x8 = 0;
330 h->mb.b_noise_reduction = 0;
336 a->i_satd_i8x8chroma = COST_MAX;
338 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
339 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
343 h->mb.b_lossless ? 0 :
345 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
347 /* II: Inter part P/B frame */
348 if( h->sh.i_type != SLICE_TYPE_I )
351 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
352 // limit motion search to a slightly smaller range than the theoretical limit,
353 // since the search may go a few iterations past its given range
354 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
356 /* Calculate max allowed MV range */
357 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
358 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
359 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
360 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
361 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
362 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
363 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
364 if( h->mb.i_mb_x == 0)
366 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
367 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
368 int thread_mvy_range = i_fmv_range;
370 if( h->param.i_threads > 1 )
372 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
373 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
374 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
376 x264_frame_t **fref = i ? h->fref1 : h->fref0;
377 int i_ref = i ? h->i_ref1 : h->i_ref0;
378 for( j=0; j<i_ref; j++ )
380 x264_frame_cond_wait( fref[j], thresh );
381 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
384 if( h->param.b_deterministic )
385 thread_mvy_range = h->param.analyse.i_mv_range_thread;
386 if( h->mb.b_interlaced )
387 thread_mvy_range >>= 1;
390 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
391 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
392 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
393 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
394 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
395 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
396 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
402 a->l0.i_cost8x8 = COST_MAX;
404 for( i = 0; i < 4; i++ )
408 a->l0.i_cost4x8[i] = COST_MAX;
412 a->l0.i_cost8x16 = COST_MAX;
413 if( h->sh.i_type == SLICE_TYPE_B )
417 a->l1.i_cost8x8 = COST_MAX;
419 for( i = 0; i < 4; i++ )
424 a->i_cost8x8direct[i] = COST_MAX;
435 a->i_cost16x16direct =
438 a->i_cost8x16bi = COST_MAX;
441 /* Fast intra decision */
442 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
444 if( IS_INTRA( h->mb.i_mb_type_left )
445 || IS_INTRA( h->mb.i_mb_type_top )
446 || IS_INTRA( h->mb.i_mb_type_topleft )
447 || IS_INTRA( h->mb.i_mb_type_topright )
448 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
449 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
450 { /* intra is likely */ }
466 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
468 int b_top = i_neighbour & MB_TOP;
469 int b_left = i_neighbour & MB_LEFT;
470 if( b_top && b_left )
472 /* top and left available */
473 *mode++ = I_PRED_16x16_V;
474 *mode++ = I_PRED_16x16_H;
475 *mode++ = I_PRED_16x16_DC;
477 if( i_neighbour & MB_TOPLEFT )
479 /* top left available*/
480 *mode++ = I_PRED_16x16_P;
487 *mode++ = I_PRED_16x16_DC_LEFT;
488 *mode++ = I_PRED_16x16_H;
494 *mode++ = I_PRED_16x16_DC_TOP;
495 *mode++ = I_PRED_16x16_V;
501 *mode = I_PRED_16x16_DC_128;
507 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
509 int b_top = i_neighbour & MB_TOP;
510 int b_left = i_neighbour & MB_LEFT;
511 if( b_top && b_left )
513 /* top and left available */
514 *mode++ = I_PRED_CHROMA_V;
515 *mode++ = I_PRED_CHROMA_H;
516 *mode++ = I_PRED_CHROMA_DC;
518 if( i_neighbour & MB_TOPLEFT )
520 /* top left available */
521 *mode++ = I_PRED_CHROMA_P;
528 *mode++ = I_PRED_CHROMA_DC_LEFT;
529 *mode++ = I_PRED_CHROMA_H;
535 *mode++ = I_PRED_CHROMA_DC_TOP;
536 *mode++ = I_PRED_CHROMA_V;
542 *mode = I_PRED_CHROMA_DC_128;
548 static void predict_4x4_mode_available( unsigned int i_neighbour,
549 int *mode, int *pi_count )
551 int b_top = i_neighbour & MB_TOP;
552 int b_left = i_neighbour & MB_LEFT;
553 if( b_top && b_left )
556 *mode++ = I_PRED_4x4_DC;
557 *mode++ = I_PRED_4x4_H;
558 *mode++ = I_PRED_4x4_V;
559 *mode++ = I_PRED_4x4_DDL;
560 if( i_neighbour & MB_TOPLEFT )
562 *mode++ = I_PRED_4x4_DDR;
563 *mode++ = I_PRED_4x4_VR;
564 *mode++ = I_PRED_4x4_HD;
567 *mode++ = I_PRED_4x4_VL;
568 *mode++ = I_PRED_4x4_HU;
572 *mode++ = I_PRED_4x4_DC_LEFT;
573 *mode++ = I_PRED_4x4_H;
574 *mode++ = I_PRED_4x4_HU;
579 *mode++ = I_PRED_4x4_DC_TOP;
580 *mode++ = I_PRED_4x4_V;
581 *mode++ = I_PRED_4x4_DDL;
582 *mode++ = I_PRED_4x4_VL;
587 *mode++ = I_PRED_4x4_DC_128;
592 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
593 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
595 ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[64] );
596 ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
597 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
600 if( do_both_dct || h->mb.b_transform_8x8 )
602 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
603 for( i = 0; i < 4; i++ )
604 h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
606 if( do_both_dct || !h->mb.b_transform_8x8 )
608 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
609 for( i = 0; i < 16; i++ )
610 h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
614 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
615 static inline void x264_mb_cache_fenc_satd( x264_t *h )
617 ALIGNED_16( static uint8_t zero[16] ) = {0};
619 int x, y, satd_sum = 0, sa8d_sum = 0;
620 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
621 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
622 if( !h->mb.i_psy_rd )
624 for( y = 0; y < 4; y++ )
625 for( x = 0; x < 4; x++ )
627 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
628 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
629 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
630 satd_sum += h->mb.pic.fenc_satd[y][x];
632 for( y = 0; y < 2; y++ )
633 for( x = 0; x < 2; x++ )
635 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
636 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
637 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
638 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
640 h->mb.pic.fenc_satd_sum = satd_sum;
641 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
644 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
650 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
652 uint8_t *p_dstc[2], *p_srcc[2];
654 if( a->i_satd_i8x8chroma < COST_MAX )
657 /* 8x8 prediction selection for chroma */
658 p_dstc[0] = h->mb.pic.p_fdec[1];
659 p_dstc[1] = h->mb.pic.p_fdec[2];
660 p_srcc[0] = h->mb.pic.p_fenc[1];
661 p_srcc[1] = h->mb.pic.p_fenc[2];
663 predict_8x8chroma_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
664 a->i_satd_i8x8chroma = COST_MAX;
665 if( i_max == 4 && b_merged_satd )
667 int satdu[4], satdv[4];
668 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
669 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
670 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
671 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
672 satdu[I_PRED_CHROMA_P] =
673 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
674 satdv[I_PRED_CHROMA_P] =
675 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
677 for( i=0; i<i_max; i++ )
679 int i_mode = predict_mode[i];
680 int i_satd = satdu[i_mode] + satdv[i_mode]
681 + a->i_lambda * bs_size_ue(i_mode);
683 a->i_satd_i8x8chroma_dir[i] = i_satd;
684 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
689 for( i=0; i<i_max; i++ )
692 int i_mode = predict_mode[i];
694 /* we do the prediction */
695 if( h->mb.b_lossless )
696 x264_predict_lossless_8x8_chroma( h, i_mode );
699 h->predict_8x8c[i_mode]( p_dstc[0] );
700 h->predict_8x8c[i_mode]( p_dstc[1] );
703 /* we calculate the cost */
704 i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
705 p_srcc[0], FENC_STRIDE ) +
706 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
707 p_srcc[1], FENC_STRIDE ) +
708 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
710 a->i_satd_i8x8chroma_dir[i] = i_satd;
711 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
715 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
718 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
720 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
721 uint8_t *p_src = h->mb.pic.p_fenc[0];
722 uint8_t *p_dst = h->mb.pic.p_fdec[0];
727 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
729 /*---------------- Try all mode and calculate their score ---------------*/
731 /* 16x16 prediction selection */
732 predict_16x16_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
734 if( b_merged_satd && i_max == 4 )
736 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
737 h->predict_16x16[I_PRED_16x16_P]( p_dst );
738 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
739 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
742 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
743 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
748 for( i = 0; i < i_max; i++ )
751 int i_mode = predict_mode[i];
753 if( h->mb.b_lossless )
754 x264_predict_lossless_16x16( h, i_mode );
756 h->predict_16x16[i_mode]( p_dst );
758 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
759 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
760 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
761 a->i_satd_i16x16_dir[i_mode] = i_satd;
765 if( h->sh.i_type == SLICE_TYPE_B )
766 /* cavlc mb type prefix */
767 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
768 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
771 /* 8x8 prediction selection */
772 if( flags & X264_ANALYSE_I8x8 )
774 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
775 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
776 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
778 h->mb.i_cbp_luma = 0;
779 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
781 // FIXME some bias like in i4x4?
782 if( h->sh.i_type == SLICE_TYPE_B )
783 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
785 for( idx = 0;; idx++ )
789 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
790 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
791 int i_best = COST_MAX;
792 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
794 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
795 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
797 if( b_merged_satd && i_max == 9 )
800 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
801 satd[i_pred_mode] -= 3 * a->i_lambda;
802 for( i=2; i>=0; i-- )
804 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
805 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
812 for( ; i<i_max; i++ )
815 int i_mode = predict_mode[i];
817 if( h->mb.b_lossless )
818 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
820 h->predict_8x8[i_mode]( p_dst_by, edge );
822 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
823 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
825 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
826 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
830 if( idx == 3 || i_cost > i_satd_thresh )
833 /* we need to encode this block now (for next ones) */
834 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
835 x264_mb_encode_i8x8( h, idx, a->i_qp );
837 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
842 a->i_satd_i8x8 = i_cost;
843 if( h->mb.i_skip_intra )
845 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
846 h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
847 h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
848 h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
849 h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
850 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
851 if( h->mb.i_skip_intra == 2 )
852 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
857 static const uint16_t cost_div_fix8[3] = {1024,512,341};
858 a->i_satd_i8x8 = COST_MAX;
859 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
861 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
865 /* 4x4 prediction selection */
866 if( flags & X264_ANALYSE_I4x4 )
869 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
870 h->mb.i_cbp_luma = 0;
871 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
873 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
875 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
876 if( h->sh.i_type == SLICE_TYPE_B )
877 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
879 for( idx = 0;; idx++ )
881 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
882 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
883 int i_best = COST_MAX;
884 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
886 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
888 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
889 /* emulate missing topright samples */
890 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
892 if( b_merged_satd && i_max >= 6 )
895 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
896 satd[i_pred_mode] -= 3 * a->i_lambda;
897 for( i=2; i>=0; i-- )
898 COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
899 a->i_predict4x4[idx], i );
905 for( ; i<i_max; i++ )
908 int i_mode = predict_mode[i];
909 if( h->mb.b_lossless )
910 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
912 h->predict_4x4[i_mode]( p_dst_by );
914 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
915 p_src_by, FENC_STRIDE )
916 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
918 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
922 if( i_cost > i_satd_thresh || idx == 15 )
925 /* we need to encode this block now (for next ones) */
926 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
927 x264_mb_encode_i4x4( h, idx, a->i_qp );
929 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
933 a->i_satd_i4x4 = i_cost;
934 if( h->mb.i_skip_intra )
936 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
937 h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
938 h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
939 h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
940 h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
941 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
942 if( h->mb.i_skip_intra == 2 )
943 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
947 a->i_satd_i4x4 = COST_MAX;
951 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
953 if( a->i_satd_i16x16 <= i_satd_thresh )
955 h->mb.i_type = I_16x16;
956 x264_analyse_update_cache( h, a );
957 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
960 a->i_satd_i16x16 = COST_MAX;
962 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
964 h->mb.i_type = I_4x4;
965 x264_analyse_update_cache( h, a );
966 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
969 a->i_satd_i4x4 = COST_MAX;
971 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
973 h->mb.i_type = I_8x8;
974 x264_analyse_update_cache( h, a );
975 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
976 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
979 a->i_satd_i8x8 = COST_MAX;
982 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
984 uint8_t *p_dst = h->mb.pic.p_fdec[0];
987 int i_max, i_mode, i_thresh;
988 uint64_t i_satd, i_best;
990 h->mb.i_skip_intra = 0;
992 if( h->mb.i_type == I_16x16 )
994 int old_pred_mode = a->i_predict16x16;
995 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
996 i_best = a->i_satd_i16x16;
997 predict_16x16_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
998 for( i = 0; i < i_max; i++ )
1000 int i_mode = predict_mode[i];
1001 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
1003 h->mb.i_intra16x16_pred_mode = i_mode;
1004 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
1005 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1009 /* RD selection for chroma prediction */
1010 predict_8x8chroma_mode_available( h->mb.i_neighbour_intra, predict_mode, &i_max );
1013 i_thresh = a->i_satd_i8x8chroma * 5/4;
1015 for( i = j = 0; i < i_max; i++ )
1016 if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
1017 predict_mode[i] != a->i_predict8x8chroma )
1019 predict_mode[j++] = predict_mode[i];
1025 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1026 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1027 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1028 * coefs for the current chroma mode are still around, so we only
1029 * have to recount the bits. */
1030 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1031 for( i = 0; i < i_max; i++ )
1033 i_mode = predict_mode[i];
1034 if( h->mb.b_lossless )
1035 x264_predict_lossless_8x8_chroma( h, i_mode );
1038 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
1039 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
1041 /* if we've already found a mode that needs no residual, then
1042 * probably any mode with a residual will be worse.
1043 * so avoid dct on the remaining modes to improve speed. */
1044 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1045 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1047 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1048 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1052 if( h->mb.i_type == I_4x4 )
1054 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1056 for( idx = 0; idx < 16; idx++ )
1058 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1059 i_best = COST_MAX64;
1061 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
1063 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1064 /* emulate missing topright samples */
1065 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1067 for( i = 0; i < i_max; i++ )
1069 i_mode = predict_mode[i];
1070 if( h->mb.b_lossless )
1071 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1073 h->predict_4x4[i_mode]( p_dst_by );
1074 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1076 if( i_best > i_satd )
1078 a->i_predict4x4[idx] = i_mode;
1080 pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
1081 pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
1082 pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
1083 pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
1084 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1088 *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
1089 *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
1090 *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
1091 *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
1092 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1094 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1097 else if( h->mb.i_type == I_8x8 )
1099 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1100 for( idx = 0; idx < 4; idx++ )
1102 uint64_t pels_h = 0;
1107 int cbp_luma_new = 0;
1108 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1110 i_best = COST_MAX64;
1114 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1115 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
1116 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1118 for( i = 0; i < i_max; i++ )
1120 i_mode = predict_mode[i];
1121 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1123 if( h->mb.b_lossless )
1124 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1126 h->predict_8x8[i_mode]( p_dst_by, edge );
1127 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1128 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1130 if( i_best > i_satd )
1132 a->i_predict8x8[idx] = i_mode;
1133 cbp_luma_new = h->mb.i_cbp_luma;
1136 pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
1138 for( j=0; j<7; j++ )
1139 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1140 i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
1141 i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
1144 a->i_cbp_i8x8_luma = cbp_luma_new;
1145 *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
1147 for( j=0; j<7; j++ )
1148 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1149 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
1150 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
1152 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1157 #define LOAD_FENC( m, src, xoff, yoff) \
1158 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1159 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1160 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1161 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1162 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1164 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1165 (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1166 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1167 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1168 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1169 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1170 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1171 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
1173 #define REF_COST(list, ref) \
1174 (a->p_cost_ref##list[ref])
1176 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1180 ALIGNED_4( int16_t mvc[8][2] );
1181 int i_halfpel_thresh = INT_MAX;
1182 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1184 /* 16x16 Search on all ref frame */
1185 m.i_pixel = PIXEL_16x16;
1186 m.p_cost_mv = a->p_cost_mv;
1187 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1189 a->l0.me16x16.cost = INT_MAX;
1190 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1192 const int i_ref_cost = REF_COST( 0, i_ref );
1193 i_halfpel_thresh -= i_ref_cost;
1194 m.i_ref_cost = i_ref_cost;
1197 /* search with ref */
1198 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1199 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1200 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1201 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1203 /* early termination
1204 * SSD threshold would probably be better than SATD */
1207 && m.cost-m.cost_mv < 300*a->i_lambda
1208 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1209 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1210 && x264_macroblock_probe_pskip( h ) )
1212 h->mb.i_type = P_SKIP;
1213 x264_analyse_update_cache( h, a );
1214 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1218 m.cost += i_ref_cost;
1219 i_halfpel_thresh += i_ref_cost;
1221 if( m.cost < a->l0.me16x16.cost )
1222 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1224 /* save mv for predicting neighbors */
1225 *(uint32_t*)a->l0.mvc[i_ref][0] =
1226 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1229 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1230 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1232 h->mb.i_type = P_L0;
1235 x264_mb_cache_fenc_satd( h );
1236 if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
1238 h->mb.i_partition = D_16x16;
1239 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1240 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1241 if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
1242 h->mb.i_type = P_SKIP;
1247 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1251 uint8_t **p_fenc = h->mb.pic.p_fenc;
1252 int i_halfpel_thresh = INT_MAX;
1253 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1255 int i_maxref = h->mb.pic.i_fref[0]-1;
1257 h->mb.i_partition = D_8x8;
1259 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1260 * than those used by the neighbors */
1261 if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
1262 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1265 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
1266 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
1267 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
1268 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
1269 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
1270 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
1273 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1274 *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
1276 for( i = 0; i < 4; i++ )
1278 x264_me_t *l0m = &a->l0.me8x8[i];
1282 m.i_pixel = PIXEL_8x8;
1283 m.p_cost_mv = a->p_cost_mv;
1285 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1286 l0m->cost = INT_MAX;
1287 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1289 const int i_ref_cost = REF_COST( 0, i_ref );
1290 i_halfpel_thresh -= i_ref_cost;
1291 m.i_ref_cost = i_ref_cost;
1294 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1295 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1296 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1297 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1299 m.cost += i_ref_cost;
1300 i_halfpel_thresh += i_ref_cost;
1301 *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
1303 if( m.cost < l0m->cost )
1304 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1306 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1307 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1309 /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
1310 are effectively zero. */
1311 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1312 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1315 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1316 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1317 /* P_8x8 ref0 has no ref cost */
1318 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1319 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1320 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1321 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1322 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1325 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1327 const int i_ref = a->l0.me16x16.i_ref;
1328 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1329 uint8_t **p_fref = h->mb.pic.p_fref[0][i_ref];
1330 uint8_t **p_fenc = h->mb.pic.p_fenc;
1332 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1335 /* XXX Needed for x264_mb_predict_mv */
1336 h->mb.i_partition = D_8x8;
1339 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
1341 for( i = 0; i < 4; i++ )
1343 x264_me_t *m = &a->l0.me8x8[i];
1347 m->i_pixel = PIXEL_8x8;
1348 m->p_cost_mv = a->p_cost_mv;
1349 m->i_ref_cost = i_ref_cost;
1352 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1353 LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
1354 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1355 x264_me_search( h, m, mvc, i_mvc );
1357 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1359 *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
1363 m->cost += i_ref_cost;
1364 if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
1365 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1368 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1369 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1370 /* theoretically this should include 4*ref_cost,
1371 * but 3 seems a better approximation of cabac. */
1372 if( h->param.b_cabac )
1373 a->l0.i_cost8x8 -= i_ref_cost;
1374 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1375 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1378 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1381 uint8_t **p_fenc = h->mb.pic.p_fenc;
1382 ALIGNED_4( int16_t mvc[3][2] );
1385 /* XXX Needed for x264_mb_predict_mv */
1386 h->mb.i_partition = D_16x8;
1388 for( i = 0; i < 2; i++ )
1390 x264_me_t *l0m = &a->l0.me16x8[i];
1391 const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1392 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1394 m.i_pixel = PIXEL_16x8;
1395 m.p_cost_mv = a->p_cost_mv;
1397 LOAD_FENC( &m, p_fenc, 0, 8*i );
1398 l0m->cost = INT_MAX;
1399 for( j = 0; j < i_ref8s; j++ )
1401 const int i_ref = ref8[j];
1402 const int i_ref_cost = REF_COST( 0, i_ref );
1403 m.i_ref_cost = i_ref_cost;
1406 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1407 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1408 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
1409 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
1411 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1412 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1413 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1414 x264_me_search( h, &m, mvc, 3 );
1416 m.cost += i_ref_cost;
1418 if( m.cost < l0m->cost )
1419 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1421 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1422 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1425 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1428 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1431 uint8_t **p_fenc = h->mb.pic.p_fenc;
1432 ALIGNED_4( int16_t mvc[3][2] );
1435 /* XXX Needed for x264_mb_predict_mv */
1436 h->mb.i_partition = D_8x16;
1438 for( i = 0; i < 2; i++ )
1440 x264_me_t *l0m = &a->l0.me8x16[i];
1441 const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1442 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1444 m.i_pixel = PIXEL_8x16;
1445 m.p_cost_mv = a->p_cost_mv;
1447 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1448 l0m->cost = INT_MAX;
1449 for( j = 0; j < i_ref8s; j++ )
1451 const int i_ref = ref8[j];
1452 const int i_ref_cost = REF_COST( 0, i_ref );
1453 m.i_ref_cost = i_ref_cost;
1456 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1457 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
1458 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
1460 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1461 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1462 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1463 x264_me_search( h, &m, mvc, 3 );
1465 m.cost += i_ref_cost;
1467 if( m.cost < l0m->cost )
1468 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1470 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1471 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1474 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1477 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1479 ALIGNED_8( uint8_t pix1[16*8] );
1480 uint8_t *pix2 = pix1+8;
1481 const int i_stride = h->mb.pic.i_stride[1];
1482 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1483 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1485 #define CHROMA4x4MC( width, height, me, x, y ) \
1486 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
1487 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
1489 if( pixel == PIXEL_4x4 )
1491 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
1492 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
1493 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
1494 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
1496 else if( pixel == PIXEL_8x4 )
1498 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
1499 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
1503 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
1504 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
1507 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1508 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1511 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1513 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1514 uint8_t **p_fenc = h->mb.pic.p_fenc;
1515 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1518 /* XXX Needed for x264_mb_predict_mv */
1519 h->mb.i_partition = D_8x8;
1521 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1523 const int idx = 4*i8x8 + i4x4;
1524 const int x4 = block_idx_x[idx];
1525 const int y4 = block_idx_y[idx];
1526 const int i_mvc = (i4x4 == 0);
1528 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1530 m->i_pixel = PIXEL_4x4;
1531 m->p_cost_mv = a->p_cost_mv;
1533 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1534 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1536 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1537 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1539 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1541 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1542 a->l0.me4x4[i8x8][1].cost +
1543 a->l0.me4x4[i8x8][2].cost +
1544 a->l0.me4x4[i8x8][3].cost +
1545 REF_COST( 0, i_ref ) +
1546 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1547 if( h->mb.b_chroma_me )
1548 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1551 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1553 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1554 uint8_t **p_fenc = h->mb.pic.p_fenc;
1555 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1558 /* XXX Needed for x264_mb_predict_mv */
1559 h->mb.i_partition = D_8x8;
1561 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1563 const int idx = 4*i8x8 + 2*i8x4;
1564 const int x4 = block_idx_x[idx];
1565 const int y4 = block_idx_y[idx];
1566 const int i_mvc = (i8x4 == 0);
1568 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1570 m->i_pixel = PIXEL_8x4;
1571 m->p_cost_mv = a->p_cost_mv;
1573 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1574 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1576 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1577 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1579 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1581 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1582 REF_COST( 0, i_ref ) +
1583 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1584 if( h->mb.b_chroma_me )
1585 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1588 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1590 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1591 uint8_t **p_fenc = h->mb.pic.p_fenc;
1592 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1595 /* XXX Needed for x264_mb_predict_mv */
1596 h->mb.i_partition = D_8x8;
1598 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1600 const int idx = 4*i8x8 + i4x8;
1601 const int x4 = block_idx_x[idx];
1602 const int y4 = block_idx_y[idx];
1603 const int i_mvc = (i4x8 == 0);
1605 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1607 m->i_pixel = PIXEL_4x8;
1608 m->p_cost_mv = a->p_cost_mv;
1610 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1611 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1613 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1614 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1616 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1618 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1619 REF_COST( 0, i_ref ) +
1620 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1621 if( h->mb.b_chroma_me )
1622 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1625 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1627 /* Assumes that fdec still contains the results of
1628 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1630 uint8_t **p_fenc = h->mb.pic.p_fenc;
1631 uint8_t **p_fdec = h->mb.pic.p_fdec;
1634 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1635 for( i = 0; i < 4; i++ )
1637 const int x = (i&1)*8;
1638 const int y = (i>>1)*8;
1639 a->i_cost16x16direct +=
1640 a->i_cost8x8direct[i] =
1641 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1644 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1648 #define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
1650 h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
1653 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1655 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1656 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1657 uint8_t *src0, *src1;
1658 int stride0 = 16, stride1 = 16;
1662 ALIGNED_4( int16_t mvc[9][2] );
1663 int i_halfpel_thresh = INT_MAX;
1664 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1666 /* 16x16 Search on all ref frame */
1667 m.i_pixel = PIXEL_16x16;
1668 m.p_cost_mv = a->p_cost_mv;
1669 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1672 a->l0.me16x16.cost = INT_MAX;
1673 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1675 /* search with ref */
1676 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1677 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1678 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1679 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1682 m.cost += REF_COST( 0, i_ref );
1684 if( m.cost < a->l0.me16x16.cost )
1686 a->l0.i_ref = i_ref;
1687 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1690 /* save mv for predicting neighbors */
1691 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1693 /* subtract ref cost, so we don't have to add it for the other MB types */
1694 a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1697 i_halfpel_thresh = INT_MAX;
1698 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1699 a->l1.me16x16.cost = INT_MAX;
1700 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1702 /* search with ref */
1703 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1704 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1705 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1706 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1709 m.cost += REF_COST( 1, i_ref );
1711 if( m.cost < a->l1.me16x16.cost )
1713 a->l1.i_ref = i_ref;
1714 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1717 /* save mv for predicting neighbors */
1718 *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1720 /* subtract ref cost, so we don't have to add it for the other MB types */
1721 a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1723 /* Set global ref, needed for other modes? */
1724 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1725 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1727 /* get cost of BI mode */
1728 src0 = h->mc.get_ref( pix0, &stride0,
1729 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1730 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
1731 src1 = h->mc.get_ref( pix1, &stride1,
1732 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1733 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
1735 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1737 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1738 + REF_COST( 0, a->l0.i_ref )
1739 + REF_COST( 1, a->l1.i_ref )
1740 + a->l0.me16x16.cost_mv
1741 + a->l1.me16x16.cost_mv;
1744 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1745 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1746 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1749 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1751 const int x = 2*(i%2);
1752 const int y = 2*(i/2);
1754 switch( h->mb.i_sub_partition[i] )
1757 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1760 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1761 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1764 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1765 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1768 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1769 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1770 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1771 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1774 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1779 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1780 if( x264_mb_partition_listX_table[0][part] ) \
1782 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1783 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1787 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1788 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1790 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1792 if( x264_mb_partition_listX_table[1][part] ) \
1794 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1795 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1799 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1800 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1802 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1805 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1809 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1811 x264_mb_load_mv_direct8x8( h, i );
1814 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1815 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1816 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1821 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1824 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1826 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1828 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1830 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1834 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1836 uint8_t **p_fref[2] =
1837 { h->mb.pic.p_fref[0][a->l0.i_ref],
1838 h->mb.pic.p_fref[1][a->l1.i_ref] };
1839 ALIGNED_8( uint8_t pix[2][8*8] );
1842 /* XXX Needed for x264_mb_predict_mv */
1843 h->mb.i_partition = D_8x8;
1847 for( i = 0; i < 4; i++ )
1852 int i_part_cost_bi = 0;
1853 int stride[2] = {8,8};
1856 for( l = 0; l < 2; l++ )
1858 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1859 x264_me_t *m = &lX->me8x8[i];
1861 m->i_pixel = PIXEL_8x8;
1862 m->p_cost_mv = a->p_cost_mv;
1864 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1865 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1867 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1868 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1870 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1873 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1874 m->mv[0], m->mv[1], 8, 8 );
1875 i_part_cost_bi += m->cost_mv;
1876 /* FIXME: ref cost */
1878 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1879 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1880 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1881 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1882 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1884 i_part_cost = a->l0.me8x8[i].cost;
1885 h->mb.i_sub_partition[i] = D_L0_8x8;
1886 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1887 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1888 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1889 a->i_cost8x8bi += i_part_cost;
1891 /* XXX Needed for x264_mb_predict_mv */
1892 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1896 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1899 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1901 uint8_t **p_fref[2] =
1902 { h->mb.pic.p_fref[0][a->l0.i_ref],
1903 h->mb.pic.p_fref[1][a->l1.i_ref] };
1904 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1905 ALIGNED_4( int16_t mvc[2][2] );
1908 h->mb.i_partition = D_16x8;
1909 a->i_cost16x8bi = 0;
1911 for( i = 0; i < 2; i++ )
1914 int i_part_cost_bi = 0;
1915 int stride[2] = {16,16};
1918 /* TODO: check only the list(s) that were used in b8x8? */
1919 for( l = 0; l < 2; l++ )
1921 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1922 x264_me_t *m = &lX->me16x8[i];
1924 m->i_pixel = PIXEL_16x8;
1925 m->p_cost_mv = a->p_cost_mv;
1927 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1928 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1930 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
1931 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
1933 x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
1934 x264_me_search( h, m, mvc, 2 );
1937 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1938 m->mv[0], m->mv[1], 16, 8 );
1939 /* FIXME: ref cost */
1940 i_part_cost_bi += m->cost_mv;
1942 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1943 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1945 i_part_cost = a->l0.me16x8[i].cost;
1946 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1947 if( a->l1.me16x8[i].cost < i_part_cost )
1949 i_part_cost = a->l1.me16x8[i].cost;
1950 a->i_mb_partition16x8[i] = D_L1_8x8;
1952 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1954 i_part_cost = i_part_cost_bi;
1955 a->i_mb_partition16x8[i] = D_BI_8x8;
1957 a->i_cost16x8bi += i_part_cost;
1959 x264_mb_cache_mv_b16x8( h, a, i, 0 );
1963 a->i_mb_type16x8 = B_L0_L0
1964 + (a->i_mb_partition16x8[0]>>2) * 3
1965 + (a->i_mb_partition16x8[1]>>2);
1966 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1969 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1971 uint8_t **p_fref[2] =
1972 { h->mb.pic.p_fref[0][a->l0.i_ref],
1973 h->mb.pic.p_fref[1][a->l1.i_ref] };
1974 ALIGNED_8( uint8_t pix[2][8*16] );
1975 ALIGNED_4( int16_t mvc[2][2] );
1978 h->mb.i_partition = D_8x16;
1979 a->i_cost8x16bi = 0;
1981 for( i = 0; i < 2; i++ )
1984 int i_part_cost_bi = 0;
1985 int stride[2] = {8,8};
1988 for( l = 0; l < 2; l++ )
1990 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1991 x264_me_t *m = &lX->me8x16[i];
1993 m->i_pixel = PIXEL_8x16;
1994 m->p_cost_mv = a->p_cost_mv;
1996 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1997 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
1999 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
2000 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
2002 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
2003 x264_me_search( h, m, mvc, 2 );
2006 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
2007 m->mv[0], m->mv[1], 8, 16 );
2008 /* FIXME: ref cost */
2009 i_part_cost_bi += m->cost_mv;
2012 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
2013 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2015 i_part_cost = a->l0.me8x16[i].cost;
2016 a->i_mb_partition8x16[i] = D_L0_8x8;
2017 if( a->l1.me8x16[i].cost < i_part_cost )
2019 i_part_cost = a->l1.me8x16[i].cost;
2020 a->i_mb_partition8x16[i] = D_L1_8x8;
2022 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2024 i_part_cost = i_part_cost_bi;
2025 a->i_mb_partition8x16[i] = D_BI_8x8;
2027 a->i_cost8x16bi += i_part_cost;
2029 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2033 a->i_mb_type8x16 = B_L0_L0
2034 + (a->i_mb_partition8x16[0]>>2) * 3
2035 + (a->i_mb_partition8x16[1]>>2);
2036 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2039 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2041 int thresh = i_satd * 5/4;
2043 h->mb.i_type = P_L0;
2044 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2046 h->mb.i_partition = D_16x16;
2047 x264_analyse_update_cache( h, a );
2048 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2051 if( a->l0.i_cost16x8 <= thresh )
2053 h->mb.i_partition = D_16x8;
2054 x264_analyse_update_cache( h, a );
2055 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2058 a->l0.i_cost16x8 = COST_MAX;
2060 if( a->l0.i_cost8x16 <= thresh )
2062 h->mb.i_partition = D_8x16;
2063 x264_analyse_update_cache( h, a );
2064 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2067 a->l0.i_cost8x16 = COST_MAX;
2069 if( a->l0.i_cost8x8 <= thresh )
2071 h->mb.i_type = P_8x8;
2072 h->mb.i_partition = D_8x8;
2073 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2076 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2077 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2078 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2079 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2080 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2081 * for future blocks are those left over from previous RDO calls. */
2082 for( i = 0; i < 4; i++ )
2084 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2085 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2086 int subtype, btype = D_L0_8x8;
2087 uint64_t bcost = COST_MAX64;
2088 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2091 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2093 h->mb.i_sub_partition[i] = subtype;
2094 x264_mb_cache_mv_p8x8( h, a, i );
2095 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2096 COPY2_IF_LT( bcost, cost, btype, subtype );
2098 h->mb.i_sub_partition[i] = btype;
2099 x264_mb_cache_mv_p8x8( h, a, i );
2103 x264_analyse_update_cache( h, a );
2104 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2107 a->l0.i_cost8x8 = COST_MAX;
2110 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2112 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2114 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2116 h->mb.i_type = B_DIRECT;
2117 /* Assumes direct/skip MC is still in fdec */
2118 /* Requires b-rdo to be done before intra analysis */
2119 h->mb.b_skip_mc = 1;
2120 x264_analyse_update_cache( h, a );
2121 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2122 h->mb.b_skip_mc = 0;
2125 //FIXME not all the update_cache calls are needed
2126 h->mb.i_partition = D_16x16;
2128 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2130 h->mb.i_type = B_L0_L0;
2131 x264_analyse_update_cache( h, a );
2132 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2136 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2138 h->mb.i_type = B_L1_L1;
2139 x264_analyse_update_cache( h, a );
2140 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2144 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2146 h->mb.i_type = B_BI_BI;
2147 x264_analyse_update_cache( h, a );
2148 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2152 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2154 h->mb.i_type = B_8x8;
2155 h->mb.i_partition = D_8x8;
2156 x264_analyse_update_cache( h, a );
2157 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2158 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2162 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2164 h->mb.i_type = a->i_mb_type16x8;
2165 h->mb.i_partition = D_16x8;
2166 x264_analyse_update_cache( h, a );
2167 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2171 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2173 h->mb.i_type = a->i_mb_type8x16;
2174 h->mb.i_partition = D_8x16;
2175 x264_analyse_update_cache( h, a );
2176 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2180 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2182 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2185 if( IS_INTRA(h->mb.i_type) )
2188 switch( h->mb.i_partition )
2191 if( h->mb.i_type == B_BI_BI )
2192 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2195 for( i=0; i<2; i++ )
2196 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2197 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2200 for( i=0; i<2; i++ )
2201 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2202 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2205 for( i=0; i<4; i++ )
2206 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2207 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2212 static inline void x264_mb_analyse_transform( x264_t *h )
2214 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2216 int i_cost4, i_cost8;
2217 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2220 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2221 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2222 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2223 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2225 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2226 h->mb.b_skip_mc = 1;
2230 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2232 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2235 x264_analyse_update_cache( h, a );
2236 h->mb.b_transform_8x8 ^= 1;
2237 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2238 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2240 if( *i_rd >= i_rd8 )
2243 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2247 h->mb.b_transform_8x8 ^= 1;
2251 /* Rate-distortion optimal QP selection.
2252 * FIXME: More than half of the benefit of this function seems to be
2253 * in the way it improves the coding of chroma DC (by decimating or
2254 * finding a better way to code a single DC coefficient.)
2255 * There must be a more efficient way to get that portion of the benefit
2256 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2258 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2260 int bcost, cost, direction, failures, prevcost, origcost;
2261 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2262 int last_qp_tried = 0;
2263 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2265 /* If CBP is already zero, don't raise the quantizer any higher. */
2266 for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2268 /* Without psy-RD, require monotonicity when moving quant away from previous
2269 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2270 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2271 * allow 2 failures when moving quant towards previous quant.
2272 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2273 int threshold = (!!h->mb.i_psy_rd);
2274 /* Raise the threshold for failures if we're moving towards the last QP. */
2275 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2276 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2278 h->mb.i_qp = orig_qp;
2280 prevcost = origcost;
2281 h->mb.i_qp += direction;
2282 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2284 if( h->mb.i_last_qp == h->mb.i_qp )
2286 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2287 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2288 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2290 /* We can't assume that the costs are monotonic over QPs.
2291 * Tie case-as-failure seems to give better results. */
2292 if( cost < prevcost )
2298 if( failures > threshold )
2300 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2302 h->mb.i_qp += direction;
2306 /* Always try the last block's QP. */
2307 if( !last_qp_tried )
2309 h->mb.i_qp = h->mb.i_last_qp;
2310 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2311 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2312 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2316 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2318 /* Check transform again; decision from before may no longer be optimal. */
2319 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2320 x264_mb_transform_8x8_allowed( h ) )
2322 h->mb.b_transform_8x8 ^= 1;
2323 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2325 h->mb.b_transform_8x8 ^= 1;
2329 /*****************************************************************************
2330 * x264_macroblock_analyse:
2331 *****************************************************************************/
2332 void x264_macroblock_analyse( x264_t *h )
2334 x264_mb_analysis_t analysis;
2335 int i_cost = COST_MAX;
2338 h->mb.i_qp = x264_ratecontrol_qp( h );
2339 if( h->param.rc.i_aq_mode )
2341 x264_adaptive_quant( h );
2342 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2343 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2344 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2345 h->mb.i_qp = h->mb.i_last_qp;
2348 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2350 /*--------------------------- Do the analysis ---------------------------*/
2351 if( h->sh.i_type == SLICE_TYPE_I )
2353 if( analysis.i_mbrd )
2354 x264_mb_cache_fenc_satd( h );
2355 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2356 if( analysis.i_mbrd )
2357 x264_intra_rd( h, &analysis, COST_MAX );
2359 i_cost = analysis.i_satd_i16x16;
2360 h->mb.i_type = I_16x16;
2361 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2362 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2363 if( analysis.i_satd_pcm < i_cost )
2364 h->mb.i_type = I_PCM;
2366 else if( analysis.i_mbrd >= 2 )
2367 x264_intra_rd_refine( h, &analysis );
2369 else if( h->sh.i_type == SLICE_TYPE_P )
2373 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2375 /* Fast P_SKIP detection */
2376 analysis.b_try_pskip = 0;
2377 if( h->param.analyse.b_fast_pskip )
2379 if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2380 // FIXME don't need to check this if the reference frame is done
2382 else if( h->param.analyse.i_subpel_refine >= 3 )
2383 analysis.b_try_pskip = 1;
2384 else if( h->mb.i_mb_type_left == P_SKIP ||
2385 h->mb.i_mb_type_top == P_SKIP ||
2386 h->mb.i_mb_type_topleft == P_SKIP ||
2387 h->mb.i_mb_type_topright == P_SKIP )
2388 b_skip = x264_macroblock_probe_pskip( h );
2391 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2395 h->mb.i_type = P_SKIP;
2396 h->mb.i_partition = D_16x16;
2397 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
2401 const unsigned int flags = h->param.analyse.inter;
2405 int i_satd_inter, i_satd_intra;
2407 x264_mb_analyse_load_costs( h, &analysis );
2409 x264_mb_analyse_inter_p16x16( h, &analysis );
2411 if( h->mb.i_type == P_SKIP )
2414 if( flags & X264_ANALYSE_PSUB16x16 )
2416 if( h->param.analyse.b_mixed_references )
2417 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2419 x264_mb_analyse_inter_p8x8( h, &analysis );
2422 /* Select best inter mode */
2424 i_partition = D_16x16;
2425 i_cost = analysis.l0.me16x16.cost;
2427 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2428 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2431 i_partition = D_8x8;
2432 i_cost = analysis.l0.i_cost8x8;
2435 if( flags & X264_ANALYSE_PSUB8x8 )
2437 for( i = 0; i < 4; i++ )
2439 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2440 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2442 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2443 h->mb.i_sub_partition[i] = D_L0_4x4;
2445 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2446 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2447 h->mb.i_sub_partition[i], D_L0_8x4 );
2449 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2450 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2451 h->mb.i_sub_partition[i], D_L0_4x8 );
2453 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2455 x264_mb_cache_mv_p8x8( h, &analysis, i );
2457 analysis.l0.i_cost8x8 = i_cost;
2461 /* Now do 16x8/8x16 */
2462 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2463 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2464 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2466 x264_mb_analyse_inter_p16x8( h, &analysis );
2467 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2469 x264_mb_analyse_inter_p8x16( h, &analysis );
2470 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2473 h->mb.i_partition = i_partition;
2476 //FIXME mb_type costs?
2477 if( analysis.i_mbrd )
2481 else if( i_partition == D_16x16 )
2483 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2484 i_cost = analysis.l0.me16x16.cost;
2486 else if( i_partition == D_16x8 )
2488 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2489 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2490 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2492 else if( i_partition == D_8x16 )
2494 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2495 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2496 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2498 else if( i_partition == D_8x8 )
2502 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2504 switch( h->mb.i_sub_partition[i8x8] )
2507 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2508 i_cost += analysis.l0.me8x8[i8x8].cost;
2511 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2512 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2513 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2514 analysis.l0.me8x4[i8x8][1].cost;
2517 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2518 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2519 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2520 analysis.l0.me4x8[i8x8][1].cost;
2524 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2525 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2526 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2527 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2528 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2529 analysis.l0.me4x4[i8x8][1].cost +
2530 analysis.l0.me4x4[i8x8][2].cost +
2531 analysis.l0.me4x4[i8x8][3].cost;
2534 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2540 if( h->mb.b_chroma_me )
2542 x264_mb_analyse_intra_chroma( h, &analysis );
2543 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2544 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2545 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2546 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2549 x264_mb_analyse_intra( h, &analysis, i_cost );
2551 i_satd_inter = i_cost;
2552 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2553 analysis.i_satd_i8x8,
2554 analysis.i_satd_i4x4 );
2556 if( analysis.i_mbrd )
2558 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2560 i_partition = D_16x16;
2561 i_cost = analysis.l0.i_rd16x16;
2562 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2563 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2564 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2565 h->mb.i_type = i_type;
2566 h->mb.i_partition = i_partition;
2567 if( i_cost < COST_MAX )
2568 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2569 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2572 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2573 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2574 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2575 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2577 h->mb.i_type = i_type;
2579 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2581 if( IS_INTRA( h->mb.i_type ) )
2583 x264_intra_rd_refine( h, &analysis );
2585 else if( i_partition == D_16x16 )
2587 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2588 analysis.l0.me16x16.cost = i_cost;
2589 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2591 else if( i_partition == D_16x8 )
2593 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2594 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2595 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2596 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2597 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2598 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2600 else if( i_partition == D_8x16 )
2602 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2603 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2604 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2605 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2606 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2607 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2609 else if( i_partition == D_8x8 )
2612 x264_analyse_update_cache( h, &analysis );
2613 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2615 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2617 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2619 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2621 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2622 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2624 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2626 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2627 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2629 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2631 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2632 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2633 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2634 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2641 else if( h->sh.i_type == SLICE_TYPE_B )
2643 int i_bskip_cost = COST_MAX;
2646 if( analysis.i_mbrd )
2647 x264_mb_cache_fenc_satd( h );
2649 h->mb.i_type = B_SKIP;
2650 if( h->mb.b_direct_auto_write )
2652 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2653 for( i = 0; i < 2; i++ )
2656 h->sh.b_direct_spatial_mv_pred ^= 1;
2657 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2658 if( analysis.b_direct_available )
2663 b_skip = x264_macroblock_probe_bskip( h );
2665 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2672 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2674 if( analysis.b_direct_available )
2676 if( !h->mb.b_direct_auto_write )
2678 if( analysis.i_mbrd )
2680 i_bskip_cost = ssd_mb( h );
2681 /* 6 = minimum cavlc cost of a non-skipped MB */
2682 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2684 else if( !h->mb.b_direct_auto_write )
2686 /* Conditioning the probe on neighboring block types
2687 * doesn't seem to help speed or quality. */
2688 b_skip = x264_macroblock_probe_bskip( h );
2694 const unsigned int flags = h->param.analyse.inter;
2698 h->mb.b_skip_mc = 0;
2700 x264_mb_analyse_load_costs( h, &analysis );
2702 /* select best inter mode */
2703 /* direct must be first */
2704 if( analysis.b_direct_available )
2705 x264_mb_analyse_inter_direct( h, &analysis );
2707 x264_mb_analyse_inter_b16x16( h, &analysis );
2710 i_partition = D_16x16;
2711 i_cost = analysis.l0.me16x16.cost;
2712 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2713 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2714 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2716 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2718 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2719 if( i_bskip_cost < analysis.i_rd16x16direct &&
2720 i_bskip_cost < analysis.i_rd16x16bi &&
2721 i_bskip_cost < analysis.l0.i_rd16x16 &&
2722 i_bskip_cost < analysis.l1.i_rd16x16 )
2724 h->mb.i_type = B_SKIP;
2725 x264_analyse_update_cache( h, &analysis );
2730 if( flags & X264_ANALYSE_BSUB16x16 )
2732 x264_mb_analyse_inter_b8x8( h, &analysis );
2733 if( analysis.i_cost8x8bi < i_cost )
2736 i_partition = D_8x8;
2737 i_cost = analysis.i_cost8x8bi;
2739 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2740 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2742 x264_mb_analyse_inter_b16x8( h, &analysis );
2743 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2744 i_type, analysis.i_mb_type16x8,
2745 i_partition, D_16x8 );
2747 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2748 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2750 x264_mb_analyse_inter_b8x16( h, &analysis );
2751 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2752 i_type, analysis.i_mb_type8x16,
2753 i_partition, D_8x16 );
2758 if( analysis.i_mbrd )
2763 else if( i_partition == D_16x16 )
2765 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2766 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2767 if( i_type == B_L0_L0 )
2769 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2770 i_cost = analysis.l0.me16x16.cost
2771 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2773 else if( i_type == B_L1_L1 )
2775 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2776 i_cost = analysis.l1.me16x16.cost
2777 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2779 else if( i_type == B_BI_BI )
2781 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2782 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2785 else if( i_partition == D_16x8 )
2787 for( i=0; i<2; i++ )
2789 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2790 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2791 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2792 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2795 else if( i_partition == D_8x16 )
2797 for( i=0; i<2; i++ )
2799 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2800 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2801 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2802 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2805 else if( i_partition == D_8x8 )
2807 for( i=0; i<4; i++ )
2810 int i_part_cost_old;
2812 int i_part_type = h->mb.i_sub_partition[i];
2813 int b_bidir = (i_part_type == D_BI_8x8);
2815 if( i_part_type == D_DIRECT_8x8 )
2817 if( x264_mb_partition_listX_table[0][i_part_type] )
2819 m = &analysis.l0.me8x8[i];
2820 i_part_cost_old = m->cost;
2821 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2822 m->cost -= i_type_cost;
2823 x264_me_refine_qpel( h, m );
2825 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2827 if( x264_mb_partition_listX_table[1][i_part_type] )
2829 m = &analysis.l1.me8x8[i];
2830 i_part_cost_old = m->cost;
2831 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2832 m->cost -= i_type_cost;
2833 x264_me_refine_qpel( h, m );
2835 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2837 /* TODO: update mvp? */
2841 i_satd_inter = i_cost;
2843 if( analysis.i_mbrd )
2845 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2847 i_cost = i_bskip_cost;
2848 i_partition = D_16x16;
2849 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2850 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2851 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2852 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2853 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2854 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2855 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2857 h->mb.i_type = i_type;
2858 h->mb.i_partition = i_partition;
2861 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2863 if( analysis.i_mbrd )
2865 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2866 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2869 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2870 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2871 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2872 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2874 h->mb.i_type = i_type;
2875 h->mb.i_partition = i_partition;
2877 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2878 x264_intra_rd_refine( h, &analysis );
2879 if( h->mb.i_subpel_refine >= 5 )
2880 x264_refine_bidir( h, &analysis );
2882 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2884 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2885 x264_analyse_update_cache( h, &analysis );
2887 if( i_partition == D_16x16 )
2889 if( i_type == B_L0_L0 )
2891 analysis.l0.me16x16.cost = i_cost;
2892 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2894 else if( i_type == B_L1_L1 )
2896 analysis.l1.me16x16.cost = i_cost;
2897 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2899 else if( i_type == B_BI_BI )
2900 x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2902 else if( i_partition == D_16x8 )
2904 for( i = 0; i < 2; i++ )
2906 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2907 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2908 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2909 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2910 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2911 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2912 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2915 else if( i_partition == D_8x16 )
2917 for( i = 0; i < 2; i++ )
2919 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2920 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2921 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2922 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2923 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2924 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2925 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2928 else if( i_partition == D_8x8 )
2930 for( i = 0; i < 4; i++ )
2932 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2933 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2934 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2935 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2936 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2937 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2944 x264_analyse_update_cache( h, &analysis );
2946 /* In rare cases we can end up qpel-RDing our way back to a larger partition size
2947 * without realizing it. Check for this and account for it if necessary. */
2948 if( analysis.i_mbrd >= 2 )
2950 /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
2951 static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
2952 int list = check_mv_lists[h->mb.i_type] - 1;
2953 if( list >= 0 && h->mb.i_partition != D_16x16 &&
2954 *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[0]] == *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[12]] &&
2955 h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
2956 h->mb.i_partition = D_16x16;
2959 if( !analysis.i_mbrd )
2960 x264_mb_analyse_transform( h );
2962 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
2963 x264_mb_analyse_qp_rd( h, &analysis );
2965 h->mb.b_trellis = h->param.analyse.i_trellis;
2966 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
2967 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
2968 x264_psy_trellis_init( h, 0 );
2969 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
2970 h->mb.i_skip_intra = 0;
2973 /*-------------------- Update MB from the analysis ----------------------*/
2974 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
2978 switch( h->mb.i_type )
2981 for( i = 0; i < 16; i++ )
2982 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
2984 x264_mb_analyse_intra_chroma( h, a );
2987 for( i = 0; i < 4; i++ )
2988 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
2990 x264_mb_analyse_intra_chroma( h, a );
2993 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
2994 x264_mb_analyse_intra_chroma( h, a );
3001 switch( h->mb.i_partition )
3004 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
3005 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3009 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
3010 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
3011 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
3012 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
3016 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
3017 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
3018 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
3019 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
3023 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3029 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3030 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3031 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3032 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3033 for( i = 0; i < 4; i++ )
3034 x264_mb_cache_mv_p8x8( h, a, i );
3039 h->mb.i_partition = D_16x16;
3040 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3041 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3047 x264_mb_load_mv_direct8x8( h, 0 );
3048 x264_mb_load_mv_direct8x8( h, 1 );
3049 x264_mb_load_mv_direct8x8( h, 2 );
3050 x264_mb_load_mv_direct8x8( h, 3 );
3054 /* optimize: cache might not need to be rewritten */
3055 for( i = 0; i < 4; i++ )
3056 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3059 default: /* the rest of the B types */
3060 switch( h->mb.i_partition )
3063 switch( h->mb.i_type )
3066 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3067 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3069 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3070 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3071 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3074 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3075 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3076 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3078 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3079 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3082 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3083 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3085 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3086 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3091 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3092 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3095 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3096 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3099 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3105 if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
3108 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3111 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3114 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
3115 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3117 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3118 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3119 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3120 h->mb.cache.mv[l][x264_scan8[15]][0],
3121 h->mb.cache.mv[l][x264_scan8[15]][1] );
3122 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3123 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3124 fprintf(stderr, "completed: %d \n", completed );
3125 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3126 x264_mb_analyse_intra( h, a, COST_MAX );
3127 h->mb.i_type = I_16x16;
3128 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3129 x264_mb_analyse_intra_chroma( h, a );
3136 #include "slicetype.c"