1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
31 #include "common/common.h"
32 #include "common/cpu.h"
33 #include "macroblock.h"
35 #include "ratecontrol.h"
48 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
49 ALIGNED_4( int16_t mvc[32][5][2] );
53 int i_cost4x4[4]; /* cost per 8x8 partition */
54 x264_me_t me4x4[4][4];
57 int i_cost8x4[4]; /* cost per 8x8 partition */
58 x264_me_t me8x4[4][2];
61 int i_cost4x8[4]; /* cost per 8x8 partition */
62 x264_me_t me4x8[4][2];
72 } x264_mb_analysis_list_t;
76 /* conduct the analysis using this lamda and QP */
81 uint16_t *p_cost_ref0;
82 uint16_t *p_cost_ref1;
87 /* Take some shortcuts in intra search if intra is deemed unlikely */
93 int i_satd_i16x16_dir[7];
98 int i_satd_i8x8_dir[12][4];
102 int i_predict4x4[16];
107 int i_satd_i8x8chroma;
108 int i_satd_i8x8chroma_dir[4];
109 int i_predict8x8chroma;
111 /* II: Inter part P/B frame */
112 x264_mb_analysis_list_t l0;
113 x264_mb_analysis_list_t l1;
115 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
116 int i_cost16x16direct;
118 int i_cost8x8direct[4];
127 int i_mb_partition16x8[2]; /* mb_partition_e */
128 int i_mb_partition8x16[2];
129 int i_mb_type16x8; /* mb_class_e */
132 int b_direct_available;
134 } x264_mb_analysis_t;
136 /* lambda = pow(2,qp/6-2) */
137 const int x264_lambda_tab[52] = {
138 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
139 1, 1, 1, 1, /* 8-11 */
140 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
141 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
142 6, 7, 8, 9,10,11,13,14, /* 28-35 */
143 16,18,20,23,25,29,32,36, /* 36-43 */
144 40,45,51,57,64,72,81,91 /* 44-51 */
147 /* lambda2 = pow(lambda,2) * .9 * 256 */
148 const int x264_lambda2_tab[52] = {
149 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
150 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
151 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
152 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
153 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
154 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
155 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
158 const uint8_t x264_exp2_lut[64] = {
159 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 44, 47,
160 50, 53, 57, 60, 64, 67, 71, 74, 78, 81, 85, 89, 93, 96, 100, 104,
161 108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172,
162 177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253,
165 const float x264_log2_lut[128] = {
166 0.00000, 0.01123, 0.02237, 0.03342, 0.04439, 0.05528, 0.06609, 0.07682,
167 0.08746, 0.09803, 0.10852, 0.11894, 0.12928, 0.13955, 0.14975, 0.15987,
168 0.16993, 0.17991, 0.18982, 0.19967, 0.20945, 0.21917, 0.22882, 0.23840,
169 0.24793, 0.25739, 0.26679, 0.27612, 0.28540, 0.29462, 0.30378, 0.31288,
170 0.32193, 0.33092, 0.33985, 0.34873, 0.35755, 0.36632, 0.37504, 0.38370,
171 0.39232, 0.40088, 0.40939, 0.41785, 0.42626, 0.43463, 0.44294, 0.45121,
172 0.45943, 0.46761, 0.47573, 0.48382, 0.49185, 0.49985, 0.50779, 0.51570,
173 0.52356, 0.53138, 0.53916, 0.54689, 0.55459, 0.56224, 0.56986, 0.57743,
174 0.58496, 0.59246, 0.59991, 0.60733, 0.61471, 0.62205, 0.62936, 0.63662,
175 0.64386, 0.65105, 0.65821, 0.66534, 0.67243, 0.67948, 0.68650, 0.69349,
176 0.70044, 0.70736, 0.71425, 0.72110, 0.72792, 0.73471, 0.74147, 0.74819,
177 0.75489, 0.76155, 0.76818, 0.77479, 0.78136, 0.78790, 0.79442, 0.80090,
178 0.80735, 0.81378, 0.82018, 0.82655, 0.83289, 0.83920, 0.84549, 0.85175,
179 0.85798, 0.86419, 0.87036, 0.87652, 0.88264, 0.88874, 0.89482, 0.90087,
180 0.90689, 0.91289, 0.91886, 0.92481, 0.93074, 0.93664, 0.94251, 0.94837,
181 0.95420, 0.96000, 0.96578, 0.97154, 0.97728, 0.98299, 0.98868, 0.99435,
184 /* Avoid an int/float conversion. */
185 const float x264_log2_lz_lut[32] = {
186 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
189 // should the intra and inter lambdas be different?
190 // I'm just matching the behaviour of deadzone quant.
191 static const int x264_trellis_lambda2_tab[2][52] = {
192 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
193 { 46, 58, 73, 92, 117, 147,
194 185, 233, 294, 370, 466, 587,
195 740, 932, 1174, 1480, 1864, 2349,
196 2959, 3728, 4697, 5918, 7457, 9395,
197 11837, 14914, 18790, 23674, 29828, 37581,
198 47349, 59656, 75163, 94699, 119313, 150326,
199 189399, 238627, 300652, 378798, 477255, 601304,
200 757596, 954511, 1202608, 1515192, 1909022, 2405217,
201 3030384, 3818045, 4810435, 6060769 },
202 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
203 { 27, 34, 43, 54, 68, 86,
204 108, 136, 172, 216, 273, 343,
205 433, 545, 687, 865, 1090, 1374,
206 1731, 2180, 2747, 3461, 4361, 5494,
207 6922, 8721, 10988, 13844, 17442, 21976,
208 27688, 34885, 43953, 55377, 69771, 87906,
209 110755, 139543, 175813, 221511, 279087, 351627,
210 443023, 558174, 703255, 886046, 1116348, 1406511,
211 1772093, 2232697, 2813022, 3544186 }
214 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
215 16, 20, 25, 32, 40, 50,
216 64, 80, 101, 128, 161, 203,
217 256, 322, 406, 512, 645, 812,
218 1024, 1290, 1625, 2048, 2580, 3250,
219 4096, 5160, 6501, 8192, 10321, 13003,
220 16384, 20642, 26007, 32768, 41285, 52015,
224 /* TODO: calculate CABAC costs */
225 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
226 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
228 static const int i_mb_b16x8_cost_table[17] = {
229 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
231 static const int i_sub_mb_b_cost_table[13] = {
232 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
234 static const int i_sub_mb_p_cost_table[4] = {
238 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
240 /* Indexed by lambda instead of qp because, due to rounding,
241 * some quantizers share lambdas. This saves memory. */
242 uint16_t *x264_cost_mv_fpel[92][4];
243 uint16_t x264_cost_ref[92][3][33];
245 /* initialize an array of lambda*nbits for all possible mvs */
246 static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
248 static int16_t *p_cost_mv[92];
251 if( !p_cost_mv[a->i_lambda] )
254 /* could be faster, but isn't called many times */
255 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
256 CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) );
257 p_cost_mv[a->i_lambda] += 2*4*2048;
258 for( i = 0; i <= 2*4*2048; i++ )
260 p_cost_mv[a->i_lambda][-i] =
261 p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
263 for( i = 0; i < 3; i++ )
264 for( j = 0; j < 33; j++ )
265 x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
267 a->p_cost_mv = p_cost_mv[a->i_lambda];
268 a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
269 a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
271 /* FIXME is this useful for all me methods? */
272 if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
276 CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) );
277 x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
278 for( i = -2*2048; i < 2*2048; i++ )
279 x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
287 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
289 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
291 /* mbrd == 1 -> RD mode decision */
292 /* mbrd == 2 -> RD refinement */
293 /* mbrd == 3 -> QPRD */
294 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
296 /* conduct the analysis using this lamda and QP */
297 a->i_qp = h->mb.i_qp = i_qp;
298 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
300 a->i_lambda = x264_lambda_tab[i_qp];
301 a->i_lambda2 = x264_lambda2_tab[i_qp];
303 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
304 if( h->param.analyse.i_trellis )
306 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
307 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
308 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
309 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
311 h->mb.i_psy_rd_lambda = a->i_lambda;
312 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
313 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
315 h->mb.i_me_method = h->param.analyse.i_me_method;
316 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
317 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
318 && h->mb.i_subpel_refine >= 5;
320 h->mb.b_transform_8x8 = 0;
321 h->mb.b_noise_reduction = 0;
327 a->i_satd_i8x8chroma = COST_MAX;
329 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
330 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
334 h->mb.b_lossless ? 0 :
336 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
338 /* II: Inter part P/B frame */
339 if( h->sh.i_type != SLICE_TYPE_I )
342 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
343 // limit motion search to a slightly smaller range than the theoretical limit,
344 // since the search may go a few iterations past its given range
345 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
347 /* Calculate max allowed MV range */
348 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
349 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
350 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
351 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
352 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
353 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
354 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
355 if( h->mb.i_mb_x == 0)
357 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
358 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
359 int thread_mvy_range = i_fmv_range;
361 if( h->param.i_threads > 1 )
363 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
364 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
365 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
367 x264_frame_t **fref = i ? h->fref1 : h->fref0;
368 int i_ref = i ? h->i_ref1 : h->i_ref0;
369 for( j=0; j<i_ref; j++ )
371 x264_frame_cond_wait( fref[j], thresh );
372 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
375 if( h->param.b_deterministic )
376 thread_mvy_range = h->param.analyse.i_mv_range_thread;
377 if( h->mb.b_interlaced )
378 thread_mvy_range >>= 1;
381 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
382 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
383 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
384 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
385 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
386 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
387 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
393 a->l0.i_cost8x8 = COST_MAX;
395 for( i = 0; i < 4; i++ )
399 a->l0.i_cost4x8[i] = COST_MAX;
403 a->l0.i_cost8x16 = COST_MAX;
404 if( h->sh.i_type == SLICE_TYPE_B )
408 a->l1.i_cost8x8 = COST_MAX;
410 for( i = 0; i < 4; i++ )
415 a->i_cost8x8direct[i] = COST_MAX;
426 a->i_cost16x16direct =
429 a->i_cost8x16bi = COST_MAX;
432 /* Fast intra decision */
433 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
435 if( IS_INTRA( h->mb.i_mb_type_left )
436 || IS_INTRA( h->mb.i_mb_type_top )
437 || IS_INTRA( h->mb.i_mb_type_topleft )
438 || IS_INTRA( h->mb.i_mb_type_topright )
439 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
440 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
441 { /* intra is likely */ }
457 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
459 int b_top = i_neighbour & MB_TOP;
460 int b_left = i_neighbour & MB_LEFT;
461 if( b_top && b_left )
463 /* top and left available */
464 *mode++ = I_PRED_16x16_V;
465 *mode++ = I_PRED_16x16_H;
466 *mode++ = I_PRED_16x16_DC;
468 if( i_neighbour & MB_TOPLEFT )
470 /* top left available*/
471 *mode++ = I_PRED_16x16_P;
478 *mode++ = I_PRED_16x16_DC_LEFT;
479 *mode++ = I_PRED_16x16_H;
485 *mode++ = I_PRED_16x16_DC_TOP;
486 *mode++ = I_PRED_16x16_V;
492 *mode = I_PRED_16x16_DC_128;
498 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
500 int b_top = i_neighbour & MB_TOP;
501 int b_left = i_neighbour & MB_LEFT;
502 if( b_top && b_left )
504 /* top and left available */
505 *mode++ = I_PRED_CHROMA_V;
506 *mode++ = I_PRED_CHROMA_H;
507 *mode++ = I_PRED_CHROMA_DC;
509 if( i_neighbour & MB_TOPLEFT )
511 /* top left available */
512 *mode++ = I_PRED_CHROMA_P;
519 *mode++ = I_PRED_CHROMA_DC_LEFT;
520 *mode++ = I_PRED_CHROMA_H;
526 *mode++ = I_PRED_CHROMA_DC_TOP;
527 *mode++ = I_PRED_CHROMA_V;
533 *mode = I_PRED_CHROMA_DC_128;
539 static void predict_4x4_mode_available( unsigned int i_neighbour,
540 int *mode, int *pi_count )
542 int b_top = i_neighbour & MB_TOP;
543 int b_left = i_neighbour & MB_LEFT;
544 if( b_top && b_left )
547 *mode++ = I_PRED_4x4_DC;
548 *mode++ = I_PRED_4x4_H;
549 *mode++ = I_PRED_4x4_V;
550 *mode++ = I_PRED_4x4_DDL;
551 if( i_neighbour & MB_TOPLEFT )
553 *mode++ = I_PRED_4x4_DDR;
554 *mode++ = I_PRED_4x4_VR;
555 *mode++ = I_PRED_4x4_HD;
558 *mode++ = I_PRED_4x4_VL;
559 *mode++ = I_PRED_4x4_HU;
563 *mode++ = I_PRED_4x4_DC_LEFT;
564 *mode++ = I_PRED_4x4_H;
565 *mode++ = I_PRED_4x4_HU;
570 *mode++ = I_PRED_4x4_DC_TOP;
571 *mode++ = I_PRED_4x4_V;
572 *mode++ = I_PRED_4x4_DDL;
573 *mode++ = I_PRED_4x4_VL;
578 *mode++ = I_PRED_4x4_DC_128;
583 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
584 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
586 ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] );
587 ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
588 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
591 if( do_both_dct || h->mb.b_transform_8x8 )
593 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
594 for( i = 0; i < 4; i++ )
595 h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
597 if( do_both_dct || !h->mb.b_transform_8x8 )
599 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
600 for( i = 0; i < 16; i++ )
601 h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
605 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
606 static inline void x264_mb_cache_fenc_satd( x264_t *h )
608 ALIGNED_16( static uint8_t zero[16] ) = {0};
610 int x, y, satd_sum = 0, sa8d_sum = 0;
611 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
612 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
613 if( !h->mb.i_psy_rd )
615 for( y = 0; y < 4; y++ )
616 for( x = 0; x < 4; x++ )
618 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
619 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
620 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
621 satd_sum += h->mb.pic.fenc_satd[y][x];
623 for( y = 0; y < 2; y++ )
624 for( x = 0; x < 2; x++ )
626 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
627 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
628 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
629 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
631 h->mb.pic.fenc_satd_sum = satd_sum;
632 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
635 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
641 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
643 uint8_t *p_dstc[2], *p_srcc[2];
645 if( a->i_satd_i8x8chroma < COST_MAX )
648 /* 8x8 prediction selection for chroma */
649 p_dstc[0] = h->mb.pic.p_fdec[1];
650 p_dstc[1] = h->mb.pic.p_fdec[2];
651 p_srcc[0] = h->mb.pic.p_fenc[1];
652 p_srcc[1] = h->mb.pic.p_fenc[2];
654 predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
655 a->i_satd_i8x8chroma = COST_MAX;
656 if( i_max == 4 && b_merged_satd )
658 int satdu[4], satdv[4];
659 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
660 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
661 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
662 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
663 satdu[I_PRED_CHROMA_P] =
664 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
665 satdv[I_PRED_CHROMA_P] =
666 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
668 for( i=0; i<i_max; i++ )
670 int i_mode = predict_mode[i];
671 int i_satd = satdu[i_mode] + satdv[i_mode]
672 + a->i_lambda * bs_size_ue(i_mode);
674 a->i_satd_i8x8chroma_dir[i] = i_satd;
675 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
680 for( i=0; i<i_max; i++ )
683 int i_mode = predict_mode[i];
685 /* we do the prediction */
686 if( h->mb.b_lossless )
687 x264_predict_lossless_8x8_chroma( h, i_mode );
690 h->predict_8x8c[i_mode]( p_dstc[0] );
691 h->predict_8x8c[i_mode]( p_dstc[1] );
694 /* we calculate the cost */
695 i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
696 p_srcc[0], FENC_STRIDE ) +
697 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
698 p_srcc[1], FENC_STRIDE ) +
699 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
701 a->i_satd_i8x8chroma_dir[i] = i_satd;
702 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
706 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
709 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
711 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
712 uint8_t *p_src = h->mb.pic.p_fenc[0];
713 uint8_t *p_dst = h->mb.pic.p_fdec[0];
718 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
720 /*---------------- Try all mode and calculate their score ---------------*/
722 /* 16x16 prediction selection */
723 predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
725 if( b_merged_satd && i_max == 4 )
727 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
728 h->predict_16x16[I_PRED_16x16_P]( p_dst );
729 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
730 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
733 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
734 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
739 for( i = 0; i < i_max; i++ )
742 int i_mode = predict_mode[i];
744 if( h->mb.b_lossless )
745 x264_predict_lossless_16x16( h, i_mode );
747 h->predict_16x16[i_mode]( p_dst );
749 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
750 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
751 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
752 a->i_satd_i16x16_dir[i_mode] = i_satd;
756 if( h->sh.i_type == SLICE_TYPE_B )
757 /* cavlc mb type prefix */
758 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
759 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
762 /* 8x8 prediction selection */
763 if( flags & X264_ANALYSE_I8x8 )
765 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
766 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
767 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
769 h->mb.i_cbp_luma = 0;
770 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
772 // FIXME some bias like in i4x4?
773 if( h->sh.i_type == SLICE_TYPE_B )
774 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
776 for( idx = 0;; idx++ )
780 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
781 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
782 int i_best = COST_MAX;
783 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
785 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
786 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
788 if( b_merged_satd && i_max == 9 )
791 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
792 satd[i_pred_mode] -= 3 * a->i_lambda;
793 for( i=2; i>=0; i-- )
795 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
796 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
803 for( ; i<i_max; i++ )
806 int i_mode = predict_mode[i];
808 if( h->mb.b_lossless )
809 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
811 h->predict_8x8[i_mode]( p_dst_by, edge );
813 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
814 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
816 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
817 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
821 if( idx == 3 || i_cost > i_satd_thresh )
824 /* we need to encode this block now (for next ones) */
825 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
826 x264_mb_encode_i8x8( h, idx, a->i_qp );
828 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
833 a->i_satd_i8x8 = i_cost;
834 if( h->mb.i_skip_intra )
836 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
837 h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
838 h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
839 h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
840 h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
841 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
842 if( h->mb.i_skip_intra == 2 )
843 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
848 static const uint16_t cost_div_fix8[3] = {1024,512,341};
849 a->i_satd_i8x8 = COST_MAX;
850 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
852 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
856 /* 4x4 prediction selection */
857 if( flags & X264_ANALYSE_I4x4 )
860 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
861 h->mb.i_cbp_luma = 0;
862 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
864 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
866 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
867 if( h->sh.i_type == SLICE_TYPE_B )
868 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
870 for( idx = 0;; idx++ )
872 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
873 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
874 int i_best = COST_MAX;
875 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
877 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
879 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
880 /* emulate missing topright samples */
881 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
883 if( b_merged_satd && i_max >= 6 )
886 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
887 satd[i_pred_mode] -= 3 * a->i_lambda;
888 for( i=2; i>=0; i-- )
889 COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
890 a->i_predict4x4[idx], i );
896 for( ; i<i_max; i++ )
899 int i_mode = predict_mode[i];
900 if( h->mb.b_lossless )
901 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
903 h->predict_4x4[i_mode]( p_dst_by );
905 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
906 p_src_by, FENC_STRIDE )
907 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
909 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
913 if( i_cost > i_satd_thresh || idx == 15 )
916 /* we need to encode this block now (for next ones) */
917 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
918 x264_mb_encode_i4x4( h, idx, a->i_qp );
920 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
924 a->i_satd_i4x4 = i_cost;
925 if( h->mb.i_skip_intra )
927 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
928 h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
929 h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
930 h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
931 h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
932 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
933 if( h->mb.i_skip_intra == 2 )
934 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
938 a->i_satd_i4x4 = COST_MAX;
942 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
944 if( a->i_satd_i16x16 <= i_satd_thresh )
946 h->mb.i_type = I_16x16;
947 x264_analyse_update_cache( h, a );
948 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
951 a->i_satd_i16x16 = COST_MAX;
953 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
955 h->mb.i_type = I_4x4;
956 x264_analyse_update_cache( h, a );
957 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
960 a->i_satd_i4x4 = COST_MAX;
962 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
964 h->mb.i_type = I_8x8;
965 x264_analyse_update_cache( h, a );
966 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
967 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
970 a->i_satd_i8x8 = COST_MAX;
973 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
975 uint8_t *p_dst = h->mb.pic.p_fdec[0];
978 int i_max, i_mode, i_thresh;
979 uint64_t i_satd, i_best;
981 h->mb.i_skip_intra = 0;
983 if( h->mb.i_type == I_16x16 )
985 int old_pred_mode = a->i_predict16x16;
986 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
987 i_best = a->i_satd_i16x16;
988 predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
989 for( i = 0; i < i_max; i++ )
991 int i_mode = predict_mode[i];
992 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
994 h->mb.i_intra16x16_pred_mode = i_mode;
995 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
996 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
1000 /* RD selection for chroma prediction */
1001 predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
1004 i_thresh = a->i_satd_i8x8chroma * 5/4;
1006 for( i = j = 0; i < i_max; i++ )
1007 if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
1008 predict_mode[i] != a->i_predict8x8chroma )
1010 predict_mode[j++] = predict_mode[i];
1016 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
1017 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
1018 /* the previous thing encoded was x264_intra_rd(), so the pixels and
1019 * coefs for the current chroma mode are still around, so we only
1020 * have to recount the bits. */
1021 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
1022 for( i = 0; i < i_max; i++ )
1024 i_mode = predict_mode[i];
1025 if( h->mb.b_lossless )
1026 x264_predict_lossless_8x8_chroma( h, i_mode );
1029 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
1030 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
1032 /* if we've already found a mode that needs no residual, then
1033 * probably any mode with a residual will be worse.
1034 * so avoid dct on the remaining modes to improve speed. */
1035 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
1036 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
1038 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
1039 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1043 if( h->mb.i_type == I_4x4 )
1045 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1047 for( idx = 0; idx < 16; idx++ )
1049 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1050 i_best = COST_MAX64;
1052 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
1054 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1055 /* emulate missing topright samples */
1056 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1058 for( i = 0; i < i_max; i++ )
1060 i_mode = predict_mode[i];
1061 if( h->mb.b_lossless )
1062 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1064 h->predict_4x4[i_mode]( p_dst_by );
1065 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1067 if( i_best > i_satd )
1069 a->i_predict4x4[idx] = i_mode;
1071 pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
1072 pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
1073 pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
1074 pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
1075 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1079 *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
1080 *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
1081 *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
1082 *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
1083 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1085 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1088 else if( h->mb.i_type == I_8x8 )
1090 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1091 for( idx = 0; idx < 4; idx++ )
1093 uint64_t pels_h = 0;
1098 int cbp_luma_new = 0;
1099 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1101 i_best = COST_MAX64;
1105 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1106 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
1107 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1109 for( i = 0; i < i_max; i++ )
1111 i_mode = predict_mode[i];
1112 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1114 if( h->mb.b_lossless )
1115 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1117 h->predict_8x8[i_mode]( p_dst_by, edge );
1118 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1119 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1121 if( i_best > i_satd )
1123 a->i_predict8x8[idx] = i_mode;
1124 cbp_luma_new = h->mb.i_cbp_luma;
1127 pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
1129 for( j=0; j<7; j++ )
1130 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1131 i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
1132 i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
1135 a->i_cbp_i8x8_luma = cbp_luma_new;
1136 *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
1138 for( j=0; j<7; j++ )
1139 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1140 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
1141 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
1143 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1148 #define LOAD_FENC( m, src, xoff, yoff) \
1149 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1150 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1151 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1152 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1153 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1155 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1156 (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1157 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1158 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1159 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1160 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1161 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1162 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
1164 #define REF_COST(list, ref) \
1165 (a->p_cost_ref##list[ref])
1167 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1171 ALIGNED_4( int16_t mvc[8][2] );
1172 int i_halfpel_thresh = INT_MAX;
1173 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1175 /* 16x16 Search on all ref frame */
1176 m.i_pixel = PIXEL_16x16;
1177 m.p_cost_mv = a->p_cost_mv;
1178 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1180 a->l0.me16x16.cost = INT_MAX;
1181 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1183 const int i_ref_cost = REF_COST( 0, i_ref );
1184 i_halfpel_thresh -= i_ref_cost;
1185 m.i_ref_cost = i_ref_cost;
1188 /* search with ref */
1189 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1190 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1191 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1192 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1194 /* early termination
1195 * SSD threshold would probably be better than SATD */
1198 && m.cost-m.cost_mv < 300*a->i_lambda
1199 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1200 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1201 && x264_macroblock_probe_pskip( h ) )
1203 h->mb.i_type = P_SKIP;
1204 x264_analyse_update_cache( h, a );
1205 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1209 m.cost += i_ref_cost;
1210 i_halfpel_thresh += i_ref_cost;
1212 if( m.cost < a->l0.me16x16.cost )
1213 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1215 /* save mv for predicting neighbors */
1216 *(uint32_t*)a->l0.mvc[i_ref][0] =
1217 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1220 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1221 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1223 h->mb.i_type = P_L0;
1226 x264_mb_cache_fenc_satd( h );
1227 if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
1229 h->mb.i_partition = D_16x16;
1230 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1231 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1236 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1240 uint8_t **p_fenc = h->mb.pic.p_fenc;
1241 int i_halfpel_thresh = INT_MAX;
1242 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1244 int i_maxref = h->mb.pic.i_fref[0]-1;
1246 h->mb.i_partition = D_8x8;
1248 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1249 * than those used by the neighbors */
1250 if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
1251 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1254 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
1255 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
1256 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
1257 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
1258 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
1259 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
1262 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1263 *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
1265 for( i = 0; i < 4; i++ )
1267 x264_me_t *l0m = &a->l0.me8x8[i];
1271 m.i_pixel = PIXEL_8x8;
1272 m.p_cost_mv = a->p_cost_mv;
1274 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1275 l0m->cost = INT_MAX;
1276 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1278 const int i_ref_cost = REF_COST( 0, i_ref );
1279 i_halfpel_thresh -= i_ref_cost;
1280 m.i_ref_cost = i_ref_cost;
1283 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1284 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1285 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1286 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1288 m.cost += i_ref_cost;
1289 i_halfpel_thresh += i_ref_cost;
1290 *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
1292 if( m.cost < l0m->cost )
1293 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1295 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1296 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1299 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1302 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1303 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1304 /* P_8x8 ref0 has no ref cost */
1305 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1306 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1307 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1308 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1309 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1312 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1314 const int i_ref = a->l0.me16x16.i_ref;
1315 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1316 uint8_t **p_fref = h->mb.pic.p_fref[0][i_ref];
1317 uint8_t **p_fenc = h->mb.pic.p_fenc;
1319 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1322 /* XXX Needed for x264_mb_predict_mv */
1323 h->mb.i_partition = D_8x8;
1326 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
1328 for( i = 0; i < 4; i++ )
1330 x264_me_t *m = &a->l0.me8x8[i];
1334 m->i_pixel = PIXEL_8x8;
1335 m->p_cost_mv = a->p_cost_mv;
1336 m->i_ref_cost = i_ref_cost;
1339 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1340 LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
1341 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1342 x264_me_search( h, m, mvc, i_mvc );
1344 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1346 *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
1350 m->cost += i_ref_cost;
1351 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1354 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1355 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1356 /* theoretically this should include 4*ref_cost,
1357 * but 3 seems a better approximation of cabac. */
1358 if( h->param.b_cabac )
1359 a->l0.i_cost8x8 -= i_ref_cost;
1360 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1361 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1364 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1367 uint8_t **p_fenc = h->mb.pic.p_fenc;
1368 ALIGNED_4( int16_t mvc[3][2] );
1371 /* XXX Needed for x264_mb_predict_mv */
1372 h->mb.i_partition = D_16x8;
1374 for( i = 0; i < 2; i++ )
1376 x264_me_t *l0m = &a->l0.me16x8[i];
1377 const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1378 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1380 m.i_pixel = PIXEL_16x8;
1381 m.p_cost_mv = a->p_cost_mv;
1383 LOAD_FENC( &m, p_fenc, 0, 8*i );
1384 l0m->cost = INT_MAX;
1385 for( j = 0; j < i_ref8s; j++ )
1387 const int i_ref = ref8[j];
1388 const int i_ref_cost = REF_COST( 0, i_ref );
1389 m.i_ref_cost = i_ref_cost;
1392 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1393 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1394 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
1395 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
1397 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1398 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1399 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1400 x264_me_search( h, &m, mvc, 3 );
1402 m.cost += i_ref_cost;
1404 if( m.cost < l0m->cost )
1405 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1407 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1408 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1411 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1414 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1417 uint8_t **p_fenc = h->mb.pic.p_fenc;
1418 ALIGNED_4( int16_t mvc[3][2] );
1421 /* XXX Needed for x264_mb_predict_mv */
1422 h->mb.i_partition = D_8x16;
1424 for( i = 0; i < 2; i++ )
1426 x264_me_t *l0m = &a->l0.me8x16[i];
1427 const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1428 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1430 m.i_pixel = PIXEL_8x16;
1431 m.p_cost_mv = a->p_cost_mv;
1433 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1434 l0m->cost = INT_MAX;
1435 for( j = 0; j < i_ref8s; j++ )
1437 const int i_ref = ref8[j];
1438 const int i_ref_cost = REF_COST( 0, i_ref );
1439 m.i_ref_cost = i_ref_cost;
1442 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1443 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
1444 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
1446 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1447 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1448 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1449 x264_me_search( h, &m, mvc, 3 );
1451 m.cost += i_ref_cost;
1453 if( m.cost < l0m->cost )
1454 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1456 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1457 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1460 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1463 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1465 ALIGNED_8( uint8_t pix1[16*8] );
1466 uint8_t *pix2 = pix1+8;
1467 const int i_stride = h->mb.pic.i_stride[1];
1468 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1469 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1471 #define CHROMA4x4MC( width, height, me, x, y ) \
1472 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
1473 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
1475 if( pixel == PIXEL_4x4 )
1477 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
1478 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
1479 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
1480 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
1482 else if( pixel == PIXEL_8x4 )
1484 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
1485 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
1489 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
1490 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
1493 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1494 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1497 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1499 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1500 uint8_t **p_fenc = h->mb.pic.p_fenc;
1501 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1504 /* XXX Needed for x264_mb_predict_mv */
1505 h->mb.i_partition = D_8x8;
1507 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1509 const int idx = 4*i8x8 + i4x4;
1510 const int x4 = block_idx_x[idx];
1511 const int y4 = block_idx_y[idx];
1512 const int i_mvc = (i4x4 == 0);
1514 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1516 m->i_pixel = PIXEL_4x4;
1517 m->p_cost_mv = a->p_cost_mv;
1519 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1520 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1522 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1523 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1525 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1527 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1528 a->l0.me4x4[i8x8][1].cost +
1529 a->l0.me4x4[i8x8][2].cost +
1530 a->l0.me4x4[i8x8][3].cost +
1531 REF_COST( 0, i_ref ) +
1532 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1533 if( h->mb.b_chroma_me )
1534 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1537 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1539 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1540 uint8_t **p_fenc = h->mb.pic.p_fenc;
1541 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1544 /* XXX Needed for x264_mb_predict_mv */
1545 h->mb.i_partition = D_8x8;
1547 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1549 const int idx = 4*i8x8 + 2*i8x4;
1550 const int x4 = block_idx_x[idx];
1551 const int y4 = block_idx_y[idx];
1552 const int i_mvc = (i8x4 == 0);
1554 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1556 m->i_pixel = PIXEL_8x4;
1557 m->p_cost_mv = a->p_cost_mv;
1559 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1560 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1562 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1563 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1565 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1567 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1568 REF_COST( 0, i_ref ) +
1569 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1570 if( h->mb.b_chroma_me )
1571 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1574 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1576 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1577 uint8_t **p_fenc = h->mb.pic.p_fenc;
1578 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1581 /* XXX Needed for x264_mb_predict_mv */
1582 h->mb.i_partition = D_8x8;
1584 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1586 const int idx = 4*i8x8 + i4x8;
1587 const int x4 = block_idx_x[idx];
1588 const int y4 = block_idx_y[idx];
1589 const int i_mvc = (i4x8 == 0);
1591 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1593 m->i_pixel = PIXEL_4x8;
1594 m->p_cost_mv = a->p_cost_mv;
1596 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1597 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1599 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1600 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1602 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1604 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1605 REF_COST( 0, i_ref ) +
1606 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1607 if( h->mb.b_chroma_me )
1608 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1611 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1613 /* Assumes that fdec still contains the results of
1614 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1616 uint8_t **p_fenc = h->mb.pic.p_fenc;
1617 uint8_t **p_fdec = h->mb.pic.p_fdec;
1620 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1621 for( i = 0; i < 4; i++ )
1623 const int x = (i&1)*8;
1624 const int y = (i>>1)*8;
1625 a->i_cost16x16direct +=
1626 a->i_cost8x8direct[i] =
1627 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1630 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1634 #define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
1636 h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
1639 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1641 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1642 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1643 uint8_t *src0, *src1;
1644 int stride0 = 16, stride1 = 16;
1648 ALIGNED_4( int16_t mvc[9][2] );
1649 int i_halfpel_thresh = INT_MAX;
1650 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1652 /* 16x16 Search on all ref frame */
1653 m.i_pixel = PIXEL_16x16;
1654 m.p_cost_mv = a->p_cost_mv;
1655 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1658 a->l0.me16x16.cost = INT_MAX;
1659 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1661 /* search with ref */
1662 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1663 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1664 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1665 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1668 m.cost += REF_COST( 0, i_ref );
1670 if( m.cost < a->l0.me16x16.cost )
1672 a->l0.i_ref = i_ref;
1673 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1676 /* save mv for predicting neighbors */
1677 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1679 /* subtract ref cost, so we don't have to add it for the other MB types */
1680 a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1683 i_halfpel_thresh = INT_MAX;
1684 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1685 a->l1.me16x16.cost = INT_MAX;
1686 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1688 /* search with ref */
1689 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1690 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1691 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1692 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1695 m.cost += REF_COST( 1, i_ref );
1697 if( m.cost < a->l1.me16x16.cost )
1699 a->l1.i_ref = i_ref;
1700 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1703 /* save mv for predicting neighbors */
1704 *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1706 /* subtract ref cost, so we don't have to add it for the other MB types */
1707 a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1709 /* Set global ref, needed for other modes? */
1710 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1711 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1713 /* get cost of BI mode */
1714 src0 = h->mc.get_ref( pix0, &stride0,
1715 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1716 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
1717 src1 = h->mc.get_ref( pix1, &stride1,
1718 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1719 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
1721 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1723 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1724 + REF_COST( 0, a->l0.i_ref )
1725 + REF_COST( 1, a->l1.i_ref )
1726 + a->l0.me16x16.cost_mv
1727 + a->l1.me16x16.cost_mv;
1730 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1731 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1732 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1735 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1737 const int x = 2*(i%2);
1738 const int y = 2*(i/2);
1740 switch( h->mb.i_sub_partition[i] )
1743 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1746 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1747 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1750 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1751 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1754 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1755 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1756 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1757 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1760 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1765 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1766 if( x264_mb_partition_listX_table[0][part] ) \
1768 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1769 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1773 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1774 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1776 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1778 if( x264_mb_partition_listX_table[1][part] ) \
1780 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1781 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1785 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1786 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1788 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1791 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1795 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1797 x264_mb_load_mv_direct8x8( h, i );
1800 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1801 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1802 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1807 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1810 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1812 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1814 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1816 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1820 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1822 uint8_t **p_fref[2] =
1823 { h->mb.pic.p_fref[0][a->l0.i_ref],
1824 h->mb.pic.p_fref[1][a->l1.i_ref] };
1825 ALIGNED_8( uint8_t pix[2][8*8] );
1828 /* XXX Needed for x264_mb_predict_mv */
1829 h->mb.i_partition = D_8x8;
1833 for( i = 0; i < 4; i++ )
1838 int i_part_cost_bi = 0;
1839 int stride[2] = {8,8};
1842 for( l = 0; l < 2; l++ )
1844 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1845 x264_me_t *m = &lX->me8x8[i];
1847 m->i_pixel = PIXEL_8x8;
1848 m->p_cost_mv = a->p_cost_mv;
1850 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1851 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1853 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1854 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1856 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1859 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1860 m->mv[0], m->mv[1], 8, 8 );
1861 i_part_cost_bi += m->cost_mv;
1862 /* FIXME: ref cost */
1864 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1865 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1866 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1867 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1868 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1870 i_part_cost = a->l0.me8x8[i].cost;
1871 h->mb.i_sub_partition[i] = D_L0_8x8;
1872 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1873 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1874 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1875 a->i_cost8x8bi += i_part_cost;
1877 /* XXX Needed for x264_mb_predict_mv */
1878 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1882 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1885 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1887 uint8_t **p_fref[2] =
1888 { h->mb.pic.p_fref[0][a->l0.i_ref],
1889 h->mb.pic.p_fref[1][a->l1.i_ref] };
1890 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1891 ALIGNED_4( int16_t mvc[2][2] );
1894 h->mb.i_partition = D_16x8;
1895 a->i_cost16x8bi = 0;
1897 for( i = 0; i < 2; i++ )
1900 int i_part_cost_bi = 0;
1901 int stride[2] = {16,16};
1904 /* TODO: check only the list(s) that were used in b8x8? */
1905 for( l = 0; l < 2; l++ )
1907 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1908 x264_me_t *m = &lX->me16x8[i];
1910 m->i_pixel = PIXEL_16x8;
1911 m->p_cost_mv = a->p_cost_mv;
1913 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1914 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1916 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
1917 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
1919 x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
1920 x264_me_search( h, m, mvc, 2 );
1923 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1924 m->mv[0], m->mv[1], 16, 8 );
1925 /* FIXME: ref cost */
1926 i_part_cost_bi += m->cost_mv;
1928 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1929 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1931 i_part_cost = a->l0.me16x8[i].cost;
1932 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1933 if( a->l1.me16x8[i].cost < i_part_cost )
1935 i_part_cost = a->l1.me16x8[i].cost;
1936 a->i_mb_partition16x8[i] = D_L1_8x8;
1938 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1940 i_part_cost = i_part_cost_bi;
1941 a->i_mb_partition16x8[i] = D_BI_8x8;
1943 a->i_cost16x8bi += i_part_cost;
1945 x264_mb_cache_mv_b16x8( h, a, i, 0 );
1949 a->i_mb_type16x8 = B_L0_L0
1950 + (a->i_mb_partition16x8[0]>>2) * 3
1951 + (a->i_mb_partition16x8[1]>>2);
1952 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1955 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1957 uint8_t **p_fref[2] =
1958 { h->mb.pic.p_fref[0][a->l0.i_ref],
1959 h->mb.pic.p_fref[1][a->l1.i_ref] };
1960 ALIGNED_8( uint8_t pix[2][8*16] );
1961 ALIGNED_4( int16_t mvc[2][2] );
1964 h->mb.i_partition = D_8x16;
1965 a->i_cost8x16bi = 0;
1967 for( i = 0; i < 2; i++ )
1970 int i_part_cost_bi = 0;
1971 int stride[2] = {8,8};
1974 for( l = 0; l < 2; l++ )
1976 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1977 x264_me_t *m = &lX->me8x16[i];
1979 m->i_pixel = PIXEL_8x16;
1980 m->p_cost_mv = a->p_cost_mv;
1982 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1983 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
1985 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
1986 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
1988 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1989 x264_me_search( h, m, mvc, 2 );
1992 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1993 m->mv[0], m->mv[1], 8, 16 );
1994 /* FIXME: ref cost */
1995 i_part_cost_bi += m->cost_mv;
1998 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1999 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
2001 i_part_cost = a->l0.me8x16[i].cost;
2002 a->i_mb_partition8x16[i] = D_L0_8x8;
2003 if( a->l1.me8x16[i].cost < i_part_cost )
2005 i_part_cost = a->l1.me8x16[i].cost;
2006 a->i_mb_partition8x16[i] = D_L1_8x8;
2008 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
2010 i_part_cost = i_part_cost_bi;
2011 a->i_mb_partition8x16[i] = D_BI_8x8;
2013 a->i_cost8x16bi += i_part_cost;
2015 x264_mb_cache_mv_b8x16( h, a, i, 0 );
2019 a->i_mb_type8x16 = B_L0_L0
2020 + (a->i_mb_partition8x16[0]>>2) * 3
2021 + (a->i_mb_partition8x16[1]>>2);
2022 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
2025 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
2027 int thresh = i_satd * 5/4;
2029 h->mb.i_type = P_L0;
2030 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
2032 h->mb.i_partition = D_16x16;
2033 x264_analyse_update_cache( h, a );
2034 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2037 if( a->l0.i_cost16x8 <= thresh )
2039 h->mb.i_partition = D_16x8;
2040 x264_analyse_update_cache( h, a );
2041 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2044 a->l0.i_cost16x8 = COST_MAX;
2046 if( a->l0.i_cost8x16 <= thresh )
2048 h->mb.i_partition = D_8x16;
2049 x264_analyse_update_cache( h, a );
2050 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2053 a->l0.i_cost8x16 = COST_MAX;
2055 if( a->l0.i_cost8x8 <= thresh )
2057 h->mb.i_type = P_8x8;
2058 h->mb.i_partition = D_8x8;
2059 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2062 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2063 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2064 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2065 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2066 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2067 * for future blocks are those left over from previous RDO calls. */
2068 for( i = 0; i < 4; i++ )
2070 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2071 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2072 int subtype, btype = D_L0_8x8;
2073 uint64_t bcost = COST_MAX64;
2074 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2077 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2079 h->mb.i_sub_partition[i] = subtype;
2080 x264_mb_cache_mv_p8x8( h, a, i );
2081 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2082 COPY2_IF_LT( bcost, cost, btype, subtype );
2084 h->mb.i_sub_partition[i] = btype;
2085 x264_mb_cache_mv_p8x8( h, a, i );
2089 x264_analyse_update_cache( h, a );
2090 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2093 a->l0.i_cost8x8 = COST_MAX;
2096 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2098 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2100 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2102 h->mb.i_type = B_DIRECT;
2103 /* Assumes direct/skip MC is still in fdec */
2104 /* Requires b-rdo to be done before intra analysis */
2105 h->mb.b_skip_mc = 1;
2106 x264_analyse_update_cache( h, a );
2107 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2108 h->mb.b_skip_mc = 0;
2111 //FIXME not all the update_cache calls are needed
2112 h->mb.i_partition = D_16x16;
2114 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2116 h->mb.i_type = B_L0_L0;
2117 x264_analyse_update_cache( h, a );
2118 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2122 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2124 h->mb.i_type = B_L1_L1;
2125 x264_analyse_update_cache( h, a );
2126 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2130 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2132 h->mb.i_type = B_BI_BI;
2133 x264_analyse_update_cache( h, a );
2134 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2138 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2140 h->mb.i_type = B_8x8;
2141 h->mb.i_partition = D_8x8;
2142 x264_analyse_update_cache( h, a );
2143 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2144 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2148 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2150 h->mb.i_type = a->i_mb_type16x8;
2151 h->mb.i_partition = D_16x8;
2152 x264_analyse_update_cache( h, a );
2153 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2157 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2159 h->mb.i_type = a->i_mb_type8x16;
2160 h->mb.i_partition = D_8x16;
2161 x264_analyse_update_cache( h, a );
2162 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2166 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2168 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2171 if( IS_INTRA(h->mb.i_type) )
2174 switch( h->mb.i_partition )
2177 if( h->mb.i_type == B_BI_BI )
2178 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2181 for( i=0; i<2; i++ )
2182 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2183 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2186 for( i=0; i<2; i++ )
2187 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2188 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2191 for( i=0; i<4; i++ )
2192 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2193 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2198 static inline void x264_mb_analyse_transform( x264_t *h )
2200 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2202 int i_cost4, i_cost8;
2203 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2206 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2207 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2208 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2209 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2211 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2212 h->mb.b_skip_mc = 1;
2216 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2218 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2221 x264_analyse_update_cache( h, a );
2222 h->mb.b_transform_8x8 ^= 1;
2223 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2224 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2226 if( *i_rd >= i_rd8 )
2229 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2233 h->mb.b_transform_8x8 ^= 1;
2237 /* Rate-distortion optimal QP selection.
2238 * FIXME: More than half of the benefit of this function seems to be
2239 * in the way it improves the coding of chroma DC (by decimating or
2240 * finding a better way to code a single DC coefficient.)
2241 * There must be a more efficient way to get that portion of the benefit
2242 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2244 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2246 int bcost, cost, direction, failures, prevcost, origcost;
2247 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2248 int last_qp_tried = 0;
2249 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2251 /* If CBP is already zero, don't raise the quantizer any higher. */
2252 for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2254 /* Without psy-RD, require monotonicity when moving quant away from previous
2255 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2256 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2257 * allow 2 failures when moving quant towards previous quant.
2258 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2259 int threshold = (!!h->mb.i_psy_rd);
2260 /* Raise the threshold for failures if we're moving towards the last QP. */
2261 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2262 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2264 h->mb.i_qp = orig_qp;
2266 prevcost = origcost;
2267 h->mb.i_qp += direction;
2268 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2270 if( h->mb.i_last_qp == h->mb.i_qp )
2272 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2273 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2274 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2276 /* We can't assume that the costs are monotonic over QPs.
2277 * Tie case-as-failure seems to give better results. */
2278 if( cost < prevcost )
2284 if( failures > threshold )
2286 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2288 h->mb.i_qp += direction;
2292 /* Always try the last block's QP. */
2293 if( !last_qp_tried )
2295 h->mb.i_qp = h->mb.i_last_qp;
2296 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2297 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2298 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2302 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2304 /* Check transform again; decision from before may no longer be optimal. */
2305 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2306 x264_mb_transform_8x8_allowed( h ) )
2308 h->mb.b_transform_8x8 ^= 1;
2309 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2311 h->mb.b_transform_8x8 ^= 1;
2315 /*****************************************************************************
2316 * x264_macroblock_analyse:
2317 *****************************************************************************/
2318 int x264_macroblock_analyse( x264_t *h )
2320 x264_mb_analysis_t analysis;
2321 int i_cost = COST_MAX;
2324 h->mb.i_qp = x264_ratecontrol_qp( h );
2325 if( h->param.rc.i_aq_mode )
2327 x264_adaptive_quant( h );
2328 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2329 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2330 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2331 h->mb.i_qp = h->mb.i_last_qp;
2334 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2336 /*--------------------------- Do the analysis ---------------------------*/
2337 if( h->sh.i_type == SLICE_TYPE_I )
2339 if( analysis.i_mbrd )
2340 x264_mb_cache_fenc_satd( h );
2341 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2342 if( analysis.i_mbrd )
2343 x264_intra_rd( h, &analysis, COST_MAX );
2345 i_cost = analysis.i_satd_i16x16;
2346 h->mb.i_type = I_16x16;
2347 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2348 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2349 if( analysis.i_satd_pcm < i_cost )
2350 h->mb.i_type = I_PCM;
2352 else if( analysis.i_mbrd >= 2 )
2353 x264_intra_rd_refine( h, &analysis );
2355 else if( h->sh.i_type == SLICE_TYPE_P )
2359 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2361 /* Fast P_SKIP detection */
2362 analysis.b_try_pskip = 0;
2363 if( h->param.analyse.b_fast_pskip )
2365 if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2366 // FIXME don't need to check this if the reference frame is done
2368 else if( h->param.analyse.i_subpel_refine >= 3 )
2369 analysis.b_try_pskip = 1;
2370 else if( h->mb.i_mb_type_left == P_SKIP ||
2371 h->mb.i_mb_type_top == P_SKIP ||
2372 h->mb.i_mb_type_topleft == P_SKIP ||
2373 h->mb.i_mb_type_topright == P_SKIP )
2374 b_skip = x264_macroblock_probe_pskip( h );
2377 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2381 h->mb.i_type = P_SKIP;
2382 h->mb.i_partition = D_16x16;
2383 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
2387 const unsigned int flags = h->param.analyse.inter;
2391 int i_satd_inter, i_satd_intra;
2393 if( x264_mb_analyse_load_costs( h, &analysis ) )
2396 x264_mb_analyse_inter_p16x16( h, &analysis );
2398 if( h->mb.i_type == P_SKIP )
2401 if( flags & X264_ANALYSE_PSUB16x16 )
2403 if( h->param.analyse.b_mixed_references )
2404 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2406 x264_mb_analyse_inter_p8x8( h, &analysis );
2409 /* Select best inter mode */
2411 i_partition = D_16x16;
2412 i_cost = analysis.l0.me16x16.cost;
2414 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2415 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2418 i_partition = D_8x8;
2419 i_cost = analysis.l0.i_cost8x8;
2422 if( flags & X264_ANALYSE_PSUB8x8 )
2424 for( i = 0; i < 4; i++ )
2426 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2427 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2429 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2430 h->mb.i_sub_partition[i] = D_L0_4x4;
2432 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2433 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2434 h->mb.i_sub_partition[i], D_L0_8x4 );
2436 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2437 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2438 h->mb.i_sub_partition[i], D_L0_4x8 );
2440 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2442 x264_mb_cache_mv_p8x8( h, &analysis, i );
2444 analysis.l0.i_cost8x8 = i_cost;
2448 /* Now do 16x8/8x16 */
2449 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2450 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2451 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2453 x264_mb_analyse_inter_p16x8( h, &analysis );
2454 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2456 x264_mb_analyse_inter_p8x16( h, &analysis );
2457 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2460 h->mb.i_partition = i_partition;
2463 //FIXME mb_type costs?
2464 if( analysis.i_mbrd )
2468 else if( i_partition == D_16x16 )
2470 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2471 i_cost = analysis.l0.me16x16.cost;
2473 else if( i_partition == D_16x8 )
2475 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2476 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2477 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2479 else if( i_partition == D_8x16 )
2481 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2482 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2483 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2485 else if( i_partition == D_8x8 )
2489 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2491 switch( h->mb.i_sub_partition[i8x8] )
2494 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2495 i_cost += analysis.l0.me8x8[i8x8].cost;
2498 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2499 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2500 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2501 analysis.l0.me8x4[i8x8][1].cost;
2504 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2505 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2506 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2507 analysis.l0.me4x8[i8x8][1].cost;
2511 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2512 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2513 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2514 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2515 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2516 analysis.l0.me4x4[i8x8][1].cost +
2517 analysis.l0.me4x4[i8x8][2].cost +
2518 analysis.l0.me4x4[i8x8][3].cost;
2521 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2527 if( h->mb.b_chroma_me )
2529 x264_mb_analyse_intra_chroma( h, &analysis );
2530 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2531 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2532 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2533 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2536 x264_mb_analyse_intra( h, &analysis, i_cost );
2538 i_satd_inter = i_cost;
2539 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2540 analysis.i_satd_i8x8,
2541 analysis.i_satd_i4x4 );
2543 if( analysis.i_mbrd )
2545 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2547 i_partition = D_16x16;
2548 i_cost = analysis.l0.i_rd16x16;
2549 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2550 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2551 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2552 h->mb.i_type = i_type;
2553 h->mb.i_partition = i_partition;
2554 if( i_cost < COST_MAX )
2555 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2556 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2559 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2560 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2561 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2562 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2564 h->mb.i_type = i_type;
2566 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2568 if( IS_INTRA( h->mb.i_type ) )
2570 x264_intra_rd_refine( h, &analysis );
2572 else if( i_partition == D_16x16 )
2574 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2575 analysis.l0.me16x16.cost = analysis.l0.i_rd16x16;
2576 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2578 else if( i_partition == D_16x8 )
2580 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2581 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2582 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2583 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2584 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2585 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2587 else if( i_partition == D_8x16 )
2589 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2590 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2591 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2592 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2593 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2594 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2596 else if( i_partition == D_8x8 )
2599 x264_analyse_update_cache( h, &analysis );
2600 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2602 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2604 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2606 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2608 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2609 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2611 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2613 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2614 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2616 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2618 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2619 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2620 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2621 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2628 else if( h->sh.i_type == SLICE_TYPE_B )
2630 int i_bskip_cost = COST_MAX;
2633 if( analysis.i_mbrd )
2634 x264_mb_cache_fenc_satd( h );
2636 h->mb.i_type = B_SKIP;
2637 if( h->mb.b_direct_auto_write )
2639 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2640 for( i = 0; i < 2; i++ )
2643 h->sh.b_direct_spatial_mv_pred ^= 1;
2644 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2645 if( analysis.b_direct_available )
2650 b_skip = x264_macroblock_probe_bskip( h );
2652 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2659 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2661 if( analysis.b_direct_available )
2663 if( !h->mb.b_direct_auto_write )
2665 if( analysis.i_mbrd )
2667 i_bskip_cost = ssd_mb( h );
2668 /* 6 = minimum cavlc cost of a non-skipped MB */
2669 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2671 else if( !h->mb.b_direct_auto_write )
2673 /* Conditioning the probe on neighboring block types
2674 * doesn't seem to help speed or quality. */
2675 b_skip = x264_macroblock_probe_bskip( h );
2681 const unsigned int flags = h->param.analyse.inter;
2685 h->mb.b_skip_mc = 0;
2687 if( x264_mb_analyse_load_costs( h, &analysis ) )
2690 /* select best inter mode */
2691 /* direct must be first */
2692 if( analysis.b_direct_available )
2693 x264_mb_analyse_inter_direct( h, &analysis );
2695 x264_mb_analyse_inter_b16x16( h, &analysis );
2698 i_partition = D_16x16;
2699 i_cost = analysis.l0.me16x16.cost;
2700 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2701 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2702 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2704 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2706 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2707 if( i_bskip_cost < analysis.i_rd16x16direct &&
2708 i_bskip_cost < analysis.i_rd16x16bi &&
2709 i_bskip_cost < analysis.l0.i_rd16x16 &&
2710 i_bskip_cost < analysis.l1.i_rd16x16 )
2712 h->mb.i_type = B_SKIP;
2713 x264_analyse_update_cache( h, &analysis );
2718 if( flags & X264_ANALYSE_BSUB16x16 )
2720 x264_mb_analyse_inter_b8x8( h, &analysis );
2721 if( analysis.i_cost8x8bi < i_cost )
2724 i_partition = D_8x8;
2725 i_cost = analysis.i_cost8x8bi;
2727 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2728 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2730 x264_mb_analyse_inter_b16x8( h, &analysis );
2731 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2732 i_type, analysis.i_mb_type16x8,
2733 i_partition, D_16x8 );
2735 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2736 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2738 x264_mb_analyse_inter_b8x16( h, &analysis );
2739 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2740 i_type, analysis.i_mb_type8x16,
2741 i_partition, D_8x16 );
2746 if( analysis.i_mbrd )
2751 else if( i_partition == D_16x16 )
2753 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2754 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2755 if( i_type == B_L0_L0 )
2757 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2758 i_cost = analysis.l0.me16x16.cost
2759 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2761 else if( i_type == B_L1_L1 )
2763 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2764 i_cost = analysis.l1.me16x16.cost
2765 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2767 else if( i_type == B_BI_BI )
2769 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2770 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2773 else if( i_partition == D_16x8 )
2775 for( i=0; i<2; i++ )
2777 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2778 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2779 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2780 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2783 else if( i_partition == D_8x16 )
2785 for( i=0; i<2; i++ )
2787 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2788 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2789 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2790 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2793 else if( i_partition == D_8x8 )
2795 for( i=0; i<4; i++ )
2798 int i_part_cost_old;
2800 int i_part_type = h->mb.i_sub_partition[i];
2801 int b_bidir = (i_part_type == D_BI_8x8);
2803 if( i_part_type == D_DIRECT_8x8 )
2805 if( x264_mb_partition_listX_table[0][i_part_type] )
2807 m = &analysis.l0.me8x8[i];
2808 i_part_cost_old = m->cost;
2809 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2810 m->cost -= i_type_cost;
2811 x264_me_refine_qpel( h, m );
2813 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2815 if( x264_mb_partition_listX_table[1][i_part_type] )
2817 m = &analysis.l1.me8x8[i];
2818 i_part_cost_old = m->cost;
2819 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2820 m->cost -= i_type_cost;
2821 x264_me_refine_qpel( h, m );
2823 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2825 /* TODO: update mvp? */
2829 i_satd_inter = i_cost;
2831 if( analysis.i_mbrd )
2833 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2835 i_cost = i_bskip_cost;
2836 i_partition = D_16x16;
2837 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2838 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2839 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2840 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2841 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2842 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2843 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2845 h->mb.i_type = i_type;
2846 h->mb.i_partition = i_partition;
2849 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2851 if( analysis.i_mbrd )
2853 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2854 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2857 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2858 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2859 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2860 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2862 h->mb.i_type = i_type;
2863 h->mb.i_partition = i_partition;
2865 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2866 x264_intra_rd_refine( h, &analysis );
2867 if( h->mb.i_subpel_refine >= 5 )
2868 x264_refine_bidir( h, &analysis );
2870 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2872 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2873 x264_analyse_update_cache( h, &analysis );
2875 if( i_partition == D_16x16 )
2877 if( i_type == B_L0_L0 )
2879 analysis.l0.me16x16.cost = analysis.l0.i_rd16x16;
2880 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2882 else if( i_type == B_L1_L1 )
2884 analysis.l1.me16x16.cost = analysis.l1.i_rd16x16;
2885 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2887 else if( i_type == B_BI_BI )
2888 x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2890 else if( i_partition == D_16x8 )
2892 for( i = 0; i < 2; i++ )
2894 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2895 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2896 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2897 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2898 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2899 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2900 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2903 else if( i_partition == D_8x16 )
2905 for( i = 0; i < 2; i++ )
2907 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2908 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2909 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2910 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2911 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2912 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2913 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2916 else if( i_partition == D_8x8 )
2918 for( i = 0; i < 4; i++ )
2920 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2921 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2922 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2923 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2924 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2925 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2932 x264_analyse_update_cache( h, &analysis );
2934 if( !analysis.i_mbrd )
2935 x264_mb_analyse_transform( h );
2937 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
2938 x264_mb_analyse_qp_rd( h, &analysis );
2940 h->mb.b_trellis = h->param.analyse.i_trellis;
2941 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
2942 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
2943 x264_psy_trellis_init( h, 0 );
2944 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
2945 h->mb.i_skip_intra = 0;
2949 /*-------------------- Update MB from the analysis ----------------------*/
2950 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
2954 switch( h->mb.i_type )
2957 for( i = 0; i < 16; i++ )
2958 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
2960 x264_mb_analyse_intra_chroma( h, a );
2963 for( i = 0; i < 4; i++ )
2964 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
2966 x264_mb_analyse_intra_chroma( h, a );
2969 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
2970 x264_mb_analyse_intra_chroma( h, a );
2977 switch( h->mb.i_partition )
2980 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
2981 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
2985 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
2986 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
2987 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
2988 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
2992 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
2993 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
2994 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
2995 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
2999 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
3005 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
3006 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
3007 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
3008 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
3009 for( i = 0; i < 4; i++ )
3010 x264_mb_cache_mv_p8x8( h, a, i );
3015 h->mb.i_partition = D_16x16;
3016 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
3017 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
3023 x264_mb_load_mv_direct8x8( h, 0 );
3024 x264_mb_load_mv_direct8x8( h, 1 );
3025 x264_mb_load_mv_direct8x8( h, 2 );
3026 x264_mb_load_mv_direct8x8( h, 3 );
3030 /* optimize: cache might not need to be rewritten */
3031 for( i = 0; i < 4; i++ )
3032 x264_mb_cache_mv_b8x8( h, a, i, 1 );
3035 default: /* the rest of the B types */
3036 switch( h->mb.i_partition )
3039 switch( h->mb.i_type )
3042 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3043 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3045 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
3046 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
3047 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3050 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3051 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3052 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3054 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3055 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3058 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3059 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3061 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3062 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3067 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3068 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3071 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3072 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3075 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3081 if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
3084 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3087 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3090 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
3091 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3093 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3094 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3095 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3096 h->mb.cache.mv[l][x264_scan8[15]][0],
3097 h->mb.cache.mv[l][x264_scan8[15]][1] );
3098 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3099 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3100 fprintf(stderr, "completed: %d \n", completed );
3101 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3102 x264_mb_analyse_intra( h, a, COST_MAX );
3103 h->mb.i_type = I_16x16;
3104 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3105 x264_mb_analyse_intra_chroma( h, a );
3112 #include "slicetype.c"