1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
32 #include "common/common.h"
33 #include "common/cpu.h"
34 #include "macroblock.h"
36 #include "ratecontrol.h"
49 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
50 ALIGNED_4( int16_t mvc[32][5][2] );
54 int i_cost4x4[4]; /* cost per 8x8 partition */
55 x264_me_t me4x4[4][4];
58 int i_cost8x4[4]; /* cost per 8x8 partition */
59 x264_me_t me8x4[4][2];
62 int i_cost4x8[4]; /* cost per 8x8 partition */
63 x264_me_t me4x8[4][2];
73 } x264_mb_analysis_list_t;
77 /* conduct the analysis using this lamda and QP */
82 uint16_t *p_cost_ref0;
83 uint16_t *p_cost_ref1;
88 /* Take some shortcuts in intra search if intra is deemed unlikely */
94 int i_satd_i16x16_dir[7];
99 int i_satd_i8x8_dir[12][4];
103 int i_predict4x4[16];
108 int i_satd_i8x8chroma;
109 int i_satd_i8x8chroma_dir[4];
110 int i_predict8x8chroma;
112 /* II: Inter part P/B frame */
113 x264_mb_analysis_list_t l0;
114 x264_mb_analysis_list_t l1;
116 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
117 int i_cost16x16direct;
119 int i_cost8x8direct[4];
128 int i_mb_partition16x8[2]; /* mb_partition_e */
129 int i_mb_partition8x16[2];
130 int i_mb_type16x8; /* mb_class_e */
133 int b_direct_available;
135 } x264_mb_analysis_t;
137 /* lambda = pow(2,qp/6-2) */
138 const int x264_lambda_tab[52] = {
139 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
140 1, 1, 1, 1, /* 8-11 */
141 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
142 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
143 6, 7, 8, 9,10,11,13,14, /* 28-35 */
144 16,18,20,23,25,29,32,36, /* 36-43 */
145 40,45,51,57,64,72,81,91 /* 44-51 */
148 /* lambda2 = pow(lambda,2) * .9 * 256 */
149 const int x264_lambda2_tab[52] = {
150 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
151 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
152 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
153 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
154 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
155 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
156 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
159 // should the intra and inter lambdas be different?
160 // I'm just matching the behaviour of deadzone quant.
161 static const int x264_trellis_lambda2_tab[2][52] = {
162 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
163 { 46, 58, 73, 92, 117, 147,
164 185, 233, 294, 370, 466, 587,
165 740, 932, 1174, 1480, 1864, 2349,
166 2959, 3728, 4697, 5918, 7457, 9395,
167 11837, 14914, 18790, 23674, 29828, 37581,
168 47349, 59656, 75163, 94699, 119313, 150326,
169 189399, 238627, 300652, 378798, 477255, 601304,
170 757596, 954511, 1202608, 1515192, 1909022, 2405217,
171 3030384, 3818045, 4810435, 6060769 },
172 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
173 { 27, 34, 43, 54, 68, 86,
174 108, 136, 172, 216, 273, 343,
175 433, 545, 687, 865, 1090, 1374,
176 1731, 2180, 2747, 3461, 4361, 5494,
177 6922, 8721, 10988, 13844, 17442, 21976,
178 27688, 34885, 43953, 55377, 69771, 87906,
179 110755, 139543, 175813, 221511, 279087, 351627,
180 443023, 558174, 703255, 886046, 1116348, 1406511,
181 1772093, 2232697, 2813022, 3544186 }
184 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
185 16, 20, 25, 32, 40, 50,
186 64, 80, 101, 128, 161, 203,
187 256, 322, 406, 512, 645, 812,
188 1024, 1290, 1625, 2048, 2580, 3250,
189 4096, 5160, 6501, 8192, 10321, 13003,
190 16384, 20642, 26007, 32768, 41285, 52015,
194 /* TODO: calculate CABAC costs */
195 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
196 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
198 static const int i_mb_b16x8_cost_table[17] = {
199 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
201 static const int i_sub_mb_b_cost_table[13] = {
202 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
204 static const int i_sub_mb_p_cost_table[4] = {
208 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
210 /* Indexed by lambda instead of qp because, due to rounding,
211 * some quantizers share lambdas. This saves memory. */
212 uint16_t *x264_cost_mv_fpel[92][4];
213 uint16_t x264_cost_ref[92][3][33];
215 /* initialize an array of lambda*nbits for all possible mvs */
216 static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
218 static int16_t *p_cost_mv[92];
221 if( !p_cost_mv[a->i_lambda] )
224 /* could be faster, but isn't called many times */
225 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
226 CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) );
227 p_cost_mv[a->i_lambda] += 2*4*2048;
228 for( i = 0; i <= 2*4*2048; i++ )
230 p_cost_mv[a->i_lambda][-i] =
231 p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
233 for( i = 0; i < 3; i++ )
234 for( j = 0; j < 33; j++ )
235 x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
237 a->p_cost_mv = p_cost_mv[a->i_lambda];
238 a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
239 a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
241 /* FIXME is this useful for all me methods? */
242 if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
246 CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) );
247 x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
248 for( i = -2*2048; i < 2*2048; i++ )
249 x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
257 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
259 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
261 /* mbrd == 1 -> RD mode decision */
262 /* mbrd == 2 -> RD refinement */
263 /* mbrd == 3 -> QPRD */
264 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
266 /* conduct the analysis using this lamda and QP */
267 a->i_qp = h->mb.i_qp = i_qp;
268 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
270 a->i_lambda = x264_lambda_tab[i_qp];
271 a->i_lambda2 = x264_lambda2_tab[i_qp];
273 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
274 if( h->param.analyse.i_trellis )
276 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
277 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
278 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
279 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
281 h->mb.i_psy_rd_lambda = a->i_lambda;
282 /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
283 h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
285 h->mb.i_me_method = h->param.analyse.i_me_method;
286 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
287 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
288 && h->mb.i_subpel_refine >= 5;
290 h->mb.b_transform_8x8 = 0;
291 h->mb.b_noise_reduction = 0;
297 a->i_satd_i8x8chroma = COST_MAX;
299 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
300 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
304 h->mb.b_lossless ? 0 :
306 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
308 /* II: Inter part P/B frame */
309 if( h->sh.i_type != SLICE_TYPE_I )
312 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
313 // limit motion search to a slightly smaller range than the theoretical limit,
314 // since the search may go a few iterations past its given range
315 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
317 /* Calculate max allowed MV range */
318 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
319 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
320 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
321 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
322 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
323 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
324 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
325 if( h->mb.i_mb_x == 0)
327 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
328 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
329 int thread_mvy_range = i_fmv_range;
331 if( h->param.i_threads > 1 )
333 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
334 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
335 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
337 x264_frame_t **fref = i ? h->fref1 : h->fref0;
338 int i_ref = i ? h->i_ref1 : h->i_ref0;
339 for( j=0; j<i_ref; j++ )
341 x264_frame_cond_wait( fref[j], thresh );
342 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
345 if( h->param.b_deterministic )
346 thread_mvy_range = h->param.analyse.i_mv_range_thread;
347 if( h->mb.b_interlaced )
348 thread_mvy_range >>= 1;
351 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
352 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
353 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
354 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
355 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
356 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
357 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
363 a->l0.i_cost8x8 = COST_MAX;
365 for( i = 0; i < 4; i++ )
369 a->l0.i_cost4x8[i] = COST_MAX;
373 a->l0.i_cost8x16 = COST_MAX;
374 if( h->sh.i_type == SLICE_TYPE_B )
378 a->l1.i_cost8x8 = COST_MAX;
380 for( i = 0; i < 4; i++ )
385 a->i_cost8x8direct[i] = COST_MAX;
396 a->i_cost16x16direct =
399 a->i_cost8x16bi = COST_MAX;
402 /* Fast intra decision */
403 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
405 if( IS_INTRA( h->mb.i_mb_type_left )
406 || IS_INTRA( h->mb.i_mb_type_top )
407 || IS_INTRA( h->mb.i_mb_type_topleft )
408 || IS_INTRA( h->mb.i_mb_type_topright )
409 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
410 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
411 { /* intra is likely */ }
427 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
429 if( i_neighbour & MB_TOPLEFT )
431 /* top and left available */
432 *mode++ = I_PRED_16x16_V;
433 *mode++ = I_PRED_16x16_H;
434 *mode++ = I_PRED_16x16_DC;
435 *mode++ = I_PRED_16x16_P;
438 else if( i_neighbour & MB_LEFT )
441 *mode++ = I_PRED_16x16_DC_LEFT;
442 *mode++ = I_PRED_16x16_H;
445 else if( i_neighbour & MB_TOP )
448 *mode++ = I_PRED_16x16_DC_TOP;
449 *mode++ = I_PRED_16x16_V;
455 *mode = I_PRED_16x16_DC_128;
461 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
463 if( i_neighbour & MB_TOPLEFT )
465 /* top and left available */
466 *mode++ = I_PRED_CHROMA_V;
467 *mode++ = I_PRED_CHROMA_H;
468 *mode++ = I_PRED_CHROMA_DC;
469 *mode++ = I_PRED_CHROMA_P;
472 else if( i_neighbour & MB_LEFT )
475 *mode++ = I_PRED_CHROMA_DC_LEFT;
476 *mode++ = I_PRED_CHROMA_H;
479 else if( i_neighbour & MB_TOP )
482 *mode++ = I_PRED_CHROMA_DC_TOP;
483 *mode++ = I_PRED_CHROMA_V;
489 *mode = I_PRED_CHROMA_DC_128;
495 static void predict_4x4_mode_available( unsigned int i_neighbour,
496 int *mode, int *pi_count )
498 int b_l = i_neighbour & MB_LEFT;
499 int b_t = i_neighbour & MB_TOP;
504 *mode++ = I_PRED_4x4_DC;
505 *mode++ = I_PRED_4x4_H;
506 *mode++ = I_PRED_4x4_V;
507 *mode++ = I_PRED_4x4_DDL;
508 if( i_neighbour & MB_TOPLEFT )
510 *mode++ = I_PRED_4x4_DDR;
511 *mode++ = I_PRED_4x4_VR;
512 *mode++ = I_PRED_4x4_HD;
515 *mode++ = I_PRED_4x4_VL;
516 *mode++ = I_PRED_4x4_HU;
520 *mode++ = I_PRED_4x4_DC_LEFT;
521 *mode++ = I_PRED_4x4_H;
522 *mode++ = I_PRED_4x4_HU;
527 *mode++ = I_PRED_4x4_DC_TOP;
528 *mode++ = I_PRED_4x4_V;
529 *mode++ = I_PRED_4x4_DDL;
530 *mode++ = I_PRED_4x4_VL;
535 *mode++ = I_PRED_4x4_DC_128;
540 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
541 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
543 ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] );
544 ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
545 ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
548 if( do_both_dct || h->mb.b_transform_8x8 )
550 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
551 for( i = 0; i < 4; i++ )
552 h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
554 if( do_both_dct || !h->mb.b_transform_8x8 )
556 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
557 for( i = 0; i < 16; i++ )
558 h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
562 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
563 static inline void x264_mb_cache_fenc_satd( x264_t *h )
565 ALIGNED_16( static uint8_t zero[16] ) = {0};
567 int x, y, satd_sum = 0, sa8d_sum = 0;
568 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
569 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
570 if( !h->mb.i_psy_rd )
572 for( y = 0; y < 4; y++ )
573 for( x = 0; x < 4; x++ )
575 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
576 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
577 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
578 satd_sum += h->mb.pic.fenc_satd[y][x];
580 for( y = 0; y < 2; y++ )
581 for( x = 0; x < 2; x++ )
583 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
584 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
585 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
586 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
588 h->mb.pic.fenc_satd_sum = satd_sum;
589 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
592 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
598 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
600 uint8_t *p_dstc[2], *p_srcc[2];
602 if( a->i_satd_i8x8chroma < COST_MAX )
605 /* 8x8 prediction selection for chroma */
606 p_dstc[0] = h->mb.pic.p_fdec[1];
607 p_dstc[1] = h->mb.pic.p_fdec[2];
608 p_srcc[0] = h->mb.pic.p_fenc[1];
609 p_srcc[1] = h->mb.pic.p_fenc[2];
611 predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
612 a->i_satd_i8x8chroma = COST_MAX;
613 if( i_max == 4 && b_merged_satd )
615 int satdu[4], satdv[4];
616 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
617 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
618 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
619 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
620 satdu[I_PRED_CHROMA_P] =
621 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
622 satdv[I_PRED_CHROMA_P] =
623 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
625 for( i=0; i<i_max; i++ )
627 int i_mode = predict_mode[i];
628 int i_satd = satdu[i_mode] + satdv[i_mode]
629 + a->i_lambda * bs_size_ue(i_mode);
631 a->i_satd_i8x8chroma_dir[i] = i_satd;
632 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
637 for( i=0; i<i_max; i++ )
640 int i_mode = predict_mode[i];
642 /* we do the prediction */
643 if( h->mb.b_lossless )
644 x264_predict_lossless_8x8_chroma( h, i_mode );
647 h->predict_8x8c[i_mode]( p_dstc[0] );
648 h->predict_8x8c[i_mode]( p_dstc[1] );
651 /* we calculate the cost */
652 i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
653 p_srcc[0], FENC_STRIDE ) +
654 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
655 p_srcc[1], FENC_STRIDE ) +
656 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
658 a->i_satd_i8x8chroma_dir[i] = i_satd;
659 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
663 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
666 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
668 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
669 uint8_t *p_src = h->mb.pic.p_fenc[0];
670 uint8_t *p_dst = h->mb.pic.p_fdec[0];
675 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
677 /*---------------- Try all mode and calculate their score ---------------*/
679 /* 16x16 prediction selection */
680 predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
682 if( b_merged_satd && i_max == 4 )
684 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
685 h->predict_16x16[I_PRED_16x16_P]( p_dst );
686 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
687 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
690 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
691 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
696 for( i = 0; i < i_max; i++ )
699 int i_mode = predict_mode[i];
701 if( h->mb.b_lossless )
702 x264_predict_lossless_16x16( h, i_mode );
704 h->predict_16x16[i_mode]( p_dst );
706 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
707 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
708 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
709 a->i_satd_i16x16_dir[i_mode] = i_satd;
713 if( h->sh.i_type == SLICE_TYPE_B )
714 /* cavlc mb type prefix */
715 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
716 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
719 /* 8x8 prediction selection */
720 if( flags & X264_ANALYSE_I8x8 )
722 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
723 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
724 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
726 h->mb.i_cbp_luma = 0;
727 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
729 // FIXME some bias like in i4x4?
730 if( h->sh.i_type == SLICE_TYPE_B )
731 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
733 for( idx = 0;; idx++ )
737 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
738 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
739 int i_best = COST_MAX;
740 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
742 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
743 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
745 if( b_merged_satd && i_max == 9 )
748 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
749 satd[i_pred_mode] -= 3 * a->i_lambda;
750 for( i=2; i>=0; i-- )
752 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
753 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
760 for( ; i<i_max; i++ )
763 int i_mode = predict_mode[i];
765 if( h->mb.b_lossless )
766 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
768 h->predict_8x8[i_mode]( p_dst_by, edge );
770 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
771 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
773 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
774 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
778 if( idx == 3 || i_cost > i_satd_thresh )
781 /* we need to encode this block now (for next ones) */
782 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
783 x264_mb_encode_i8x8( h, idx, a->i_qp );
785 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
790 a->i_satd_i8x8 = i_cost;
791 if( h->mb.i_skip_intra )
793 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
794 h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
795 h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
796 h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
797 h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
798 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
799 if( h->mb.i_skip_intra == 2 )
800 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
805 static const uint16_t cost_div_fix8[3] = {1024,512,341};
806 a->i_satd_i8x8 = COST_MAX;
807 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
809 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
813 /* 4x4 prediction selection */
814 if( flags & X264_ANALYSE_I4x4 )
817 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
818 h->mb.i_cbp_luma = 0;
819 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
821 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
823 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
824 if( h->sh.i_type == SLICE_TYPE_B )
825 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
827 for( idx = 0;; idx++ )
829 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
830 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
831 int i_best = COST_MAX;
832 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
834 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
836 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
837 /* emulate missing topright samples */
838 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
840 if( b_merged_satd && i_max >= 6 )
843 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
844 satd[i_pred_mode] -= 3 * a->i_lambda;
845 for( i=2; i>=0; i-- )
846 COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
847 a->i_predict4x4[idx], i );
853 for( ; i<i_max; i++ )
856 int i_mode = predict_mode[i];
857 if( h->mb.b_lossless )
858 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
860 h->predict_4x4[i_mode]( p_dst_by );
862 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
863 p_src_by, FENC_STRIDE )
864 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
866 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
870 if( i_cost > i_satd_thresh || idx == 15 )
873 /* we need to encode this block now (for next ones) */
874 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
875 x264_mb_encode_i4x4( h, idx, a->i_qp );
877 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
881 a->i_satd_i4x4 = i_cost;
882 if( h->mb.i_skip_intra )
884 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
885 h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
886 h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
887 h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
888 h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
889 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
890 if( h->mb.i_skip_intra == 2 )
891 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
895 a->i_satd_i4x4 = COST_MAX;
899 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
901 if( a->i_satd_i16x16 <= i_satd_thresh )
903 h->mb.i_type = I_16x16;
904 x264_analyse_update_cache( h, a );
905 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
908 a->i_satd_i16x16 = COST_MAX;
910 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
912 h->mb.i_type = I_4x4;
913 x264_analyse_update_cache( h, a );
914 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
917 a->i_satd_i4x4 = COST_MAX;
919 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
921 h->mb.i_type = I_8x8;
922 x264_analyse_update_cache( h, a );
923 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
924 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
927 a->i_satd_i8x8 = COST_MAX;
930 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
932 uint8_t *p_dst = h->mb.pic.p_fdec[0];
935 int i_max, i_mode, i_thresh;
936 uint64_t i_satd, i_best;
938 h->mb.i_skip_intra = 0;
940 if( h->mb.i_type == I_16x16 )
942 int old_pred_mode = a->i_predict16x16;
943 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
944 i_best = a->i_satd_i16x16;
945 predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
946 for( i = 0; i < i_max; i++ )
948 int i_mode = predict_mode[i];
949 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
951 h->mb.i_intra16x16_pred_mode = i_mode;
952 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
953 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
957 /* RD selection for chroma prediction */
958 predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
961 i_thresh = a->i_satd_i8x8chroma * 5/4;
963 for( i = j = 0; i < i_max; i++ )
964 if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
965 predict_mode[i] != a->i_predict8x8chroma )
967 predict_mode[j++] = predict_mode[i];
973 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
974 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
975 /* the previous thing encoded was x264_intra_rd(), so the pixels and
976 * coefs for the current chroma mode are still around, so we only
977 * have to recount the bits. */
978 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
979 for( i = 0; i < i_max; i++ )
981 i_mode = predict_mode[i];
982 if( h->mb.b_lossless )
983 x264_predict_lossless_8x8_chroma( h, i_mode );
986 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
987 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
989 /* if we've already found a mode that needs no residual, then
990 * probably any mode with a residual will be worse.
991 * so avoid dct on the remaining modes to improve speed. */
992 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
993 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
995 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
996 h->mb.i_cbp_chroma = i_cbp_chroma_best;
1000 if( h->mb.i_type == I_4x4 )
1002 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1004 for( idx = 0; idx < 16; idx++ )
1006 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1007 i_best = COST_MAX64;
1009 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
1011 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1012 /* emulate missing topright samples */
1013 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1015 for( i = 0; i < i_max; i++ )
1017 i_mode = predict_mode[i];
1018 if( h->mb.b_lossless )
1019 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1021 h->predict_4x4[i_mode]( p_dst_by );
1022 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1024 if( i_best > i_satd )
1026 a->i_predict4x4[idx] = i_mode;
1028 pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
1029 pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
1030 pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
1031 pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
1032 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1036 *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
1037 *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
1038 *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
1039 *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
1040 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1042 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1045 else if( h->mb.i_type == I_8x8 )
1047 ALIGNED_ARRAY_16( uint8_t, edge,[33] );
1048 for( idx = 0; idx < 4; idx++ )
1050 uint64_t pels_h = 0;
1055 int cbp_luma_new = 0;
1056 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1058 i_best = COST_MAX64;
1062 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1063 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
1064 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1066 for( i = 0; i < i_max; i++ )
1068 i_mode = predict_mode[i];
1069 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1071 if( h->mb.b_lossless )
1072 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1074 h->predict_8x8[i_mode]( p_dst_by, edge );
1075 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1076 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1078 if( i_best > i_satd )
1080 a->i_predict8x8[idx] = i_mode;
1081 cbp_luma_new = h->mb.i_cbp_luma;
1084 pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
1086 for( j=0; j<7; j++ )
1087 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1088 i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
1089 i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
1092 a->i_cbp_i8x8_luma = cbp_luma_new;
1093 *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
1095 for( j=0; j<7; j++ )
1096 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1097 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
1098 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
1100 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1105 #define LOAD_FENC( m, src, xoff, yoff) \
1106 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1107 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1108 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1109 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1110 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1112 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1113 (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1114 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1115 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1116 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1117 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1118 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1119 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
1121 #define REF_COST(list, ref) \
1122 (a->p_cost_ref##list[ref])
1124 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1128 ALIGNED_4( int16_t mvc[8][2] );
1129 int i_halfpel_thresh = INT_MAX;
1130 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1132 /* 16x16 Search on all ref frame */
1133 m.i_pixel = PIXEL_16x16;
1134 m.p_cost_mv = a->p_cost_mv;
1135 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1137 a->l0.me16x16.cost = INT_MAX;
1138 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1140 const int i_ref_cost = REF_COST( 0, i_ref );
1141 i_halfpel_thresh -= i_ref_cost;
1142 m.i_ref_cost = i_ref_cost;
1145 /* search with ref */
1146 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1147 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1148 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1149 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1151 /* early termination
1152 * SSD threshold would probably be better than SATD */
1155 && m.cost-m.cost_mv < 300*a->i_lambda
1156 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1157 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1158 && x264_macroblock_probe_pskip( h ) )
1160 h->mb.i_type = P_SKIP;
1161 x264_analyse_update_cache( h, a );
1162 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1166 m.cost += i_ref_cost;
1167 i_halfpel_thresh += i_ref_cost;
1169 if( m.cost < a->l0.me16x16.cost )
1170 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1172 /* save mv for predicting neighbors */
1173 *(uint32_t*)a->l0.mvc[i_ref][0] =
1174 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1177 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1178 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1180 h->mb.i_type = P_L0;
1183 x264_mb_cache_fenc_satd( h );
1184 if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
1186 h->mb.i_partition = D_16x16;
1187 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1188 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1193 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1197 uint8_t **p_fenc = h->mb.pic.p_fenc;
1198 int i_halfpel_thresh = INT_MAX;
1199 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1201 int i_maxref = h->mb.pic.i_fref[0]-1;
1203 h->mb.i_partition = D_8x8;
1205 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1206 * than those used by the neighbors */
1207 if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
1208 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1211 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
1212 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
1213 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
1214 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
1215 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
1216 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
1219 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1220 *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
1222 for( i = 0; i < 4; i++ )
1224 x264_me_t *l0m = &a->l0.me8x8[i];
1228 m.i_pixel = PIXEL_8x8;
1229 m.p_cost_mv = a->p_cost_mv;
1231 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1232 l0m->cost = INT_MAX;
1233 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1235 const int i_ref_cost = REF_COST( 0, i_ref );
1236 i_halfpel_thresh -= i_ref_cost;
1237 m.i_ref_cost = i_ref_cost;
1240 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1241 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1242 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1243 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1245 m.cost += i_ref_cost;
1246 i_halfpel_thresh += i_ref_cost;
1247 *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
1249 if( m.cost < l0m->cost )
1250 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1252 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1253 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1256 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1259 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1260 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1261 /* P_8x8 ref0 has no ref cost */
1262 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1263 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1264 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1265 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1266 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1269 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1271 const int i_ref = a->l0.me16x16.i_ref;
1272 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1273 uint8_t **p_fref = h->mb.pic.p_fref[0][i_ref];
1274 uint8_t **p_fenc = h->mb.pic.p_fenc;
1276 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1279 /* XXX Needed for x264_mb_predict_mv */
1280 h->mb.i_partition = D_8x8;
1283 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
1285 for( i = 0; i < 4; i++ )
1287 x264_me_t *m = &a->l0.me8x8[i];
1291 m->i_pixel = PIXEL_8x8;
1292 m->p_cost_mv = a->p_cost_mv;
1293 m->i_ref_cost = i_ref_cost;
1296 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1297 LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
1298 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1299 x264_me_search( h, m, mvc, i_mvc );
1301 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1303 *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
1307 m->cost += i_ref_cost;
1308 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1311 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1312 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1313 /* theoretically this should include 4*ref_cost,
1314 * but 3 seems a better approximation of cabac. */
1315 if( h->param.b_cabac )
1316 a->l0.i_cost8x8 -= i_ref_cost;
1317 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1318 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1321 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1324 uint8_t **p_fenc = h->mb.pic.p_fenc;
1325 ALIGNED_4( int16_t mvc[3][2] );
1328 /* XXX Needed for x264_mb_predict_mv */
1329 h->mb.i_partition = D_16x8;
1331 for( i = 0; i < 2; i++ )
1333 x264_me_t *l0m = &a->l0.me16x8[i];
1334 const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1335 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1337 m.i_pixel = PIXEL_16x8;
1338 m.p_cost_mv = a->p_cost_mv;
1340 LOAD_FENC( &m, p_fenc, 0, 8*i );
1341 l0m->cost = INT_MAX;
1342 for( j = 0; j < i_ref8s; j++ )
1344 const int i_ref = ref8[j];
1345 const int i_ref_cost = REF_COST( 0, i_ref );
1346 m.i_ref_cost = i_ref_cost;
1349 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1350 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1351 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
1352 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
1354 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1355 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1356 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1357 x264_me_search( h, &m, mvc, 3 );
1359 m.cost += i_ref_cost;
1361 if( m.cost < l0m->cost )
1362 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1364 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1365 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1368 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1371 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1374 uint8_t **p_fenc = h->mb.pic.p_fenc;
1375 ALIGNED_4( int16_t mvc[3][2] );
1378 /* XXX Needed for x264_mb_predict_mv */
1379 h->mb.i_partition = D_8x16;
1381 for( i = 0; i < 2; i++ )
1383 x264_me_t *l0m = &a->l0.me8x16[i];
1384 const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1385 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1387 m.i_pixel = PIXEL_8x16;
1388 m.p_cost_mv = a->p_cost_mv;
1390 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1391 l0m->cost = INT_MAX;
1392 for( j = 0; j < i_ref8s; j++ )
1394 const int i_ref = ref8[j];
1395 const int i_ref_cost = REF_COST( 0, i_ref );
1396 m.i_ref_cost = i_ref_cost;
1399 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1400 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
1401 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
1403 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1404 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1405 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1406 x264_me_search( h, &m, mvc, 3 );
1408 m.cost += i_ref_cost;
1410 if( m.cost < l0m->cost )
1411 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1413 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1414 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1417 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1420 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1422 ALIGNED_8( uint8_t pix1[16*8] );
1423 uint8_t *pix2 = pix1+8;
1424 const int i_stride = h->mb.pic.i_stride[1];
1425 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1426 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1428 #define CHROMA4x4MC( width, height, me, x, y ) \
1429 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
1430 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
1432 if( pixel == PIXEL_4x4 )
1434 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
1435 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
1436 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
1437 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
1439 else if( pixel == PIXEL_8x4 )
1441 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
1442 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
1446 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
1447 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
1450 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1451 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1454 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1456 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1457 uint8_t **p_fenc = h->mb.pic.p_fenc;
1458 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1461 /* XXX Needed for x264_mb_predict_mv */
1462 h->mb.i_partition = D_8x8;
1464 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1466 const int idx = 4*i8x8 + i4x4;
1467 const int x4 = block_idx_x[idx];
1468 const int y4 = block_idx_y[idx];
1469 const int i_mvc = (i4x4 == 0);
1471 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1473 m->i_pixel = PIXEL_4x4;
1474 m->p_cost_mv = a->p_cost_mv;
1476 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1477 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1479 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1480 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1482 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1484 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1485 a->l0.me4x4[i8x8][1].cost +
1486 a->l0.me4x4[i8x8][2].cost +
1487 a->l0.me4x4[i8x8][3].cost +
1488 REF_COST( 0, i_ref ) +
1489 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1490 if( h->mb.b_chroma_me )
1491 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1494 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1496 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1497 uint8_t **p_fenc = h->mb.pic.p_fenc;
1498 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1501 /* XXX Needed for x264_mb_predict_mv */
1502 h->mb.i_partition = D_8x8;
1504 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1506 const int idx = 4*i8x8 + 2*i8x4;
1507 const int x4 = block_idx_x[idx];
1508 const int y4 = block_idx_y[idx];
1509 const int i_mvc = (i8x4 == 0);
1511 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1513 m->i_pixel = PIXEL_8x4;
1514 m->p_cost_mv = a->p_cost_mv;
1516 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1517 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1519 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1520 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1522 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1524 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1525 REF_COST( 0, i_ref ) +
1526 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1527 if( h->mb.b_chroma_me )
1528 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1531 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1533 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1534 uint8_t **p_fenc = h->mb.pic.p_fenc;
1535 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1538 /* XXX Needed for x264_mb_predict_mv */
1539 h->mb.i_partition = D_8x8;
1541 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1543 const int idx = 4*i8x8 + i4x8;
1544 const int x4 = block_idx_x[idx];
1545 const int y4 = block_idx_y[idx];
1546 const int i_mvc = (i4x8 == 0);
1548 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1550 m->i_pixel = PIXEL_4x8;
1551 m->p_cost_mv = a->p_cost_mv;
1553 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1554 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1556 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1557 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1559 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1561 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1562 REF_COST( 0, i_ref ) +
1563 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1564 if( h->mb.b_chroma_me )
1565 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1568 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1570 /* Assumes that fdec still contains the results of
1571 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1573 uint8_t **p_fenc = h->mb.pic.p_fenc;
1574 uint8_t **p_fdec = h->mb.pic.p_fdec;
1577 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1578 for( i = 0; i < 4; i++ )
1580 const int x = (i&1)*8;
1581 const int y = (i>>1)*8;
1582 a->i_cost16x16direct +=
1583 a->i_cost8x8direct[i] =
1584 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1587 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1591 #define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
1593 h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
1596 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1598 ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
1599 ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
1600 uint8_t *src0, *src1;
1601 int stride0 = 16, stride1 = 16;
1605 ALIGNED_4( int16_t mvc[9][2] );
1606 int i_halfpel_thresh = INT_MAX;
1607 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1609 /* 16x16 Search on all ref frame */
1610 m.i_pixel = PIXEL_16x16;
1611 m.p_cost_mv = a->p_cost_mv;
1612 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1615 a->l0.me16x16.cost = INT_MAX;
1616 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1618 /* search with ref */
1619 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1620 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1621 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1622 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1625 m.cost += REF_COST( 0, i_ref );
1627 if( m.cost < a->l0.me16x16.cost )
1629 a->l0.i_ref = i_ref;
1630 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1633 /* save mv for predicting neighbors */
1634 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1636 /* subtract ref cost, so we don't have to add it for the other MB types */
1637 a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1640 i_halfpel_thresh = INT_MAX;
1641 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1642 a->l1.me16x16.cost = INT_MAX;
1643 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1645 /* search with ref */
1646 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1647 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1648 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1649 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1652 m.cost += REF_COST( 1, i_ref );
1654 if( m.cost < a->l1.me16x16.cost )
1656 a->l1.i_ref = i_ref;
1657 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1660 /* save mv for predicting neighbors */
1661 *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1663 /* subtract ref cost, so we don't have to add it for the other MB types */
1664 a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1666 /* Set global ref, needed for other modes? */
1667 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1668 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1670 /* get cost of BI mode */
1671 src0 = h->mc.get_ref( pix0, &stride0,
1672 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1673 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
1674 src1 = h->mc.get_ref( pix1, &stride1,
1675 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1676 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
1678 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1680 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1681 + REF_COST( 0, a->l0.i_ref )
1682 + REF_COST( 1, a->l1.i_ref )
1683 + a->l0.me16x16.cost_mv
1684 + a->l1.me16x16.cost_mv;
1687 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1688 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1689 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1692 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1694 const int x = 2*(i%2);
1695 const int y = 2*(i/2);
1697 switch( h->mb.i_sub_partition[i] )
1700 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1703 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1704 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1707 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1708 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1711 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1712 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1713 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1714 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1717 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1722 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1723 if( x264_mb_partition_listX_table[0][part] ) \
1725 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1726 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1730 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1731 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1733 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1735 if( x264_mb_partition_listX_table[1][part] ) \
1737 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1738 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1742 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1743 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1745 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1748 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1752 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1754 x264_mb_load_mv_direct8x8( h, i );
1757 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1758 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1759 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1764 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1767 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1769 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1771 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1773 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1777 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1779 uint8_t **p_fref[2] =
1780 { h->mb.pic.p_fref[0][a->l0.i_ref],
1781 h->mb.pic.p_fref[1][a->l1.i_ref] };
1782 ALIGNED_8( uint8_t pix[2][8*8] );
1785 /* XXX Needed for x264_mb_predict_mv */
1786 h->mb.i_partition = D_8x8;
1790 for( i = 0; i < 4; i++ )
1795 int i_part_cost_bi = 0;
1796 int stride[2] = {8,8};
1799 for( l = 0; l < 2; l++ )
1801 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1802 x264_me_t *m = &lX->me8x8[i];
1804 m->i_pixel = PIXEL_8x8;
1805 m->p_cost_mv = a->p_cost_mv;
1807 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1808 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1810 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1811 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1813 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1816 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1817 m->mv[0], m->mv[1], 8, 8 );
1818 i_part_cost_bi += m->cost_mv;
1819 /* FIXME: ref cost */
1821 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1822 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1823 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1824 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1825 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1827 i_part_cost = a->l0.me8x8[i].cost;
1828 h->mb.i_sub_partition[i] = D_L0_8x8;
1829 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1830 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1831 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1832 a->i_cost8x8bi += i_part_cost;
1834 /* XXX Needed for x264_mb_predict_mv */
1835 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1839 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1842 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1844 uint8_t **p_fref[2] =
1845 { h->mb.pic.p_fref[0][a->l0.i_ref],
1846 h->mb.pic.p_fref[1][a->l1.i_ref] };
1847 ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
1848 ALIGNED_4( int16_t mvc[2][2] );
1851 h->mb.i_partition = D_16x8;
1852 a->i_cost16x8bi = 0;
1854 for( i = 0; i < 2; i++ )
1857 int i_part_cost_bi = 0;
1858 int stride[2] = {16,16};
1861 /* TODO: check only the list(s) that were used in b8x8? */
1862 for( l = 0; l < 2; l++ )
1864 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1865 x264_me_t *m = &lX->me16x8[i];
1867 m->i_pixel = PIXEL_16x8;
1868 m->p_cost_mv = a->p_cost_mv;
1870 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1871 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1873 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
1874 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
1876 x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
1877 x264_me_search( h, m, mvc, 2 );
1880 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1881 m->mv[0], m->mv[1], 16, 8 );
1882 /* FIXME: ref cost */
1883 i_part_cost_bi += m->cost_mv;
1885 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1886 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1888 i_part_cost = a->l0.me16x8[i].cost;
1889 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1890 if( a->l1.me16x8[i].cost < i_part_cost )
1892 i_part_cost = a->l1.me16x8[i].cost;
1893 a->i_mb_partition16x8[i] = D_L1_8x8;
1895 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1897 i_part_cost = i_part_cost_bi;
1898 a->i_mb_partition16x8[i] = D_BI_8x8;
1900 a->i_cost16x8bi += i_part_cost;
1902 x264_mb_cache_mv_b16x8( h, a, i, 0 );
1906 a->i_mb_type16x8 = B_L0_L0
1907 + (a->i_mb_partition16x8[0]>>2) * 3
1908 + (a->i_mb_partition16x8[1]>>2);
1909 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1912 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1914 uint8_t **p_fref[2] =
1915 { h->mb.pic.p_fref[0][a->l0.i_ref],
1916 h->mb.pic.p_fref[1][a->l1.i_ref] };
1917 ALIGNED_8( uint8_t pix[2][8*16] );
1918 ALIGNED_4( int16_t mvc[2][2] );
1921 h->mb.i_partition = D_8x16;
1922 a->i_cost8x16bi = 0;
1924 for( i = 0; i < 2; i++ )
1927 int i_part_cost_bi = 0;
1928 int stride[2] = {8,8};
1931 for( l = 0; l < 2; l++ )
1933 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1934 x264_me_t *m = &lX->me8x16[i];
1936 m->i_pixel = PIXEL_8x16;
1937 m->p_cost_mv = a->p_cost_mv;
1939 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1940 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
1942 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
1943 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
1945 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1946 x264_me_search( h, m, mvc, 2 );
1949 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1950 m->mv[0], m->mv[1], 8, 16 );
1951 /* FIXME: ref cost */
1952 i_part_cost_bi += m->cost_mv;
1955 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1956 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
1958 i_part_cost = a->l0.me8x16[i].cost;
1959 a->i_mb_partition8x16[i] = D_L0_8x8;
1960 if( a->l1.me8x16[i].cost < i_part_cost )
1962 i_part_cost = a->l1.me8x16[i].cost;
1963 a->i_mb_partition8x16[i] = D_L1_8x8;
1965 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1967 i_part_cost = i_part_cost_bi;
1968 a->i_mb_partition8x16[i] = D_BI_8x8;
1970 a->i_cost8x16bi += i_part_cost;
1972 x264_mb_cache_mv_b8x16( h, a, i, 0 );
1976 a->i_mb_type8x16 = B_L0_L0
1977 + (a->i_mb_partition8x16[0]>>2) * 3
1978 + (a->i_mb_partition8x16[1]>>2);
1979 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
1982 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
1984 int thresh = i_satd * 5/4;
1986 h->mb.i_type = P_L0;
1987 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
1989 h->mb.i_partition = D_16x16;
1990 x264_analyse_update_cache( h, a );
1991 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1993 a->l0.me16x16.cost = a->l0.i_rd16x16;
1995 if( a->l0.i_cost16x8 <= thresh )
1997 h->mb.i_partition = D_16x8;
1998 x264_analyse_update_cache( h, a );
1999 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2002 a->l0.i_cost16x8 = COST_MAX;
2004 if( a->l0.i_cost8x16 <= thresh )
2006 h->mb.i_partition = D_8x16;
2007 x264_analyse_update_cache( h, a );
2008 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2011 a->l0.i_cost8x16 = COST_MAX;
2013 if( a->l0.i_cost8x8 <= thresh )
2015 h->mb.i_type = P_8x8;
2016 h->mb.i_partition = D_8x8;
2017 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2020 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2021 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2022 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2023 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2024 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2025 * for future blocks are those left over from previous RDO calls. */
2026 for( i = 0; i < 4; i++ )
2028 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2029 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2030 int subtype, btype = D_L0_8x8;
2031 uint64_t bcost = COST_MAX64;
2032 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2035 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2037 h->mb.i_sub_partition[i] = subtype;
2038 x264_mb_cache_mv_p8x8( h, a, i );
2039 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2040 COPY2_IF_LT( bcost, cost, btype, subtype );
2042 h->mb.i_sub_partition[i] = btype;
2043 x264_mb_cache_mv_p8x8( h, a, i );
2047 x264_analyse_update_cache( h, a );
2048 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2051 a->l0.i_cost8x8 = COST_MAX;
2054 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2056 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2058 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2060 h->mb.i_type = B_DIRECT;
2061 /* Assumes direct/skip MC is still in fdec */
2062 /* Requires b-rdo to be done before intra analysis */
2063 h->mb.b_skip_mc = 1;
2064 x264_analyse_update_cache( h, a );
2065 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2066 h->mb.b_skip_mc = 0;
2069 //FIXME not all the update_cache calls are needed
2070 h->mb.i_partition = D_16x16;
2072 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2074 h->mb.i_type = B_L0_L0;
2075 x264_analyse_update_cache( h, a );
2076 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2080 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2082 h->mb.i_type = B_L1_L1;
2083 x264_analyse_update_cache( h, a );
2084 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2088 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2090 h->mb.i_type = B_BI_BI;
2091 x264_analyse_update_cache( h, a );
2092 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2096 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2098 h->mb.i_type = B_8x8;
2099 h->mb.i_partition = D_8x8;
2100 x264_analyse_update_cache( h, a );
2101 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2102 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2106 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2108 h->mb.i_type = a->i_mb_type16x8;
2109 h->mb.i_partition = D_16x8;
2110 x264_analyse_update_cache( h, a );
2111 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2115 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2117 h->mb.i_type = a->i_mb_type8x16;
2118 h->mb.i_partition = D_8x16;
2119 x264_analyse_update_cache( h, a );
2120 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2124 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2126 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2129 if( IS_INTRA(h->mb.i_type) )
2132 switch( h->mb.i_partition )
2135 if( h->mb.i_type == B_BI_BI )
2136 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2139 for( i=0; i<2; i++ )
2140 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2141 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2144 for( i=0; i<2; i++ )
2145 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2146 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2149 for( i=0; i<4; i++ )
2150 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2151 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2156 static inline void x264_mb_analyse_transform( x264_t *h )
2158 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2160 int i_cost4, i_cost8;
2161 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2164 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2165 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2166 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2167 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2169 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2170 h->mb.b_skip_mc = 1;
2174 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2176 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2179 x264_analyse_update_cache( h, a );
2180 h->mb.b_transform_8x8 ^= 1;
2181 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2182 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2184 if( *i_rd >= i_rd8 )
2187 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2191 h->mb.b_transform_8x8 ^= 1;
2195 /* Rate-distortion optimal QP selection.
2196 * FIXME: More than half of the benefit of this function seems to be
2197 * in the way it improves the coding of chroma DC (by decimating or
2198 * finding a better way to code a single DC coefficient.)
2199 * There must be a more efficient way to get that portion of the benefit
2200 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2202 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2204 int bcost, cost, direction, failures, prevcost, origcost;
2205 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2206 int last_qp_tried = 0;
2207 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2209 /* If CBP is already zero, don't raise the quantizer any higher. */
2210 for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2212 /* Without psy-RD, require monotonicity when moving quant away from previous
2213 * macroblock's quant; allow 1 failure when moving quant towards previous quant.
2214 * With psy-RD, allow 1 failure when moving quant away from previous quant,
2215 * allow 2 failures when moving quant towards previous quant.
2216 * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
2217 int threshold = (!!h->mb.i_psy_rd);
2218 /* Raise the threshold for failures if we're moving towards the last QP. */
2219 if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
2220 ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
2222 h->mb.i_qp = orig_qp;
2224 prevcost = origcost;
2225 h->mb.i_qp += direction;
2226 while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max )
2228 if( h->mb.i_last_qp == h->mb.i_qp )
2230 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2231 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2232 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2234 /* We can't assume that the costs are monotonic over QPs.
2235 * Tie case-as-failure seems to give better results. */
2236 if( cost < prevcost )
2242 if( failures > threshold )
2244 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2246 h->mb.i_qp += direction;
2250 /* Always try the last block's QP. */
2251 if( !last_qp_tried )
2253 h->mb.i_qp = h->mb.i_last_qp;
2254 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2255 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2256 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2260 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2262 /* Check transform again; decision from before may no longer be optimal. */
2263 if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
2264 x264_mb_transform_8x8_allowed( h ) )
2266 h->mb.b_transform_8x8 ^= 1;
2267 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2269 h->mb.b_transform_8x8 ^= 1;
2273 /*****************************************************************************
2274 * x264_macroblock_analyse:
2275 *****************************************************************************/
2276 int x264_macroblock_analyse( x264_t *h )
2278 x264_mb_analysis_t analysis;
2279 int i_cost = COST_MAX;
2282 h->mb.i_qp = x264_ratecontrol_qp( h );
2283 if( h->param.rc.i_aq_mode )
2285 x264_adaptive_quant( h );
2286 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2287 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2288 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2289 h->mb.i_qp = h->mb.i_last_qp;
2292 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2294 /*--------------------------- Do the analysis ---------------------------*/
2295 if( h->sh.i_type == SLICE_TYPE_I )
2297 if( analysis.i_mbrd )
2298 x264_mb_cache_fenc_satd( h );
2299 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2300 if( analysis.i_mbrd )
2301 x264_intra_rd( h, &analysis, COST_MAX );
2303 i_cost = analysis.i_satd_i16x16;
2304 h->mb.i_type = I_16x16;
2305 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2306 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2307 if( analysis.i_satd_pcm < i_cost )
2308 h->mb.i_type = I_PCM;
2310 else if( analysis.i_mbrd >= 2 )
2311 x264_intra_rd_refine( h, &analysis );
2313 else if( h->sh.i_type == SLICE_TYPE_P )
2317 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2319 /* Fast P_SKIP detection */
2320 analysis.b_try_pskip = 0;
2321 if( h->param.analyse.b_fast_pskip )
2323 if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2324 // FIXME don't need to check this if the reference frame is done
2326 else if( h->param.analyse.i_subpel_refine >= 3 )
2327 analysis.b_try_pskip = 1;
2328 else if( h->mb.i_mb_type_left == P_SKIP ||
2329 h->mb.i_mb_type_top == P_SKIP ||
2330 h->mb.i_mb_type_topleft == P_SKIP ||
2331 h->mb.i_mb_type_topright == P_SKIP )
2332 b_skip = x264_macroblock_probe_pskip( h );
2335 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2339 h->mb.i_type = P_SKIP;
2340 h->mb.i_partition = D_16x16;
2341 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
2345 const unsigned int flags = h->param.analyse.inter;
2349 int i_satd_inter, i_satd_intra;
2351 if( x264_mb_analyse_load_costs( h, &analysis ) )
2354 x264_mb_analyse_inter_p16x16( h, &analysis );
2356 if( h->mb.i_type == P_SKIP )
2359 if( flags & X264_ANALYSE_PSUB16x16 )
2361 if( h->param.analyse.b_mixed_references )
2362 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2364 x264_mb_analyse_inter_p8x8( h, &analysis );
2367 /* Select best inter mode */
2369 i_partition = D_16x16;
2370 i_cost = analysis.l0.me16x16.cost;
2372 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2373 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2376 i_partition = D_8x8;
2377 i_cost = analysis.l0.i_cost8x8;
2380 if( flags & X264_ANALYSE_PSUB8x8 )
2382 for( i = 0; i < 4; i++ )
2384 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2385 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2387 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2388 h->mb.i_sub_partition[i] = D_L0_4x4;
2390 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2391 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2392 h->mb.i_sub_partition[i], D_L0_8x4 );
2394 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2395 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2396 h->mb.i_sub_partition[i], D_L0_4x8 );
2398 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2400 x264_mb_cache_mv_p8x8( h, &analysis, i );
2402 analysis.l0.i_cost8x8 = i_cost;
2406 /* Now do 16x8/8x16 */
2407 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2408 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2409 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2411 x264_mb_analyse_inter_p16x8( h, &analysis );
2412 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2414 x264_mb_analyse_inter_p8x16( h, &analysis );
2415 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2418 h->mb.i_partition = i_partition;
2421 //FIXME mb_type costs?
2422 if( analysis.i_mbrd )
2426 else if( i_partition == D_16x16 )
2428 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2429 i_cost = analysis.l0.me16x16.cost;
2431 else if( i_partition == D_16x8 )
2433 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2434 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2435 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2437 else if( i_partition == D_8x16 )
2439 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2440 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2441 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2443 else if( i_partition == D_8x8 )
2447 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2449 switch( h->mb.i_sub_partition[i8x8] )
2452 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2453 i_cost += analysis.l0.me8x8[i8x8].cost;
2456 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2457 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2458 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2459 analysis.l0.me8x4[i8x8][1].cost;
2462 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2463 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2464 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2465 analysis.l0.me4x8[i8x8][1].cost;
2469 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2470 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2471 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2472 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2473 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2474 analysis.l0.me4x4[i8x8][1].cost +
2475 analysis.l0.me4x4[i8x8][2].cost +
2476 analysis.l0.me4x4[i8x8][3].cost;
2479 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2485 if( h->mb.b_chroma_me )
2487 x264_mb_analyse_intra_chroma( h, &analysis );
2488 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2489 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2490 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2491 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2494 x264_mb_analyse_intra( h, &analysis, i_cost );
2496 i_satd_inter = i_cost;
2497 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2498 analysis.i_satd_i8x8,
2499 analysis.i_satd_i4x4 );
2501 if( analysis.i_mbrd )
2503 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2505 i_partition = D_16x16;
2506 i_cost = analysis.l0.me16x16.cost;
2507 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2508 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2509 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2510 h->mb.i_type = i_type;
2511 h->mb.i_partition = i_partition;
2512 if( i_cost < COST_MAX )
2513 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2514 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2517 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2518 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2519 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2520 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2522 h->mb.i_type = i_type;
2524 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2526 if( IS_INTRA( h->mb.i_type ) )
2528 x264_intra_rd_refine( h, &analysis );
2530 else if( i_partition == D_16x16 )
2532 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2533 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2535 else if( i_partition == D_16x8 )
2537 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2538 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2539 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2540 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2541 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2542 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2544 else if( i_partition == D_8x16 )
2546 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2547 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2548 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2549 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2550 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2551 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2553 else if( i_partition == D_8x8 )
2556 x264_analyse_update_cache( h, &analysis );
2557 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2559 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2561 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2563 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2565 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2566 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2568 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2570 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2571 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2573 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2575 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2576 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2577 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2578 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2585 else if( h->sh.i_type == SLICE_TYPE_B )
2587 int i_bskip_cost = COST_MAX;
2590 if( analysis.i_mbrd )
2591 x264_mb_cache_fenc_satd( h );
2593 h->mb.i_type = B_SKIP;
2594 if( h->mb.b_direct_auto_write )
2596 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2597 for( i = 0; i < 2; i++ )
2600 h->sh.b_direct_spatial_mv_pred ^= 1;
2601 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2602 if( analysis.b_direct_available )
2607 b_skip = x264_macroblock_probe_bskip( h );
2609 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2616 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2618 if( analysis.b_direct_available )
2620 if( !h->mb.b_direct_auto_write )
2622 if( analysis.i_mbrd )
2624 i_bskip_cost = ssd_mb( h );
2625 /* 6 = minimum cavlc cost of a non-skipped MB */
2626 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2628 else if( !h->mb.b_direct_auto_write )
2630 /* Conditioning the probe on neighboring block types
2631 * doesn't seem to help speed or quality. */
2632 b_skip = x264_macroblock_probe_bskip( h );
2638 const unsigned int flags = h->param.analyse.inter;
2641 int i_satd_inter = 0; // shut up uninitialized warning
2642 h->mb.b_skip_mc = 0;
2644 if( x264_mb_analyse_load_costs( h, &analysis ) )
2647 /* select best inter mode */
2648 /* direct must be first */
2649 if( analysis.b_direct_available )
2650 x264_mb_analyse_inter_direct( h, &analysis );
2652 x264_mb_analyse_inter_b16x16( h, &analysis );
2655 i_partition = D_16x16;
2656 i_cost = analysis.l0.me16x16.cost;
2657 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2658 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2659 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2661 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2663 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2664 if( i_bskip_cost < analysis.i_rd16x16direct &&
2665 i_bskip_cost < analysis.i_rd16x16bi &&
2666 i_bskip_cost < analysis.l0.i_rd16x16 &&
2667 i_bskip_cost < analysis.l1.i_rd16x16 )
2669 h->mb.i_type = B_SKIP;
2670 x264_analyse_update_cache( h, &analysis );
2675 if( flags & X264_ANALYSE_BSUB16x16 )
2677 x264_mb_analyse_inter_b8x8( h, &analysis );
2678 if( analysis.i_cost8x8bi < i_cost )
2681 i_partition = D_8x8;
2682 i_cost = analysis.i_cost8x8bi;
2684 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2685 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2687 x264_mb_analyse_inter_b16x8( h, &analysis );
2688 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2689 i_type, analysis.i_mb_type16x8,
2690 i_partition, D_16x8 );
2692 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2693 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2695 x264_mb_analyse_inter_b8x16( h, &analysis );
2696 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2697 i_type, analysis.i_mb_type8x16,
2698 i_partition, D_8x16 );
2703 if( analysis.i_mbrd )
2708 else if( i_partition == D_16x16 )
2710 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2711 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2712 if( i_type == B_L0_L0 )
2714 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2715 i_cost = analysis.l0.me16x16.cost
2716 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2718 else if( i_type == B_L1_L1 )
2720 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2721 i_cost = analysis.l1.me16x16.cost
2722 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2724 else if( i_type == B_BI_BI )
2726 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2727 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2730 else if( i_partition == D_16x8 )
2732 for( i=0; i<2; i++ )
2734 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2735 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2736 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2737 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2740 else if( i_partition == D_8x16 )
2742 for( i=0; i<2; i++ )
2744 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2745 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2746 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2747 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2750 else if( i_partition == D_8x8 )
2752 for( i=0; i<4; i++ )
2755 int i_part_cost_old;
2757 int i_part_type = h->mb.i_sub_partition[i];
2758 int b_bidir = (i_part_type == D_BI_8x8);
2760 if( i_part_type == D_DIRECT_8x8 )
2762 if( x264_mb_partition_listX_table[0][i_part_type] )
2764 m = &analysis.l0.me8x8[i];
2765 i_part_cost_old = m->cost;
2766 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2767 m->cost -= i_type_cost;
2768 x264_me_refine_qpel( h, m );
2770 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2772 if( x264_mb_partition_listX_table[1][i_part_type] )
2774 m = &analysis.l1.me8x8[i];
2775 i_part_cost_old = m->cost;
2776 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2777 m->cost -= i_type_cost;
2778 x264_me_refine_qpel( h, m );
2780 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2782 /* TODO: update mvp? */
2786 if( analysis.i_mbrd )
2788 i_satd_inter = i_cost;
2789 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2791 i_cost = i_bskip_cost;
2792 i_partition = D_16x16;
2793 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2794 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2795 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2796 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2797 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2798 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2799 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2801 h->mb.i_type = i_type;
2802 h->mb.i_partition = i_partition;
2805 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2807 if( analysis.i_mbrd )
2809 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2810 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2813 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2814 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2815 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2816 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2818 h->mb.i_type = i_type;
2819 h->mb.i_partition = i_partition;
2821 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2822 x264_intra_rd_refine( h, &analysis );
2823 if( h->mb.i_subpel_refine >= 5 )
2824 x264_refine_bidir( h, &analysis );
2826 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2828 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2829 x264_analyse_update_cache( h, &analysis );
2831 if( i_partition == D_16x16 )
2833 if( i_type == B_L0_L0 )
2834 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2835 else if( i_type == B_L1_L1 )
2836 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2837 else if( i_type == B_BI_BI )
2838 x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2840 else if( i_partition == D_16x8 )
2842 for( i = 0; i < 2; i++ )
2844 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2845 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2846 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2847 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2848 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2849 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2850 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2853 else if( i_partition == D_8x16 )
2855 for( i = 0; i < 2; i++ )
2857 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2858 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2859 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2860 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2861 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2862 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2863 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2866 else if( i_partition == D_8x8 )
2868 for( i = 0; i < 4; i++ )
2870 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2871 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2872 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2873 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2874 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2875 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2882 x264_analyse_update_cache( h, &analysis );
2884 if( !analysis.i_mbrd )
2885 x264_mb_analyse_transform( h );
2887 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
2888 x264_mb_analyse_qp_rd( h, &analysis );
2890 h->mb.b_trellis = h->param.analyse.i_trellis;
2891 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
2892 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
2893 x264_psy_trellis_init( h, 0 );
2894 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
2895 h->mb.i_skip_intra = 0;
2899 /*-------------------- Update MB from the analysis ----------------------*/
2900 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
2904 switch( h->mb.i_type )
2907 for( i = 0; i < 16; i++ )
2908 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
2910 x264_mb_analyse_intra_chroma( h, a );
2913 for( i = 0; i < 4; i++ )
2914 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
2916 x264_mb_analyse_intra_chroma( h, a );
2919 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
2920 x264_mb_analyse_intra_chroma( h, a );
2927 switch( h->mb.i_partition )
2930 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
2931 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
2935 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
2936 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
2937 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
2938 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
2942 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
2943 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
2944 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
2945 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
2949 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
2955 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2956 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2957 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2958 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2959 for( i = 0; i < 4; i++ )
2960 x264_mb_cache_mv_p8x8( h, a, i );
2965 h->mb.i_partition = D_16x16;
2966 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
2967 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
2973 x264_mb_load_mv_direct8x8( h, 0 );
2974 x264_mb_load_mv_direct8x8( h, 1 );
2975 x264_mb_load_mv_direct8x8( h, 2 );
2976 x264_mb_load_mv_direct8x8( h, 3 );
2980 /* optimize: cache might not need to be rewritten */
2981 for( i = 0; i < 4; i++ )
2982 x264_mb_cache_mv_b8x8( h, a, i, 1 );
2985 default: /* the rest of the B types */
2986 switch( h->mb.i_partition )
2989 switch( h->mb.i_type )
2992 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
2993 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
2995 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
2996 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
2997 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
3000 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
3001 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
3002 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
3004 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3005 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3008 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
3009 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
3011 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
3012 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
3017 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
3018 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
3021 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
3022 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3025 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3031 if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
3034 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3037 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3040 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
3041 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3043 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3044 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3045 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3046 h->mb.cache.mv[l][x264_scan8[15]][0],
3047 h->mb.cache.mv[l][x264_scan8[15]][1] );
3048 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3049 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3050 fprintf(stderr, "completed: %d \n", completed );
3051 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3052 x264_mb_analyse_intra( h, a, COST_MAX );
3053 h->mb.i_type = I_16x16;
3054 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3055 x264_mb_analyse_intra_chroma( h, a );
3062 #include "slicetype.c"