1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7 * Loren Merritt <lorenm@u.washington.edu>
8 * Fiona Glaser <fiona@x264.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define _ISOC99_SOURCE
32 #include "common/common.h"
33 #include "common/cpu.h"
34 #include "macroblock.h"
36 #include "ratecontrol.h"
49 /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
50 DECLARE_ALIGNED_4( int16_t mvc[32][5][2] );
54 int i_cost4x4[4]; /* cost per 8x8 partition */
55 x264_me_t me4x4[4][4];
58 int i_cost8x4[4]; /* cost per 8x8 partition */
59 x264_me_t me8x4[4][2];
62 int i_cost4x8[4]; /* cost per 8x8 partition */
63 x264_me_t me4x8[4][2];
73 } x264_mb_analysis_list_t;
77 /* conduct the analysis using this lamda and QP */
82 uint16_t *p_cost_ref0;
83 uint16_t *p_cost_ref1;
88 /* Take some shortcuts in intra search if intra is deemed unlikely */
94 int i_satd_i16x16_dir[7];
99 int i_satd_i8x8_dir[12][4];
103 int i_predict4x4[16];
108 int i_satd_i8x8chroma;
109 int i_satd_i8x8chroma_dir[4];
110 int i_predict8x8chroma;
112 /* II: Inter part P/B frame */
113 x264_mb_analysis_list_t l0;
114 x264_mb_analysis_list_t l1;
116 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
117 int i_cost16x16direct;
119 int i_cost8x8direct[4];
128 int i_mb_partition16x8[2]; /* mb_partition_e */
129 int i_mb_partition8x16[2];
130 int i_mb_type16x8; /* mb_class_e */
133 int b_direct_available;
135 } x264_mb_analysis_t;
137 /* lambda = pow(2,qp/6-2) */
138 const int x264_lambda_tab[52] = {
139 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
140 1, 1, 1, 1, /* 8-11 */
141 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
142 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
143 6, 7, 8, 9,10,11,13,14, /* 28-35 */
144 16,18,20,23,25,29,32,36, /* 36-43 */
145 40,45,51,57,64,72,81,91 /* 44-51 */
148 /* lambda2 = pow(lambda,2) * .9 * 256 */
149 const int x264_lambda2_tab[52] = {
150 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */
151 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */
152 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */
153 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */
154 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */
155 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
156 943718, 1189010, 1498059, 1887436 /* 48 - 51 */
159 // should the intra and inter lambdas be different?
160 // I'm just matching the behaviour of deadzone quant.
161 static const int x264_trellis_lambda2_tab[2][52] = {
162 // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
163 { 46, 58, 73, 92, 117, 147,
164 185, 233, 294, 370, 466, 587,
165 740, 932, 1174, 1480, 1864, 2349,
166 2959, 3728, 4697, 5918, 7457, 9395,
167 11837, 14914, 18790, 23674, 29828, 37581,
168 47349, 59656, 75163, 94699, 119313, 150326,
169 189399, 238627, 300652, 378798, 477255, 601304,
170 757596, 954511, 1202608, 1515192, 1909022, 2405217,
171 3030384, 3818045, 4810435, 6060769 },
172 // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
173 { 27, 34, 43, 54, 68, 86,
174 108, 136, 172, 216, 273, 343,
175 433, 545, 687, 865, 1090, 1374,
176 1731, 2180, 2747, 3461, 4361, 5494,
177 6922, 8721, 10988, 13844, 17442, 21976,
178 27688, 34885, 43953, 55377, 69771, 87906,
179 110755, 139543, 175813, 221511, 279087, 351627,
180 443023, 558174, 703255, 886046, 1116348, 1406511,
181 1772093, 2232697, 2813022, 3544186 }
184 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
185 16, 20, 25, 32, 40, 50,
186 64, 80, 101, 128, 161, 203,
187 256, 322, 406, 512, 645, 812,
188 1024, 1290, 1625, 2048, 2580, 3250,
189 4096, 5160, 6501, 8192, 10321, 13003,
190 16384, 20642, 26007, 32768, 41285, 52015,
194 /* TODO: calculate CABAC costs */
195 static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = {
196 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
198 static const int i_mb_b16x8_cost_table[17] = {
199 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
201 static const int i_sub_mb_b_cost_table[13] = {
202 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
204 static const int i_sub_mb_p_cost_table[4] = {
208 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
210 /* Indexed by lambda instead of qp because, due to rounding,
211 * some quantizers share lambdas. This saves memory. */
212 uint16_t *x264_cost_mv_fpel[92][4];
213 uint16_t x264_cost_ref[92][3][33];
215 /* initialize an array of lambda*nbits for all possible mvs */
216 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
218 static int16_t *p_cost_mv[92];
221 if( !p_cost_mv[a->i_lambda] )
224 /* could be faster, but isn't called many times */
225 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
226 p_cost_mv[a->i_lambda] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) );
227 p_cost_mv[a->i_lambda] += 2*4*2048;
228 for( i = 0; i <= 2*4*2048; i++ )
230 p_cost_mv[a->i_lambda][-i] =
231 p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
233 for( i = 0; i < 3; i++ )
234 for( j = 0; j < 33; j++ )
235 x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
237 a->p_cost_mv = p_cost_mv[a->i_lambda];
238 a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
239 a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
241 /* FIXME is this useful for all me methods? */
242 if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
246 x264_cost_mv_fpel[a->i_lambda][j] = x264_malloc( (4*2048 + 1) * sizeof(int16_t) );
247 x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
248 for( i = -2*2048; i < 2*2048; i++ )
249 x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
254 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
256 int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
258 /* mbrd == 1 -> RD mode decision */
259 /* mbrd == 2 -> RD refinement */
260 /* mbrd == 3 -> QPRD */
261 a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10);
263 /* conduct the analysis using this lamda and QP */
264 a->i_qp = h->mb.i_qp = i_qp;
265 h->mb.i_chroma_qp = h->chroma_qp_table[i_qp];
267 a->i_lambda = x264_lambda_tab[i_qp];
268 a->i_lambda2 = x264_lambda2_tab[i_qp];
270 h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
271 if( h->param.analyse.i_trellis )
273 h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp];
274 h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp];
275 h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp];
276 h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp];
278 h->mb.i_psy_rd_lambda = a->i_lambda;
279 /* Adjusting chroma lambda based on QP offset hurts PSNR, so we'll leave it as part of psy-RD. */
280 h->mb.i_chroma_lambda2_offset = h->mb.i_psy_rd ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256;
282 h->mb.i_me_method = h->param.analyse.i_me_method;
283 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
284 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
285 && h->mb.i_subpel_refine >= 5;
287 h->mb.b_transform_8x8 = 0;
288 h->mb.b_noise_reduction = 0;
294 a->i_satd_i8x8chroma = COST_MAX;
296 /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
297 a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
301 h->mb.b_lossless ? 0 :
303 !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
305 /* II: Inter part P/B frame */
306 if( h->sh.i_type != SLICE_TYPE_I )
309 int i_fmv_range = 4 * h->param.analyse.i_mv_range;
310 // limit motion search to a slightly smaller range than the theoretical limit,
311 // since the search may go a few iterations past its given range
312 int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
314 /* Calculate max allowed MV range */
315 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
316 h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
317 h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
318 h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
319 h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
320 h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
321 h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
322 if( h->mb.i_mb_x == 0)
324 int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff;
325 int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff;
326 int thread_mvy_range = i_fmv_range;
328 if( h->param.i_threads > 1 )
330 int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16;
331 int thresh = pix_y + h->param.analyse.i_mv_range_thread;
332 for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
334 x264_frame_t **fref = i ? h->fref1 : h->fref0;
335 int i_ref = i ? h->i_ref1 : h->i_ref0;
336 for( j=0; j<i_ref; j++ )
338 x264_frame_cond_wait( fref[j], thresh );
339 thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->i_lines_completed - pix_y );
342 if( h->param.b_deterministic )
343 thread_mvy_range = h->param.analyse.i_mv_range_thread;
344 if( h->mb.b_interlaced )
345 thread_mvy_range >>= 1;
348 h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
349 h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 );
350 h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
351 h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
352 h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
353 h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
354 h->mb.mv_max_fpel[1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
360 a->l0.i_cost8x8 = COST_MAX;
362 for( i = 0; i < 4; i++ )
366 a->l0.i_cost4x8[i] = COST_MAX;
370 a->l0.i_cost8x16 = COST_MAX;
371 if( h->sh.i_type == SLICE_TYPE_B )
375 a->l1.i_cost8x8 = COST_MAX;
377 for( i = 0; i < 4; i++ )
382 a->i_cost8x8direct[i] = COST_MAX;
393 a->i_cost16x16direct =
396 a->i_cost8x16bi = COST_MAX;
399 /* Fast intra decision */
400 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
402 if( IS_INTRA( h->mb.i_mb_type_left )
403 || IS_INTRA( h->mb.i_mb_type_top )
404 || IS_INTRA( h->mb.i_mb_type_topleft )
405 || IS_INTRA( h->mb.i_mb_type_topright )
406 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
407 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
408 { /* intra is likely */ }
424 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
426 if( i_neighbour & MB_TOPLEFT )
428 /* top and left available */
429 *mode++ = I_PRED_16x16_V;
430 *mode++ = I_PRED_16x16_H;
431 *mode++ = I_PRED_16x16_DC;
432 *mode++ = I_PRED_16x16_P;
435 else if( i_neighbour & MB_LEFT )
438 *mode++ = I_PRED_16x16_DC_LEFT;
439 *mode++ = I_PRED_16x16_H;
442 else if( i_neighbour & MB_TOP )
445 *mode++ = I_PRED_16x16_DC_TOP;
446 *mode++ = I_PRED_16x16_V;
452 *mode = I_PRED_16x16_DC_128;
458 static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
460 if( i_neighbour & MB_TOPLEFT )
462 /* top and left available */
463 *mode++ = I_PRED_CHROMA_V;
464 *mode++ = I_PRED_CHROMA_H;
465 *mode++ = I_PRED_CHROMA_DC;
466 *mode++ = I_PRED_CHROMA_P;
469 else if( i_neighbour & MB_LEFT )
472 *mode++ = I_PRED_CHROMA_DC_LEFT;
473 *mode++ = I_PRED_CHROMA_H;
476 else if( i_neighbour & MB_TOP )
479 *mode++ = I_PRED_CHROMA_DC_TOP;
480 *mode++ = I_PRED_CHROMA_V;
486 *mode = I_PRED_CHROMA_DC_128;
492 static void predict_4x4_mode_available( unsigned int i_neighbour,
493 int *mode, int *pi_count )
495 int b_l = i_neighbour & MB_LEFT;
496 int b_t = i_neighbour & MB_TOP;
501 *mode++ = I_PRED_4x4_DC;
502 *mode++ = I_PRED_4x4_H;
503 *mode++ = I_PRED_4x4_V;
504 *mode++ = I_PRED_4x4_DDL;
505 if( i_neighbour & MB_TOPLEFT )
507 *mode++ = I_PRED_4x4_DDR;
508 *mode++ = I_PRED_4x4_VR;
509 *mode++ = I_PRED_4x4_HD;
512 *mode++ = I_PRED_4x4_VL;
513 *mode++ = I_PRED_4x4_HU;
517 *mode++ = I_PRED_4x4_DC_LEFT;
518 *mode++ = I_PRED_4x4_H;
519 *mode++ = I_PRED_4x4_HU;
524 *mode++ = I_PRED_4x4_DC_TOP;
525 *mode++ = I_PRED_4x4_V;
526 *mode++ = I_PRED_4x4_DDL;
527 *mode++ = I_PRED_4x4_VL;
532 *mode++ = I_PRED_4x4_DC_128;
537 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
538 static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
540 DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
541 DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
542 DECLARE_ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
545 if( do_both_dct || h->mb.b_transform_8x8 )
547 h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero );
548 for( i = 0; i < 4; i++ )
549 h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] );
551 if( do_both_dct || !h->mb.b_transform_8x8 )
553 h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero );
554 for( i = 0; i < 16; i++ )
555 h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] );
559 /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
560 static inline void x264_mb_cache_fenc_satd( x264_t *h )
562 DECLARE_ALIGNED_16( static uint8_t zero[16] ) = {0};
564 int x, y, satd_sum = 0, sa8d_sum = 0;
565 if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
566 x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
567 if( !h->mb.i_psy_rd )
569 for( y = 0; y < 4; y++ )
570 for( x = 0; x < 4; x++ )
572 fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
573 h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
574 - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
575 satd_sum += h->mb.pic.fenc_satd[y][x];
577 for( y = 0; y < 2; y++ )
578 for( x = 0; x < 2; x++ )
580 fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
581 h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
582 - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
583 sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
585 h->mb.pic.fenc_satd_sum = satd_sum;
586 h->mb.pic.fenc_sa8d_sum = sa8d_sum;
589 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
595 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
597 uint8_t *p_dstc[2], *p_srcc[2];
599 if( a->i_satd_i8x8chroma < COST_MAX )
602 /* 8x8 prediction selection for chroma */
603 p_dstc[0] = h->mb.pic.p_fdec[1];
604 p_dstc[1] = h->mb.pic.p_fdec[2];
605 p_srcc[0] = h->mb.pic.p_fenc[1];
606 p_srcc[1] = h->mb.pic.p_fenc[2];
608 predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
609 a->i_satd_i8x8chroma = COST_MAX;
610 if( i_max == 4 && b_merged_satd )
612 int satdu[4], satdv[4];
613 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
614 h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
615 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
616 h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
617 satdu[I_PRED_CHROMA_P] =
618 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
619 satdv[I_PRED_CHROMA_P] =
620 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
622 for( i=0; i<i_max; i++ )
624 int i_mode = predict_mode[i];
625 int i_satd = satdu[i_mode] + satdv[i_mode]
626 + a->i_lambda * bs_size_ue(i_mode);
628 a->i_satd_i8x8chroma_dir[i] = i_satd;
629 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
634 for( i=0; i<i_max; i++ )
637 int i_mode = predict_mode[i];
639 /* we do the prediction */
640 if( h->mb.b_lossless )
641 x264_predict_lossless_8x8_chroma( h, i_mode );
644 h->predict_8x8c[i_mode]( p_dstc[0] );
645 h->predict_8x8c[i_mode]( p_dstc[1] );
648 /* we calculate the cost */
649 i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
650 p_srcc[0], FENC_STRIDE ) +
651 h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
652 p_srcc[1], FENC_STRIDE ) +
653 a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
655 a->i_satd_i8x8chroma_dir[i] = i_satd;
656 COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
660 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
663 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
665 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
666 uint8_t *p_src = h->mb.pic.p_fenc[0];
667 uint8_t *p_dst = h->mb.pic.p_fdec[0];
672 int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
674 /*---------------- Try all mode and calculate their score ---------------*/
676 /* 16x16 prediction selection */
677 predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
679 if( b_merged_satd && i_max == 4 )
681 h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
682 h->predict_16x16[I_PRED_16x16_P]( p_dst );
683 a->i_satd_i16x16_dir[I_PRED_16x16_P] =
684 h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
687 int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
688 COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
693 for( i = 0; i < i_max; i++ )
696 int i_mode = predict_mode[i];
698 if( h->mb.b_lossless )
699 x264_predict_lossless_16x16( h, i_mode );
701 h->predict_16x16[i_mode]( p_dst );
703 i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
704 a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
705 COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
706 a->i_satd_i16x16_dir[i_mode] = i_satd;
710 if( h->sh.i_type == SLICE_TYPE_B )
711 /* cavlc mb type prefix */
712 a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16];
713 if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter )
716 /* 8x8 prediction selection */
717 if( flags & X264_ANALYSE_I8x8 )
719 DECLARE_ALIGNED_16( uint8_t edge[33] );
720 x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
721 int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
723 h->mb.i_cbp_luma = 0;
724 b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless;
726 // FIXME some bias like in i4x4?
727 if( h->sh.i_type == SLICE_TYPE_B )
728 i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8];
730 for( idx = 0;; idx++ )
734 uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
735 uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
736 int i_best = COST_MAX;
737 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
739 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
740 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
742 if( b_merged_satd && i_max == 9 )
745 h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
746 satd[i_pred_mode] -= 3 * a->i_lambda;
747 for( i=2; i>=0; i-- )
749 int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
750 COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
757 for( ; i<i_max; i++ )
760 int i_mode = predict_mode[i];
762 if( h->mb.b_lossless )
763 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
765 h->predict_8x8[i_mode]( p_dst_by, edge );
767 i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
768 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
770 COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
771 a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
775 if( idx == 3 || i_cost > i_satd_thresh )
778 /* we need to encode this block now (for next ones) */
779 h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
780 x264_mb_encode_i8x8( h, idx, a->i_qp );
782 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
787 a->i_satd_i8x8 = i_cost;
788 if( h->mb.i_skip_intra )
790 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
791 h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
792 h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
793 h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
794 h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
795 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
796 if( h->mb.i_skip_intra == 2 )
797 h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
802 static const uint16_t cost_div_fix8[3] = {1024,512,341};
803 a->i_satd_i8x8 = COST_MAX;
804 i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
806 if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 )
810 /* 4x4 prediction selection */
811 if( flags & X264_ANALYSE_I4x4 )
814 int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
815 h->mb.i_cbp_luma = 0;
816 b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
818 i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
820 i_cost = a->i_lambda * 24; /* from JVT (SATD0) */
821 if( h->sh.i_type == SLICE_TYPE_B )
822 i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4];
824 for( idx = 0;; idx++ )
826 uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
827 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
828 int i_best = COST_MAX;
829 int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
831 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
833 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
834 /* emulate missing topright samples */
835 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
837 if( b_merged_satd && i_max >= 6 )
840 h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
841 satd[i_pred_mode] -= 3 * a->i_lambda;
842 for( i=2; i>=0; i-- )
843 COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
844 a->i_predict4x4[idx], i );
850 for( ; i<i_max; i++ )
853 int i_mode = predict_mode[i];
854 if( h->mb.b_lossless )
855 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
857 h->predict_4x4[i_mode]( p_dst_by );
859 i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
860 p_src_by, FENC_STRIDE )
861 + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
863 COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
867 if( i_cost > i_satd_thresh || idx == 15 )
870 /* we need to encode this block now (for next ones) */
871 h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
872 x264_mb_encode_i4x4( h, idx, a->i_qp );
874 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
878 a->i_satd_i4x4 = i_cost;
879 if( h->mb.i_skip_intra )
881 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
882 h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
883 h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
884 h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
885 h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
886 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
887 if( h->mb.i_skip_intra == 2 )
888 h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
892 a->i_satd_i4x4 = COST_MAX;
896 static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
898 if( a->i_satd_i16x16 <= i_satd_thresh )
900 h->mb.i_type = I_16x16;
901 x264_analyse_update_cache( h, a );
902 a->i_satd_i16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
905 a->i_satd_i16x16 = COST_MAX;
907 if( a->i_satd_i4x4 <= i_satd_thresh && a->i_satd_i4x4 < COST_MAX )
909 h->mb.i_type = I_4x4;
910 x264_analyse_update_cache( h, a );
911 a->i_satd_i4x4 = x264_rd_cost_mb( h, a->i_lambda2 );
914 a->i_satd_i4x4 = COST_MAX;
916 if( a->i_satd_i8x8 <= i_satd_thresh && a->i_satd_i8x8 < COST_MAX )
918 h->mb.i_type = I_8x8;
919 x264_analyse_update_cache( h, a );
920 a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
921 a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
924 a->i_satd_i8x8 = COST_MAX;
927 static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
929 uint8_t *p_dst = h->mb.pic.p_fdec[0];
932 int i_max, i_mode, i_thresh;
933 uint64_t i_satd, i_best;
935 h->mb.i_skip_intra = 0;
937 if( h->mb.i_type == I_16x16 )
939 int old_pred_mode = a->i_predict16x16;
940 i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8;
941 i_best = a->i_satd_i16x16;
942 predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
943 for( i = 0; i < i_max; i++ )
945 int i_mode = predict_mode[i];
946 if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
948 h->mb.i_intra16x16_pred_mode = i_mode;
949 i_satd = x264_rd_cost_mb( h, a->i_lambda2 );
950 COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
954 /* RD selection for chroma prediction */
955 predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
958 i_thresh = a->i_satd_i8x8chroma * 5/4;
960 for( i = j = 0; i < i_max; i++ )
961 if( a->i_satd_i8x8chroma_dir[i] < i_thresh &&
962 predict_mode[i] != a->i_predict8x8chroma )
964 predict_mode[j++] = predict_mode[i];
970 int i_cbp_chroma_best = h->mb.i_cbp_chroma;
971 int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
972 /* the previous thing encoded was x264_intra_rd(), so the pixels and
973 * coefs for the current chroma mode are still around, so we only
974 * have to recount the bits. */
975 i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
976 for( i = 0; i < i_max; i++ )
978 i_mode = predict_mode[i];
979 if( h->mb.b_lossless )
980 x264_predict_lossless_8x8_chroma( h, i_mode );
983 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
984 h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
986 /* if we've already found a mode that needs no residual, then
987 * probably any mode with a residual will be worse.
988 * so avoid dct on the remaining modes to improve speed. */
989 i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
990 COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
992 h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
993 h->mb.i_cbp_chroma = i_cbp_chroma_best;
997 if( h->mb.i_type == I_4x4 )
999 uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
1001 for( idx = 0; idx < 16; idx++ )
1003 uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
1004 i_best = COST_MAX64;
1006 predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max );
1008 if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
1009 /* emulate missing topright samples */
1010 *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
1012 for( i = 0; i < i_max; i++ )
1014 i_mode = predict_mode[i];
1015 if( h->mb.b_lossless )
1016 x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode );
1018 h->predict_4x4[i_mode]( p_dst_by );
1019 i_satd = x264_rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
1021 if( i_best > i_satd )
1023 a->i_predict4x4[idx] = i_mode;
1025 pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
1026 pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
1027 pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
1028 pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
1029 i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
1033 *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
1034 *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
1035 *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
1036 *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
1037 h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
1039 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
1042 else if( h->mb.i_type == I_8x8 )
1044 DECLARE_ALIGNED_16( uint8_t edge[33] );
1045 for( idx = 0; idx < 4; idx++ )
1047 uint64_t pels_h = 0;
1052 int cbp_luma_new = 0;
1053 i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
1055 i_best = COST_MAX64;
1059 p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
1060 predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
1061 h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
1063 for( i = 0; i < i_max; i++ )
1065 i_mode = predict_mode[i];
1066 if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
1068 if( h->mb.b_lossless )
1069 x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge );
1071 h->predict_8x8[i_mode]( p_dst_by, edge );
1072 h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
1073 i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
1075 if( i_best > i_satd )
1077 a->i_predict8x8[idx] = i_mode;
1078 cbp_luma_new = h->mb.i_cbp_luma;
1081 pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
1083 for( j=0; j<7; j++ )
1084 pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
1085 i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
1086 i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
1089 a->i_cbp_i8x8_luma = cbp_luma_new;
1090 *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
1092 for( j=0; j<7; j++ )
1093 p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
1094 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
1095 *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
1097 x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
1102 #define LOAD_FENC( m, src, xoff, yoff) \
1103 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
1104 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
1105 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
1106 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; \
1107 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE];
1109 #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
1110 (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
1111 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
1112 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
1113 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
1114 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1115 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
1116 (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]];
1118 #define REF_COST(list, ref) \
1119 (a->p_cost_ref##list[ref])
1121 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
1125 DECLARE_ALIGNED_4( int16_t mvc[8][2] );
1126 int i_halfpel_thresh = INT_MAX;
1127 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1129 /* 16x16 Search on all ref frame */
1130 m.i_pixel = PIXEL_16x16;
1131 m.p_cost_mv = a->p_cost_mv;
1132 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1134 a->l0.me16x16.cost = INT_MAX;
1135 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1137 const int i_ref_cost = REF_COST( 0, i_ref );
1138 i_halfpel_thresh -= i_ref_cost;
1139 m.i_ref_cost = i_ref_cost;
1142 /* search with ref */
1143 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1144 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1145 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1146 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1148 /* early termination
1149 * SSD threshold would probably be better than SATD */
1152 && m.cost-m.cost_mv < 300*a->i_lambda
1153 && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
1154 + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
1155 && x264_macroblock_probe_pskip( h ) )
1157 h->mb.i_type = P_SKIP;
1158 x264_analyse_update_cache( h, a );
1159 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1163 m.cost += i_ref_cost;
1164 i_halfpel_thresh += i_ref_cost;
1166 if( m.cost < a->l0.me16x16.cost )
1167 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1169 /* save mv for predicting neighbors */
1170 *(uint32_t*)a->l0.mvc[i_ref][0] =
1171 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1174 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
1175 assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
1177 h->mb.i_type = P_L0;
1180 x264_mb_cache_fenc_satd( h );
1181 if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
1183 h->mb.i_partition = D_16x16;
1184 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
1185 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1190 static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
1194 uint8_t **p_fenc = h->mb.pic.p_fenc;
1195 int i_halfpel_thresh = INT_MAX;
1196 int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL;
1198 int i_maxref = h->mb.pic.i_fref[0]-1;
1200 h->mb.i_partition = D_8x8;
1202 /* early termination: if 16x16 chose ref 0, then evalute no refs older
1203 * than those used by the neighbors */
1204 if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 &&
1205 h->mb.i_mb_type_top && h->mb.i_mb_type_left )
1208 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] );
1209 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] );
1210 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] );
1211 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] );
1212 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] );
1213 i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] );
1216 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1217 *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
1219 for( i = 0; i < 4; i++ )
1221 x264_me_t *l0m = &a->l0.me8x8[i];
1225 m.i_pixel = PIXEL_8x8;
1226 m.p_cost_mv = a->p_cost_mv;
1228 LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
1229 l0m->cost = INT_MAX;
1230 for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
1232 const int i_ref_cost = REF_COST( 0, i_ref );
1233 i_halfpel_thresh -= i_ref_cost;
1234 m.i_ref_cost = i_ref_cost;
1237 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
1238 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
1239 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1240 x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
1242 m.cost += i_ref_cost;
1243 i_halfpel_thresh += i_ref_cost;
1244 *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
1246 if( m.cost < l0m->cost )
1247 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1249 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
1250 x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
1253 l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1256 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1257 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1258 /* P_8x8 ref0 has no ref cost */
1259 if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
1260 a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
1261 a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
1262 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1263 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1266 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
1268 const int i_ref = a->l0.me16x16.i_ref;
1269 const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
1270 uint8_t **p_fref = h->mb.pic.p_fref[0][i_ref];
1271 uint8_t **p_fenc = h->mb.pic.p_fenc;
1273 int16_t (*mvc)[2] = a->l0.mvc[i_ref];
1276 /* XXX Needed for x264_mb_predict_mv */
1277 h->mb.i_partition = D_8x8;
1280 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
1282 for( i = 0; i < 4; i++ )
1284 x264_me_t *m = &a->l0.me8x8[i];
1288 m->i_pixel = PIXEL_8x8;
1289 m->p_cost_mv = a->p_cost_mv;
1290 m->i_ref_cost = i_ref_cost;
1293 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
1294 LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 );
1295 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1296 x264_me_search( h, m, mvc, i_mvc );
1298 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
1300 *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
1304 m->cost += i_ref_cost;
1305 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
1308 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
1309 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
1310 /* theoretically this should include 4*ref_cost,
1311 * but 3 seems a better approximation of cabac. */
1312 if( h->param.b_cabac )
1313 a->l0.i_cost8x8 -= i_ref_cost;
1314 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
1315 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
1318 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
1321 uint8_t **p_fenc = h->mb.pic.p_fenc;
1322 DECLARE_ALIGNED_4( int16_t mvc[3][2] );
1325 /* XXX Needed for x264_mb_predict_mv */
1326 h->mb.i_partition = D_16x8;
1328 for( i = 0; i < 2; i++ )
1330 x264_me_t *l0m = &a->l0.me16x8[i];
1331 const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref };
1332 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1334 m.i_pixel = PIXEL_16x8;
1335 m.p_cost_mv = a->p_cost_mv;
1337 LOAD_FENC( &m, p_fenc, 0, 8*i );
1338 l0m->cost = INT_MAX;
1339 for( j = 0; j < i_ref8s; j++ )
1341 const int i_ref = ref8[j];
1342 const int i_ref_cost = REF_COST( 0, i_ref );
1343 m.i_ref_cost = i_ref_cost;
1346 /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
1347 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1348 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
1349 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
1351 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
1352 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
1353 x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
1354 x264_me_search( h, &m, mvc, 3 );
1356 m.cost += i_ref_cost;
1358 if( m.cost < l0m->cost )
1359 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1361 x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
1362 x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
1365 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
1368 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
1371 uint8_t **p_fenc = h->mb.pic.p_fenc;
1372 DECLARE_ALIGNED_4( int16_t mvc[3][2] );
1375 /* XXX Needed for x264_mb_predict_mv */
1376 h->mb.i_partition = D_8x16;
1378 for( i = 0; i < 2; i++ )
1380 x264_me_t *l0m = &a->l0.me8x16[i];
1381 const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref };
1382 const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
1384 m.i_pixel = PIXEL_8x16;
1385 m.p_cost_mv = a->p_cost_mv;
1387 LOAD_FENC( &m, p_fenc, 8*i, 0 );
1388 l0m->cost = INT_MAX;
1389 for( j = 0; j < i_ref8s; j++ )
1391 const int i_ref = ref8[j];
1392 const int i_ref_cost = REF_COST( 0, i_ref );
1393 m.i_ref_cost = i_ref_cost;
1396 *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
1397 *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
1398 *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
1400 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
1401 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
1402 x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
1403 x264_me_search( h, &m, mvc, 3 );
1405 m.cost += i_ref_cost;
1407 if( m.cost < l0m->cost )
1408 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
1410 x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
1411 x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
1414 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
1417 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
1419 DECLARE_ALIGNED_8( uint8_t pix1[16*8] );
1420 uint8_t *pix2 = pix1+8;
1421 const int i_stride = h->mb.pic.i_stride[1];
1422 const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
1423 const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
1425 #define CHROMA4x4MC( width, height, me, x, y ) \
1426 h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
1427 h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
1429 if( pixel == PIXEL_4x4 )
1431 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
1432 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 );
1433 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 );
1434 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
1436 else if( pixel == PIXEL_8x4 )
1438 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
1439 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
1443 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
1444 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
1447 return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
1448 + h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
1451 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1453 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1454 uint8_t **p_fenc = h->mb.pic.p_fenc;
1455 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1458 /* XXX Needed for x264_mb_predict_mv */
1459 h->mb.i_partition = D_8x8;
1461 for( i4x4 = 0; i4x4 < 4; i4x4++ )
1463 const int idx = 4*i8x8 + i4x4;
1464 const int x4 = block_idx_x[idx];
1465 const int y4 = block_idx_y[idx];
1466 const int i_mvc = (i4x4 == 0);
1468 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
1470 m->i_pixel = PIXEL_4x4;
1471 m->p_cost_mv = a->p_cost_mv;
1473 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1474 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1476 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1477 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
1479 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
1481 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
1482 a->l0.me4x4[i8x8][1].cost +
1483 a->l0.me4x4[i8x8][2].cost +
1484 a->l0.me4x4[i8x8][3].cost +
1485 REF_COST( 0, i_ref ) +
1486 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
1487 if( h->mb.b_chroma_me )
1488 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
1491 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1493 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1494 uint8_t **p_fenc = h->mb.pic.p_fenc;
1495 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1498 /* XXX Needed for x264_mb_predict_mv */
1499 h->mb.i_partition = D_8x8;
1501 for( i8x4 = 0; i8x4 < 2; i8x4++ )
1503 const int idx = 4*i8x8 + 2*i8x4;
1504 const int x4 = block_idx_x[idx];
1505 const int y4 = block_idx_y[idx];
1506 const int i_mvc = (i8x4 == 0);
1508 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
1510 m->i_pixel = PIXEL_8x4;
1511 m->p_cost_mv = a->p_cost_mv;
1513 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1514 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1516 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
1517 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1519 x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
1521 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
1522 REF_COST( 0, i_ref ) +
1523 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
1524 if( h->mb.b_chroma_me )
1525 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
1528 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
1530 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
1531 uint8_t **p_fenc = h->mb.pic.p_fenc;
1532 const int i_ref = a->l0.me8x8[i8x8].i_ref;
1535 /* XXX Needed for x264_mb_predict_mv */
1536 h->mb.i_partition = D_8x8;
1538 for( i4x8 = 0; i4x8 < 2; i4x8++ )
1540 const int idx = 4*i8x8 + i4x8;
1541 const int x4 = block_idx_x[idx];
1542 const int y4 = block_idx_y[idx];
1543 const int i_mvc = (i4x8 == 0);
1545 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
1547 m->i_pixel = PIXEL_4x8;
1548 m->p_cost_mv = a->p_cost_mv;
1550 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
1551 LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
1553 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
1554 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
1556 x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
1558 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
1559 REF_COST( 0, i_ref ) +
1560 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
1561 if( h->mb.b_chroma_me )
1562 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
1565 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
1567 /* Assumes that fdec still contains the results of
1568 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
1570 uint8_t **p_fenc = h->mb.pic.p_fenc;
1571 uint8_t **p_fdec = h->mb.pic.p_fdec;
1574 a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
1575 for( i = 0; i < 4; i++ )
1577 const int x = (i&1)*8;
1578 const int y = (i>>1)*8;
1579 a->i_cost16x16direct +=
1580 a->i_cost8x8direct[i] =
1581 h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE );
1584 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
1588 #define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
1590 h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
1593 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
1595 DECLARE_ALIGNED_16( uint8_t pix0[16*16] );
1596 DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
1597 uint8_t *src0, *src1;
1598 int stride0 = 16, stride1 = 16;
1602 DECLARE_ALIGNED_4( int16_t mvc[9][2] );
1603 int i_halfpel_thresh = INT_MAX;
1604 int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
1606 /* 16x16 Search on all ref frame */
1607 m.i_pixel = PIXEL_16x16;
1608 m.p_cost_mv = a->p_cost_mv;
1609 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
1612 a->l0.me16x16.cost = INT_MAX;
1613 for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
1615 /* search with ref */
1616 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
1617 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
1618 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
1619 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1622 m.cost += REF_COST( 0, i_ref );
1624 if( m.cost < a->l0.me16x16.cost )
1626 a->l0.i_ref = i_ref;
1627 h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
1630 /* save mv for predicting neighbors */
1631 *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1633 /* subtract ref cost, so we don't have to add it for the other MB types */
1634 a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
1637 i_halfpel_thresh = INT_MAX;
1638 p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL;
1639 a->l1.me16x16.cost = INT_MAX;
1640 for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ )
1642 /* search with ref */
1643 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 );
1644 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
1645 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
1646 x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
1649 m.cost += REF_COST( 1, i_ref );
1651 if( m.cost < a->l1.me16x16.cost )
1653 a->l1.i_ref = i_ref;
1654 h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
1657 /* save mv for predicting neighbors */
1658 *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
1660 /* subtract ref cost, so we don't have to add it for the other MB types */
1661 a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
1663 /* Set global ref, needed for other modes? */
1664 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
1665 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
1667 /* get cost of BI mode */
1668 src0 = h->mc.get_ref( pix0, &stride0,
1669 h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
1670 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
1671 src1 = h->mc.get_ref( pix1, &stride1,
1672 h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1673 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
1675 h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1677 a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
1678 + REF_COST( 0, a->l0.i_ref )
1679 + REF_COST( 1, a->l1.i_ref )
1680 + a->l0.me16x16.cost_mv
1681 + a->l1.me16x16.cost_mv;
1684 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1685 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1686 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1689 static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
1691 const int x = 2*(i%2);
1692 const int y = 2*(i/2);
1694 switch( h->mb.i_sub_partition[i] )
1697 x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
1700 x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
1701 x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
1704 x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
1705 x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
1708 x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
1709 x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
1710 x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
1711 x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
1714 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1719 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1720 if( x264_mb_partition_listX_table[0][part] ) \
1722 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1723 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
1727 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1728 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
1730 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
1732 if( x264_mb_partition_listX_table[1][part] ) \
1734 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1735 x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
1739 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1740 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
1742 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
1745 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1749 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1751 x264_mb_load_mv_direct8x8( h, i );
1754 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
1755 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
1756 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1761 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1764 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1766 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1768 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1770 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1774 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1776 uint8_t **p_fref[2] =
1777 { h->mb.pic.p_fref[0][a->l0.i_ref],
1778 h->mb.pic.p_fref[1][a->l1.i_ref] };
1779 DECLARE_ALIGNED_8( uint8_t pix[2][8*8] );
1782 /* XXX Needed for x264_mb_predict_mv */
1783 h->mb.i_partition = D_8x8;
1787 for( i = 0; i < 4; i++ )
1792 int i_part_cost_bi = 0;
1793 int stride[2] = {8,8};
1796 for( l = 0; l < 2; l++ )
1798 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1799 x264_me_t *m = &lX->me8x8[i];
1801 m->i_pixel = PIXEL_8x8;
1802 m->p_cost_mv = a->p_cost_mv;
1804 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1805 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 );
1807 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1808 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1810 x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
1813 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1814 m->mv[0], m->mv[1], 8, 8 );
1815 i_part_cost_bi += m->cost_mv;
1816 /* FIXME: ref cost */
1818 h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1819 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
1820 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1821 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1822 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1824 i_part_cost = a->l0.me8x8[i].cost;
1825 h->mb.i_sub_partition[i] = D_L0_8x8;
1826 COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
1827 COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
1828 COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
1829 a->i_cost8x8bi += i_part_cost;
1831 /* XXX Needed for x264_mb_predict_mv */
1832 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1836 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1839 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1841 uint8_t **p_fref[2] =
1842 { h->mb.pic.p_fref[0][a->l0.i_ref],
1843 h->mb.pic.p_fref[1][a->l1.i_ref] };
1844 DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
1845 DECLARE_ALIGNED_4( int16_t mvc[2][2] );
1848 h->mb.i_partition = D_16x8;
1849 a->i_cost16x8bi = 0;
1851 for( i = 0; i < 2; i++ )
1854 int i_part_cost_bi = 0;
1855 int stride[2] = {16,16};
1858 /* TODO: check only the list(s) that were used in b8x8? */
1859 for( l = 0; l < 2; l++ )
1861 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1862 x264_me_t *m = &lX->me16x8[i];
1864 m->i_pixel = PIXEL_16x8;
1865 m->p_cost_mv = a->p_cost_mv;
1867 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1868 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
1870 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
1871 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
1873 x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
1874 x264_me_search( h, m, mvc, 2 );
1877 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1878 m->mv[0], m->mv[1], 16, 8 );
1879 /* FIXME: ref cost */
1880 i_part_cost_bi += m->cost_mv;
1882 h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1883 i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
1885 i_part_cost = a->l0.me16x8[i].cost;
1886 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1887 if( a->l1.me16x8[i].cost < i_part_cost )
1889 i_part_cost = a->l1.me16x8[i].cost;
1890 a->i_mb_partition16x8[i] = D_L1_8x8;
1892 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1894 i_part_cost = i_part_cost_bi;
1895 a->i_mb_partition16x8[i] = D_BI_8x8;
1897 a->i_cost16x8bi += i_part_cost;
1899 x264_mb_cache_mv_b16x8( h, a, i, 0 );
1903 a->i_mb_type16x8 = B_L0_L0
1904 + (a->i_mb_partition16x8[0]>>2) * 3
1905 + (a->i_mb_partition16x8[1]>>2);
1906 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1909 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1911 uint8_t **p_fref[2] =
1912 { h->mb.pic.p_fref[0][a->l0.i_ref],
1913 h->mb.pic.p_fref[1][a->l1.i_ref] };
1914 DECLARE_ALIGNED_8( uint8_t pix[2][8*16] );
1915 DECLARE_ALIGNED_4( int16_t mvc[2][2] );
1918 h->mb.i_partition = D_8x16;
1919 a->i_cost8x16bi = 0;
1921 for( i = 0; i < 2; i++ )
1924 int i_part_cost_bi = 0;
1925 int stride[2] = {8,8};
1928 for( l = 0; l < 2; l++ )
1930 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1931 x264_me_t *m = &lX->me8x16[i];
1933 m->i_pixel = PIXEL_8x16;
1934 m->p_cost_mv = a->p_cost_mv;
1936 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1937 LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
1939 *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
1940 *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
1942 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1943 x264_me_search( h, m, mvc, 2 );
1946 src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
1947 m->mv[0], m->mv[1], 8, 16 );
1948 /* FIXME: ref cost */
1949 i_part_cost_bi += m->cost_mv;
1952 h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
1953 i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
1955 i_part_cost = a->l0.me8x16[i].cost;
1956 a->i_mb_partition8x16[i] = D_L0_8x8;
1957 if( a->l1.me8x16[i].cost < i_part_cost )
1959 i_part_cost = a->l1.me8x16[i].cost;
1960 a->i_mb_partition8x16[i] = D_L1_8x8;
1962 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1964 i_part_cost = i_part_cost_bi;
1965 a->i_mb_partition8x16[i] = D_BI_8x8;
1967 a->i_cost8x16bi += i_part_cost;
1969 x264_mb_cache_mv_b8x16( h, a, i, 0 );
1973 a->i_mb_type8x16 = B_L0_L0
1974 + (a->i_mb_partition8x16[0]>>2) * 3
1975 + (a->i_mb_partition8x16[1]>>2);
1976 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
1979 static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
1981 int thresh = i_satd * 5/4;
1983 h->mb.i_type = P_L0;
1984 if( a->l0.i_rd16x16 == COST_MAX && a->l0.me16x16.cost <= i_satd * 3/2 )
1986 h->mb.i_partition = D_16x16;
1987 x264_analyse_update_cache( h, a );
1988 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
1990 a->l0.me16x16.cost = a->l0.i_rd16x16;
1992 if( a->l0.i_cost16x8 <= thresh )
1994 h->mb.i_partition = D_16x8;
1995 x264_analyse_update_cache( h, a );
1996 a->l0.i_cost16x8 = x264_rd_cost_mb( h, a->i_lambda2 );
1999 a->l0.i_cost16x8 = COST_MAX;
2001 if( a->l0.i_cost8x16 <= thresh )
2003 h->mb.i_partition = D_8x16;
2004 x264_analyse_update_cache( h, a );
2005 a->l0.i_cost8x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2008 a->l0.i_cost8x16 = COST_MAX;
2010 if( a->l0.i_cost8x8 <= thresh )
2012 h->mb.i_type = P_8x8;
2013 h->mb.i_partition = D_8x8;
2014 if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
2017 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2018 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2019 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2020 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2021 /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
2022 * for future blocks are those left over from previous RDO calls. */
2023 for( i = 0; i < 4; i++ )
2025 int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
2026 int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4;
2027 int subtype, btype = D_L0_8x8;
2028 uint64_t bcost = COST_MAX64;
2029 for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
2032 if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) )
2034 h->mb.i_sub_partition[i] = subtype;
2035 x264_mb_cache_mv_p8x8( h, a, i );
2036 cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
2037 COPY2_IF_LT( bcost, cost, btype, subtype );
2039 h->mb.i_sub_partition[i] = btype;
2040 x264_mb_cache_mv_p8x8( h, a, i );
2044 x264_analyse_update_cache( h, a );
2045 a->l0.i_cost8x8 = x264_rd_cost_mb( h, a->i_lambda2 );
2048 a->l0.i_cost8x8 = COST_MAX;
2051 static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
2053 int thresh = i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16;
2055 if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
2057 h->mb.i_type = B_DIRECT;
2058 /* Assumes direct/skip MC is still in fdec */
2059 /* Requires b-rdo to be done before intra analysis */
2060 h->mb.b_skip_mc = 1;
2061 x264_analyse_update_cache( h, a );
2062 a->i_rd16x16direct = x264_rd_cost_mb( h, a->i_lambda2 );
2063 h->mb.b_skip_mc = 0;
2066 //FIXME not all the update_cache calls are needed
2067 h->mb.i_partition = D_16x16;
2069 if( a->l0.me16x16.cost <= thresh && a->l0.i_rd16x16 == COST_MAX )
2071 h->mb.i_type = B_L0_L0;
2072 x264_analyse_update_cache( h, a );
2073 a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2077 if( a->l1.me16x16.cost <= thresh && a->l1.i_rd16x16 == COST_MAX )
2079 h->mb.i_type = B_L1_L1;
2080 x264_analyse_update_cache( h, a );
2081 a->l1.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
2085 if( a->i_cost16x16bi <= thresh && a->i_rd16x16bi == COST_MAX )
2087 h->mb.i_type = B_BI_BI;
2088 x264_analyse_update_cache( h, a );
2089 a->i_rd16x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2093 if( a->i_cost8x8bi <= thresh && a->i_rd8x8bi == COST_MAX )
2095 h->mb.i_type = B_8x8;
2096 h->mb.i_partition = D_8x8;
2097 x264_analyse_update_cache( h, a );
2098 a->i_rd8x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2099 x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
2103 if( a->i_cost16x8bi <= thresh && a->i_rd16x8bi == COST_MAX )
2105 h->mb.i_type = a->i_mb_type16x8;
2106 h->mb.i_partition = D_16x8;
2107 x264_analyse_update_cache( h, a );
2108 a->i_rd16x8bi = x264_rd_cost_mb( h, a->i_lambda2 );
2112 if( a->i_cost8x16bi <= thresh && a->i_rd8x16bi == COST_MAX )
2114 h->mb.i_type = a->i_mb_type8x16;
2115 h->mb.i_partition = D_8x16;
2116 x264_analyse_update_cache( h, a );
2117 a->i_rd8x16bi = x264_rd_cost_mb( h, a->i_lambda2 );
2121 static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a )
2123 const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
2126 if( IS_INTRA(h->mb.i_type) )
2129 switch( h->mb.i_partition )
2132 if( h->mb.i_type == B_BI_BI )
2133 x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight );
2136 for( i=0; i<2; i++ )
2137 if( a->i_mb_partition16x8[i] == D_BI_8x8 )
2138 x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
2141 for( i=0; i<2; i++ )
2142 if( a->i_mb_partition8x16[i] == D_BI_8x8 )
2143 x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
2146 for( i=0; i<4; i++ )
2147 if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2148 x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
2153 static inline void x264_mb_analyse_transform( x264_t *h )
2155 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
2157 int i_cost4, i_cost8;
2158 /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */
2161 i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2162 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2163 i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
2164 h->mb.pic.p_fdec[0], FDEC_STRIDE );
2166 h->mb.b_transform_8x8 = i_cost8 < i_cost4;
2167 h->mb.b_skip_mc = 1;
2171 static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
2173 if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 )
2176 x264_analyse_update_cache( h, a );
2177 h->mb.b_transform_8x8 ^= 1;
2178 /* FIXME only luma is needed, but the score for comparison already includes chroma */
2179 i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 );
2181 if( *i_rd >= i_rd8 )
2184 *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
2188 h->mb.b_transform_8x8 ^= 1;
2192 /* Rate-distortion optimal QP selection.
2193 * FIXME: More than half of the benefit of this function seems to be
2194 * in the way it improves the coding of chroma DC (by decimating or
2195 * finding a better way to code a single DC coefficient.)
2196 * There must be a more efficient way to get that portion of the benefit
2197 * without doing full QP-RD, but RD-decimation doesn't seem to do the
2199 static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
2201 int bcost, cost, direction, failures, prevcost, origcost;
2202 int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
2203 origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 );
2205 /* If CBP is already zero, don't raise the quantizer any higher. */
2206 for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 )
2208 h->mb.i_qp = orig_qp;
2210 prevcost = origcost;
2211 while( h->mb.i_qp > 0 && h->mb.i_qp < 51 )
2213 h->mb.i_qp += direction;
2214 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2215 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2216 COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
2218 /* We can't assume that the costs are monotonic over QPs.
2219 * Tie case-as-failure seems to give better results. */
2220 if( cost < prevcost )
2226 /* Without psy-RD, require monotonicity when lowering
2227 * quant, allow 1 failure when raising quant.
2228 * With psy-RD, allow 1 failure when lowering quant,
2229 * allow 2 failures when raising quant.
2230 * Psy-RD generally seems to result in more chaotic
2231 * RD score-vs-quantizer curves. */
2232 if( failures > ((direction + 1)>>1)+(!!h->mb.i_psy_rd) )
2234 if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
2240 h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
2242 /* Check transform again; decision from before may no longer be optimal. */
2243 if( h->mb.i_qp != orig_qp && x264_mb_transform_8x8_allowed( h ) &&
2244 h->param.analyse.b_transform_8x8 )
2246 h->mb.b_transform_8x8 ^= 1;
2247 cost = x264_rd_cost_mb( h, a->i_lambda2 );
2249 h->mb.b_transform_8x8 ^= 1;
2253 /*****************************************************************************
2254 * x264_macroblock_analyse:
2255 *****************************************************************************/
2256 void x264_macroblock_analyse( x264_t *h )
2258 x264_mb_analysis_t analysis;
2259 int i_cost = COST_MAX;
2262 h->mb.i_qp = x264_ratecontrol_qp( h );
2263 if( h->param.rc.i_aq_mode )
2265 x264_adaptive_quant( h );
2266 /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
2267 * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
2268 if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 )
2269 h->mb.i_qp = h->mb.i_last_qp;
2272 x264_mb_analyse_init( h, &analysis, h->mb.i_qp );
2274 /*--------------------------- Do the analysis ---------------------------*/
2275 if( h->sh.i_type == SLICE_TYPE_I )
2277 if( analysis.i_mbrd )
2278 x264_mb_cache_fenc_satd( h );
2279 x264_mb_analyse_intra( h, &analysis, COST_MAX );
2280 if( analysis.i_mbrd )
2281 x264_intra_rd( h, &analysis, COST_MAX );
2283 i_cost = analysis.i_satd_i16x16;
2284 h->mb.i_type = I_16x16;
2285 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
2286 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
2287 if( analysis.i_satd_pcm < i_cost )
2288 h->mb.i_type = I_PCM;
2290 else if( analysis.i_mbrd >= 2 )
2291 x264_intra_rd_refine( h, &analysis );
2293 else if( h->sh.i_type == SLICE_TYPE_P )
2297 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
2299 /* Fast P_SKIP detection */
2300 analysis.b_try_pskip = 0;
2301 if( h->param.analyse.b_fast_pskip )
2303 if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] )
2304 // FIXME don't need to check this if the reference frame is done
2306 else if( h->param.analyse.i_subpel_refine >= 3 )
2307 analysis.b_try_pskip = 1;
2308 else if( h->mb.i_mb_type_left == P_SKIP ||
2309 h->mb.i_mb_type_top == P_SKIP ||
2310 h->mb.i_mb_type_topleft == P_SKIP ||
2311 h->mb.i_mb_type_topright == P_SKIP )
2312 b_skip = x264_macroblock_probe_pskip( h );
2315 h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
2319 h->mb.i_type = P_SKIP;
2320 h->mb.i_partition = D_16x16;
2321 assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 );
2325 const unsigned int flags = h->param.analyse.inter;
2329 int i_satd_inter, i_satd_intra;
2331 x264_mb_analyse_load_costs( h, &analysis );
2333 x264_mb_analyse_inter_p16x16( h, &analysis );
2335 if( h->mb.i_type == P_SKIP )
2338 if( flags & X264_ANALYSE_PSUB16x16 )
2340 if( h->param.analyse.b_mixed_references )
2341 x264_mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
2343 x264_mb_analyse_inter_p8x8( h, &analysis );
2346 /* Select best inter mode */
2348 i_partition = D_16x16;
2349 i_cost = analysis.l0.me16x16.cost;
2351 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2352 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
2355 i_partition = D_8x8;
2356 i_cost = analysis.l0.i_cost8x8;
2359 if( flags & X264_ANALYSE_PSUB8x8 )
2361 for( i = 0; i < 4; i++ )
2363 x264_mb_analyse_inter_p4x4( h, &analysis, i );
2364 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
2366 int i_cost8x8 = analysis.l0.i_cost4x4[i];
2367 h->mb.i_sub_partition[i] = D_L0_4x4;
2369 x264_mb_analyse_inter_p8x4( h, &analysis, i );
2370 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
2371 h->mb.i_sub_partition[i], D_L0_8x4 );
2373 x264_mb_analyse_inter_p4x8( h, &analysis, i );
2374 COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
2375 h->mb.i_sub_partition[i], D_L0_4x8 );
2377 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
2379 x264_mb_cache_mv_p8x8( h, &analysis, i );
2381 analysis.l0.i_cost8x8 = i_cost;
2385 /* Now do 16x8/8x16 */
2386 i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
2387 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
2388 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8 )
2390 x264_mb_analyse_inter_p16x8( h, &analysis );
2391 COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
2393 x264_mb_analyse_inter_p8x16( h, &analysis );
2394 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
2397 h->mb.i_partition = i_partition;
2400 //FIXME mb_type costs?
2401 if( analysis.i_mbrd )
2405 else if( i_partition == D_16x16 )
2407 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2408 i_cost = analysis.l0.me16x16.cost;
2410 else if( i_partition == D_16x8 )
2412 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
2413 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
2414 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
2416 else if( i_partition == D_8x16 )
2418 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
2419 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
2420 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
2422 else if( i_partition == D_8x8 )
2426 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2428 switch( h->mb.i_sub_partition[i8x8] )
2431 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
2432 i_cost += analysis.l0.me8x8[i8x8].cost;
2435 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
2436 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
2437 i_cost += analysis.l0.me8x4[i8x8][0].cost +
2438 analysis.l0.me8x4[i8x8][1].cost;
2441 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
2442 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
2443 i_cost += analysis.l0.me4x8[i8x8][0].cost +
2444 analysis.l0.me4x8[i8x8][1].cost;
2448 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
2449 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
2450 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
2451 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
2452 i_cost += analysis.l0.me4x4[i8x8][0].cost +
2453 analysis.l0.me4x4[i8x8][1].cost +
2454 analysis.l0.me4x4[i8x8][2].cost +
2455 analysis.l0.me4x4[i8x8][3].cost;
2458 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
2464 if( h->mb.b_chroma_me )
2466 x264_mb_analyse_intra_chroma( h, &analysis );
2467 x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
2468 analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
2469 analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
2470 analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
2473 x264_mb_analyse_intra( h, &analysis, i_cost );
2475 i_satd_inter = i_cost;
2476 i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
2477 analysis.i_satd_i8x8,
2478 analysis.i_satd_i4x4 );
2480 if( analysis.i_mbrd )
2482 x264_mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
2484 i_partition = D_16x16;
2485 i_cost = analysis.l0.me16x16.cost;
2486 COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
2487 COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
2488 COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
2489 h->mb.i_type = i_type;
2490 h->mb.i_partition = i_partition;
2491 if( i_cost < COST_MAX )
2492 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2493 x264_intra_rd( h, &analysis, i_satd_inter * 5/4 );
2496 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2497 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2498 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2499 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2501 h->mb.i_type = i_type;
2503 if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
2505 if( IS_INTRA( h->mb.i_type ) )
2507 x264_intra_rd_refine( h, &analysis );
2509 else if( i_partition == D_16x16 )
2511 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
2512 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2514 else if( i_partition == D_16x8 )
2516 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2517 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2518 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
2519 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
2520 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
2521 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
2523 else if( i_partition == D_8x16 )
2525 h->mb.i_sub_partition[0] = h->mb.i_sub_partition[1] =
2526 h->mb.i_sub_partition[2] = h->mb.i_sub_partition[3] = D_L0_8x8;
2527 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
2528 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
2529 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
2530 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
2532 else if( i_partition == D_8x8 )
2535 x264_analyse_update_cache( h, &analysis );
2536 for( i8x8 = 0; i8x8 < 4; i8x8++ )
2538 if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
2540 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
2542 else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
2544 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2545 x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
2547 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
2549 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2550 x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2552 else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
2554 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
2555 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
2556 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
2557 x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
2564 else if( h->sh.i_type == SLICE_TYPE_B )
2566 int i_bskip_cost = COST_MAX;
2569 if( analysis.i_mbrd )
2570 x264_mb_cache_fenc_satd( h );
2572 h->mb.i_type = B_SKIP;
2573 if( h->mb.b_direct_auto_write )
2575 /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
2576 for( i = 0; i < 2; i++ )
2579 h->sh.b_direct_spatial_mv_pred ^= 1;
2580 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
2581 if( analysis.b_direct_available )
2586 b_skip = x264_macroblock_probe_bskip( h );
2588 h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
2595 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
2597 if( analysis.b_direct_available )
2599 if( !h->mb.b_direct_auto_write )
2601 if( analysis.i_mbrd )
2603 i_bskip_cost = ssd_mb( h );
2604 /* 6 = minimum cavlc cost of a non-skipped MB */
2605 b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
2607 else if( !h->mb.b_direct_auto_write )
2609 /* Conditioning the probe on neighboring block types
2610 * doesn't seem to help speed or quality. */
2611 b_skip = x264_macroblock_probe_bskip( h );
2617 const unsigned int flags = h->param.analyse.inter;
2620 int i_satd_inter = 0; // shut up uninitialized warning
2621 h->mb.b_skip_mc = 0;
2623 x264_mb_analyse_load_costs( h, &analysis );
2625 /* select best inter mode */
2626 /* direct must be first */
2627 if( analysis.b_direct_available )
2628 x264_mb_analyse_inter_direct( h, &analysis );
2630 x264_mb_analyse_inter_b16x16( h, &analysis );
2633 i_partition = D_16x16;
2634 i_cost = analysis.l0.me16x16.cost;
2635 COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
2636 COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
2637 COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
2639 if( analysis.i_mbrd && analysis.i_cost16x16direct <= i_cost * 33/32 )
2641 x264_mb_analyse_b_rd( h, &analysis, i_cost );
2642 if( i_bskip_cost < analysis.i_rd16x16direct &&
2643 i_bskip_cost < analysis.i_rd16x16bi &&
2644 i_bskip_cost < analysis.l0.i_rd16x16 &&
2645 i_bskip_cost < analysis.l1.i_rd16x16 )
2647 h->mb.i_type = B_SKIP;
2648 x264_analyse_update_cache( h, &analysis );
2653 if( flags & X264_ANALYSE_BSUB16x16 )
2655 x264_mb_analyse_inter_b8x8( h, &analysis );
2656 if( analysis.i_cost8x8bi < i_cost )
2659 i_partition = D_8x8;
2660 i_cost = analysis.i_cost8x8bi;
2662 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
2663 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
2665 x264_mb_analyse_inter_b16x8( h, &analysis );
2666 COPY3_IF_LT( i_cost, analysis.i_cost16x8bi,
2667 i_type, analysis.i_mb_type16x8,
2668 i_partition, D_16x8 );
2670 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
2671 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
2673 x264_mb_analyse_inter_b8x16( h, &analysis );
2674 COPY3_IF_LT( i_cost, analysis.i_cost8x16bi,
2675 i_type, analysis.i_mb_type8x16,
2676 i_partition, D_8x16 );
2681 if( analysis.i_mbrd )
2686 else if( i_partition == D_16x16 )
2688 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2689 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2690 if( i_type == B_L0_L0 )
2692 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2693 i_cost = analysis.l0.me16x16.cost
2694 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
2696 else if( i_type == B_L1_L1 )
2698 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2699 i_cost = analysis.l1.me16x16.cost
2700 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
2702 else if( i_type == B_BI_BI )
2704 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
2705 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
2708 else if( i_partition == D_16x8 )
2710 for( i=0; i<2; i++ )
2712 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
2713 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
2714 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
2715 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
2718 else if( i_partition == D_8x16 )
2720 for( i=0; i<2; i++ )
2722 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
2723 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
2724 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
2725 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
2728 else if( i_partition == D_8x8 )
2730 for( i=0; i<4; i++ )
2733 int i_part_cost_old;
2735 int i_part_type = h->mb.i_sub_partition[i];
2736 int b_bidir = (i_part_type == D_BI_8x8);
2738 if( i_part_type == D_DIRECT_8x8 )
2740 if( x264_mb_partition_listX_table[0][i_part_type] )
2742 m = &analysis.l0.me8x8[i];
2743 i_part_cost_old = m->cost;
2744 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
2745 m->cost -= i_type_cost;
2746 x264_me_refine_qpel( h, m );
2748 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2750 if( x264_mb_partition_listX_table[1][i_part_type] )
2752 m = &analysis.l1.me8x8[i];
2753 i_part_cost_old = m->cost;
2754 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
2755 m->cost -= i_type_cost;
2756 x264_me_refine_qpel( h, m );
2758 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
2760 /* TODO: update mvp? */
2764 if( analysis.i_mbrd )
2766 i_satd_inter = i_cost;
2767 x264_mb_analyse_b_rd( h, &analysis, i_satd_inter );
2769 i_cost = i_bskip_cost;
2770 i_partition = D_16x16;
2771 COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
2772 COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
2773 COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
2774 COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
2775 COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
2776 COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
2777 COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
2779 h->mb.i_type = i_type;
2780 h->mb.i_partition = i_partition;
2783 x264_mb_analyse_intra( h, &analysis, i_satd_inter );
2785 if( analysis.i_mbrd )
2787 x264_mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
2788 x264_intra_rd( h, &analysis, i_satd_inter * 17/16 );
2791 COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
2792 COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
2793 COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
2794 COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
2796 h->mb.i_type = i_type;
2797 h->mb.i_partition = i_partition;
2799 if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
2800 x264_intra_rd_refine( h, &analysis );
2801 if( h->mb.i_subpel_refine >= 5 )
2802 x264_refine_bidir( h, &analysis );
2804 if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
2806 const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref];
2807 x264_analyse_update_cache( h, &analysis );
2809 if( i_partition == D_16x16 )
2811 if( i_type == B_L0_L0 )
2812 x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
2813 else if( i_type == B_L1_L1 )
2814 x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
2815 else if( i_type == B_BI_BI )
2816 x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 );
2818 else if( i_partition == D_16x8 )
2820 for( i = 0; i < 2; i++ )
2822 h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
2823 if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
2824 x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
2825 else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
2826 x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
2827 else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
2828 x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
2831 else if( i_partition == D_8x16 )
2833 for( i = 0; i < 2; i++ )
2835 h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
2836 if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
2837 x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
2838 else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
2839 x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
2840 else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
2841 x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
2844 else if( i_partition == D_8x8 )
2846 for( i = 0; i < 4; i++ )
2848 if( h->mb.i_sub_partition[i] == D_L0_8x8 )
2849 x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
2850 else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
2851 x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
2852 else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
2853 x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
2860 x264_analyse_update_cache( h, &analysis );
2862 if( !analysis.i_mbrd )
2863 x264_mb_analyse_transform( h );
2865 if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
2866 x264_mb_analyse_qp_rd( h, &analysis );
2868 h->mb.b_trellis = h->param.analyse.i_trellis;
2869 h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
2870 if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
2871 x264_psy_trellis_init( h, 0 );
2872 if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
2873 h->mb.i_skip_intra = 0;
2876 /*-------------------- Update MB from the analysis ----------------------*/
2877 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
2881 switch( h->mb.i_type )
2884 for( i = 0; i < 16; i++ )
2885 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
2887 x264_mb_analyse_intra_chroma( h, a );
2890 for( i = 0; i < 4; i++ )
2891 x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
2893 x264_mb_analyse_intra_chroma( h, a );
2896 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
2897 x264_mb_analyse_intra_chroma( h, a );
2904 switch( h->mb.i_partition )
2907 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
2908 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
2912 x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
2913 x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
2914 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
2915 x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
2919 x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
2920 x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
2921 x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
2922 x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
2926 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
2932 x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
2933 x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
2934 x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
2935 x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
2936 for( i = 0; i < 4; i++ )
2937 x264_mb_cache_mv_p8x8( h, a, i );
2942 h->mb.i_partition = D_16x16;
2943 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
2944 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
2950 x264_mb_load_mv_direct8x8( h, 0 );
2951 x264_mb_load_mv_direct8x8( h, 1 );
2952 x264_mb_load_mv_direct8x8( h, 2 );
2953 x264_mb_load_mv_direct8x8( h, 3 );
2957 /* optimize: cache might not need to be rewritten */
2958 for( i = 0; i < 4; i++ )
2959 x264_mb_cache_mv_b8x8( h, a, i, 1 );
2962 default: /* the rest of the B types */
2963 switch( h->mb.i_partition )
2966 switch( h->mb.i_type )
2969 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
2970 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
2972 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
2973 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
2974 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
2977 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
2978 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
2979 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
2981 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
2982 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
2985 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
2986 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
2988 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
2989 x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
2994 x264_mb_cache_mv_b16x8( h, a, 0, 1 );
2995 x264_mb_cache_mv_b16x8( h, a, 1, 1 );
2998 x264_mb_cache_mv_b8x16( h, a, 0, 1 );
2999 x264_mb_cache_mv_b8x16( h, a, 1, 1 );
3002 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
3008 if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) )
3011 for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
3014 int ref = h->mb.cache.ref[l][x264_scan8[0]];
3017 completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed;
3018 if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed )
3020 x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
3021 fprintf(stderr, "mb type: %d \n", h->mb.i_type);
3022 fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref,
3023 h->mb.cache.mv[l][x264_scan8[15]][0],
3024 h->mb.cache.mv[l][x264_scan8[15]][1] );
3025 fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]);
3026 fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
3027 fprintf(stderr, "completed: %d \n", completed );
3028 x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
3029 x264_mb_analyse_intra( h, a, COST_MAX );
3030 h->mb.i_type = I_16x16;
3031 h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
3032 x264_mb_analyse_intra_chroma( h, a );
3039 #include "slicetype.c"