1 /*****************************************************************************
2 * analyse.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003 Laurent Aimar
5 * $Id: analyse.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
7 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 *****************************************************************************/
30 #include "common/common.h"
31 #include "common/macroblock.h"
32 #include "macroblock.h"
34 #include "ratecontrol.h"
47 int i_cost4x4[4]; /* cost per 8x8 partition */
48 x264_me_t me4x4[4][4];
51 int i_cost8x4[4]; /* cost per 8x8 partition */
52 x264_me_t me8x4[4][2];
55 int i_cost4x8[4]; /* cost per 8x8 partition */
56 x264_me_t me4x8[4][4];
66 } x264_mb_analysis_list_t;
70 /* conduct the analysis using this lamda and QP */
77 /* Take some shortcuts in intra search if intra is deemed unlikely */
80 /* Luma part 16x16 and 4x4 modes stats */
85 int i_predict4x4[4][4];
91 /* II: Inter part P/B frame */
92 x264_mb_analysis_list_t l0;
93 x264_mb_analysis_list_t l1;
95 int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
96 int i_cost16x16direct;
98 int i_cost8x8direct[4];
102 int i_mb_partition16x8[2]; /* mb_partition_e */
103 int i_mb_partition8x16[2];
104 int i_mb_type16x8; /* mb_class_e */
107 int b_direct_available;
109 } x264_mb_analysis_t;
111 static const int i_qp0_cost_table[52] = {
112 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */
113 1, 1, 1, 1, /* 8-11 */
114 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */
115 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */
116 6, 7, 8, 9,10,11,13,14, /* 28-35 */
117 16,18,20,23,25,29,32,36, /* 36-43 */
118 40,45,51,57,64,72,81,91 /* 44-51 */
121 static const uint8_t block_idx_x[16] = {
122 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
124 static const uint8_t block_idx_y[16] = {
125 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
128 /* TODO: calculate CABAC costs */
129 static const int i_mb_b_cost_table[18] = {
130 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
132 static const int i_mb_b16x8_cost_table[16] = {
133 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
135 static const int i_sub_mb_b_cost_table[13] = {
136 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
138 static const int i_sub_mb_p_cost_table[4] = {
142 /* initialize an array of lambda*nbits for all possible mvs */
143 static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
145 static int16_t *p_cost_mv[52];
147 if( !p_cost_mv[a->i_qp] )
149 /* could be faster, but isn't called many times */
150 /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
152 p_cost_mv[a->i_qp] = x264_malloc( (4*4*h->param.analyse.i_mv_range + 1) * sizeof(int16_t) );
153 p_cost_mv[a->i_qp] += 2*4*h->param.analyse.i_mv_range;
154 for( i = 0; i <= 2*4*h->param.analyse.i_mv_range; i++ )
156 p_cost_mv[a->i_qp][-i] =
157 p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i );
161 a->p_cost_mv = p_cost_mv[a->i_qp];
164 static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
166 memset( a, 0, sizeof( x264_mb_analysis_t ) );
168 /* conduct the analysis using this lamda and QP */
170 a->i_lambda = i_qp0_cost_table[i_qp];
172 h->mb.i_me_method = h->param.analyse.i_me_method;
173 h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine;
174 h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P
175 && h->mb.i_subpel_refine >= 5;
181 a->i_sad_i8x8 = COST_MAX;
183 /* II: Inter part P/B frame */
184 if( h->sh.i_type != SLICE_TYPE_I )
187 int i_fmv_range = h->param.analyse.i_mv_range - 16;
189 /* Calculate max allowed MV range */
190 #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range )
191 h->mb.mv_min_fpel[0] = CLIP_FMV( -16*h->mb.i_mb_x - 8 );
192 h->mb.mv_max_fpel[0] = CLIP_FMV( 16*( h->sps->i_mb_width - h->mb.i_mb_x ) - 8 );
193 h->mb.mv_min[0] = 4*( h->mb.mv_min_fpel[0] - 16 );
194 h->mb.mv_max[0] = 4*( h->mb.mv_max_fpel[0] + 16 );
195 if( h->mb.i_mb_x == 0)
197 h->mb.mv_min_fpel[1] = CLIP_FMV( -16*h->mb.i_mb_y - 8 );
198 h->mb.mv_max_fpel[1] = CLIP_FMV( 16*( h->sps->i_mb_height - h->mb.i_mb_y ) - 8 );
199 h->mb.mv_min[1] = 4*( h->mb.mv_min_fpel[1] - 16 );
200 h->mb.mv_max[1] = 4*( h->mb.mv_max_fpel[1] + 16 );
205 a->l0.i_cost8x8 = COST_MAX;
207 for( i = 0; i < 4; i++ )
211 a->l0.i_cost4x8[i] = COST_MAX;
215 a->l0.i_cost8x16 = COST_MAX;
216 if( h->sh.i_type == SLICE_TYPE_B )
219 a->l1.i_cost8x8 = COST_MAX;
221 for( i = 0; i < 4; i++ )
226 a->i_cost8x8direct[i] = COST_MAX;
233 a->i_cost16x16direct =
236 a->i_cost8x16bi = COST_MAX;
239 /* Fast intra decision */
240 if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
242 const unsigned int i_neighbour = h->mb.i_neighbour;
243 if( ((i_neighbour&MB_LEFT) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - 1] ))
244 || ((i_neighbour&MB_TOP) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] ))
245 || (((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT)) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] ))
246 || ((i_neighbour&MB_TOPRIGHT) && IS_INTRA( h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] ))
247 || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] ))
248 || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_16x16])) )
249 { /* intra is likely */ }
264 static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
266 if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
268 /* top and left avaible */
269 *mode++ = I_PRED_16x16_V;
270 *mode++ = I_PRED_16x16_H;
271 *mode++ = I_PRED_16x16_DC;
272 *mode++ = I_PRED_16x16_P;
275 else if( ( i_neighbour & MB_LEFT ) )
278 *mode++ = I_PRED_16x16_DC_LEFT;
279 *mode++ = I_PRED_16x16_H;
282 else if( ( i_neighbour & MB_TOP ) )
285 *mode++ = I_PRED_16x16_DC_TOP;
286 *mode++ = I_PRED_16x16_V;
292 *mode = I_PRED_16x16_DC_128;
298 static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int *pi_count )
300 if( ( i_neighbour & (MB_LEFT|MB_TOP) ) == (MB_LEFT|MB_TOP) )
302 /* top and left avaible */
303 *mode++ = I_PRED_CHROMA_V;
304 *mode++ = I_PRED_CHROMA_H;
305 *mode++ = I_PRED_CHROMA_DC;
306 *mode++ = I_PRED_CHROMA_P;
309 else if( ( i_neighbour & MB_LEFT ) )
312 *mode++ = I_PRED_CHROMA_DC_LEFT;
313 *mode++ = I_PRED_CHROMA_H;
316 else if( ( i_neighbour & MB_TOP ) )
319 *mode++ = I_PRED_CHROMA_DC_TOP;
320 *mode++ = I_PRED_CHROMA_V;
326 *mode = I_PRED_CHROMA_DC_128;
332 static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *mode, int *pi_count )
335 static const unsigned int needmb[16] =
337 MB_LEFT|MB_TOP, MB_TOP,
339 MB_TOP, MB_TOP|MB_TOPRIGHT,
347 /* FIXME even when b_c == 0 there is some case where missing pixels
348 * are emulated and thus more mode are available TODO
349 * analysis and encode should be fixed too */
350 b_a = (needmb[idx]&i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT);
351 b_b = (needmb[idx]&i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP);
352 b_c = (needmb[idx]&i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE));
356 *mode++ = I_PRED_4x4_DC;
357 *mode++ = I_PRED_4x4_H;
358 *mode++ = I_PRED_4x4_V;
359 *mode++ = I_PRED_4x4_DDR;
360 *mode++ = I_PRED_4x4_VR;
361 *mode++ = I_PRED_4x4_HD;
362 *mode++ = I_PRED_4x4_HU;
368 *mode++ = I_PRED_4x4_DDL;
369 *mode++ = I_PRED_4x4_VL;
373 else if( b_a && !b_b )
375 *mode++ = I_PRED_4x4_DC_LEFT;
376 *mode++ = I_PRED_4x4_H;
377 *mode++ = I_PRED_4x4_HU;
380 else if( !b_a && b_b )
382 *mode++ = I_PRED_4x4_DC_TOP;
383 *mode++ = I_PRED_4x4_V;
388 *mode++ = I_PRED_4x4_DC_128;
393 static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cost_inter )
395 const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
396 const int i_stride = h->mb.pic.i_stride[0];
397 uint8_t *p_src = h->mb.pic.p_fenc[0];
398 uint8_t *p_dst = h->mb.pic.p_fdec[0];
405 /*---------------- Try all mode and calculate their score ---------------*/
407 /* 16x16 prediction selection */
408 predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
409 for( i = 0; i < i_max; i++ )
414 i_mode = predict_mode[i];
416 /* we do the prediction */
417 h->predict_16x16[i_mode]( p_dst, i_stride );
419 /* we calculate the diff and get the square sum of the diff */
420 i_sad = h->pixf.satd[PIXEL_16x16]( p_dst, i_stride, p_src, i_stride ) +
421 res->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
422 /* if i_score is lower it is better */
423 if( res->i_sad_i16x16 > i_sad )
425 res->i_predict16x16 = i_mode;
426 res->i_sad_i16x16 = i_sad;
429 /* cavlc mb type prefix */
430 if( h->sh.i_type == SLICE_TYPE_B )
431 res->i_sad_i16x16 += res->i_lambda * i_mb_b_cost_table[I_16x16];
433 if( res->b_fast_intra )
435 if( res->i_sad_i16x16 > 2*i_cost_inter )
439 /* 4x4 prediction selection */
440 if( flags & X264_ANALYSE_I4x4 )
443 for( idx = 0; idx < 16; idx++ )
451 i_pred_mode= x264_mb_predict_intra4x4_mode( h, idx );
452 x = block_idx_x[idx];
453 y = block_idx_y[idx];
455 p_src_by = p_src + 4 * x + 4 * y * i_stride;
456 p_dst_by = p_dst + 4 * x + 4 * y * i_stride;
459 predict_4x4_mode_available( h->mb.i_neighbour, idx, predict_mode, &i_max );
460 for( i = 0; i < i_max; i++ )
465 i_mode = predict_mode[i];
467 /* we do the prediction */
468 h->predict_4x4[i_mode]( p_dst_by, i_stride );
470 /* we calculate diff and get the square sum of the diff */
471 i_sad = h->pixf.satd[PIXEL_4x4]( p_dst_by, i_stride,
472 p_src_by, i_stride );
474 i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4);
476 /* if i_score is lower it is better */
479 res->i_predict4x4[x][y] = i_mode;
483 res->i_sad_i4x4 += i_best;
485 /* we need to encode this mb now (for next ones) */
486 h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride );
487 x264_mb_encode_i4x4( h, idx, res->i_qp );
489 /* we need to store the 'fixed' version */
490 h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] =
491 x264_mb_pred_mode4x4_fix[res->i_predict4x4[x][y]];
493 res->i_sad_i4x4 += res->i_lambda * 24; /* from JVT (SATD0) */
494 if( h->sh.i_type == SLICE_TYPE_B )
495 res->i_sad_i4x4 += res->i_lambda * i_mb_b_cost_table[I_4x4];
499 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res )
506 uint8_t *p_dstc[2], *p_srcc[2];
509 if( res->i_sad_i8x8 < COST_MAX )
512 /* 8x8 prediction selection for chroma */
513 p_dstc[0] = h->mb.pic.p_fdec[1];
514 p_dstc[1] = h->mb.pic.p_fdec[2];
515 p_srcc[0] = h->mb.pic.p_fenc[1];
516 p_srcc[1] = h->mb.pic.p_fenc[2];
518 i_stride[0] = h->mb.pic.i_stride[1];
519 i_stride[1] = h->mb.pic.i_stride[2];
521 predict_8x8_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
522 res->i_sad_i8x8 = COST_MAX;
523 for( i = 0; i < i_max; i++ )
528 i_mode = predict_mode[i];
530 /* we do the prediction */
531 h->predict_8x8[i_mode]( p_dstc[0], i_stride[0] );
532 h->predict_8x8[i_mode]( p_dstc[1], i_stride[1] );
534 /* we calculate the cost */
535 i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0],
536 p_srcc[0], i_stride[0] ) +
537 h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1],
538 p_srcc[1], i_stride[1] ) +
539 res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix[i_mode] );
541 /* if i_score is lower it is better */
542 if( res->i_sad_i8x8 > i_sad )
544 res->i_predict8x8 = i_mode;
545 res->i_sad_i8x8 = i_sad;
550 #define LOAD_FENC( m, src, xoff, yoff) \
551 (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
552 (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
553 (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
554 (m)->p_fenc[1] = &(src)[1][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
555 (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
556 #define LOAD_HPELS(m, src, xoff, yoff) \
557 (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
558 (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
559 (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
560 (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
561 (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
562 (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]];
564 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
568 int mvc[4][2], i_mvc;
569 int i_fullpel_thresh = INT_MAX;
570 int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
572 /* 16x16 Search on all ref frame */
573 m.i_pixel = PIXEL_16x16;
574 m.p_cost_mv = a->p_cost_mv;
575 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
577 a->l0.me16x16.cost = INT_MAX;
578 for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
580 const int i_ref_cost = a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
581 i_fullpel_thresh -= i_ref_cost;
583 /* search with ref */
584 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
585 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
586 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
587 x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
589 m.cost += i_ref_cost;
590 i_fullpel_thresh += i_ref_cost;
592 if( m.cost < a->l0.me16x16.cost )
598 /* save mv for predicting neighbors */
599 h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
600 h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
603 /* subtract ref cost, so we don't have to add it for the other P types */
604 a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
606 /* Set global ref, needed for all others modes */
607 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
610 static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
612 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
613 uint8_t **p_fenc = h->mb.pic.p_fenc;
614 int mvc[5][2], i_mvc;
617 /* XXX Needed for x264_mb_predict_mv */
618 h->mb.i_partition = D_8x8;
621 mvc[0][0] = a->l0.me16x16.mv[0];
622 mvc[0][1] = a->l0.me16x16.mv[1];
624 for( i = 0; i < 4; i++ )
626 x264_me_t *m = &a->l0.me8x8[i];
630 m->i_pixel = PIXEL_8x8;
631 m->p_cost_mv = a->p_cost_mv;
633 LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
634 LOAD_HPELS( m, p_fref, 8*x8, 8*y8 );
636 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
637 x264_me_search( h, m, mvc, i_mvc );
639 x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
641 mvc[i_mvc][0] = m->mv[0];
642 mvc[i_mvc][1] = m->mv[1];
646 m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
649 a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
650 a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
653 static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
655 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
656 uint8_t **p_fenc = h->mb.pic.p_fenc;
660 /* XXX Needed for x264_mb_predict_mv */
661 h->mb.i_partition = D_16x8;
663 for( i = 0; i < 2; i++ )
665 x264_me_t *m = &a->l0.me16x8[i];
667 m->i_pixel = PIXEL_16x8;
668 m->p_cost_mv = a->p_cost_mv;
670 LOAD_FENC( m, p_fenc, 0, 8*i );
671 LOAD_HPELS( m, p_fref, 0, 8*i );
673 mvc[0][0] = a->l0.me8x8[2*i].mv[0];
674 mvc[0][1] = a->l0.me8x8[2*i].mv[1];
675 mvc[1][0] = a->l0.me8x8[2*i+1].mv[0];
676 mvc[1][1] = a->l0.me8x8[2*i+1].mv[1];
678 x264_mb_predict_mv( h, 0, 8*i, 4, m->mvp );
679 x264_me_search( h, m, mvc, 2 );
681 x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, m->mv[0], m->mv[1] );
684 a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
687 static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
689 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
690 uint8_t **p_fenc = h->mb.pic.p_fenc;
694 /* XXX Needed for x264_mb_predict_mv */
695 h->mb.i_partition = D_8x16;
697 for( i = 0; i < 2; i++ )
699 x264_me_t *m = &a->l0.me8x16[i];
701 m->i_pixel = PIXEL_8x16;
702 m->p_cost_mv = a->p_cost_mv;
704 LOAD_FENC( m, p_fenc, 8*i, 0 );
705 LOAD_HPELS( m, p_fref, 8*i, 0 );
707 mvc[0][0] = a->l0.me8x8[i].mv[0];
708 mvc[0][1] = a->l0.me8x8[i].mv[1];
709 mvc[1][0] = a->l0.me8x8[i+2].mv[0];
710 mvc[1][1] = a->l0.me8x8[i+2].mv[1];
712 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
713 x264_me_search( h, m, mvc, 2 );
715 x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, m->mv[0], m->mv[1] );
718 a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
721 static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
723 uint8_t pix1[8*8], pix2[8*8];
724 const int i_stride = h->mb.pic.i_stride[1];
725 const int off = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
727 #define CHROMA4x4MC( width, height, me, x, y ) \
728 h->mc.mc_chroma( &p_fref[4][off+x+y*i_stride], i_stride, &pix1[x+y*8], 8, (me).mv[0], (me).mv[1], width, height ); \
729 h->mc.mc_chroma( &p_fref[5][off+x+y*i_stride], i_stride, &pix2[x+y*8], 8, (me).mv[0], (me).mv[1], width, height );
731 if( pixel == PIXEL_4x4 )
733 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 );
734 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 0,2 );
735 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 2,0 );
736 CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 );
738 else if( pixel == PIXEL_8x4 )
740 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 );
741 CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 );
745 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 );
746 CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 );
749 return h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[1][off], i_stride, pix1, 8 )
750 + h->pixf.satd[PIXEL_4x4]( &h->mb.pic.p_fenc[2][off], i_stride, pix2, 8 );
753 static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
755 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
756 uint8_t **p_fenc = h->mb.pic.p_fenc;
760 /* XXX Needed for x264_mb_predict_mv */
761 h->mb.i_partition = D_8x8;
763 for( i4x4 = 0; i4x4 < 4; i4x4++ )
765 const int idx = 4*i8x8 + i4x4;
766 const int x4 = block_idx_x[idx];
767 const int y4 = block_idx_y[idx];
768 const int i_mvc = (i4x4 == 0);
770 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
772 m->i_pixel = PIXEL_4x4;
773 m->p_cost_mv = a->p_cost_mv;
775 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
776 LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
778 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
779 x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
781 x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
784 a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
785 a->l0.me4x4[i8x8][1].cost +
786 a->l0.me4x4[i8x8][2].cost +
787 a->l0.me4x4[i8x8][3].cost +
788 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
789 if( h->mb.b_chroma_me )
790 a->l0.i_cost4x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
793 static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
795 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
796 uint8_t **p_fenc = h->mb.pic.p_fenc;
800 /* XXX Needed for x264_mb_predict_mv */
801 h->mb.i_partition = D_8x8;
803 for( i8x4 = 0; i8x4 < 2; i8x4++ )
805 const int idx = 4*i8x8 + 2*i8x4;
806 const int x4 = block_idx_x[idx];
807 const int y4 = block_idx_y[idx];
808 const int i_mvc = (i8x4 == 0);
810 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
812 m->i_pixel = PIXEL_8x4;
813 m->p_cost_mv = a->p_cost_mv;
815 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
816 LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
818 x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
819 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
821 x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
824 a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
825 a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
826 if( h->mb.b_chroma_me )
827 a->l0.i_cost8x4[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
830 static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
832 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.i_ref];
833 uint8_t **p_fenc = h->mb.pic.p_fenc;
837 /* XXX Needed for x264_mb_predict_mv */
838 h->mb.i_partition = D_8x8;
840 for( i4x8 = 0; i4x8 < 2; i4x8++ )
842 const int idx = 4*i8x8 + i4x8;
843 const int x4 = block_idx_x[idx];
844 const int y4 = block_idx_y[idx];
845 const int i_mvc = (i4x8 == 0);
847 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
849 m->i_pixel = PIXEL_4x8;
850 m->p_cost_mv = a->p_cost_mv;
852 LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
853 LOAD_HPELS( m, p_fref, 4*x4, 4*y4 );
855 x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
856 x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
858 x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
861 a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
862 a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
863 if( h->mb.b_chroma_me )
864 a->l0.i_cost4x8[i8x8] += x264_mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
867 static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
869 /* Assumes that fdec still contains the results of
870 * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
872 uint8_t **p_fenc = h->mb.pic.p_fenc;
873 uint8_t **p_fdec = h->mb.pic.p_fdec;
874 int i_stride= h->mb.pic.i_stride[0];
877 a->i_cost16x16direct = 0;
878 for( i = 0; i < 4; i++ )
882 const int off = 8 * x8 + 8 * i_stride * y8;
883 a->i_cost16x16direct +=
884 a->i_cost8x8direct[i] =
885 h->pixf.satd[PIXEL_8x8]( &p_fenc[0][off], i_stride, &p_fdec[0][off], i_stride );
888 a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
891 a->i_cost16x16direct += a->i_lambda * i_mb_b_cost_table[B_DIRECT];
894 #define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
896 if( h->param.analyse.b_weighted_bipred ) \
897 h->pixf.avg_weight[size]( pix1, stride1, src2, stride2, \
898 h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
900 h->pixf.avg[size]( pix1, stride1, src2, stride2 ); \
903 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
905 uint8_t pix1[16*16], pix2[16*16];
908 int src2_ref, pix1_ref;
912 int mvc[5][2], i_mvc;
913 int i_fullpel_thresh = INT_MAX;
914 int *p_fullpel_thresh = h->i_ref0>1 ? &i_fullpel_thresh : NULL;
916 /* 16x16 Search on all ref frame */
917 m.i_pixel = PIXEL_16x16;
918 m.p_cost_mv = a->p_cost_mv;
919 LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
922 a->l0.me16x16.cost = INT_MAX;
923 for( i_ref = 0; i_ref < h->i_ref0; i_ref++ )
925 /* search with ref */
926 LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, 0 );
927 x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
928 x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
929 x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
932 m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, i_ref );
934 if( m.cost < a->l0.me16x16.cost )
940 /* save mv for predicting neighbors */
941 h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
942 h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
944 /* subtract ref cost, so we don't have to add it for the other MB types */
945 a->l0.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref );
948 i_fullpel_thresh = INT_MAX;
949 p_fullpel_thresh = h->i_ref1>1 ? &i_fullpel_thresh : NULL;
950 a->l1.me16x16.cost = INT_MAX;
951 for( i_ref = 0; i_ref < h->i_ref1; i_ref++ )
953 /* search with ref */
954 LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 0, 0 );
955 x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp );
956 x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc );
957 x264_me_search_ref( h, &m, mvc, i_mvc, p_fullpel_thresh );
960 m.cost += a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, i_ref );
962 if( m.cost < a->l1.me16x16.cost )
968 /* save mv for predicting neighbors */
969 h->mb.mvr[1][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
970 h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
972 /* subtract ref cost, so we don't have to add it for the other MB types */
973 a->l1.me16x16.cost -= a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref );
975 /* Set global ref, needed for other modes? */
976 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
977 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
979 /* get cost of BI mode */
980 if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 )
982 /* l0 reference is halfpel, so get_ref on it will make it faster */
983 src2 = h->mc.get_ref( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
985 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
987 h->mc.mc_luma( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
989 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
991 src2_ref = a->l0.i_ref;
992 pix1_ref = a->l1.i_ref;
996 /* if l0 was qpel, we'll use get_ref on l1 instead */
997 h->mc.mc_luma( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
999 a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
1001 src2 = h->mc.get_ref( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
1003 a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
1005 src2_ref = a->l1.i_ref;
1006 pix1_ref = a->l0.i_ref;
1009 if( h->param.analyse.b_weighted_bipred )
1010 h->pixf.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2,
1011 h->mb.bipred_weight[pix1_ref][src2_ref] );
1013 h->pixf.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
1015 a->i_cost16x16bi = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], pix1, 16 )
1016 + a->i_lambda * ( bs_size_te( h->sh.i_num_ref_idx_l0_active - 1, a->l0.i_ref )
1017 + bs_size_te( h->sh.i_num_ref_idx_l1_active - 1, a->l1.i_ref ) )
1018 + a->l0.me16x16.cost_mv
1019 + a->l1.me16x16.cost_mv;
1022 a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
1023 a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
1024 a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
1027 #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
1028 if( x264_mb_partition_listX_table[0][part] ) \
1030 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
1031 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, me0.mv[0], me0.mv[1] ); \
1035 x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
1036 x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0, 0 ); \
1038 x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0, 0 ); \
1040 if( x264_mb_partition_listX_table[1][part] ) \
1042 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
1043 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, me1.mv[0], me1.mv[1] ); \
1047 x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
1048 x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0, 0 ); \
1050 x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0, 0 ); \
1053 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1057 if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
1059 x264_mb_load_mv_direct8x8( h, i );
1062 x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0, 0 );
1063 x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0, 0 );
1064 x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
1069 CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
1072 static inline void x264_mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1074 CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
1076 static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
1078 CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
1082 static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
1084 uint8_t **p_fref[2] =
1085 { h->mb.pic.p_fref[0][a->l0.i_ref],
1086 h->mb.pic.p_fref[1][a->l1.i_ref] };
1087 uint8_t pix[2][8*8];
1090 /* XXX Needed for x264_mb_predict_mv */
1091 h->mb.i_partition = D_8x8;
1095 for( i = 0; i < 4; i++ )
1100 int i_part_cost_bi = 0;
1102 for( l = 0; l < 2; l++ )
1104 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1105 x264_me_t *m = &lX->me8x8[i];
1107 m->i_pixel = PIXEL_8x8;
1108 m->p_cost_mv = a->p_cost_mv;
1110 LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
1111 LOAD_HPELS( m, p_fref[l], 8*x8, 8*y8 );
1113 x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
1114 x264_me_search( h, m, &lX->me16x16.mv, 1 );
1116 x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, l, m->mv[0], m->mv[1] );
1119 h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1120 m->mv[0], m->mv[1], 8, 8 );
1121 i_part_cost_bi += m->cost_mv;
1122 /* FIXME: ref cost */
1125 WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
1126 i_part_cost_bi += h->pixf.satd[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 )
1127 + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
1128 a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1129 a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1131 i_part_cost = a->l0.me8x8[i].cost;
1132 h->mb.i_sub_partition[i] = D_L0_8x8;
1133 if( a->l1.me8x8[i].cost < i_part_cost )
1135 i_part_cost = a->l1.me8x8[i].cost;
1136 h->mb.i_sub_partition[i] = D_L1_8x8;
1138 if( i_part_cost_bi < i_part_cost )
1140 i_part_cost = i_part_cost_bi;
1141 h->mb.i_sub_partition[i] = D_BI_8x8;
1143 if( a->i_cost8x8direct[i] < i_part_cost )
1145 i_part_cost = a->i_cost8x8direct[i];
1146 h->mb.i_sub_partition[i] = D_DIRECT_8x8;
1148 a->i_cost8x8bi += i_part_cost;
1150 /* XXX Needed for x264_mb_predict_mv */
1151 x264_mb_cache_mv_b8x8( h, a, i, 0 );
1155 a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
1158 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
1160 uint8_t **p_fref[2] =
1161 { h->mb.pic.p_fref[0][a->l0.i_ref],
1162 h->mb.pic.p_fref[1][a->l1.i_ref] };
1163 uint8_t pix[2][16*8];
1167 h->mb.i_partition = D_16x8;
1168 a->i_cost16x8bi = 0;
1170 for( i = 0; i < 2; i++ )
1173 int i_part_cost_bi = 0;
1175 /* TODO: check only the list(s) that were used in b8x8? */
1176 for( l = 0; l < 2; l++ )
1178 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1179 x264_me_t *m = &lX->me16x8[i];
1181 m->i_pixel = PIXEL_16x8;
1182 m->p_cost_mv = a->p_cost_mv;
1184 LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
1185 LOAD_HPELS( m, p_fref[l], 0, 8*i );
1187 mvc[0][0] = lX->me8x8[2*i].mv[0];
1188 mvc[0][1] = lX->me8x8[2*i].mv[1];
1189 mvc[1][0] = lX->me8x8[2*i+1].mv[0];
1190 mvc[1][1] = lX->me8x8[2*i+1].mv[1];
1192 x264_mb_predict_mv( h, 0, 8*i, 2, m->mvp );
1193 x264_me_search( h, m, mvc, 2 );
1196 h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 16,
1197 m->mv[0], m->mv[1], 16, 8 );
1198 /* FIXME: ref cost */
1199 i_part_cost_bi += m->cost_mv;
1202 WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
1203 i_part_cost_bi += h->pixf.satd[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 16 );
1205 i_part_cost = a->l0.me16x8[i].cost;
1206 a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
1207 if( a->l1.me16x8[i].cost < i_part_cost )
1209 i_part_cost = a->l1.me16x8[i].cost;
1210 a->i_mb_partition16x8[i] = D_L1_8x8;
1212 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1214 i_part_cost = i_part_cost_bi;
1215 a->i_mb_partition16x8[i] = D_BI_8x8;
1217 a->i_cost16x8bi += i_part_cost;
1220 x264_mb_cache_mv_b16x8( h, a, i, 0 );
1224 a->i_mb_type16x8 = B_L0_L0
1225 + (a->i_mb_partition16x8[0]>>2) * 3
1226 + (a->i_mb_partition16x8[1]>>2);
1227 a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
1229 static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
1231 uint8_t **p_fref[2] =
1232 { h->mb.pic.p_fref[0][a->l0.i_ref],
1233 h->mb.pic.p_fref[1][a->l1.i_ref] };
1234 uint8_t pix[2][8*16];
1238 h->mb.i_partition = D_8x16;
1239 a->i_cost8x16bi = 0;
1241 for( i = 0; i < 2; i++ )
1244 int i_part_cost_bi = 0;
1246 for( l = 0; l < 2; l++ )
1248 x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
1249 x264_me_t *m = &lX->me8x16[i];
1251 m->i_pixel = PIXEL_8x16;
1252 m->p_cost_mv = a->p_cost_mv;
1254 LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
1255 LOAD_HPELS( m, p_fref[l], 8*i, 0 );
1257 mvc[0][0] = lX->me8x8[i].mv[0];
1258 mvc[0][1] = lX->me8x8[i].mv[1];
1259 mvc[1][0] = lX->me8x8[i+2].mv[0];
1260 mvc[1][1] = lX->me8x8[i+2].mv[1];
1262 x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
1263 x264_me_search( h, m, mvc, 2 );
1266 h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
1267 m->mv[0], m->mv[1], 8, 16 );
1268 /* FIXME: ref cost */
1269 i_part_cost_bi += m->cost_mv;
1272 WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
1273 i_part_cost_bi += h->pixf.satd[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], h->mb.pic.i_stride[0], pix[0], 8 );
1275 i_part_cost = a->l0.me8x16[i].cost;
1276 a->i_mb_partition8x16[i] = D_L0_8x8;
1277 if( a->l1.me8x16[i].cost < i_part_cost )
1279 i_part_cost = a->l1.me8x16[i].cost;
1280 a->i_mb_partition8x16[i] = D_L1_8x8;
1282 if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
1284 i_part_cost = i_part_cost_bi;
1285 a->i_mb_partition8x16[i] = D_BI_8x8;
1287 a->i_cost8x16bi += i_part_cost;
1290 x264_mb_cache_mv_b8x16( h, a, i, 0 );
1294 a->i_mb_type8x16 = B_L0_L0
1295 + (a->i_mb_partition8x16[0]>>2) * 3
1296 + (a->i_mb_partition8x16[1]>>2);
1297 a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
1300 /*****************************************************************************
1301 * x264_macroblock_analyse:
1302 *****************************************************************************/
1303 void x264_macroblock_analyse( x264_t *h )
1305 x264_mb_analysis_t analysis;
1308 h->mb.qp[h->mb.i_mb_xy] = x264_ratecontrol_qp(h);
1310 /* prevent QP from varying too fast. FIXME what's a sane limit? */
1311 h->mb.qp[h->mb.i_mb_xy] = x264_clip3( h->mb.qp[h->mb.i_mb_xy],
1312 h->mb.i_last_qp - 12, h->mb.i_last_qp + 12 );
1315 x264_mb_analyse_init( h, &analysis, h->mb.qp[h->mb.i_mb_xy] );
1317 /*--------------------------- Do the analysis ---------------------------*/
1318 if( h->sh.i_type == SLICE_TYPE_I )
1320 x264_mb_analyse_intra( h, &analysis, COST_MAX );
1322 if( analysis.i_sad_i4x4 < analysis.i_sad_i16x16 )
1323 h->mb.i_type = I_4x4;
1325 h->mb.i_type = I_16x16;
1327 else if( h->sh.i_type == SLICE_TYPE_P )
1329 const unsigned int i_neighbour = h->mb.i_neighbour;
1333 int i_intra_cost, i_intra_type;
1335 /* Fast P_SKIP detection */
1336 if( ( (i_neighbour&MB_LEFT) && h->mb.type[h->mb.i_mb_xy - 1] == P_SKIP ) ||
1337 ( (i_neighbour&MB_TOP) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride] == P_SKIP ) ||
1338 ( ((i_neighbour&(MB_TOP|MB_LEFT)) == (MB_TOP|MB_LEFT) ) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride-1 ] == P_SKIP ) ||
1339 ( (i_neighbour&MB_TOPRIGHT) && h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride+1 ] == P_SKIP ) )
1341 b_skip = x264_macroblock_probe_pskip( h );
1346 h->mb.i_type = P_SKIP;
1347 h->mb.i_partition = D_16x16;
1351 const unsigned int flags = h->param.analyse.inter;
1355 x264_mb_analyse_load_costs( h, &analysis );
1357 x264_mb_analyse_inter_p16x16( h, &analysis );
1358 if( flags & X264_ANALYSE_PSUB16x16 )
1359 x264_mb_analyse_inter_p8x8( h, &analysis );
1361 /* Select best inter mode */
1363 i_partition = D_16x16;
1364 i_cost = analysis.l0.me16x16.cost;
1366 if( ( flags & X264_ANALYSE_PSUB16x16 ) &&
1367 analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost )
1372 i_partition = D_8x8;
1373 h->mb.i_sub_partition[0] = D_L0_8x8;
1374 h->mb.i_sub_partition[1] = D_L0_8x8;
1375 h->mb.i_sub_partition[2] = D_L0_8x8;
1376 h->mb.i_sub_partition[3] = D_L0_8x8;
1378 i_cost = analysis.l0.i_cost8x8;
1381 if( flags & X264_ANALYSE_PSUB8x8 )
1383 for( i = 0; i < 4; i++ )
1385 x264_mb_analyse_inter_p4x4( h, &analysis, i );
1386 if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost )
1390 h->mb.i_sub_partition[i] = D_L0_4x4;
1391 i_cost8x8 = analysis.l0.i_cost4x4[i];
1393 x264_mb_analyse_inter_p8x4( h, &analysis, i );
1394 if( analysis.l0.i_cost8x4[i] < analysis.l0.i_cost4x4[i] )
1396 h->mb.i_sub_partition[i] = D_L0_8x4;
1397 i_cost8x8 = analysis.l0.i_cost8x4[i];
1400 x264_mb_analyse_inter_p4x8( h, &analysis, i );
1401 if( analysis.l0.i_cost4x8[i] < analysis.l0.i_cost4x4[i] )
1403 h->mb.i_sub_partition[i] = D_L0_4x8;
1404 i_cost8x8 = analysis.l0.i_cost4x8[i];
1407 i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
1412 /* Now do sub 16x8/8x16 */
1413 x264_mb_analyse_inter_p16x8( h, &analysis );
1414 if( analysis.l0.i_cost16x8 < i_cost )
1417 i_partition = D_16x8;
1418 i_cost = analysis.l0.i_cost16x8;
1421 x264_mb_analyse_inter_p8x16( h, &analysis );
1422 if( analysis.l0.i_cost8x16 < i_cost )
1425 i_partition = D_8x16;
1426 i_cost = analysis.l0.i_cost8x16;
1430 h->mb.i_type = i_type;
1431 h->mb.i_partition = i_partition;
1434 if( h->mb.i_partition == D_16x16 )
1436 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1437 i_cost = analysis.l0.me16x16.cost;
1439 else if( h->mb.i_partition == D_16x8 )
1441 x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
1442 x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
1443 i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
1445 else if( h->mb.i_partition == D_8x16 )
1447 x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
1448 x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
1449 i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
1451 else if( h->mb.i_partition == D_8x8 )
1455 for( i8x8 = 0; i8x8 < 4; i8x8++ )
1457 switch( h->mb.i_sub_partition[i8x8] )
1460 x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
1461 i_cost += analysis.l0.me8x8[i8x8].cost;
1464 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
1465 x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
1466 i_cost += analysis.l0.me8x4[i8x8][0].cost +
1467 analysis.l0.me8x4[i8x8][1].cost;
1470 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
1471 x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
1472 i_cost += analysis.l0.me4x8[i8x8][0].cost +
1473 analysis.l0.me4x8[i8x8][1].cost;
1477 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
1478 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
1479 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
1480 x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
1481 i_cost += analysis.l0.me4x4[i8x8][0].cost +
1482 analysis.l0.me4x4[i8x8][1].cost +
1483 analysis.l0.me4x4[i8x8][2].cost +
1484 analysis.l0.me4x4[i8x8][3].cost;
1487 x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
1493 x264_mb_analyse_intra( h, &analysis, i_cost );
1494 if( h->mb.b_chroma_me &&
1495 ( analysis.i_sad_i16x16 < i_cost
1496 || ( analysis.i_sad_i4x4 < i_cost )))
1498 x264_mb_analyse_intra_chroma( h, &analysis );
1499 analysis.i_sad_i16x16 += analysis.i_sad_i8x8;
1500 analysis.i_sad_i4x4 += analysis.i_sad_i8x8;
1503 i_intra_type = I_16x16;
1504 i_intra_cost = analysis.i_sad_i16x16;
1506 if( analysis.i_sad_i4x4 < i_intra_cost )
1508 i_intra_type = I_4x4;
1509 i_intra_cost = analysis.i_sad_i4x4;
1512 if( i_intra_cost < i_cost )
1514 h->mb.i_type = i_intra_type;
1515 i_cost = i_intra_cost;
1518 h->stat.frame.i_intra_cost += i_intra_cost;
1519 h->stat.frame.i_inter_cost += i_cost;
1522 else if( h->sh.i_type == SLICE_TYPE_B )
1526 analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h );
1527 if( analysis.b_direct_available )
1529 h->mb.i_type = B_SKIP;
1532 /* Conditioning the probe on neighboring block types
1533 * doesn't seem to help speed or quality. */
1534 b_skip = x264_macroblock_probe_bskip( h );
1539 const unsigned int flags = h->param.analyse.inter;
1543 x264_mb_analyse_load_costs( h, &analysis );
1545 /* select best inter mode */
1546 /* direct must be first */
1547 if( analysis.b_direct_available )
1548 x264_mb_analyse_inter_direct( h, &analysis );
1550 x264_mb_analyse_inter_b16x16( h, &analysis );
1552 h->mb.i_type = B_L0_L0;
1553 i_partition = D_16x16;
1554 i_cost = analysis.l0.me16x16.cost;
1555 if( analysis.l1.me16x16.cost < i_cost )
1557 h->mb.i_type = B_L1_L1;
1558 i_cost = analysis.l1.me16x16.cost;
1560 if( analysis.i_cost16x16bi < i_cost )
1562 h->mb.i_type = B_BI_BI;
1563 i_cost = analysis.i_cost16x16bi;
1565 if( analysis.i_cost16x16direct < i_cost )
1567 h->mb.i_type = B_DIRECT;
1568 i_cost = analysis.i_cost16x16direct;
1571 if( flags & X264_ANALYSE_BSUB16x16 )
1573 x264_mb_analyse_inter_b8x8( h, &analysis );
1574 if( analysis.i_cost8x8bi < i_cost )
1576 h->mb.i_type = B_8x8;
1577 i_partition = D_8x8;
1578 i_cost = analysis.i_cost8x8bi;
1580 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[1] ||
1581 h->mb.i_sub_partition[2] == h->mb.i_sub_partition[3] )
1583 x264_mb_analyse_inter_b16x8( h, &analysis );
1584 if( analysis.i_cost16x8bi < i_cost )
1586 i_partition = D_16x8;
1587 i_cost = analysis.i_cost16x8bi;
1588 h->mb.i_type = analysis.i_mb_type16x8;
1591 if( h->mb.i_sub_partition[0] == h->mb.i_sub_partition[2] ||
1592 h->mb.i_sub_partition[1] == h->mb.i_sub_partition[3] )
1594 x264_mb_analyse_inter_b8x16( h, &analysis );
1595 if( analysis.i_cost8x16bi < i_cost )
1597 i_partition = D_8x16;
1598 i_cost = analysis.i_cost8x16bi;
1599 h->mb.i_type = analysis.i_mb_type8x16;
1605 h->mb.i_partition = i_partition;
1608 if( i_partition == D_16x16 )
1610 analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1611 analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1612 if( h->mb.i_type == B_L0_L0 )
1614 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1615 i_cost = analysis.l0.me16x16.cost
1616 + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
1618 else if( h->mb.i_type == B_L1_L1 )
1620 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
1621 i_cost = analysis.l1.me16x16.cost
1622 + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
1624 else if( h->mb.i_type == B_BI_BI )
1626 x264_me_refine_qpel( h, &analysis.l0.me16x16 );
1627 x264_me_refine_qpel( h, &analysis.l1.me16x16 );
1630 else if( i_partition == D_16x8 )
1632 for( i=0; i<2; i++ )
1634 if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
1635 x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
1636 if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
1637 x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
1640 else if( i_partition == D_8x16 )
1642 for( i=0; i<2; i++ )
1644 if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
1645 x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
1646 if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
1647 x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
1650 else if( i_partition == D_8x8 )
1652 for( i=0; i<4; i++ )
1655 int i_part_cost_old;
1657 int i_part_type = h->mb.i_sub_partition[i];
1658 int b_bidir = (i_part_type == D_BI_8x8);
1660 if( i_part_type == D_DIRECT_8x8 )
1662 if( x264_mb_partition_listX_table[0][i_part_type] )
1664 m = &analysis.l0.me8x8[i];
1665 i_part_cost_old = m->cost;
1666 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
1667 m->cost -= i_type_cost;
1668 x264_me_refine_qpel( h, m );
1670 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
1672 if( x264_mb_partition_listX_table[1][i_part_type] )
1674 m = &analysis.l1.me8x8[i];
1675 i_part_cost_old = m->cost;
1676 i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
1677 m->cost -= i_type_cost;
1678 x264_me_refine_qpel( h, m );
1680 analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
1682 /* TODO: update mvp? */
1686 /* best intra mode */
1687 x264_mb_analyse_intra( h, &analysis, i_cost );
1689 if( analysis.i_sad_i16x16 < i_cost )
1691 h->mb.i_type = I_16x16;
1692 i_cost = analysis.i_sad_i16x16;
1694 if( analysis.i_sad_i4x4 < i_cost )
1696 h->mb.i_type = I_4x4;
1697 i_cost = analysis.i_sad_i4x4;
1702 /*-------------------- Update MB from the analysis ----------------------*/
1703 h->mb.type[h->mb.i_mb_xy] = h->mb.i_type;
1704 switch( h->mb.i_type )
1707 for( i = 0; i < 16; i++ )
1709 h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] =
1710 analysis.i_predict4x4[block_idx_x[i]][block_idx_y[i]];
1713 x264_mb_analyse_intra_chroma( h, &analysis );
1714 h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
1717 h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16;
1719 x264_mb_analyse_intra_chroma( h, &analysis );
1720 h->mb.i_chroma_pred_mode = analysis.i_predict8x8;
1724 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1725 switch( h->mb.i_partition )
1728 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1732 x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].mv[0], analysis.l0.me16x8[0].mv[1] );
1733 x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].mv[0], analysis.l0.me16x8[1].mv[1] );
1737 x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].mv[0], analysis.l0.me8x16[0].mv[1] );
1738 x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].mv[0], analysis.l0.me8x16[1].mv[1] );
1742 x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
1748 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1749 for( i = 0; i < 4; i++ )
1751 const int x = 2*(i%2);
1752 const int y = 2*(i/2);
1754 switch( h->mb.i_sub_partition[i] )
1757 x264_macroblock_cache_mv( h, x, y, 2, 2, 0, analysis.l0.me8x8[i].mv[0], analysis.l0.me8x8[i].mv[1] );
1760 x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, analysis.l0.me8x4[i][0].mv[0], analysis.l0.me8x4[i][0].mv[1] );
1761 x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, analysis.l0.me8x4[i][1].mv[0], analysis.l0.me8x4[i][1].mv[1] );
1764 x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, analysis.l0.me4x8[i][0].mv[0], analysis.l0.me4x8[i][0].mv[1] );
1765 x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, analysis.l0.me4x8[i][1].mv[0], analysis.l0.me4x8[i][1].mv[1] );
1768 x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, analysis.l0.me4x4[i][0].mv[0], analysis.l0.me4x4[i][0].mv[1] );
1769 x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, analysis.l0.me4x4[i][1].mv[0], analysis.l0.me4x4[i][1].mv[1] );
1770 x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, analysis.l0.me4x4[i][2].mv[0], analysis.l0.me4x4[i][2].mv[1] );
1771 x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, analysis.l0.me4x4[i][3].mv[0], analysis.l0.me4x4[i][3].mv[1] );
1774 x264_log( h, X264_LOG_ERROR, "internal error\n" );
1783 x264_mb_predict_mv_pskip( h, mvp );
1785 h->mb.i_partition = D_16x16;
1786 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
1787 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, mvp[0], mvp[1] );
1792 /* nothing has changed since x264_macroblock_probe_bskip */
1795 x264_mb_load_mv_direct8x8( h, 0 );
1796 x264_mb_load_mv_direct8x8( h, 1 );
1797 x264_mb_load_mv_direct8x8( h, 2 );
1798 x264_mb_load_mv_direct8x8( h, 3 );
1802 /* optimize: cache might not need to be rewritten */
1803 for( i = 0; i < 4; i++ )
1804 x264_mb_cache_mv_b8x8( h, &analysis, i, 1 );
1807 default: /* the rest of the B types */
1808 switch( h->mb.i_partition )
1811 switch( h->mb.i_type )
1814 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1815 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1817 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
1818 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0, 0 );
1819 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0, 0 );
1822 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
1823 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0, 0 );
1824 x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0, 0 );
1826 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
1827 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
1830 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.i_ref );
1831 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.mv[0], analysis.l0.me16x16.mv[1] );
1833 x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, analysis.l1.i_ref );
1834 x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, analysis.l1.me16x16.mv[0], analysis.l1.me16x16.mv[1] );
1839 x264_mb_cache_mv_b16x8( h, &analysis, 0, 1 );
1840 x264_mb_cache_mv_b16x8( h, &analysis, 1, 1 );
1843 x264_mb_cache_mv_b8x16( h, &analysis, 0, 1 );
1844 x264_mb_cache_mv_b8x16( h, &analysis, 1, 1 );
1847 x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
1853 #include "slicetype_decision.c"