X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=encoder%2Fanalyse.c;h=fecdd3bd0e9ae43e53622c248b25e3330ab39cc7;hb=58d2349dd7aad34a2cf09be081670d510657eda1;hp=c1c9314a0a38d2958179e1a63e6aa5c19026a52d;hpb=a54f4f2b77c7f77cb86232a291c802c1d993f7e7;p=x264 diff --git a/encoder/analyse.c b/encoder/analyse.c index c1c9314a..fecdd3bd 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -24,9 +24,7 @@ #define _ISOC99_SOURCE #include -#ifndef _MSC_VER #include -#endif #include "common/common.h" #include "common/cpu.h" @@ -39,9 +37,9 @@ typedef struct { /* 16x16 */ - int i_ref; int i_rd16x16; x264_me_t me16x16; + x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */ /* 8x8 */ int i_cost8x8; @@ -78,15 +76,15 @@ typedef struct int i_lambda2; int i_qp; uint16_t *p_cost_mv; - uint16_t *p_cost_ref0; - uint16_t *p_cost_ref1; + uint16_t *p_cost_ref[2]; int i_mbrd; /* I: Intra part */ /* Take some shortcuts in intra search if intra is deemed unlikely */ int b_fast_intra; - int b_try_pskip; + int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */ + int b_try_skip; /* Luma part */ int i_satd_i16x16; @@ -105,7 +103,7 @@ typedef struct /* Chroma part */ int i_satd_i8x8chroma; - int i_satd_i8x8chroma_dir[4]; + int i_satd_i8x8chroma_dir[7]; int i_predict8x8chroma; /* II: Inter part P/B frame */ @@ -134,7 +132,7 @@ typedef struct } x264_mb_analysis_t; /* lambda = pow(2,qp/6-2) */ -const int x264_lambda_tab[52] = { +const uint8_t x264_lambda_tab[52] = { 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */ 1, 1, 1, 1, /* 8-11 */ 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */ @@ -156,10 +154,10 @@ const int x264_lambda2_tab[52] = { }; const uint8_t x264_exp2_lut[64] = { - 1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 34, 37, 40, 44, 47, - 50, 53, 57, 60, 64, 67, 71, 74, 78, 81, 85, 89, 93, 96, 100, 104, - 108, 112, 116, 120, 124, 128, 132, 137, 141, 145, 150, 154, 159, 163, 168, 172, - 177, 182, 186, 191, 196, 201, 206, 211, 216, 221, 226, 232, 237, 242, 248, 253, + 0, 3, 6, 8, 11, 14, 17, 20, 23, 26, 29, 32, 36, 39, 42, 45, + 48, 52, 55, 58, 62, 65, 69, 72, 76, 80, 83, 87, 91, 94, 98, 102, + 106, 110, 114, 118, 122, 126, 130, 135, 139, 143, 147, 152, 156, 161, 165, 170, + 175, 179, 184, 189, 194, 198, 203, 208, 214, 219, 224, 229, 234, 240, 245, 250 }; const float x264_log2_lut[128] = { @@ -222,50 +220,49 @@ static const uint16_t x264_chroma_lambda2_offset_tab[] = { }; /* TODO: calculate CABAC costs */ -static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = { +static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] = { 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0 }; -static const int i_mb_b16x8_cost_table[17] = { +static const uint8_t i_mb_b16x8_cost_table[17] = { 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9 }; -static const int i_sub_mb_b_cost_table[13] = { +static const uint8_t i_sub_mb_b_cost_table[13] = { 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1 }; -static const int i_sub_mb_p_cost_table[4] = { +static const uint8_t i_sub_mb_p_cost_table[4] = { 5, 3, 3, 1 }; static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ); static uint16_t x264_cost_ref[92][3][33]; -static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER; +static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER; int x264_analyse_init_costs( x264_t *h, int qp ) { - int i, j; int lambda = x264_lambda_tab[qp]; if( h->cost_mv[lambda] ) return 0; /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) ); h->cost_mv[lambda] += 2*4*2048; - for( i = 0; i <= 2*4*2048; i++ ) + for( int i = 0; i <= 2*4*2048; i++ ) { h->cost_mv[lambda][-i] = h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f; } x264_pthread_mutex_lock( &cost_ref_mutex ); - for( i = 0; i < 3; i++ ) - for( j = 0; j < 33; j++ ) + for( int i = 0; i < 3; i++ ) + for( int j = 0; j < 33; j++ ) x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0; x264_pthread_mutex_unlock( &cost_ref_mutex ); if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] ) { - for( j=0; j<4; j++ ) + for( int j = 0; j < 4; j++ ) { CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) ); h->cost_mv_fpel[lambda][j] += 2*2048; - for( i = -2*2048; i < 2*2048; i++ ) + for( int i = -2*2048; i < 2*2048; i++ ) h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j]; } } @@ -276,34 +273,54 @@ fail: void x264_analyse_free_costs( x264_t *h ) { - int i, j; - for( i = 0; i < 92; i++ ) + for( int i = 0; i < 92; i++ ) { if( h->cost_mv[i] ) x264_free( h->cost_mv[i] - 2*4*2048 ); if( h->cost_mv_fpel[i][0] ) - for( j = 0; j < 4; j++ ) + for( int j = 0; j < 4; j++ ) x264_free( h->cost_mv_fpel[i][j] - 2*2048 ); } } +void x264_analyse_weight_frame( x264_t *h, int end ) +{ + for( int j = 0; j < h->i_ref0; j++ ) + { + if( h->sh.weight[j][0].weightfn ) + { + x264_frame_t *frame = h->fref0[j]; + int width = frame->i_width[0] + 2*PADH; + int i_padv = PADV << h->param.b_interlaced; + int offset, height; + uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH; + height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted; + offset = h->fenc->i_lines_weighted*frame->i_stride[0]; + h->fenc->i_lines_weighted += height; + if( height ) + for( int k = j; k < h->i_ref0; k++ ) + if( h->sh.weight[k][0].weightfn ) + { + uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH; + x264_weight_scale_plane( h, dst + offset, frame->i_stride[0], + src + offset, frame->i_stride[0], + width, height, &h->sh.weight[k][0] ); + } + break; + } + } +} + /* initialize an array of lambda*nbits for all possible mvs */ static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) { a->p_cost_mv = h->cost_mv[a->i_lambda]; - a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)]; - a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)]; + a->p_cost_ref[0] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)]; + a->p_cost_ref[1] = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)]; } -static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) +static void x264_mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int i_qp ) { - int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B); - - /* mbrd == 1 -> RD mode decision */ - /* mbrd == 2 -> RD refinement */ - /* mbrd == 3 -> QPRD */ - a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10); - /* conduct the analysis using this lamda and QP */ a->i_qp = h->mb.i_qp = i_qp; h->mb.i_chroma_qp = h->chroma_qp_table[i_qp]; @@ -322,11 +339,18 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) h->mb.i_psy_rd_lambda = a->i_lambda; /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */ h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256; +} + +static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) +{ + int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B); + + /* mbrd == 1 -> RD mode decision */ + /* mbrd == 2 -> RD refinement */ + /* mbrd == 3 -> QPRD */ + a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10); - h->mb.i_me_method = h->param.analyse.i_me_method; - h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine; - h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P - && h->mb.i_subpel_refine >= 5; + x264_mb_analyse_init_qp( h, a, i_qp ); h->mb.b_transform_8x8 = 0; h->mb.b_noise_reduction = 0; @@ -349,7 +373,6 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) /* II: Inter part P/B frame */ if( h->sh.i_type != SLICE_TYPE_I ) { - int i, j; int i_fmv_range = 4 * h->param.analyse.i_mv_range; // limit motion search to a slightly smaller range than the theoretical limit, // since the search may go a few iterations past its given range @@ -361,32 +384,43 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) h->mb.mv_max[0] = 4*( 16*( h->sps->i_mb_width - h->mb.i_mb_x - 1 ) + 24 ); h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] ); h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] ); + if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P ) + { + int max_x = (h->fref0[0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */ + int max_mv = max_x - 4*16*h->mb.i_mb_x; + /* If we're left of the refresh bar, don't reference right of it. */ + if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col ) + h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv ); + } h->mb.mv_min_fpel[0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border; h->mb.mv_max_fpel[0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border; - if( h->mb.i_mb_x == 0) + if( h->mb.i_mb_x == 0 ) { int mb_y = h->mb.i_mb_y >> h->sh.b_mbaff; int mb_height = h->sps->i_mb_height >> h->sh.b_mbaff; int thread_mvy_range = i_fmv_range; - if( h->param.i_threads > 1 ) + if( h->i_thread_frames > 1 ) { int pix_y = (h->mb.i_mb_y | h->mb.b_interlaced) * 16; int thresh = pix_y + h->param.analyse.i_mv_range_thread; - for( i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- ) + for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- ) { x264_frame_t **fref = i ? h->fref1 : h->fref0; int i_ref = i ? h->i_ref1 : h->i_ref0; - for( j=0; ji_lines_completed - pix_y ); + x264_frame_cond_wait( fref[j]->orig, thresh ); + thread_mvy_range = X264_MIN( thread_mvy_range, fref[j]->orig->i_lines_completed - pix_y ); } } + if( h->param.b_deterministic ) thread_mvy_range = h->param.analyse.i_mv_range_thread; if( h->mb.b_interlaced ) thread_mvy_range >>= 1; + + x264_analyse_weight_frame( h, pix_y + thread_mvy_range ); } h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); @@ -401,31 +435,18 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) a->l0.me16x16.cost = a->l0.i_rd16x16 = - a->l0.i_cost8x8 = COST_MAX; - - for( i = 0; i < 4; i++ ) - { - a->l0.i_cost4x4[i] = - a->l0.i_cost8x4[i] = - a->l0.i_cost4x8[i] = COST_MAX; - } - + a->l0.i_cost8x8 = a->l0.i_cost16x8 = a->l0.i_cost8x16 = COST_MAX; if( h->sh.i_type == SLICE_TYPE_B ) { a->l1.me16x16.cost = a->l1.i_rd16x16 = - a->l1.i_cost8x8 = COST_MAX; - - for( i = 0; i < 4; i++ ) - { - a->l1.i_cost4x4[i] = - a->l1.i_cost8x4[i] = - a->l1.i_cost4x8[i] = - a->i_cost8x8direct[i] = COST_MAX; - } - + a->l1.i_cost8x8 = + a->i_cost8x8direct[0] = + a->i_cost8x8direct[1] = + a->i_cost8x8direct[2] = + a->i_cost8x8direct[3] = a->l1.i_cost16x8 = a->l1.i_cost8x16 = a->i_rd16x16bi = @@ -439,16 +460,25 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) a->i_cost16x8bi = a->i_cost8x16bi = COST_MAX; } + else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ) + for( int i = 0; i < 4; i++ ) + { + a->l0.i_cost4x4[i] = + a->l0.i_cost8x4[i] = + a->l0.i_cost4x8[i] = COST_MAX; + } /* Fast intra decision */ if( h->mb.i_mb_xy - h->sh.i_first_mb > 4 ) { - if( IS_INTRA( h->mb.i_mb_type_left ) - || IS_INTRA( h->mb.i_mb_type_top ) - || IS_INTRA( h->mb.i_mb_type_topleft ) - || IS_INTRA( h->mb.i_mb_type_topright ) - || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] )) - || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) + /* Always run in fast-intra mode for subme < 3 */ + if( h->mb.i_subpel_refine > 2 && + ( IS_INTRA( h->mb.i_mb_type_left ) || + IS_INTRA( h->mb.i_mb_type_top ) || + IS_INTRA( h->mb.i_mb_type_topleft ) || + IS_INTRA( h->mb.i_mb_type_topright ) || + (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] )) || + (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) ) { /* intra is likely */ } else { @@ -456,260 +486,140 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) } } h->mb.b_skip_mc = 0; + if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P && + h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col ) + { + a->b_force_intra = 1; + a->b_fast_intra = 0; + } + else + a->b_force_intra = 0; } } +/* Prediction modes allowed for various combinations of neighbors. */ +/* Terminated by a -1. */ +/* In order, no neighbors, left, top, top/left, top/left/topleft */ +static const int8_t i16x16_mode_available[5][5] = +{ + {I_PRED_16x16_DC_128, -1, -1, -1, -1}, + {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1}, + {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1}, + {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1}, + {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1}, +}; +static const int8_t i8x8chroma_mode_available[5][5] = +{ + {I_PRED_CHROMA_DC_128, -1, -1, -1, -1}, + {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1}, + {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1}, + {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1}, + {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1}, +}; -/* - * Handle intra mb - */ -/* Max = 4 */ -static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, int *pi_count ) +static const int8_t i4x4_mode_available[5][10] = { - int b_top = i_neighbour & MB_TOP; - int b_left = i_neighbour & MB_LEFT; - if( b_top && b_left ) - { - /* top and left available */ - *mode++ = I_PRED_16x16_V; - *mode++ = I_PRED_16x16_H; - *mode++ = I_PRED_16x16_DC; - *pi_count = 3; - if( i_neighbour & MB_TOPLEFT ) - { - /* top left available*/ - *mode++ = I_PRED_16x16_P; - *pi_count = 4; - } - } - else if( b_left ) - { - /* left available*/ - *mode++ = I_PRED_16x16_DC_LEFT; - *mode++ = I_PRED_16x16_H; - *pi_count = 2; - } - else if( b_top ) - { - /* top available*/ - *mode++ = I_PRED_16x16_DC_TOP; - *mode++ = I_PRED_16x16_V; - *pi_count = 2; - } - else - { - /* none available */ - *mode = I_PRED_16x16_DC_128; - *pi_count = 1; - } + {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1}, + {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1}, + {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1}, + {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1}, +}; + +static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour ) +{ + int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); + return i16x16_mode_available[(idx&MB_TOPLEFT)?4:idx]; } -/* Max = 4 */ -static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count ) +static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour ) { - int b_top = i_neighbour & MB_TOP; - int b_left = i_neighbour & MB_LEFT; - if( b_top && b_left ) - { - /* top and left available */ - *mode++ = I_PRED_CHROMA_V; - *mode++ = I_PRED_CHROMA_H; - *mode++ = I_PRED_CHROMA_DC; - *pi_count = 3; - if( i_neighbour & MB_TOPLEFT ) - { - /* top left available */ - *mode++ = I_PRED_CHROMA_P; - *pi_count = 4; - } - } - else if( b_left ) - { - /* left available*/ - *mode++ = I_PRED_CHROMA_DC_LEFT; - *mode++ = I_PRED_CHROMA_H; - *pi_count = 2; - } - else if( b_top ) - { - /* top available*/ - *mode++ = I_PRED_CHROMA_DC_TOP; - *mode++ = I_PRED_CHROMA_V; - *pi_count = 2; - } - else - { - /* none available */ - *mode = I_PRED_CHROMA_DC_128; - *pi_count = 1; - } + int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); + return i8x8chroma_mode_available[(idx&MB_TOPLEFT)?4:idx]; } -/* MAX = 9 */ -static void predict_4x4_mode_available( unsigned int i_neighbour, - int *mode, int *pi_count ) +static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int i_neighbour ) { - int b_top = i_neighbour & MB_TOP; - int b_left = i_neighbour & MB_LEFT; - if( b_top && b_left ) - { - *pi_count = 6; - *mode++ = I_PRED_4x4_DC; - *mode++ = I_PRED_4x4_H; - *mode++ = I_PRED_4x4_V; - *mode++ = I_PRED_4x4_DDL; - if( i_neighbour & MB_TOPLEFT ) - { - *mode++ = I_PRED_4x4_DDR; - *mode++ = I_PRED_4x4_VR; - *mode++ = I_PRED_4x4_HD; - *pi_count += 3; - } - *mode++ = I_PRED_4x4_VL; - *mode++ = I_PRED_4x4_HU; - } - else if( b_left ) - { - *mode++ = I_PRED_4x4_DC_LEFT; - *mode++ = I_PRED_4x4_H; - *mode++ = I_PRED_4x4_HU; - *pi_count = 3; - } - else if( b_top ) - { - *mode++ = I_PRED_4x4_DC_TOP; - *mode++ = I_PRED_4x4_V; - *mode++ = I_PRED_4x4_DDL; - *mode++ = I_PRED_4x4_VL; - *pi_count = 4; - } - else - { - *mode++ = I_PRED_4x4_DC_128; - *pi_count = 1; - } + int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); + return i4x4_mode_available[(idx&MB_TOPLEFT)?4:idx]; } /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct ) { - ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] ); - ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] ); ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0}; - int i; if( do_both_dct || h->mb.b_transform_8x8 ) - { - h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], zero ); - for( i = 0; i < 4; i++ ) - h->zigzagf.scan_8x8( h->mb.pic.fenc_dct8[i], dct8x8[i] ); - } + h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero ); if( do_both_dct || !h->mb.b_transform_8x8 ) - { - h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], zero ); - for( i = 0; i < 16; i++ ) - h->zigzagf.scan_4x4( h->mb.pic.fenc_dct4[i], dct4x4[i] ); - } + h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero ); } -/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */ -static inline void x264_mb_cache_fenc_satd( x264_t *h ) +/* Reset fenc satd scores cache for psy RD */ +static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd ) { - ALIGNED_16( static uint8_t zero[16] ) = {0}; - uint8_t *fenc; - int x, y, satd_sum = 0, sa8d_sum = 0; if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis ) x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 ); if( !h->mb.i_psy_rd ) return; - for( y = 0; y < 4; y++ ) - for( x = 0; x < 4; x++ ) - { - fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE; - h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE ) - - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1); - satd_sum += h->mb.pic.fenc_satd[y][x]; - } - for( y = 0; y < 2; y++ ) - for( x = 0; x < 2; x++ ) - { - fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE; - h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE ) - - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2); - sa8d_sum += h->mb.pic.fenc_sa8d[y][x]; - } - h->mb.pic.fenc_satd_sum = satd_sum; - h->mb.pic.fenc_sa8d_sum = sa8d_sum; + /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */ + h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) ); + if( b_satd ) + h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) ); } static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) { - int i; - - int i_max; - int predict_mode[4]; int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless; - uint8_t *p_dstc[2], *p_srcc[2]; - if( a->i_satd_i8x8chroma < COST_MAX ) return; - /* 8x8 prediction selection for chroma */ - p_dstc[0] = h->mb.pic.p_fdec[1]; - p_dstc[1] = h->mb.pic.p_fdec[2]; - p_srcc[0] = h->mb.pic.p_fenc[1]; - p_srcc[1] = h->mb.pic.p_fenc[2]; + const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra ); - predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); - a->i_satd_i8x8chroma = COST_MAX; - if( i_max == 4 && b_merged_satd ) + /* 8x8 prediction selection for chroma */ + if( predict_mode[3] >= 0 && b_merged_satd ) { int satdu[4], satdv[4]; - h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu ); - h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv ); - h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] ); - h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] ); - satdu[I_PRED_CHROMA_P] = - h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE ); - satdv[I_PRED_CHROMA_P] = - h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE ); - - for( i=0; ipixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu ); + h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv ); + h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] ); + h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] ); + satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ); + satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ); + + for( ; *predict_mode >= 0; predict_mode++ ) { - int i_mode = predict_mode[i]; - int i_satd = satdu[i_mode] + satdv[i_mode] - + a->i_lambda * bs_size_ue(i_mode); + int i_mode = *predict_mode; + int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode ); - a->i_satd_i8x8chroma_dir[i] = i_satd; + a->i_satd_i8x8chroma_dir[i_mode] = i_satd; COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode ); } } else { - for( i=0; i= 0; predict_mode++ ) { int i_satd; - int i_mode = predict_mode[i]; + int i_mode = *predict_mode; /* we do the prediction */ if( h->mb.b_lossless ) x264_predict_lossless_8x8_chroma( h, i_mode ); else { - h->predict_8x8c[i_mode]( p_dstc[0] ); - h->predict_8x8c[i_mode]( p_dstc[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); } /* we calculate the cost */ - i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, - p_srcc[0], FENC_STRIDE ) + - h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, - p_srcc[1], FENC_STRIDE ) + + i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) + + h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) + a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] ); - a->i_satd_i8x8chroma_dir[i] = i_satd; + a->i_satd_i8x8chroma_dir[i_mode] = i_satd; COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode ); } } @@ -723,23 +633,21 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ uint8_t *p_src = h->mb.pic.p_fenc[0]; uint8_t *p_dst = h->mb.pic.p_fdec[0]; - int i, idx; - int i_max; - int predict_mode[9]; + int idx; int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless; /*---------------- Try all mode and calculate their score ---------------*/ /* 16x16 prediction selection */ - predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); + const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra ); - if( b_merged_satd && i_max == 4 ) + if( b_merged_satd && predict_mode[3] >= 0 ) { h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); h->predict_16x16[I_PRED_16x16_P]( p_dst ); a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); - for( i=0; i<4; i++ ) + for( int i = 0; i < 4; i++ ) { int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i); COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i ); @@ -747,10 +655,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ } else { - for( i = 0; i < i_max; i++ ) + for( ; *predict_mode >= 0; predict_mode++ ) { int i_satd; - int i_mode = predict_mode[i]; + int i_mode = *predict_mode; if( h->mb.b_lossless ) x264_predict_lossless_16x16( h, i_mode ); @@ -767,7 +675,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( h->sh.i_type == SLICE_TYPE_B ) /* cavlc mb type prefix */ a->i_satd_i16x16 += a->i_lambda * i_mb_b_cost_table[I_16x16]; - if( a->b_fast_intra && a->i_satd_i16x16 > 2*i_satd_inter ) + + /* Not heavily tuned */ + const uint8_t i16x16_thresh[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 }; + if( a->b_fast_intra && a->i_satd_i16x16 > (i16x16_thresh[h->mb.i_subpel_refine]*i_satd_inter)>>1 ) return; /* 8x8 prediction selection */ @@ -776,11 +687,12 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ ALIGNED_ARRAY_16( uint8_t, edge,[33] ); x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8]; int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); - int i_cost = 0; + + // FIXME some bias like in i4x4? + int i_cost = a->i_lambda * 4; /* base predmode costs */ h->mb.i_cbp_luma = 0; b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless; - // FIXME some bias like in i4x4? if( h->sh.i_type == SLICE_TYPE_B ) i_cost += a->i_lambda * i_mb_b_cost_table[I_8x8]; @@ -793,41 +705,40 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ int i_best = COST_MAX; int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx ); - predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max ); + predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] ); h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); - if( b_merged_satd && i_max == 9 ) + if( b_merged_satd && predict_mode[8] >= 0 ) { int satd[9]; h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd ); satd[i_pred_mode] -= 3 * a->i_lambda; - for( i=2; i>=0; i-- ) + for( int i = 2; i >= 0; i-- ) { - int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda; + int cost = a->i_satd_i8x8_dir[i][idx] = satd[i]; COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i ); } - i = 3; + predict_mode += 3; } - else - i = 0; - for( ; i= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ ) { int i_satd; - int i_mode = predict_mode[i]; + int i_mode = *predict_mode; if( h->mb.b_lossless ) x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge ); else h->predict_8x8[i_mode]( p_dst_by, edge ); - i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ) - + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4); + i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); + if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) + i_satd -= 3 * a->i_lambda; COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode ); - a->i_satd_i8x8_dir[i_mode][idx] = i_satd; + a->i_satd_i8x8_dir[i_mode][idx] = i_satd + 4 * a->i_lambda; } - i_cost += i_best; + i_cost += i_best + 3 * a->i_lambda; if( idx == 3 || i_cost > i_satd_thresh ) break; @@ -845,10 +756,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); - h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]]; - h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]]; - h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]]; - h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]]; + h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); + h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); + h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); + h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ); h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) ); @@ -860,21 +771,22 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ a->i_satd_i8x8 = COST_MAX; i_cost = (i_cost * cost_div_fix8[idx]) >> 8; } - if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 ) + /* Not heavily tuned */ + const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 }; + if( X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 ) return; } /* 4x4 prediction selection */ if( flags & X264_ANALYSE_I4x4 ) { - int i_cost; + int i_cost = a->i_lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */ int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ); h->mb.i_cbp_luma = 0; b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless; if( a->i_mbrd ) i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8; - i_cost = a->i_lambda * 24; /* from JVT (SATD0) */ if( h->sh.i_type == SLICE_TYPE_B ) i_cost += a->i_lambda * i_mb_b_cost_table[I_4x4]; @@ -885,41 +797,50 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ int i_best = COST_MAX; int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx ); - predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max ); + predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] ); if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) /* emulate missing topright samples */ - *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U; + M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U; - if( b_merged_satd && i_max >= 6 ) + if( b_merged_satd && predict_mode[5] >= 0 ) { int satd[9]; h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd ); satd[i_pred_mode] -= 3 * a->i_lambda; - for( i=2; i>=0; i-- ) - COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda, - a->i_predict4x4[idx], i ); - i = 3; + for( int i = 2; i >= 0; i-- ) + COPY2_IF_LT( i_best, satd[i], a->i_predict4x4[idx], i ); + predict_mode += 3; } - else - i = 0; - for( ; i 0 ) { - int i_satd; - int i_mode = predict_mode[i]; - if( h->mb.b_lossless ) - x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode ); - else - h->predict_4x4[i_mode]( p_dst_by ); + for( ; *predict_mode >= 0; predict_mode++ ) + { + int i_satd; + int i_mode = *predict_mode; - i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, - p_src_by, FENC_STRIDE ) - + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4); + if( h->mb.b_lossless ) + x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode ); + else + h->predict_4x4[i_mode]( p_dst_by ); - COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode ); + i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); + if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) + { + i_satd -= a->i_lambda * 3; + if( i_satd <= 0 ) + { + i_best = i_satd; + a->i_predict4x4[idx] = i_mode; + break; + } + } + + COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode ); + } } - i_cost += i_best; + i_cost += i_best + 3 * a->i_lambda; if( i_cost > i_satd_thresh || idx == 15 ) break; @@ -936,10 +857,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); - h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]]; - h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]]; - h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]]; - h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]]; + h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ); + h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ); + h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ); + h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ); h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) ); @@ -985,21 +906,19 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) { uint8_t *p_dst = h->mb.pic.p_fdec[0]; - int i, j, idx, x, y; - int i_max, i_mode, i_thresh; + int x, y; uint64_t i_satd, i_best; - int predict_mode[9]; h->mb.i_skip_intra = 0; if( h->mb.i_type == I_16x16 ) { int old_pred_mode = a->i_predict16x16; - i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8; + const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra ); + int i_thresh = a->i_satd_i16x16_dir[old_pred_mode] * 9/8; i_best = a->i_satd_i16x16; - predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); - for( i = 0; i < i_max; i++ ) + for( ; *predict_mode >= 0; predict_mode++ ) { - int i_mode = predict_mode[i]; + int i_mode = *predict_mode; if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh ) continue; h->mb.i_intra16x16_pred_mode = i_mode; @@ -1009,18 +928,19 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) } /* RD selection for chroma prediction */ - predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); - if( i_max > 1 ) + const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra ); + if( predict_mode[1] >= 0 ) { - i_thresh = a->i_satd_i8x8chroma * 5/4; + int8_t predict_mode_sorted[4]; + int i_max; + int i_thresh = a->i_satd_i8x8chroma * 5/4; - for( i = j = 0; i < i_max; i++ ) - if( a->i_satd_i8x8chroma_dir[i] < i_thresh && - predict_mode[i] != a->i_predict8x8chroma ) - { - predict_mode[j++] = predict_mode[i]; - } - i_max = j; + for( i_max = 0; *predict_mode >= 0; predict_mode++ ) + { + int i_mode = *predict_mode; + if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma ) + predict_mode_sorted[i_max++] = i_mode; + } if( i_max > 0 ) { @@ -1030,9 +950,9 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) * coefs for the current chroma mode are still around, so we only * have to recount the bits. */ i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 ); - for( i = 0; i < i_max; i++ ) + for( int i = 0; i < i_max; i++ ) { - i_mode = predict_mode[i]; + int i_mode = predict_mode_sorted[i]; if( h->mb.b_lossless ) x264_predict_lossless_8x8_chroma( h, i_mode ); else @@ -1055,20 +975,20 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) { uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning int i_nnz = 0; - for( idx = 0; idx < 16; idx++ ) + for( int idx = 0; idx < 16; idx++ ) { uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx]; i_best = COST_MAX64; - predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max ); + predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] ); if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) /* emulate missing topright samples */ - *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U; + M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U; - for( i = 0; i < i_max; i++ ) + for( ; *predict_mode >= 0; predict_mode++ ) { - i_mode = predict_mode[i]; + int i_mode = *predict_mode; if( h->mb.b_lossless ) x264_predict_lossless_4x4( h, p_dst_by, idx, i_mode ); else @@ -1079,18 +999,18 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) { a->i_predict4x4[idx] = i_mode; i_best = i_satd; - pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE); - pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE); - pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE); - pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE); + pels[0] = M32( p_dst_by+0*FDEC_STRIDE ); + pels[1] = M32( p_dst_by+1*FDEC_STRIDE ); + pels[2] = M32( p_dst_by+2*FDEC_STRIDE ); + pels[3] = M32( p_dst_by+3*FDEC_STRIDE ); i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]]; } } - *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0]; - *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1]; - *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2]; - *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3]; + M32( p_dst_by+0*FDEC_STRIDE ) = pels[0]; + M32( p_dst_by+1*FDEC_STRIDE ) = pels[1]; + M32( p_dst_by+2*FDEC_STRIDE ) = pels[2]; + M32( p_dst_by+3*FDEC_STRIDE ) = pels[3]; h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz; h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx]; @@ -1099,29 +1019,29 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) else if( h->mb.i_type == I_8x8 ) { ALIGNED_ARRAY_16( uint8_t, edge,[33] ); - for( idx = 0; idx < 4; idx++ ) + for( int idx = 0; idx < 4; idx++ ) { uint64_t pels_h = 0; uint8_t pels_v[7]; - uint16_t i_nnz[2]; + uint16_t i_nnz[2] = {0}; //shut up gcc uint8_t *p_dst_by; - int j; int cbp_luma_new = 0; - i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8; + int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8; i_best = COST_MAX64; x = idx&1; y = idx>>1; p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE; - predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max ); + predict_mode = predict_4x4_mode_available( h->mb.i_neighbour8[idx] ); h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); - for( i = 0; i < i_max; i++ ) + for( ; *predict_mode >= 0; predict_mode++ ) { - i_mode = predict_mode[i]; + int i_mode = *predict_mode; if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh ) continue; + if( h->mb.b_lossless ) x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge ); else @@ -1135,21 +1055,21 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) cbp_luma_new = h->mb.i_cbp_luma; i_best = i_satd; - pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE); + pels_h = M64( p_dst_by+7*FDEC_STRIDE ); if( !(idx&1) ) - for( j=0; j<7; j++ ) + for( int j = 0; j < 7; j++ ) pels_v[j] = p_dst_by[7+j*FDEC_STRIDE]; - i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]]; - i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]]; + i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ); + i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ); } } a->i_cbp_i8x8_luma = cbp_luma_new; - *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h; + M64( p_dst_by+7*FDEC_STRIDE ) = pels_h; if( !(idx&1) ) - for( j=0; j<7; j++ ) + for( int j = 0; j < 7; j++ ) p_dst_by[7+j*FDEC_STRIDE] = pels_v[j]; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0]; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1]; + M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0]; + M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1]; x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] ); } @@ -1157,6 +1077,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) } #define LOAD_FENC( m, src, xoff, yoff) \ + (m)->p_cost_mv = a->p_cost_mv; \ (m)->i_stride[0] = h->mb.pic.i_stride[0]; \ (m)->i_stride[1] = h->mb.pic.i_stride[1]; \ (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \ @@ -1164,48 +1085,66 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) (m)->p_fenc[2] = &(src)[2][((xoff)>>1)+((yoff)>>1)*FENC_STRIDE]; #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \ - (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \ + (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \ - (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; + (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \ + (m)->weight = weight_none; \ + (m)->i_ref = ref; + +#define LOAD_WPELS(m, src, list, ref, xoff, yoff) \ + (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \ + (m)->weight = h->sh.weight[i_ref]; #define REF_COST(list, ref) \ - (a->p_cost_ref##list[ref]) + (a->p_cost_ref[list][ref]) static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) { x264_me_t m; - int i_ref, i_mvc; + int i_mvc; ALIGNED_4( int16_t mvc[8][2] ); int i_halfpel_thresh = INT_MAX; int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL; /* 16x16 Search on all ref frame */ m.i_pixel = PIXEL_16x16; - m.p_cost_mv = a->p_cost_mv; LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 ); a->l0.me16x16.cost = INT_MAX; - for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ ) + for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ ) { - const int i_ref_cost = REF_COST( 0, i_ref ); - i_halfpel_thresh -= i_ref_cost; - m.i_ref_cost = i_ref_cost; - m.i_ref = i_ref; + m.i_ref_cost = REF_COST( 0, i_ref ); + i_halfpel_thresh -= m.i_ref_cost; /* search with ref */ LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 ); + LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 ); + x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); - x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); - x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); + + if( h->mb.ref_blind_dupe == i_ref ) + { + CP32( m.mv, a->l0.mvc[0][0] ); + x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh ); + } + else + { + x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); + x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); + } + + /* save mv for predicting neighbors */ + CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv ); + CP32( a->l0.mvc[i_ref][0], m.mv ); /* early termination * SSD threshold would probably be better than SATD */ if( i_ref == 0 - && a->b_try_pskip + && a->b_try_skip && m.cost-m.cost_mv < 300*a->i_lambda && abs(m.mv[0]-h->mb.cache.pskip_mv[0]) + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1 @@ -1213,29 +1152,25 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) { h->mb.i_type = P_SKIP; x264_analyse_update_cache( h, a ); - assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 ); + assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 ); return; } - m.cost += i_ref_cost; - i_halfpel_thresh += i_ref_cost; + m.cost += m.i_ref_cost; + i_halfpel_thresh += m.i_ref_cost; if( m.cost < a->l0.me16x16.cost ) h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) ); - - /* save mv for predicting neighbors */ - *(uint32_t*)a->l0.mvc[i_ref][0] = - *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; } x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); - assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 ); + assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 ); h->mb.i_type = P_L0; if( a->i_mbrd ) { - x264_mb_cache_fenc_satd( h ); - if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv ) + x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ); + if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra ) { h->mb.i_partition = D_16x16; x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); @@ -1249,67 +1184,81 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a ) { x264_me_t m; - int i_ref; uint8_t **p_fenc = h->mb.pic.p_fenc; - int i_halfpel_thresh = INT_MAX; - int *p_halfpel_thresh = /*h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : */NULL; - int i; int i_maxref = h->mb.pic.i_fref[0]-1; h->mb.i_partition = D_8x8; + #define CHECK_NEIGHBOUR(i)\ + {\ + int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\ + if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\ + i_maxref = ref;\ + } + /* early termination: if 16x16 chose ref 0, then evalute no refs older * than those used by the neighbors */ - if( i_maxref > 0 && a->l0.me16x16.i_ref == 0 && - h->mb.i_mb_type_top && h->mb.i_mb_type_left ) + if( i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) && + h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 ) { i_maxref = 0; - i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 - 1 ] ); - i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 0 ] ); - i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 2 ] ); - i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 - 8 + 4 ] ); - i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 0 - 1 ] ); - i_maxref = X264_MAX( i_maxref, h->mb.cache.ref[0][ X264_SCAN8_0 + 2*8 - 1 ] ); + CHECK_NEIGHBOUR( -8 - 1 ); + CHECK_NEIGHBOUR( -8 + 0 ); + CHECK_NEIGHBOUR( -8 + 2 ); + CHECK_NEIGHBOUR( -8 + 4 ); + CHECK_NEIGHBOUR( 0 - 1 ); + CHECK_NEIGHBOUR( 2*8 - 1 ); } + #undef CHECK_NEIGHBOUR - for( i_ref = 0; i_ref <= i_maxref; i_ref++ ) - *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy]; + for( int i_ref = 0; i_ref <= i_maxref; i_ref++ ) + CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] ); - for( i = 0; i < 4; i++ ) + for( int i = 0; i < 4; i++ ) { x264_me_t *l0m = &a->l0.me8x8[i]; const int x8 = i%2; const int y8 = i/2; m.i_pixel = PIXEL_8x8; - m.p_cost_mv = a->p_cost_mv; LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 ); l0m->cost = INT_MAX; - for( i_ref = 0; i_ref <= i_maxref; i_ref++ ) + for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; ) { - const int i_ref_cost = REF_COST( 0, i_ref ); - i_halfpel_thresh -= i_ref_cost; - m.i_ref_cost = i_ref_cost; - m.i_ref = i_ref; + m.i_ref_cost = REF_COST( 0, i_ref ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 ); + LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 ); + x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); - x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh ); + if( h->mb.ref_blind_dupe == i_ref ) + { + CP32( m.mv, a->l0.mvc[0][i+1] ); + x264_me_refine_qpel_refdupe( h, &m, NULL ); + } + else + x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 ); + + m.cost += m.i_ref_cost; - m.cost += i_ref_cost; - i_halfpel_thresh += i_ref_cost; - *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv; + CP32( a->l0.mvc[i_ref][i+1], m.mv ); if( m.cost < l0m->cost ) h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); + if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe ) + i_ref = h->mb.ref_blind_dupe; + else + i_ref++; } x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref ); - /* mb type cost */ - l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8]; + /* If CABAC is on and we're not doing sub-8x8 analysis, the costs + are effectively zero. */ + if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) + l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8]; } a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost + @@ -1324,44 +1273,46 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a ) { - const int i_ref = a->l0.me16x16.i_ref; + /* Duplicate refs are rarely useful in p8x8 due to the high cost of the + * reference frame flags. Thus, if we're not doing mixedrefs, just + * don't bother analysing the dupes. */ + const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref; const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0; - uint8_t **p_fref = h->mb.pic.p_fref[0][i_ref]; uint8_t **p_fenc = h->mb.pic.p_fenc; int i_mvc; int16_t (*mvc)[2] = a->l0.mvc[i_ref]; - int i; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; i_mvc = 1; - *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv; + CP32( mvc[0], a->l0.me16x16.mv ); - for( i = 0; i < 4; i++ ) + for( int i = 0; i < 4; i++ ) { x264_me_t *m = &a->l0.me8x8[i]; const int x8 = i%2; const int y8 = i/2; m->i_pixel = PIXEL_8x8; - m->p_cost_mv = a->p_cost_mv; m->i_ref_cost = i_ref_cost; - m->i_ref = i_ref; LOAD_FENC( m, p_fenc, 8*x8, 8*y8 ); - LOAD_HPELS( m, p_fref, 0, i_ref, 8*x8, 8*y8 ); + LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 ); + LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 ); + x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp ); x264_me_search( h, m, mvc, i_mvc ); x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv ); - *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv; + CP32( mvc[i_mvc], m->mv ); i_mvc++; /* mb type cost */ m->cost += i_ref_cost; - m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8]; + if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) ) + m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8]; } a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost + @@ -1379,40 +1330,47 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) x264_me_t m; uint8_t **p_fenc = h->mb.pic.p_fenc; ALIGNED_4( int16_t mvc[3][2] ); - int i, j; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_16x8; - for( i = 0; i < 2; i++ ) + for( int i = 0; i < 2; i++ ) { x264_me_t *l0m = &a->l0.me16x8[i]; - const int ref8[2] = { a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref }; + const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref ); + const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref ); + const int ref8[2] = { minref, maxref }; const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; m.i_pixel = PIXEL_16x8; - m.p_cost_mv = a->p_cost_mv; LOAD_FENC( &m, p_fenc, 0, 8*i ); l0m->cost = INT_MAX; - for( j = 0; j < i_ref8s; j++ ) + for( int j = 0; j < i_ref8s; j++ ) { const int i_ref = ref8[j]; - const int i_ref_cost = REF_COST( 0, i_ref ); - m.i_ref_cost = i_ref_cost; - m.i_ref = i_ref; + m.i_ref_cost = REF_COST( 0, i_ref ); /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */ - *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; - *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1]; - *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2]; + CP32( mvc[0], a->l0.mvc[i_ref][0] ); + CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] ); + CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i ); + LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i ); + x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref ); x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp ); - x264_me_search( h, &m, mvc, 3 ); + /* We can only take this shortcut if the first search was performed on ref0. */ + if( h->mb.ref_blind_dupe == i_ref && !ref8[0] ) + { + /* We can just leave the MV from the previous ref search. */ + x264_me_refine_qpel_refdupe( h, &m, NULL ); + } + else + x264_me_search( h, &m, mvc, 3 ); - m.cost += i_ref_cost; + m.cost += m.i_ref_cost; if( m.cost < l0m->cost ) h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); @@ -1429,39 +1387,46 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) x264_me_t m; uint8_t **p_fenc = h->mb.pic.p_fenc; ALIGNED_4( int16_t mvc[3][2] ); - int i, j; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x16; - for( i = 0; i < 2; i++ ) + for( int i = 0; i < 2; i++ ) { x264_me_t *l0m = &a->l0.me8x16[i]; - const int ref8[2] = { a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref }; + const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref ); + const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref ); + const int ref8[2] = { minref, maxref }; const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; m.i_pixel = PIXEL_8x16; - m.p_cost_mv = a->p_cost_mv; LOAD_FENC( &m, p_fenc, 8*i, 0 ); l0m->cost = INT_MAX; - for( j = 0; j < i_ref8s; j++ ) + for( int j = 0; j < i_ref8s; j++ ) { const int i_ref = ref8[j]; - const int i_ref_cost = REF_COST( 0, i_ref ); - m.i_ref_cost = i_ref_cost; - m.i_ref = i_ref; + m.i_ref_cost = REF_COST( 0, i_ref ); - *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; - *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1]; - *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3]; + CP32( mvc[0], a->l0.mvc[i_ref][0] ); + CP32( mvc[1], a->l0.mvc[i_ref][i+1] ); + CP32( mvc[2], a->l0.mvc[i_ref][i+3] ); LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 ); + LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 ); + x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref ); x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); - x264_me_search( h, &m, mvc, 3 ); + /* We can only take this shortcut if the first search was performed on ref0. */ + if( h->mb.ref_blind_dupe == i_ref && !ref8[0] ) + { + /* We can just leave the MV from the previous ref search. */ + x264_me_refine_qpel_refdupe( h, &m, NULL ); + } + else + x264_me_search( h, &m, mvc, 3 ); - m.cost += i_ref_cost; + m.cost += m.i_ref_cost; if( m.cost < l0m->cost ) h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); @@ -1475,32 +1440,43 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel ) { - ALIGNED_8( uint8_t pix1[16*8] ); + ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] ); uint8_t *pix2 = pix1+8; const int i_stride = h->mb.pic.i_stride[1]; const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride; const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE; + const int i_ref = a->l0.me8x8[i8x8].i_ref; + const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + x264_weight_t *weight = h->sh.weight[i_ref]; #define CHROMA4x4MC( width, height, me, x, y ) \ - h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \ - h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); + h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ + if( weight[1].weightfn ) \ + weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \ + h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ + if( weight[2].weightfn ) \ + weight[1].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); + if( pixel == PIXEL_4x4 ) { - CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][0], 0,0 ); - CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][1], 2,0 ); - CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][2], 0,2 ); - CHROMA4x4MC( 2,2, a->l0.me4x4[i8x8][3], 2,2 ); + x264_me_t *m = a->l0.me4x4[i8x8]; + CHROMA4x4MC( 2,2, m[0], 0,0 ); + CHROMA4x4MC( 2,2, m[1], 2,0 ); + CHROMA4x4MC( 2,2, m[2], 0,2 ); + CHROMA4x4MC( 2,2, m[3], 2,2 ); } else if( pixel == PIXEL_8x4 ) { - CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][0], 0,0 ); - CHROMA4x4MC( 4,2, a->l0.me8x4[i8x8][1], 0,2 ); + x264_me_t *m = a->l0.me8x4[i8x8]; + CHROMA4x4MC( 4,2, m[0], 0,0 ); + CHROMA4x4MC( 4,2, m[1], 0,2 ); } else { - CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][0], 0,0 ); - CHROMA4x4MC( 2,4, a->l0.me4x8[i8x8][1], 2,0 ); + x264_me_t *m = a->l0.me4x8[i8x8]; + CHROMA4x4MC( 2,4, m[0], 0,0 ); + CHROMA4x4MC( 2,4, m[1], 2,0 ); } return h->pixf.mbcmp[PIXEL_4x4]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 ) @@ -1512,12 +1488,11 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; uint8_t **p_fenc = h->mb.pic.p_fenc; const int i_ref = a->l0.me8x8[i8x8].i_ref; - int i4x4; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; - for( i4x4 = 0; i4x4 < 4; i4x4++ ) + for( int i4x4 = 0; i4x4 < 4; i4x4++ ) { const int idx = 4*i8x8 + i4x4; const int x4 = block_idx_x[idx]; @@ -1527,10 +1502,10 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8 x264_me_t *m = &a->l0.me4x4[i8x8][i4x4]; m->i_pixel = PIXEL_4x4; - m->p_cost_mv = a->p_cost_mv; LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 ); + LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 ); x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc ); @@ -1552,12 +1527,11 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; uint8_t **p_fenc = h->mb.pic.p_fenc; const int i_ref = a->l0.me8x8[i8x8].i_ref; - int i8x4; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; - for( i8x4 = 0; i8x4 < 2; i8x4++ ) + for( int i8x4 = 0; i8x4 < 2; i8x4++ ) { const int idx = 4*i8x8 + 2*i8x4; const int x4 = block_idx_x[idx]; @@ -1567,10 +1541,10 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8 x264_me_t *m = &a->l0.me8x4[i8x8][i8x4]; m->i_pixel = PIXEL_8x4; - m->p_cost_mv = a->p_cost_mv; LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 ); + LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 ); x264_mb_predict_mv( h, 0, idx, 2, m->mvp ); x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); @@ -1589,12 +1563,11 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8 uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; uint8_t **p_fenc = h->mb.pic.p_fenc; const int i_ref = a->l0.me8x8[i8x8].i_ref; - int i4x8; /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; - for( i4x8 = 0; i4x8 < 2; i4x8++ ) + for( int i4x8 = 0; i4x8 < 2; i4x8++ ) { const int idx = 4*i8x8 + i4x8; const int x4 = block_idx_x[idx]; @@ -1604,10 +1577,10 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8 x264_me_t *m = &a->l0.me4x8[i8x8][i4x8]; m->i_pixel = PIXEL_4x8; - m->p_cost_mv = a->p_cost_mv; LOAD_FENC( m, p_fenc, 4*x4, 4*y4 ); LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 ); + LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 ); x264_mb_predict_mv( h, 0, idx, 1, m->mvp ); x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc ); @@ -1626,27 +1599,24 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) /* Assumes that fdec still contains the results of * x264_mb_predict_mv_direct16x16 and x264_mb_mc */ - uint8_t **p_fenc = h->mb.pic.p_fenc; - uint8_t **p_fdec = h->mb.pic.p_fdec; - int i; + uint8_t *p_fenc = h->mb.pic.p_fenc[0]; + uint8_t *p_fdec = h->mb.pic.p_fdec[0]; a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT]; - for( i = 0; i < 4; i++ ) - { - const int x = (i&1)*8; - const int y = (i>>1)*8; - a->i_cost16x16direct += - a->i_cost8x8direct[i] = - h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[0][x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[0][x+y*FDEC_STRIDE], FDEC_STRIDE ); - - /* mb type cost */ - a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8]; - } -} - -#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \ -{ \ - h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \ + if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 ) + for( int i = 0; i < 4; i++ ) + { + const int x = (i&1)*8; + const int y = (i>>1)*8; + a->i_cost16x16direct += + a->i_cost8x8direct[i] = + h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE, &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE ); + + /* mb type cost */ + a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8]; + } + else + a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE ); } static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) @@ -1655,89 +1625,124 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] ); uint8_t *src0, *src1; int stride0 = 16, stride1 = 16; - - x264_me_t m; int i_ref, i_mvc; ALIGNED_4( int16_t mvc[9][2] ); - int i_halfpel_thresh = INT_MAX; - int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL; + int try_skip = a->b_try_skip; + int list1_skipped = 0; + int i_halfpel_thresh[2] = {INT_MAX, INT_MAX}; + int *p_halfpel_thresh[2] = {h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh[0] : NULL, + h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh[1] : NULL}; - /* 16x16 Search on all ref frame */ + x264_me_t m; m.i_pixel = PIXEL_16x16; - m.p_cost_mv = a->p_cost_mv; + LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 ); - /* ME for List 0 */ + /* 16x16 Search on list 0 and list 1 */ a->l0.me16x16.cost = INT_MAX; - for( i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ ) - { - /* search with ref */ - LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 ); - x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp ); - x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc ); - x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); + a->l1.me16x16.cost = INT_MAX; + for( int l = 1; l >= 0; ) + { + x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; + + /* This loop is extremely munged in order to facilitate the following order of operations, + * necessary for an efficient fast skip. + * 1. Search list1 ref0. + * 2. Search list0 ref0. + * 3. Try skip. + * 4. Search the rest of list0. + * 5. Go back and finish list1. + */ + for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ ) + { + if( try_skip && l == 1 && i_ref > 0 ) + { + list1_skipped = 1; + break; + } - /* add ref cost */ - m.cost += REF_COST( 0, i_ref ); + m.i_ref_cost = REF_COST( l, i_ref ); - if( m.cost < a->l0.me16x16.cost ) - { - a->l0.i_ref = i_ref; - h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) ); - } + /* search with ref */ + LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 ); + x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp ); + x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc ); + x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] ); - /* save mv for predicting neighbors */ - *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; - } - /* subtract ref cost, so we don't have to add it for the other MB types */ - a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref ); + /* add ref cost */ + m.cost += m.i_ref_cost; - /* ME for list 1 */ - i_halfpel_thresh = INT_MAX; - p_halfpel_thresh = h->mb.pic.i_fref[1]>1 ? &i_halfpel_thresh : NULL; - a->l1.me16x16.cost = INT_MAX; - for( i_ref = 0; i_ref < h->mb.pic.i_fref[1]; i_ref++ ) - { - /* search with ref */ - LOAD_HPELS( &m, h->mb.pic.p_fref[1][i_ref], 1, i_ref, 0, 0 ); - x264_mb_predict_mv_16x16( h, 1, i_ref, m.mvp ); - x264_mb_predict_mv_ref16x16( h, 1, i_ref, mvc, &i_mvc ); - x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh ); + if( m.cost < lX->me16x16.cost ) + h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) ); - /* add ref cost */ - m.cost += REF_COST( 1, i_ref ); + /* save mv for predicting neighbors */ + CP32( lX->mvc[i_ref][0], m.mv ); + CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv ); - if( m.cost < a->l1.me16x16.cost ) - { - a->l1.i_ref = i_ref; - h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) ); + /* Fast skip detection. */ + if( i_ref == 0 && try_skip ) + { + if( abs(lX->bi16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) + + abs(lX->bi16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 ) + { + try_skip = 0; + } + else if( !l ) + { + /* We already tested skip */ + h->mb.i_type = B_SKIP; + x264_analyse_update_cache( h, a ); + return; + } + } } - - /* save mv for predicting neighbors */ - *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv; + if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] ) + break; + if( list1_skipped && l == 0 ) + l = 1; + else + l--; } - /* subtract ref cost, so we don't have to add it for the other MB types */ - a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref ); - - /* Set global ref, needed for other modes? */ - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref ); - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref ); /* get cost of BI mode */ + h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) ); + h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) ); + int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref ); src0 = h->mc.get_ref( pix0, &stride0, - h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 ); + h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0], + a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, weight_none ); src1 = h->mc.get_ref( pix1, &stride1, - h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 ); + h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0], + a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, weight_none ); - h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); + h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 ) - + REF_COST( 0, a->l0.i_ref ) - + REF_COST( 1, a->l1.i_ref ) - + a->l0.me16x16.cost_mv - + a->l1.me16x16.cost_mv; + + ref_costs + + a->l0.bi16x16.cost_mv + + a->l1.bi16x16.cost_mv; + + /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */ + if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) ) + { + int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]] + + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]]; + int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]] + + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]]; + h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0], + h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0], + h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); + int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 ) + + ref_costs + l0_mv_cost + l1_mv_cost; + if( cost00 < a->i_cost16x16bi ) + { + M32( a->l0.bi16x16.mv ) = 0; + M32( a->l1.bi16x16.mv ) = 0; + a->l0.bi16x16.cost_mv = l0_mv_cost; + a->l1.bi16x16.cost_mv = l1_mv_cost; + a->i_cost16x16bi = cost00; + } + } /* mb type cost */ a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI]; @@ -1775,10 +1780,20 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int } } +static void x264_mb_load_mv_direct8x8( x264_t *h, int idx ) +{ + const int x = 2*(idx&1); + const int y = 2*(idx>>1); + x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] ); + x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] ); + x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] ); + x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] ); +} + #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \ if( x264_mb_partition_listX_table[0][part] ) \ { \ - x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \ + x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \ x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \ } \ else \ @@ -1790,7 +1805,7 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int } \ if( x264_mb_partition_listX_table[1][part] ) \ { \ - x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \ + x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \ x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \ } \ else \ @@ -1830,20 +1845,120 @@ static inline void x264_mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int } #undef CACHE_MV_BI +static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a ) +{ + ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] ); + int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1}; + + /* early termination: if 16x16 chose ref 0, then evalute no refs older + * than those used by the neighbors */ + #define CHECK_NEIGHBOUR(i)\ + {\ + int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\ + if( ref > i_maxref[l] )\ + i_maxref[l] = ref;\ + } + + for( int l = 0; l < 2; l++ ) + { + x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; + if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 && + h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left > 0 ) + { + i_maxref[l] = 0; + CHECK_NEIGHBOUR( -8 - 1 ); + CHECK_NEIGHBOUR( -8 + 0 ); + CHECK_NEIGHBOUR( -8 + 2 ); + CHECK_NEIGHBOUR( -8 + 4 ); + CHECK_NEIGHBOUR( 0 - 1 ); + CHECK_NEIGHBOUR( 2*8 - 1 ); + } + } + + /* XXX Needed for x264_mb_predict_mv */ + h->mb.i_partition = D_8x8; + + a->i_cost8x8bi = 0; + + for( int i = 0; i < 4; i++ ) + { + int x8 = i%2; + int y8 = i/2; + int i_part_cost; + int i_part_cost_bi; + int stride[2] = {8,8}; + uint8_t *src[2]; + x264_me_t m; + m.i_pixel = PIXEL_8x8; + LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 ); + + for( int l = 0; l < 2; l++ ) + { + x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; + + lX->me8x8[i].cost = INT_MAX; + for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ ) + { + m.i_ref_cost = REF_COST( l, i_ref );; + + LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 ); + + x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref ); + x264_mb_predict_mv( h, l, 4*i, 2, m.mvp ); + x264_me_search( h, &m, lX->mvc[i_ref], i+1 ); + m.cost += m.i_ref_cost; + + if( m.cost < lX->me8x8[i].cost ) + h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) ); + + /* save mv for predicting other partitions within this MB */ + CP32( lX->mvc[i_ref][i+1], m.mv ); + } + } + + /* BI mode */ + src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0], + a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, weight_none ); + src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0], + a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, weight_none ); + h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], + h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] ); + + i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ) + + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv + a->l0.me8x8[i].i_ref_cost + + a->l1.me8x8[i].i_ref_cost + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8]; + + a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; + a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8]; + + i_part_cost = a->l0.me8x8[i].cost; + h->mb.i_sub_partition[i] = D_L0_8x8; + COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 ); + COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 ); + COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 ); + a->i_cost8x8bi += i_part_cost; + + /* XXX Needed for x264_mb_predict_mv */ + x264_mb_cache_mv_b8x8( h, a, i, 0 ); + } + + /* mb type cost */ + a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8]; +} + static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) { uint8_t **p_fref[2] = - { h->mb.pic.p_fref[0][a->l0.i_ref], - h->mb.pic.p_fref[1][a->l1.i_ref] }; - ALIGNED_8( uint8_t pix[2][8*8] ); - int i, l; + { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref], + h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] }; + ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] ); /* XXX Needed for x264_mb_predict_mv */ h->mb.i_partition = D_8x8; a->i_cost8x8bi = 0; - for( i = 0; i < 4; i++ ) + for( int i = 0; i < 4; i++ ) { const int x8 = i%2; const int y8 = i/2; @@ -1852,29 +1967,34 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) int stride[2] = {8,8}; uint8_t *src[2]; - for( l = 0; l < 2; l++ ) + for( int l = 0; l < 2; l++ ) { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; x264_me_t *m = &lX->me8x8[i]; - m->i_pixel = PIXEL_8x8; - m->p_cost_mv = a->p_cost_mv; - LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 ); - LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*x8, 8*y8 ); + m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref ); + m->i_ref = lX->me16x16.i_ref; + + LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 ); + + x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref ); x264_mb_predict_mv( h, l, 4*i, 2, m->mvp ); x264_me_search( h, m, &lX->me16x16.mv, 1 ); + m->cost += m->i_ref_cost; x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv ); + /* save mv for predicting other partitions within this MB */ + CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv ); + /* BI mode */ src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 8, 8 ); - i_part_cost_bi += m->cost_mv; - /* FIXME: ref cost */ + m->mv[0], m->mv[1], 8, 8, weight_none ); + i_part_cost_bi += m->cost_mv + m->i_ref_cost; } - h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); + h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ) + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8]; a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; @@ -1897,52 +2017,64 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) { - uint8_t **p_fref[2] = - { h->mb.pic.p_fref[0][a->l0.i_ref], - h->mb.pic.p_fref[1][a->l1.i_ref] }; ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] ); - ALIGNED_4( int16_t mvc[2][2] ); - int i, l; + ALIGNED_4( int16_t mvc[3][2] ); h->mb.i_partition = D_16x8; a->i_cost16x8bi = 0; - for( i = 0; i < 2; i++ ) + for( int i = 0; i < 2; i++ ) { int i_part_cost; int i_part_cost_bi = 0; int stride[2] = {16,16}; uint8_t *src[2]; + x264_me_t m; + m.i_pixel = PIXEL_16x8; + LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i ); - /* TODO: check only the list(s) that were used in b8x8? */ - for( l = 0; l < 2; l++ ) + for( int l = 0; l < 2; l++ ) { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; - x264_me_t *m = &lX->me16x8[i]; - - m->i_pixel = PIXEL_16x8; - m->p_cost_mv = a->p_cost_mv; + int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref }; + int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; + lX->me16x8[i].cost = INT_MAX; + for( int j = 0; j < i_ref8s; j++ ) + { + int i_ref = ref8[j]; + m.i_ref_cost = REF_COST( l, i_ref );; - LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i ); - LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i ); + LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i ); - *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv; - *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv; + CP32( mvc[0], lX->mvc[i_ref][0] ); + CP32( mvc[1], lX->mvc[i_ref][2*i+1] ); + CP32( mvc[2], lX->mvc[i_ref][2*i+2] ); - x264_mb_predict_mv( h, l, 8*i, 2, m->mvp ); - x264_me_search( h, m, mvc, 2 ); + x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref ); + x264_mb_predict_mv( h, l, 8*i, 4, m.mvp ); + x264_me_search( h, &m, mvc, 3 ); + m.cost += m.i_ref_cost; - /* BI mode */ - src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 16, 8 ); - /* FIXME: ref cost */ - i_part_cost_bi += m->cost_mv; + if( m.cost < lX->me16x8[i].cost ) + h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) ); + } } - h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); - i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 ); + + /* BI mode */ + src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0], + a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, weight_none ); + src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0], + a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, weight_none ); + h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1], + h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] ); + + i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 ) + + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost + + a->l1.me16x8[i].i_ref_cost; i_part_cost = a->l0.me16x8[i].cost; a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */ + if( a->l1.me16x8[i].cost < i_part_cost ) { i_part_cost = a->l1.me16x8[i].cost; @@ -1967,52 +2099,63 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) { - uint8_t **p_fref[2] = - { h->mb.pic.p_fref[0][a->l0.i_ref], - h->mb.pic.p_fref[1][a->l1.i_ref] }; - ALIGNED_8( uint8_t pix[2][8*16] ); - ALIGNED_4( int16_t mvc[2][2] ); - int i, l; + ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] ); + ALIGNED_4( int16_t mvc[3][2] ); h->mb.i_partition = D_8x16; a->i_cost8x16bi = 0; - for( i = 0; i < 2; i++ ) + for( int i = 0; i < 2; i++ ) { int i_part_cost; int i_part_cost_bi = 0; int stride[2] = {8,8}; uint8_t *src[2]; + x264_me_t m; + m.i_pixel = PIXEL_8x16; + LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 ); - for( l = 0; l < 2; l++ ) + for( int l = 0; l < 2; l++ ) { x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0; - x264_me_t *m = &lX->me8x16[i]; - - m->i_pixel = PIXEL_8x16; - m->p_cost_mv = a->p_cost_mv; + int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref }; + int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2; + lX->me8x16[i].cost = INT_MAX; + for( int j = 0; j < i_ref8s; j++ ) + { + int i_ref = ref8[j]; + m.i_ref_cost = REF_COST( l, i_ref ); - LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 ); - LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 ); + LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 ); - *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv; - *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv; + CP32( mvc[0], lX->mvc[i_ref][0] ); + CP32( mvc[1], lX->mvc[i_ref][i+1] ); + CP32( mvc[2], lX->mvc[i_ref][i+3] ); - x264_mb_predict_mv( h, l, 4*i, 2, m->mvp ); - x264_me_search( h, m, mvc, 2 ); + x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref ); + x264_mb_predict_mv( h, l, 4*i, 2, m.mvp ); + x264_me_search( h, &m, mvc, 3 ); + m.cost += m.i_ref_cost; - /* BI mode */ - src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 8, 16 ); - /* FIXME: ref cost */ - i_part_cost_bi += m->cost_mv; + if( m.cost < lX->me8x16[i].cost ) + h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) ); + } } - h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); - i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ); + /* BI mode */ + src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0], + a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, weight_none ); + src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0], + a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, weight_none ); + h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] ); + + i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ) + + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost + + a->l1.me8x16[i].i_ref_cost; i_part_cost = a->l0.me8x16[i].cost; a->i_mb_partition8x16[i] = D_L0_8x8; + if( a->l1.me8x16[i].cost < i_part_cost ) { i_part_cost = a->l1.me8x16[i].cost; @@ -2071,31 +2214,33 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd ) h->mb.i_partition = D_8x8; if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 ) { - int i; x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref ); x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref ); x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref ); x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref ); /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection * for future blocks are those left over from previous RDO calls. */ - for( i = 0; i < 4; i++ ) + for( int i = 0; i < 4; i++ ) { int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost}; - int thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4; + int sub8x8_thresh = X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4; int subtype, btype = D_L0_8x8; uint64_t bcost = COST_MAX64; for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ ) { uint64_t cost; - if( costs[subtype] > thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) ) + if( costs[subtype] > sub8x8_thresh || (subtype == D_L0_8x8 && bcost == COST_MAX64) ) continue; h->mb.i_sub_partition[i] = subtype; x264_mb_cache_mv_p8x8( h, a, i ); cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 ); COPY2_IF_LT( bcost, cost, btype, subtype ); } - h->mb.i_sub_partition[i] = btype; - x264_mb_cache_mv_p8x8( h, a, i ); + if( h->mb.i_sub_partition[i] != btype ) + { + h->mb.i_sub_partition[i] = btype; + x264_mb_cache_mv_p8x8( h, a, i ); + } } } else @@ -2178,8 +2323,7 @@ static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_i static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a ) { - const int i_biweight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref]; - int i; + int i_biweight; if( IS_INTRA(h->mb.i_type) ) return; @@ -2188,22 +2332,34 @@ static void x264_refine_bidir( x264_t *h, x264_mb_analysis_t *a ) { case D_16x16: if( h->mb.i_type == B_BI_BI ) - x264_me_refine_bidir_satd( h, &a->l0.me16x16, &a->l1.me16x16, i_biweight ); + { + i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref]; + x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight ); + } break; case D_16x8: - for( i=0; i<2; i++ ) + for( int i = 0; i < 2; i++ ) if( a->i_mb_partition16x8[i] == D_BI_8x8 ) + { + i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref]; x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight ); + } break; case D_8x16: - for( i=0; i<2; i++ ) + for( int i = 0; i < 2; i++ ) if( a->i_mb_partition8x16[i] == D_BI_8x8 ) + { + i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref]; x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight ); + } break; case D_8x8: - for( i=0; i<4; i++ ) + for( int i = 0; i < 4; i++ ) if( h->mb.i_sub_partition[i] == D_BI_8x8 ) + { + i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref]; x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight ); + } break; } } @@ -2212,13 +2368,12 @@ static inline void x264_mb_analyse_transform( x264_t *h ) { if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless ) { - int i_cost4, i_cost8; /* Only luma MC is really needed, but the full MC is re-used in macroblock_encode. */ x264_mb_mc( h ); - i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, + int i_cost8 = h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE ); - i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, + int i_cost4 = h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE ); h->mb.b_transform_8x8 = i_cost8 < i_cost4; @@ -2230,11 +2385,10 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t * { if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 ) { - int i_rd8; x264_analyse_update_cache( h, a ); h->mb.b_transform_8x8 ^= 1; /* FIXME only luma is needed, but the score for comparison already includes chroma */ - i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 ); + int i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 ); if( *i_rd >= i_rd8 ) { @@ -2256,13 +2410,14 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t * * trick. */ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a ) { - int bcost, cost, direction, failures, prevcost, origcost; + int bcost, cost, failures, prevcost, origcost; int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp; int last_qp_tried = 0; origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 ); + int origcbp = h->mb.cbp[h->mb.i_mb_xy]; /* If CBP is already zero, don't raise the quantizer any higher. */ - for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 ) + for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 ) { /* Without psy-RD, require monotonicity when moving quant away from previous * macroblock's quant; allow 1 failure when moving quant towards previous quant. @@ -2277,14 +2432,47 @@ static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a ) h->mb.i_qp = orig_qp; failures = 0; prevcost = origcost; + + /* If the current QP results in an empty CBP, it's highly likely that lower QPs + * (up to a point) will too. So, jump down to where the threshold will kick in + * and check the QP there. If the CBP is still empty, skip the main loop. + * If it isn't empty, we would have ended up having to check this QP anyways, + * so as long as we store it for later lookup, we lose nothing. */ + int already_checked_qp = -1; + int already_checked_cost = COST_MAX; + if( direction == -1 ) + { + if( !origcbp ) + { + h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, h->param.rc.i_qp_min ); + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; + already_checked_cost = x264_rd_cost_mb( h, a->i_lambda2 ); + if( !h->mb.cbp[h->mb.i_mb_xy] ) + { + /* If our empty-CBP block is lower QP than the last QP, + * the last QP almost surely doesn't have a CBP either. */ + if( h->mb.i_last_qp > h->mb.i_qp ) + last_qp_tried = 1; + break; + } + already_checked_qp = h->mb.i_qp; + h->mb.i_qp = orig_qp; + } + } + h->mb.i_qp += direction; while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= h->param.rc.i_qp_max ) { if( h->mb.i_last_qp == h->mb.i_qp ) last_qp_tried = 1; - h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; - cost = x264_rd_cost_mb( h, a->i_lambda2 ); - COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp ); + if( h->mb.i_qp == already_checked_qp ) + cost = already_checked_cost; + else + { + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; + cost = x264_rd_cost_mb( h, a->i_lambda2 ); + COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp ); + } /* We can't assume that the costs are monotonic over QPs. * Tie case-as-failure seems to give better results. */ @@ -2332,7 +2520,6 @@ void x264_macroblock_analyse( x264_t *h ) { x264_mb_analysis_t analysis; int i_cost = COST_MAX; - int i; h->mb.i_qp = x264_ratecontrol_qp( h ); if( h->param.rc.i_aq_mode ) @@ -2349,8 +2536,9 @@ void x264_macroblock_analyse( x264_t *h ) /*--------------------------- Do the analysis ---------------------------*/ if( h->sh.i_type == SLICE_TYPE_I ) { +intra_analysis: if( analysis.i_mbrd ) - x264_mb_cache_fenc_satd( h ); + x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 ); x264_mb_analyse_intra( h, &analysis, COST_MAX ); if( analysis.i_mbrd ) x264_intra_rd( h, &analysis, COST_MAX ); @@ -2371,20 +2559,31 @@ void x264_macroblock_analyse( x264_t *h ) h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 ); - /* Fast P_SKIP detection */ - analysis.b_try_pskip = 0; - if( h->param.analyse.b_fast_pskip ) + analysis.b_try_skip = 0; + if( analysis.b_force_intra ) { - if( h->param.i_threads > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] ) - // FIXME don't need to check this if the reference frame is done - {} - else if( h->param.analyse.i_subpel_refine >= 3 ) - analysis.b_try_pskip = 1; - else if( h->mb.i_mb_type_left == P_SKIP || - h->mb.i_mb_type_top == P_SKIP || - h->mb.i_mb_type_topleft == P_SKIP || - h->mb.i_mb_type_topright == P_SKIP ) - b_skip = x264_macroblock_probe_pskip( h ); + if( !h->param.analyse.b_psy ) + { + x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) ); + goto intra_analysis; + } + } + else + { + /* Fast P_SKIP detection */ + if( h->param.analyse.b_fast_pskip ) + { + if( h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1] ) + // FIXME don't need to check this if the reference frame is done + {} + else if( h->param.analyse.i_subpel_refine >= 3 ) + analysis.b_try_skip = 1; + else if( h->mb.i_mb_type_left == P_SKIP || + h->mb.i_mb_type_top == P_SKIP || + h->mb.i_mb_type_topleft == P_SKIP || + h->mb.i_mb_type_topright == P_SKIP ) + b_skip = x264_macroblock_probe_pskip( h ); + } } h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 ); @@ -2393,7 +2592,11 @@ void x264_macroblock_analyse( x264_t *h ) { h->mb.i_type = P_SKIP; h->mb.i_partition = D_16x16; - assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 ); + assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 ); + /* Set up MVs for future predictors */ + if( b_skip ) + for( int i = 0; i < h->mb.pic.i_fref[0]; i++ ) + M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; } else { @@ -2408,7 +2611,11 @@ void x264_macroblock_analyse( x264_t *h ) x264_mb_analyse_inter_p16x16( h, &analysis ); if( h->mb.i_type == P_SKIP ) + { + for( int i = 1; i < h->mb.pic.i_fref[0]; i++ ) + M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; return; + } if( flags & X264_ANALYSE_PSUB16x16 ) { @@ -2433,7 +2640,7 @@ void x264_macroblock_analyse( x264_t *h ) /* Do sub 8x8 */ if( flags & X264_ANALYSE_PSUB8x8 ) { - for( i = 0; i < 4; i++ ) + for( int i = 0; i < 4; i++ ) { x264_mb_analyse_inter_p4x4( h, &analysis, i ); if( analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost ) @@ -2473,7 +2680,7 @@ void x264_macroblock_analyse( x264_t *h ) /* refine qpel */ //FIXME mb_type costs? - if( analysis.i_mbrd ) + if( analysis.i_mbrd || !h->mb.i_subpel_refine ) { /* refine later */ } @@ -2496,9 +2703,8 @@ void x264_macroblock_analyse( x264_t *h ) } else if( i_partition == D_8x8 ) { - int i8x8; i_cost = 0; - for( i8x8 = 0; i8x8 < 4; i8x8++ ) + for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { switch( h->mb.i_sub_partition[i8x8] ) { @@ -2575,6 +2781,19 @@ void x264_macroblock_analyse( x264_t *h ) h->mb.i_type = i_type; + if( analysis.b_force_intra && !IS_INTRA(i_type) ) + { + /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if + * it was an inter block. */ + x264_analyse_update_cache( h, &analysis ); + x264_macroblock_encode( h ); + h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 ); + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 ); + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 ); + x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) ); + goto intra_analysis; + } + if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM ) { if( IS_INTRA( h->mb.i_type ) ) @@ -2607,9 +2826,8 @@ void x264_macroblock_analyse( x264_t *h ) } else if( i_partition == D_8x8 ) { - int i8x8; x264_analyse_update_cache( h, &analysis ); - for( i8x8 = 0; i8x8 < 4; i8x8++ ) + for( int i8x8 = 0; i8x8 < 4; i8x8++ ) { if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 ) { @@ -2643,13 +2861,13 @@ void x264_macroblock_analyse( x264_t *h ) int b_skip = 0; if( analysis.i_mbrd ) - x264_mb_cache_fenc_satd( h ); + x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 ); h->mb.i_type = B_SKIP; if( h->mb.b_direct_auto_write ) { /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */ - for( i = 0; i < 2; i++ ) + for( int i = 0; i < 2; i++ ) { int b_changed = 1; h->sh.b_direct_spatial_mv_pred ^= 1; @@ -2670,6 +2888,7 @@ void x264_macroblock_analyse( x264_t *h ) else analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL ); + analysis.b_try_skip = 0; if( analysis.b_direct_available ) { if( !h->mb.b_direct_auto_write ) @@ -2684,7 +2903,17 @@ void x264_macroblock_analyse( x264_t *h ) { /* Conditioning the probe on neighboring block types * doesn't seem to help speed or quality. */ - b_skip = x264_macroblock_probe_bskip( h ); + analysis.b_try_skip = x264_macroblock_probe_bskip( h ); + if( h->param.analyse.i_subpel_refine < 3 ) + b_skip = analysis.b_try_skip; + } + /* Set up MVs for future predictors */ + if( b_skip ) + { + for( int i = 0; i < h->mb.pic.i_fref[0]; i++ ) + M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; + for( int i = 0; i < h->mb.pic.i_fref[1]; i++ ) + M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0; } } @@ -2695,6 +2924,7 @@ void x264_macroblock_analyse( x264_t *h ) int i_partition; int i_satd_inter; h->mb.b_skip_mc = 0; + h->mb.i_type = B_DIRECT; x264_mb_analyse_load_costs( h, &analysis ); @@ -2705,6 +2935,15 @@ void x264_macroblock_analyse( x264_t *h ) x264_mb_analyse_inter_b16x16( h, &analysis ); + if( h->mb.i_type == B_SKIP ) + { + for( int i = 1; i < h->mb.pic.i_fref[0]; i++ ) + M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; + for( int i = 1; i < h->mb.pic.i_fref[1]; i++ ) + M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0; + return; + } + i_type = B_L0_L0; i_partition = D_16x16; i_cost = analysis.l0.me16x16.cost; @@ -2728,7 +2967,11 @@ void x264_macroblock_analyse( x264_t *h ) if( flags & X264_ANALYSE_BSUB16x16 ) { - x264_mb_analyse_inter_b8x8( h, &analysis ); + if( h->param.analyse.b_mixed_references ) + x264_mb_analyse_inter_b8x8_mixed_ref( h, &analysis ); + else + x264_mb_analyse_inter_b8x8( h, &analysis ); + if( analysis.i_cost8x8bi < i_cost ) { i_type = B_8x8; @@ -2754,7 +2997,7 @@ void x264_macroblock_analyse( x264_t *h ) } } - if( analysis.i_mbrd ) + if( analysis.i_mbrd || !h->mb.i_subpel_refine ) { /* refine later */ } @@ -2777,13 +3020,13 @@ void x264_macroblock_analyse( x264_t *h ) } else if( i_type == B_BI_BI ) { - x264_me_refine_qpel( h, &analysis.l0.me16x16 ); - x264_me_refine_qpel( h, &analysis.l1.me16x16 ); + x264_me_refine_qpel( h, &analysis.l0.bi16x16 ); + x264_me_refine_qpel( h, &analysis.l1.bi16x16 ); } } else if( i_partition == D_16x8 ) { - for( i=0; i<2; i++ ) + for( int i = 0; i < 2; i++ ) { if( analysis.i_mb_partition16x8[i] != D_L1_8x8 ) x264_me_refine_qpel( h, &analysis.l0.me16x8[i] ); @@ -2793,7 +3036,7 @@ void x264_macroblock_analyse( x264_t *h ) } else if( i_partition == D_8x16 ) { - for( i=0; i<2; i++ ) + for( int i = 0; i < 2; i++ ) { if( analysis.i_mb_partition8x16[i] != D_L1_8x8 ) x264_me_refine_qpel( h, &analysis.l0.me8x16[i] ); @@ -2803,7 +3046,7 @@ void x264_macroblock_analyse( x264_t *h ) } else if( i_partition == D_8x8 ) { - for( i=0; i<4; i++ ) + for( int i = 0; i < 4; i++ ) { x264_me_t *m; int i_part_cost_old; @@ -2880,7 +3123,7 @@ void x264_macroblock_analyse( x264_t *h ) if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP ) { - const int i_biweight = h->mb.bipred_weight[analysis.l0.i_ref][analysis.l1.i_ref]; + int i_biweight; x264_analyse_update_cache( h, &analysis ); if( i_partition == D_16x16 ) @@ -2896,11 +3139,14 @@ void x264_macroblock_analyse( x264_t *h ) x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 ); } else if( i_type == B_BI_BI ) - x264_me_refine_bidir_rd( h, &analysis.l0.me16x16, &analysis.l1.me16x16, i_biweight, 0, analysis.i_lambda2 ); + { + i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref]; + x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 ); + } } else if( i_partition == D_16x8 ) { - for( i = 0; i < 2; i++ ) + for( int i = 0; i < 2; i++ ) { h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i]; if( analysis.i_mb_partition16x8[i] == D_L0_8x8 ) @@ -2908,12 +3154,15 @@ void x264_macroblock_analyse( x264_t *h ) else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 ); else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 ) + { + i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref]; x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 ); + } } } else if( i_partition == D_8x16 ) { - for( i = 0; i < 2; i++ ) + for( int i = 0; i < 2; i++ ) { h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i]; if( analysis.i_mb_partition8x16[i] == D_L0_8x8 ) @@ -2921,19 +3170,25 @@ void x264_macroblock_analyse( x264_t *h ) else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 ); else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 ) + { + i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref]; x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 ); + } } } else if( i_partition == D_8x8 ) { - for( i = 0; i < 4; i++ ) + for( int i = 0; i < 4; i++ ) { if( h->mb.i_sub_partition[i] == D_L0_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 ); else if( h->mb.i_sub_partition[i] == D_L1_8x8 ) x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 ); else if( h->mb.i_sub_partition[i] == D_BI_8x8 ) + { + i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref]; x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 ); + } } } } @@ -2950,7 +3205,7 @@ void x264_macroblock_analyse( x264_t *h ) static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2}; int list = check_mv_lists[h->mb.i_type] - 1; if( list >= 0 && h->mb.i_partition != D_16x16 && - *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[0]] == *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[12]] && + M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) && h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] ) h->mb.i_partition = D_16x16; } @@ -2972,18 +3227,16 @@ void x264_macroblock_analyse( x264_t *h ) /*-------------------- Update MB from the analysis ----------------------*/ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) { - int i; - switch( h->mb.i_type ) { case I_4x4: - for( i = 0; i < 16; i++ ) + for( int i = 0; i < 16; i++ ) h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i]; x264_mb_analyse_intra_chroma( h, a ); break; case I_8x8: - for( i = 0; i < 4; i++ ) + for( int i = 0; i < 4; i++ ) x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] ); x264_mb_analyse_intra_chroma( h, a ); @@ -3029,7 +3282,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref ); x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref ); x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref ); - for( i = 0; i < 4; i++ ) + for( int i = 0; i < 4; i++ ) x264_mb_cache_mv_p8x8( h, a, i ); break; @@ -3043,6 +3296,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) case B_SKIP: case B_DIRECT: + h->mb.i_partition = h->mb.cache.direct_partition; x264_mb_load_mv_direct8x8( h, 0 ); x264_mb_load_mv_direct8x8( h, 1 ); x264_mb_load_mv_direct8x8( h, 2 ); @@ -3051,7 +3305,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) case B_8x8: /* optimize: cache might not need to be rewritten */ - for( i = 0; i < 4; i++ ) + for( int i = 0; i < 4; i++ ) x264_mb_cache_mv_b8x8( h, a, i, 1 ); break; @@ -3062,7 +3316,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) switch( h->mb.i_type ) { case B_L0_L0: - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref ); + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 ); @@ -3074,15 +3328,15 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 ); x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 ); - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref ); + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref ); x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv ); break; case B_BI_BI: - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref ); - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv ); - x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref ); - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv ); + x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref ); + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv ); break; } break; @@ -3101,26 +3355,25 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ) } #ifndef NDEBUG - if( h->param.i_threads > 1 && !IS_INTRA(h->mb.i_type) ) + if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) ) { - int l; - for( l=0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ ) + for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ ) { int completed; int ref = h->mb.cache.ref[l][x264_scan8[0]]; if( ref < 0 ) continue; - completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->i_lines_completed; + completed = (l ? h->fref1 : h->fref0)[ ref >> h->mb.b_interlaced ]->orig->i_lines_completed; if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - h->mb.b_interlaced)) + h->mb.i_mb_y*16 > completed ) { x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n"); - fprintf(stderr, "mb type: %d \n", h->mb.i_type); - fprintf(stderr, "mv: l%dr%d (%d,%d) \n", l, ref, + x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type); + x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref, h->mb.cache.mv[l][x264_scan8[15]][0], h->mb.cache.mv[l][x264_scan8[15]][1] ); - fprintf(stderr, "limit: %d \n", h->mb.mv_max_spel[1]); - fprintf(stderr, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y); - fprintf(stderr, "completed: %d \n", completed ); + x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]); + x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y); + x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed ); x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n"); x264_mb_analyse_intra( h, a, COST_MAX ); h->mb.i_type = I_16x16;