X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;ds=sidebyside;f=encoder%2Fanalyse.c;h=c052fd8135f79b0d73b68847377c6a7a7fdfaa85;hb=9ea7b69df504b8990f339e2c8578a516f9df00c7;hp=6d75fe5a8a68bbabaf804188b98906a004352fd0;hpb=d52d44b319c30142903fceb09c52c9c8b64f22da;p=x264 diff --git a/encoder/analyse.c b/encoder/analyse.c index 6d75fe5a..c052fd81 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -22,6 +22,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. *****************************************************************************/ +#define _ISOC99_SOURCE #include #include #ifndef _MSC_VER @@ -29,6 +30,7 @@ #endif #include "common/common.h" +#include "common/cpu.h" #include "macroblock.h" #include "me.h" #include "ratecontrol.h" @@ -77,6 +79,8 @@ typedef struct int i_lambda2; int i_qp; int16_t *p_cost_mv; + uint16_t *p_cost_ref0; + uint16_t *p_cost_ref1; int i_mbrd; @@ -91,6 +95,7 @@ typedef struct int i_predict16x16; int i_satd_i8x8; + int i_cbp_i8x8_luma; int i_satd_i8x8_dir[12][4]; int i_predict8x8[4]; @@ -151,6 +156,41 @@ const int x264_lambda2_tab[52] = { 943718, 1189010, 1498059, 1887436 /* 48 - 51 */ }; +// should the intra and inter lambdas be different? +// I'm just matching the behaviour of deadzone quant. +static const int x264_trellis_lambda2_tab[2][52] = { + // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS) + { 46, 58, 73, 92, 117, 147, + 185, 233, 294, 370, 466, 587, + 740, 932, 1174, 1480, 1864, 2349, + 2959, 3728, 4697, 5918, 7457, 9395, + 11837, 14914, 18790, 23674, 29828, 37581, + 47349, 59656, 75163, 94699, 119313, 150326, + 189399, 238627, 300652, 378798, 477255, 601304, + 757596, 954511, 1202608, 1515192, 1909022, 2405217, + 3030384, 3818045, 4810435, 6060769 }, + // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS) + { 27, 34, 43, 54, 68, 86, + 108, 136, 172, 216, 273, 343, + 433, 545, 687, 865, 1090, 1374, + 1731, 2180, 2747, 3461, 4361, 5494, + 6922, 8721, 10988, 13844, 17442, 21976, + 27688, 34885, 43953, 55377, 69771, 87906, + 110755, 139543, 175813, 221511, 279087, 351627, + 443023, 558174, 703255, 886046, 1116348, 1406511, + 1772093, 2232697, 2813022, 3544186 } +}; + +static const uint16_t x264_chroma_lambda2_offset_tab[] = { + 16, 20, 25, 32, 40, 50, + 64, 80, 101, 128, 161, 203, + 256, 322, 406, 512, 645, 812, + 1024, 1290, 1625, 2048, 2580, 3250, + 4096, 5160, 6501, 8192, 10321, 13003, + 16384, 20642, 26007, 32768, 41285, 52015, + 65535 +}; + /* TODO: calculate CABAC costs */ static const int i_mb_b_cost_table[X264_MBTYPE_MAX] = { 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0 @@ -167,37 +207,46 @@ static const int i_sub_mb_p_cost_table[4] = { static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ); -uint16_t *x264_cost_mv_fpel[52][4]; +/* Indexed by lambda instead of qp because, due to rounding, + * some quantizers share lambdas. This saves memory. */ +uint16_t *x264_cost_mv_fpel[92][4]; +uint16_t x264_cost_ref[92][3][33]; /* initialize an array of lambda*nbits for all possible mvs */ static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) { - static int16_t *p_cost_mv[52]; + static int16_t *p_cost_mv[92]; int i, j; - if( !p_cost_mv[a->i_qp] ) + if( !p_cost_mv[a->i_lambda] ) { + x264_emms(); /* could be faster, but isn't called many times */ /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ - p_cost_mv[a->i_qp] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) ); - p_cost_mv[a->i_qp] += 2*4*2048; + p_cost_mv[a->i_lambda] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) ); + p_cost_mv[a->i_lambda] += 2*4*2048; for( i = 0; i <= 2*4*2048; i++ ) { - p_cost_mv[a->i_qp][-i] = - p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i ); + p_cost_mv[a->i_lambda][-i] = + p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f; } + for( i = 0; i < 3; i++ ) + for( j = 0; j < 33; j++ ) + x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0; } - a->p_cost_mv = p_cost_mv[a->i_qp]; + a->p_cost_mv = p_cost_mv[a->i_lambda]; + a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)]; + a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)]; /* FIXME is this useful for all me methods? */ - if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] ) + if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] ) { for( j=0; j<4; j++ ) { - x264_cost_mv_fpel[a->i_qp][j] = x264_malloc( (4*2048 + 1) * sizeof(int16_t) ); - x264_cost_mv_fpel[a->i_qp][j] += 2*2048; + x264_cost_mv_fpel[a->i_lambda][j] = x264_malloc( (4*2048 + 1) * sizeof(int16_t) ); + x264_cost_mv_fpel[a->i_lambda][j] += 2*2048; for( i = -2*2048; i < 2*2048; i++ ) - x264_cost_mv_fpel[a->i_qp][j][i] = p_cost_mv[a->i_qp][i*4+j]; + x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j]; } } } @@ -205,19 +254,36 @@ static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) { int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B); + /* mbrd == 1 -> RD mode decision */ /* mbrd == 2 -> RD refinement */ - a->i_mbrd = (i>=6) + (i>=8); + /* mbrd == 3 -> QPRD */ + a->i_mbrd = (i>=6) + (i>=8) + (h->param.analyse.i_subpel_refine>=10); + /* conduct the analysis using this lamda and QP */ a->i_qp = h->mb.i_qp = i_qp; h->mb.i_chroma_qp = h->chroma_qp_table[i_qp]; + a->i_lambda = x264_lambda_tab[i_qp]; a->i_lambda2 = x264_lambda2_tab[i_qp]; + + h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd; + if( h->param.analyse.i_trellis ) + { + h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][h->mb.i_qp]; + h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][h->mb.i_qp]; + h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][h->mb.i_chroma_qp]; + h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][h->mb.i_chroma_qp]; + } + h->mb.i_psy_rd_lambda = a->i_lambda; + /* Adjusting chroma lambda based on QP offset hurts PSNR, so we'll leave it as part of psy-RD. */ + h->mb.i_chroma_lambda2_offset = h->mb.i_psy_rd ? x264_chroma_lambda2_offset_tab[h->mb.i_qp-h->mb.i_chroma_qp+12] : 256; + h->mb.i_me_method = h->param.analyse.i_me_method; h->mb.i_subpel_refine = h->param.analyse.i_subpel_refine; h->mb.b_chroma_me = h->param.analyse.b_chroma_me && h->sh.i_type == SLICE_TYPE_P && h->mb.i_subpel_refine >= 5; - h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd; + h->mb.b_transform_8x8 = 0; h->mb.b_noise_reduction = 0; @@ -243,8 +309,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) int i_fmv_range = 4 * h->param.analyse.i_mv_range; // limit motion search to a slightly smaller range than the theoretical limit, // since the search may go a few iterations past its given range - int i_fpel_border = 5; // umh unconditional radius - int i_spel_border = 8; // 1.5 for subpel_satd, 1.5 for subpel_rd, 2 for bime, round up + int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel /* Calculate max allowed MV range */ #define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 ) @@ -282,7 +347,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); h->mb.mv_max[1] = 4*( 16*( mb_height - mb_y - 1 ) + 24 ); - h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], X264_MAX(4*(-512+i_spel_border), -i_fmv_range), i_fmv_range ); + h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 ); h->mb.mv_min_fpel[1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; @@ -474,7 +539,7 @@ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct ) { DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] ); DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] ); - DECLARE_ALIGNED_16( uint8_t zero[16*FDEC_STRIDE] ) = {0}; + DECLARE_ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0}; int i; if( do_both_dct || h->mb.b_transform_8x8 ) @@ -494,7 +559,7 @@ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct ) /* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */ static inline void x264_mb_cache_fenc_satd( x264_t *h ) { - DECLARE_ALIGNED_16(uint8_t zero[16]) = {0}; + DECLARE_ALIGNED_16( static uint8_t zero[16] ) = {0}; uint8_t *fenc; int x, y, satd_sum = 0, sa8d_sum = 0; if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis ) @@ -527,6 +592,7 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) int i_max; int predict_mode[4]; + int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless; uint8_t *p_dstc[2], *p_srcc[2]; @@ -541,11 +607,11 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); a->i_satd_i8x8chroma = COST_MAX; - if( i_max == 4 && h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] ) + if( i_max == 4 && b_merged_satd ) { int satdu[4], satdv[4]; - h->pixf.intra_satd_x3_8x8c( p_srcc[0], p_dstc[0], satdu ); - h->pixf.intra_satd_x3_8x8c( p_srcc[1], p_dstc[1], satdv ); + h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu ); + h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv ); h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] ); h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] ); satdu[I_PRED_CHROMA_P] = @@ -654,7 +720,8 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8]; int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); int i_cost = 0; - b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0]; + h->mb.i_cbp_luma = 0; + b_merged_satd = h->pixf.intra_mbcmp_x3_8x8 && !h->mb.b_lossless; // FIXME some bias like in i4x4? if( h->sh.i_type == SLICE_TYPE_B ) @@ -670,12 +737,12 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx ); predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max ); - x264_predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); + h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); if( b_merged_satd && i_max == 9 ) { int satd[9]; - h->pixf.intra_sa8d_x3_8x8( p_src_by, edge, satd ); + h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd ); satd[i_pred_mode] -= 3 * a->i_lambda; for( i=2; i>=0; i-- ) { @@ -721,14 +788,20 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); + h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]]; + h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]]; + h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]]; + h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]]; + h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) ); } } else { + static const uint16_t cost_div_fix8[3] = {1024,512,341}; a->i_satd_i8x8 = COST_MAX; - i_cost = i_cost * 4/(idx+1); + i_cost = (i_cost * cost_div_fix8[idx]) >> 8; } if( X264_MIN(i_cost, a->i_satd_i16x16) > i_satd_inter*(5+!!a->i_mbrd)/4 ) return; @@ -739,7 +812,8 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ { int i_cost; int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ); - b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0]; + h->mb.i_cbp_luma = 0; + b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless; if( a->i_mbrd ) i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8; @@ -763,7 +837,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( b_merged_satd && i_max >= 6 ) { int satd[9]; - h->pixf.intra_satd_x3_4x4( p_src_by, p_dst_by, satd ); + h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd ); satd[i_pred_mode] -= 3 * a->i_lambda; for( i=2; i>=0; i-- ) COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda, @@ -805,6 +879,11 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); + h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]]; + h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]]; + h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]]; + h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]]; + h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) ); } @@ -839,6 +918,7 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh ) h->mb.i_type = I_8x8; x264_analyse_update_cache( h, a ); a->i_satd_i8x8 = x264_rd_cost_mb( h, a->i_lambda2 ); + a->i_cbp_i8x8_luma = h->mb.i_cbp_luma; } else a->i_satd_i8x8 = COST_MAX; @@ -846,13 +926,11 @@ static void x264_intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh ) static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) { - uint8_t *p_src = h->mb.pic.p_fenc[0]; uint8_t *p_dst = h->mb.pic.p_fdec[0]; int i, j, idx, x, y; int i_max, i_mode, i_thresh; uint64_t i_satd, i_best; - int i_pred_mode; int predict_mode[9]; h->mb.i_skip_intra = 0; @@ -872,7 +950,51 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode ); } } - else if( h->mb.i_type == I_4x4 ) + + /* RD selection for chroma prediction */ + predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); + if( i_max > 1 ) + { + i_thresh = a->i_satd_i8x8chroma * 5/4; + + for( i = j = 0; i < i_max; i++ ) + if( a->i_satd_i8x8chroma_dir[i] < i_thresh && + predict_mode[i] != a->i_predict8x8chroma ) + { + predict_mode[j++] = predict_mode[i]; + } + i_max = j; + + if( i_max > 0 ) + { + int i_cbp_chroma_best = h->mb.i_cbp_chroma; + int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp]; + /* the previous thing encoded was x264_intra_rd(), so the pixels and + * coefs for the current chroma mode are still around, so we only + * have to recount the bits. */ + i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 ); + for( i = 0; i < i_max; i++ ) + { + i_mode = predict_mode[i]; + if( h->mb.b_lossless ) + x264_predict_lossless_8x8_chroma( h, i_mode ); + else + { + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + } + /* if we've already found a mode that needs no residual, then + * probably any mode with a residual will be worse. + * so avoid dct on the remaining modes to improve speed. */ + i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 ); + COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma ); + } + h->mb.i_chroma_pred_mode = a->i_predict8x8chroma; + h->mb.i_cbp_chroma = i_cbp_chroma_best; + } + } + + if( h->mb.i_type == I_4x4 ) { uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning int i_nnz = 0; @@ -881,8 +1003,6 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx]; i_best = COST_MAX64; - i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx ); - predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max ); if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) @@ -926,21 +1046,19 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) { uint64_t pels_h = 0; uint8_t pels_v[7]; - int i_nnz[3]; - uint8_t *p_src_by; + uint16_t i_nnz[2]; uint8_t *p_dst_by; int j; + int cbp_luma_new = 0; i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8; i_best = COST_MAX64; - i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx ); x = idx&1; y = idx>>1; - p_src_by = p_src + 8*x + 8*y*FENC_STRIDE; p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE; predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max ); - x264_predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); + h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS ); for( i = 0; i < i_max; i++ ) { @@ -951,73 +1069,34 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) x264_predict_lossless_8x8( h, p_dst_by, idx, i_mode, edge ); else h->predict_8x8[i_mode]( p_dst_by, edge ); + h->mb.i_cbp_luma = a->i_cbp_i8x8_luma; i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode ); if( i_best > i_satd ) { a->i_predict8x8[idx] = i_mode; + cbp_luma_new = h->mb.i_cbp_luma; i_best = i_satd; pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE); if( !(idx&1) ) for( j=0; j<7; j++ ) pels_v[j] = p_dst_by[7+j*FDEC_STRIDE]; - for( j=0; j<3; j++ ) - i_nnz[j] = h->mb.cache.non_zero_count[x264_scan8[4*idx+j+1]]; + i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]]; + i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]]; } } - + a->i_cbp_i8x8_luma = cbp_luma_new; *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h; if( !(idx&1) ) for( j=0; j<7; j++ ) p_dst_by[7+j*FDEC_STRIDE] = pels_v[j]; - for( j=0; j<3; j++ ) - h->mb.cache.non_zero_count[x264_scan8[4*idx+j+1]] = i_nnz[j]; + *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0]; + *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1]; x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] ); } } - - /* RD selection for chroma prediction */ - predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); - if( i_max > 1 ) - { - i_thresh = a->i_satd_i8x8chroma * 5/4; - - for( i = j = 0; i < i_max; i++ ) - if( a->i_satd_i8x8chroma_dir[i] < i_thresh && - predict_mode[i] != a->i_predict8x8chroma ) - { - predict_mode[j++] = predict_mode[i]; - } - i_max = j; - - if( i_max > 0 ) - { - int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp]; - /* the previous thing encoded was x264_intra_rd(), so the pixels and - * coefs for the current chroma mode are still around, so we only - * have to recount the bits. */ - i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 ); - for( i = 0; i < i_max; i++ ) - { - i_mode = predict_mode[i]; - if( h->mb.b_lossless ) - x264_predict_lossless_8x8_chroma( h, i_mode ); - else - { - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); - } - /* if we've already found a mode that needs no residual, then - * probably any mode with a residual will be worse. - * so avoid dct on the remaining modes to improve speed. */ - i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 ); - COPY2_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode ); - } - h->mb.i_chroma_pred_mode = a->i_predict8x8chroma; - } - } } #define LOAD_FENC( m, src, xoff, yoff) \ @@ -1037,7 +1116,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; #define REF_COST(list, ref) \ - (a->i_lambda * bs_size_te( h->sh.i_num_ref_idx_l##list##_active - 1, ref )) + (a->p_cost_ref##list[ref]) static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) { @@ -1939,6 +2018,8 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd ) x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref ); x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref ); x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref ); + /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection + * for future blocks are those left over from previous RDO calls. */ for( i = 0; i < 4; i++ ) { int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost}; @@ -2093,7 +2174,7 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t * { int i_rd8; x264_analyse_update_cache( h, a ); - h->mb.b_transform_8x8 = !h->mb.b_transform_8x8; + h->mb.b_transform_8x8 ^= 1; /* FIXME only luma is needed, but the score for comparison already includes chroma */ i_rd8 = x264_rd_cost_mb( h, a->i_lambda2 ); @@ -2101,17 +2182,73 @@ static inline void x264_mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t * { if( *i_rd > 0 ) *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd; - /* prevent a rare division by zero in estimated intra cost */ - if( *i_satd == 0 ) - *i_satd = 1; - *i_rd = i_rd8; } else - h->mb.b_transform_8x8 = !h->mb.b_transform_8x8; + h->mb.b_transform_8x8 ^= 1; } } +/* Rate-distortion optimal QP selection. + * FIXME: More than half of the benefit of this function seems to be + * in the way it improves the coding of chroma DC (by decimating or + * finding a better way to code a single DC coefficient.) + * There must be a more efficient way to get that portion of the benefit + * without doing full QP-RD, but RD-decimation doesn't seem to do the + * trick. */ +static inline void x264_mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a ) +{ + int bcost, cost, direction, failures, prevcost, origcost; + int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp; + origcost = bcost = x264_rd_cost_mb( h, a->i_lambda2 ); + + /* If CBP is already zero, don't raise the quantizer any higher. */ + for( direction = h->mb.cbp[h->mb.i_mb_xy] ? 1 : -1; direction >= -1; direction-=2 ) + { + h->mb.i_qp = orig_qp; + failures = 0; + prevcost = origcost; + while( h->mb.i_qp > 0 && h->mb.i_qp < 51 ) + { + h->mb.i_qp += direction; + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; + cost = x264_rd_cost_mb( h, a->i_lambda2 ); + COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp ); + + /* We can't assume that the costs are monotonic over QPs. + * Tie case-as-failure seems to give better results. */ + if( cost < prevcost ) + failures = 0; + else + failures++; + prevcost = cost; + + /* Without psy-RD, require monotonicity when lowering + * quant, allow 1 failure when raising quant. + * With psy-RD, allow 1 failure when lowering quant, + * allow 2 failures when raising quant. + * Psy-RD generally seems to result in more chaotic + * RD score-vs-quantizer curves. */ + if( failures > ((direction + 1)>>1)+(!!h->mb.i_psy_rd) ) + break; + if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] ) + break; + } + } + + h->mb.i_qp = bqp; + h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp]; + + /* Check transform again; decision from before may no longer be optimal. */ + if( h->mb.i_qp != orig_qp && x264_mb_transform_8x8_allowed( h ) && + h->param.analyse.b_transform_8x8 ) + { + h->mb.b_transform_8x8 ^= 1; + cost = x264_rd_cost_mb( h, a->i_lambda2 ); + if( cost > bcost ) + h->mb.b_transform_8x8 ^= 1; + } +} /***************************************************************************** * x264_macroblock_analyse: @@ -2124,7 +2261,13 @@ void x264_macroblock_analyse( x264_t *h ) h->mb.i_qp = x264_ratecontrol_qp( h ); if( h->param.rc.i_aq_mode ) + { x264_adaptive_quant( h ); + /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB, + * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */ + if( h->param.analyse.i_subpel_refine < 10 && abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ) + h->mb.i_qp = h->mb.i_last_qp; + } x264_mb_analyse_init( h, &analysis, h->mb.i_qp ); @@ -2150,7 +2293,6 @@ void x264_macroblock_analyse( x264_t *h ) else if( h->sh.i_type == SLICE_TYPE_P ) { int b_skip = 0; - int i_intra_cost, i_intra_type; h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 ); @@ -2351,20 +2493,12 @@ void x264_macroblock_analyse( x264_t *h ) x264_intra_rd( h, &analysis, i_satd_inter * 5/4 ); } - i_intra_type = I_16x16; - i_intra_cost = analysis.i_satd_i16x16; - COPY2_IF_LT( i_intra_cost, analysis.i_satd_i8x8, i_intra_type, I_8x8 ); - COPY2_IF_LT( i_intra_cost, analysis.i_satd_i4x4, i_intra_type, I_4x4 ); - COPY2_IF_LT( i_intra_cost, analysis.i_satd_pcm, i_intra_type, I_PCM ); - COPY2_IF_LT( i_cost, i_intra_cost, i_type, i_intra_type ); - - if( i_intra_cost == COST_MAX ) - i_intra_cost = i_cost * i_satd_intra / i_satd_inter + 1; + COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 ); + COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 ); + COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 ); + COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM ); h->mb.i_type = i_type; - h->stat.frame.i_intra_cost += i_intra_cost; - h->stat.frame.i_inter_cost += i_cost; - h->stat.frame.i_mbs_analysed++; if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM ) { @@ -2728,6 +2862,9 @@ void x264_macroblock_analyse( x264_t *h ) if( !analysis.i_mbrd ) x264_mb_analyse_transform( h ); + if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) ) + x264_mb_analyse_qp_rd( h, &analysis ); + h->mb.b_trellis = h->param.analyse.i_trellis; h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction; if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )