From: Oskar Arvidsson Date: Fri, 2 Jul 2010 02:06:08 +0000 (+0200) Subject: Support for 9 and 10-bit encoding X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=c91f43a4b09dab84953f417e6d6662ec0fa7acb1;p=x264 Support for 9 and 10-bit encoding Output bit depth is specified on compilation time via --bit-depth. There is currently almost no assembly code available for high-bit-depth modes, so encoding will be very slow. Input is still 8-bit only; this will change in the future. Note that very few H.264 decoders support >8 bit depth currently. Also note that the quantizer scale differs for higher bit depth. For example, for 10-bit, the quantizer (and crf) ranges from 0 to 63 instead of 0 to 51. --- diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index d294eff4..b1106dd2 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -64,6 +64,19 @@ MC_WEIGHT(_nodenom) MC_WEIGHT(_offsetadd) MC_WEIGHT(_offsetsub) +void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int ); + +void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int ); +void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int); + +void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int ); +void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int ); +void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int ); + +#if !X264_HIGH_BIT_DEPTH static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) { if( w->i_scale == 1<i_denom ) @@ -85,14 +98,6 @@ static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) w->weightfn = x264_mc_wtab_neon; } -void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int ); -void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int ); -void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int ); -void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int ); - -void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int ); -void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int); - static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) = { NULL, @@ -174,10 +179,6 @@ static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride, } } -void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int ); -void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int ); -void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int ); - static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf ) { @@ -198,18 +199,22 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8 src += stride; } } +#endif // !X264_HIGH_BIT_DEPTH void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_ARMV6) ) return; +#if !X264_HIGH_BIT_DEPTH pf->prefetch_fenc = x264_prefetch_fenc_arm; pf->prefetch_ref = x264_prefetch_ref_arm; +#endif // !X264_HIGH_BIT_DEPTH if( !(cpu&X264_CPU_NEON) ) return; +#if !X264_HIGH_BIT_DEPTH pf->copy_16x16_unaligned = x264_mc_copy_w16_neon; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; @@ -229,15 +234,16 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) pf->offsetsub = x264_mc_offsetsub_wtab_neon; pf->weight_cache = x264_weight_cache_neon; -// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs -#ifndef SYS_MACOSX - pf->memcpy_aligned = x264_memcpy_aligned_neon; -#endif - pf->memzero_aligned = x264_memzero_aligned_neon; - pf->mc_chroma = x264_mc_chroma_neon; pf->mc_luma = mc_luma_neon; pf->get_ref = get_ref_neon; pf->hpel_filter = hpel_filter_neon; pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; +#endif // !X264_HIGH_BIT_DEPTH + +// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs +#ifndef SYS_MACOSX + pf->memcpy_aligned = x264_memcpy_aligned_neon; +#endif + pf->memzero_aligned = x264_memzero_aligned_neon; } diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c index fa7b9f7b..b40dc9a1 100644 --- a/common/arm/predict-c.c +++ b/common/arm/predict-c.c @@ -51,6 +51,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) if (!(cpu&X264_CPU_ARMV6)) return; +#if !X264_HIGH_BIT_DEPTH pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6; pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6; @@ -59,6 +60,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) return; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; +#endif // !X264_HIGH_BIT_DEPTH } void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] ) @@ -66,12 +68,14 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] ) if (!(cpu&X264_CPU_NEON)) return; +#if !X264_HIGH_BIT_DEPTH pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon; pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon; pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon; pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon; +#endif // !X264_HIGH_BIT_DEPTH } void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) @@ -79,8 +83,10 @@ void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_ if (!(cpu&X264_CPU_NEON)) return; +#if !X264_HIGH_BIT_DEPTH pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; +#endif // !X264_HIGH_BIT_DEPTH } void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] ) @@ -88,10 +94,12 @@ void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] ) if (!(cpu&X264_CPU_NEON)) return; +#if !X264_HIGH_BIT_DEPTH pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon; pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon; pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon; pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon; pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon; pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon; +#endif // !X264_HIGH_BIT_DEPTH } diff --git a/common/bitstream.h b/common/bitstream.h index d10f3a20..b2aa8b89 100644 --- a/common/bitstream.h +++ b/common/bitstream.h @@ -53,7 +53,7 @@ typedef struct bs_s typedef struct { int last; - int16_t level[16]; + dctcoef level[16]; uint8_t run[16]; } x264_run_level_t; diff --git a/common/common.c b/common/common.c index 14dd7167..728dfab0 100644 --- a/common/common.c +++ b/common/common.c @@ -91,10 +91,10 @@ void x264_param_default( x264_param_t *param ) param->rc.i_vbv_max_bitrate = 0; param->rc.i_vbv_buffer_size = 0; param->rc.f_vbv_buffer_init = 0.9; - param->rc.i_qp_constant = 23; - param->rc.f_rf_constant = 23; + param->rc.i_qp_constant = 23 + QP_BD_OFFSET; + param->rc.f_rf_constant = 23 + QP_BD_OFFSET; param->rc.i_qp_min = 10; - param->rc.i_qp_max = 51; + param->rc.i_qp_max = QP_MAX; param->rc.i_qp_step = 4; param->rc.f_ip_factor = 1.4; param->rc.f_pb_factor = 1.3; @@ -418,6 +418,15 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile ) if( !profile ) return 0; +#if BIT_DEPTH > 8 + if( !strcasecmp( profile, "baseline" ) || !strcasecmp( profile, "main" ) || + !strcasecmp( profile, "high" ) ) + { + x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d.\n", profile, BIT_DEPTH ); + return -1; + } +#endif + if( !strcasecmp( profile, "baseline" ) ) { param->analyse.b_transform_8x8 = 0; @@ -441,7 +450,7 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile ) param->analyse.b_transform_8x8 = 0; param->i_cqm_preset = X264_CQM_FLAT; } - else if( !strcasecmp( profile, "high" ) ) + else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) ) { /* Default */ } diff --git a/common/common.h b/common/common.h index 7b60811f..ca279683 100644 --- a/common/common.h +++ b/common/common.h @@ -52,10 +52,15 @@ do {\ #define X264_BFRAME_MAX 16 #define X264_THREAD_MAX 128 -#define X264_PCM_COST (386*8) +#define X264_PCM_COST (384*BIT_DEPTH+16) #define X264_LOOKAHEAD_MAX 250 +#define QP_BD_OFFSET (6*(BIT_DEPTH-8)) +#define QP_MAX (51+QP_BD_OFFSET) +#define QP_MAX_MAX (51+2*6) +#define LAMBDA_MAX (91 << (BIT_DEPTH-8)) +#define PIXEL_MAX ((1 << BIT_DEPTH)-1) // arbitrary, but low because SATD scores are 1/4 normal -#define X264_LOOKAHEAD_QP 12 +#define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET) // number of pixels (per thread) in progress at any given time. // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety @@ -101,17 +106,23 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u #define CP64(dst,src) M64(dst) = M64(src) #define CP128(dst,src) M128(dst) = M128(src) -typedef uint8_t pixel; -typedef uint32_t pixel4; -typedef int16_t dctcoef; +#if X264_HIGH_BIT_DEPTH + typedef uint16_t pixel; + typedef uint64_t pixel4; + typedef int32_t dctcoef; -#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U) -#define MPIXEL_X4(src) M32(src) -#define CPPIXEL_X4(dst,src) CP32(dst,src) -#define CPPIXEL_X8(dst,src) CP64(dst,src) -#define MDCT_X2(dct) M32(dct) -#define CPDCT_X2(dst,src) CP32(dst,src) -#define CPDCT_X4(dst,src) CP64(dst,src) +# define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL) +# define MPIXEL_X4(src) M64(src) +#else + typedef uint8_t pixel; + typedef uint32_t pixel4; + typedef int16_t dctcoef; + +# define PIXEL_SPLAT_X4(x) ((x)*0x01010101U) +# define MPIXEL_X4(src) M32(src) +#endif + +#define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src) #define X264_SCAN8_SIZE (6*8) #define X264_SCAN8_LUMA_SIZE (5*8) @@ -189,7 +200,7 @@ void x264_init_vlc_tables(); static ALWAYS_INLINE pixel x264_clip_pixel( int x ) { - return x&(~255) ? (-x)>>31 : x; + return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x ); } static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max ) @@ -449,8 +460,8 @@ struct x264_t /* mv/ref cost arrays. Indexed by lambda instead of * qp because, due to rounding, some quantizers share * lambdas. This saves memory. */ - uint16_t *cost_mv[92]; - uint16_t *cost_mv_fpel[92][4]; + uint16_t *cost_mv[LAMBDA_MAX+1]; + uint16_t *cost_mv_fpel[LAMBDA_MAX+1][4]; const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */ diff --git a/common/dct.c b/common/dct.c index 60dbd551..cd273636 100644 --- a/common/dct.c +++ b/common/dct.c @@ -418,6 +418,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->dct4x4dc = dct4x4dc; dctf->idct4x4dc = idct4x4dc; +#if !X264_HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) { @@ -515,6 +516,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add16x16_idct8= x264_add16x16_idct8_neon; } #endif +#endif // !X264_HIGH_BIT_DEPTH } void x264_dct_init_weights( void ) @@ -599,11 +601,9 @@ static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] ) static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] ) { - CPDCT_X2( level, dct ); + memcpy( level, dct, 2 * sizeof(dctcoef) ); ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1) - CPDCT_X2( level+6, dct+6 ); - CPDCT_X4( level+8, dct+8 ); - CPDCT_X4( level+12, dct+12 ); + memcpy( level+6, dct+6, 10 * sizeof(dctcoef) ); } #undef ZIG @@ -618,6 +618,7 @@ static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] ) CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE ); +#define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) ) #define COPY8x8\ CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ @@ -709,6 +710,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) pf->sub_8x8 = zigzag_sub_8x8_field; pf->sub_4x4 = zigzag_sub_4x4_field; pf->sub_4x4ac = zigzag_sub_4x4ac_field; +#if !X264_HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMXEXT ) { @@ -726,6 +728,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) if( cpu&X264_CPU_ALTIVEC ) pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; #endif +#endif // !X264_HIGH_BIT_DEPTH } else { @@ -734,6 +737,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) pf->sub_8x8 = zigzag_sub_8x8_frame; pf->sub_4x4 = zigzag_sub_4x4_frame; pf->sub_4x4ac = zigzag_sub_4x4ac_frame; +#if !X264_HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; @@ -759,13 +763,16 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) if( cpu&X264_CPU_NEON ) pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; #endif +#endif // !X264_HIGH_BIT_DEPTH } pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; +#if !X264_HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; if( cpu&X264_CPU_SHUFFLE_IS_FAST ) pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; #endif +#endif // !X264_HIGH_BIT_DEPTH } diff --git a/common/deblock.c b/common/deblock.c index 9e42d43e..c7298747 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -25,8 +25,9 @@ #include "common.h" /* Deblocking filter */ -static const uint8_t i_alpha_table[52+12*2] = +static const uint8_t i_alpha_table[52+12*3] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 5, 6, @@ -36,8 +37,9 @@ static const uint8_t i_alpha_table[52+12*2] = 255,255, 255,255,255,255,255,255,255,255,255,255,255,255, }; -static const uint8_t i_beta_table[52+12*2] = +static const uint8_t i_beta_table[52+12*3] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3, @@ -47,12 +49,14 @@ static const uint8_t i_beta_table[52+12*2] = 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, }; -static const int8_t i_tc0_table[52+12*2][4] = +static const int8_t i_tc0_table[52+12*3][4] = { {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, + {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, @@ -63,9 +67,9 @@ static const int8_t i_tc0_table[52+12*2][4] = {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, }; -#define alpha_table(x) i_alpha_table[(x)+12] -#define beta_table(x) i_beta_table[(x)+12] -#define tc0_table(x) i_tc0_table[(x)+12] +#define alpha_table(x) i_alpha_table[(x)+24] +#define beta_table(x) i_beta_table[(x)+24] +#define tc0_table(x) i_tc0_table[(x)+24] /* From ffmpeg */ static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) @@ -265,18 +269,19 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264 static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) { - int index_a = i_qp + h->sh.i_alpha_c0_offset; - int alpha = alpha_table(index_a); - int beta = beta_table(i_qp + h->sh.i_beta_offset); + int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; + int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset; + int alpha = alpha_table(index_a) << (BIT_DEPTH-8); + int beta = beta_table(index_b) << (BIT_DEPTH-8); int8_t tc[4]; if( !M32(bS) || !alpha || !beta ) return; - tc[0] = tc0_table(index_a)[bS[0]] + b_chroma; - tc[1] = tc0_table(index_a)[bS[1]] + b_chroma; - tc[2] = tc0_table(index_a)[bS[2]] + b_chroma; - tc[3] = tc0_table(index_a)[bS[3]] + b_chroma; + tc[0] = (tc0_table(index_a)[bS[0]] << (BIT_DEPTH-8)) + b_chroma; + tc[1] = (tc0_table(index_a)[bS[1]] << (BIT_DEPTH-8)) + b_chroma; + tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma; + tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma; pf_inter( pix1, i_stride, alpha, beta, tc ); if( b_chroma ) @@ -285,8 +290,10 @@ static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stri static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) { - int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset); - int beta = beta_table(i_qp + h->sh.i_beta_offset); + int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; + int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset; + int alpha = alpha_table(index_a) << (BIT_DEPTH-8); + int beta = beta_table(index_b) << (BIT_DEPTH-8); if( !alpha || !beta ) return; @@ -450,6 +457,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) #if HAVE_MMX if( cpu&X264_CPU_MMXEXT ) { +#if !X264_HIGH_BIT_DEPTH pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext; pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext; @@ -460,10 +468,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext; #endif +#endif // !X264_HIGH_BIT_DEPTH pf->deblock_strength = x264_deblock_strength_mmxext; if( cpu&X264_CPU_SSE2 ) { pf->deblock_strength = x264_deblock_strength_sse2; +#if !X264_HIGH_BIT_DEPTH if( !(cpu&X264_CPU_STACK_MOD4) ) { pf->deblock_luma[1] = x264_deblock_v_luma_sse2; @@ -471,12 +481,14 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; } +#endif // !X264_HIGH_BIT_DEPTH } if( cpu&X264_CPU_SSSE3 ) pf->deblock_strength = x264_deblock_strength_ssse3; } #endif +#if !X264_HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) { @@ -494,4 +506,5 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) pf->deblock_chroma[0] = x264_deblock_h_chroma_neon; } #endif +#endif // !X264_HIGH_BIT_DEPTH } diff --git a/common/macroblock.c b/common/macroblock.c index 94df8f6e..386063ac 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -337,7 +337,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) int scratch_size = 0; if( !b_lookahead ) { - int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t); + int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(dctcoef); int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int); int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range); int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) * diff --git a/common/macroblock.h b/common/macroblock.h index b2723da2..68844cc3 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -238,17 +238,30 @@ static const uint16_t block_idx_xy_fdec[16] = 2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE }; -static const uint8_t i_chroma_qp_table[52+12*2] = +#define QP(qP) ( (qP)+QP_BD_OFFSET ) +static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 29, 30, 31, 32, 32, 33, 34, 34, 35, 35, - 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, - 39, 39, - 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, +#if BIT_DEPTH > 9 + QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7), +#endif +#if BIT_DEPTH > 8 + QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1), +#endif + QP(0), QP(1), QP(2), QP(3), QP(4), QP(5), + QP(6), QP(7), QP(8), QP(9), QP(10), QP(11), + QP(12), QP(13), QP(14), QP(15), QP(16), QP(17), + QP(18), QP(19), QP(20), QP(21), QP(22), QP(23), + QP(24), QP(25), QP(26), QP(27), QP(28), QP(29), + QP(29), QP(30), QP(31), QP(32), QP(32), QP(33), + QP(34), QP(34), QP(35), QP(35), QP(36), QP(36), + QP(37), QP(37), QP(37), QP(38), QP(38), QP(38), + QP(39), QP(39), QP(39), QP(39), + QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), + QP(39), QP(39), QP(39), QP(39), QP(39), QP(39), }; +#undef QP enum cabac_ctx_block_cat_e { @@ -340,26 +353,31 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b ) return (a&0xFFFF) + (b<<16); #endif } +static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b ) +{ +#ifdef WORDS_BIGENDIAN + return b + ((uint64_t)a<<32); +#else + return a + ((uint64_t)b<<32); +#endif +} -#define pack_pixel_1to2 pack8to16 -#define pack_pixel_2to4 pack16to32 +#if X264_HIGH_BIT_DEPTH +# define pack_pixel_1to2 pack16to32 +# define pack_pixel_2to4 pack32to64 +#else +# define pack_pixel_1to2 pack8to16 +# define pack_pixel_2to4 pack16to32 +#endif -#define array_non_zero(a) array_non_zero_int(a, sizeof(a)) +#define array_non_zero(a) array_non_zero_int(a, sizeof(a)/sizeof(dctcoef)) #define array_non_zero_int array_non_zero_int static ALWAYS_INLINE int array_non_zero_int( dctcoef *v, int i_count ) { - if(i_count == 8) - return !!M64( &v[0] ); - else if(i_count == 16) - return !!(M64( &v[0] ) | M64( &v[4] )); - else if(i_count == 32) - return !!(M64( &v[0] ) | M64( &v[4] ) | M64( &v[8] ) | M64( &v[12] )); - else - { - for( int i = 0; i < i_count; i+=4 ) - if( M64( &v[i] ) ) return 1; - return 0; - } + for( int i = 0; i < i_count; i++ ) + if( v[i] ) + return 1; + return 0; } static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx ) { diff --git a/common/mc.c b/common/mc.c index 9776becf..5ef0682e 100644 --- a/common/mc.c +++ b/common/mc.c @@ -117,11 +117,14 @@ static void x264_weight_cache( x264_t *h, x264_weight_t *w ) { w->weightfn = h->mc.weight; } -#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * weight->i_scale + (1<<(weight->i_denom - 1))) >> weight->i_denom) + weight->i_offset ) -#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * weight->i_scale + weight->i_offset ) -static inline void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height ) +#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset ) +#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset ) +static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height ) { - if( weight->i_denom >= 1 ) + int offset = weight->i_offset << (BIT_DEPTH-8); + int scale = weight->i_scale; + int denom = weight->i_denom; + if( denom >= 1 ) { for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride ) for( int x = 0; x < i_width; x++ ) @@ -135,21 +138,10 @@ static inline void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_sr } } -#define MC_WEIGHT_C( name, lx ) \ +#define MC_WEIGHT_C( name, width ) \ static void name( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int height ) \ { \ - if( weight->i_denom >= 1 ) \ - { \ - for( int y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \ - for( int x = 0; x < lx; x++ ) \ - opscale( x ); \ - } \ - else \ - { \ - for( int y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \ - for( int x = 0; x < lx; x++ ) \ - opscale_noden( x ); \ - } \ + mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\ } MC_WEIGHT_C( mc_weight_w20, 20 ) @@ -182,7 +174,7 @@ static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d])) static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, - int stride, int width, int height, int16_t *buf ) + int stride, int width, int height, dctcoef *buf ) { for( int y = 0; y < height; y++ ) { @@ -301,7 +293,12 @@ void x264_plane_copy_c( pixel *dst, int i_dst, { while( h-- ) { +#if X264_HIGH_BIT_DEPTH + for( int i = 0; i < w; i++ ) + dst[i] = src[i] << (BIT_DEPTH-8); +#else memcpy( dst, src, w ); +#endif dst += i_dst; src += i_src; } diff --git a/common/mc.h b/common/mc.h index bb16d13e..cbdf1a63 100644 --- a/common/mc.h +++ b/common/mc.h @@ -82,7 +82,7 @@ typedef struct uint8_t *src, int i_src, int w, int h); void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, - int i_stride, int i_width, int i_height, int16_t *buf ); + int i_stride, int i_width, int i_height, dctcoef *buf ); /* prefetch the next few macroblocks of fenc or fdec */ void (*prefetch_fenc)( pixel *pix_y, int stride_y, diff --git a/common/pixel.c b/common/pixel.c index 8441c7ae..069589f6 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -177,7 +177,7 @@ static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride pix2 += i_stride2; } sum = abs(sum); - var = sqr - (sum * sum >> 6); + var = sqr - ((uint64_t)sum * sum >> 6); *ssd = sqr; return var; } @@ -406,12 +406,14 @@ SAD_X( 8x4 ) SAD_X( 4x8 ) SAD_X( 4x4 ) +#if !X264_HIGH_BIT_DEPTH #if ARCH_UltraSparc SAD_X( 16x16_vis ) SAD_X( 16x8_vis ) SAD_X( 8x16_vis ) SAD_X( 8x8_vis ) #endif +#endif // !X264_HIGH_BIT_DEPTH /**************************************************************************** * pixel_satd_x4 @@ -444,6 +446,7 @@ SATD_X_DECL6( cpu )\ SATD_X( 4x4, cpu ) SATD_X_DECL7() +#if !X264_HIGH_BIT_DEPTH #if HAVE_MMX SATD_X_DECL7( _mmxext ) SATD_X_DECL6( _sse2 ) @@ -454,6 +457,7 @@ SATD_X_DECL7( _sse4 ) #if HAVE_ARMV6 SATD_X_DECL7( _neon ) #endif +#endif // !X264_HIGH_BIT_DEPTH #define INTRA_MBCMP_8x8( mbcmp )\ void x264_intra_##mbcmp##_x3_8x8( pixel *fenc, pixel edge[33], int res[3] )\ @@ -520,8 +524,8 @@ static void ssim_4x4x2_core( const pixel *pix1, int stride1, static float ssim_end1( int s1, int s2, int ss, int s12 ) { - static const int ssim_c1 = (int)(.01*.01*255*255*64 + .5); - static const int ssim_c2 = (int)(.03*.03*255*255*64*63 + .5); + static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5); + static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5); int vars = ss*64 - s1*s1 - s2*s2; int covar = s12*64 - s1*s2; return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2) @@ -678,6 +682,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16; +#if !X264_HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) { @@ -903,17 +908,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } } #endif +#endif // !X264_HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) { x264_pixel_altivec_init( pixf ); } #endif +#if !X264_HIGH_BIT_DEPTH #if ARCH_UltraSparc INIT4( sad, _vis ); INIT4( sad_x3, _vis ); INIT4( sad_x4, _vis ); #endif +#endif // !X264_HIGH_BIT_DEPTH pixf->ads[PIXEL_8x16] = pixf->ads[PIXEL_8x4] = diff --git a/common/ppc/dct.c b/common/ppc/dct.c index eb223ae2..85d5ce7f 100644 --- a/common/ppc/dct.c +++ b/common/ppc/dct.c @@ -24,6 +24,7 @@ #include "common/common.h" #include "ppccommon.h" +#if !X264_HIGH_BIT_DEPTH #define VEC_DCT(a0,a1,a2,a3,b0,b1,b2,b3) \ b1 = vec_add( a0, a3 ); \ b3 = vec_add( a1, a2 ); \ @@ -482,4 +483,5 @@ void x264_zigzag_scan_4x4_field_altivec( int16_t level[16], int16_t dct[4][4] ) vec_st( tmp0v, 0x00, level ); vec_st( tmp1v, 0x10, level ); } +#endif // !X264_HIGH_BIT_DEPTH diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c index 0c8d2d43..986710d9 100644 --- a/common/ppc/deblock.c +++ b/common/ppc/deblock.c @@ -21,6 +21,7 @@ #include "common/common.h" #include "ppccommon.h" +#if !X264_HIGH_BIT_DEPTH #define transpose4x16(r0, r1, r2, r3) \ { \ register vec_u8_t r4; \ @@ -292,3 +293,4 @@ void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, transpose4x16(line1, line2, line3, line4); write16x4(pix-2, stride, line1, line2, line3, line4); } +#endif // !X264_HIGH_BIT_DEPTH diff --git a/common/ppc/mc.c b/common/ppc/mc.c index 7ad80505..744a8043 100644 --- a/common/ppc/mc.c +++ b/common/ppc/mc.c @@ -33,6 +33,7 @@ #include "mc.h" #include "ppccommon.h" +#if !X264_HIGH_BIT_DEPTH typedef void (*pf_mc_t)( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ); @@ -792,9 +793,11 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_ dstc += dst_stride; } } +#endif // !X264_HIGH_BIT_DEPTH void x264_mc_altivec_init( x264_mc_functions_t *pf ) { +#if !X264_HIGH_BIT_DEPTH pf->mc_luma = mc_luma_altivec; pf->get_ref = get_ref_altivec; pf->mc_chroma = mc_chroma_altivec; @@ -804,4 +807,5 @@ void x264_mc_altivec_init( x264_mc_functions_t *pf ) pf->hpel_filter = x264_hpel_filter_altivec; pf->frame_init_lowres_core = frame_init_lowres_core_altivec; +#endif // !X264_HIGH_BIT_DEPTH } diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c index 3f996065..bd5f547f 100644 --- a/common/ppc/pixel.c +++ b/common/ppc/pixel.c @@ -24,6 +24,7 @@ #include "common/common.h" #include "ppccommon.h" +#if !X264_HIGH_BIT_DEPTH /*********************************************************************** * SAD routines **********************************************************************/ @@ -1979,12 +1980,14 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1, sums[0][3] = temp[0]; sums[1][3] = temp[1]; } +#endif // !X264_HIGH_BIT_DEPTH /**************************************************************************** * x264_pixel_init: ****************************************************************************/ void x264_pixel_altivec_init( x264_pixel_function_t *pixf ) { +#if !X264_HIGH_BIT_DEPTH pixf->sad[PIXEL_16x16] = pixel_sad_16x16_altivec; pixf->sad[PIXEL_8x16] = pixel_sad_8x16_altivec; pixf->sad[PIXEL_16x8] = pixel_sad_16x8_altivec; @@ -2023,4 +2026,5 @@ void x264_pixel_altivec_init( x264_pixel_function_t *pixf ) pixf->hadamard_ac[PIXEL_8x8] = x264_pixel_hadamard_ac_8x8_altivec; pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec; +#endif // !X264_HIGH_BIT_DEPTH } diff --git a/common/ppc/predict.c b/common/ppc/predict.c index 3fb1a2b7..c71dbb56 100644 --- a/common/ppc/predict.c +++ b/common/ppc/predict.c @@ -23,6 +23,7 @@ #include "pixel.h" #include "ppccommon.h" +#if !X264_HIGH_BIT_DEPTH static void predict_8x8c_p_altivec( uint8_t *src ) { int H = 0, V = 0; @@ -194,6 +195,7 @@ static void predict_16x16_v_altivec( uint8_t *src ) src += FDEC_STRIDE; } } +#endif // !X264_HIGH_BIT_DEPTH /**************************************************************************** @@ -201,6 +203,7 @@ static void predict_16x16_v_altivec( uint8_t *src ) ****************************************************************************/ void x264_predict_16x16_init_altivec( x264_predict_t pf[7] ) { +#if !X264_HIGH_BIT_DEPTH pf[I_PRED_16x16_V ] = predict_16x16_v_altivec; pf[I_PRED_16x16_H ] = predict_16x16_h_altivec; pf[I_PRED_16x16_DC] = predict_16x16_dc_altivec; @@ -208,9 +211,12 @@ void x264_predict_16x16_init_altivec( x264_predict_t pf[7] ) pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_altivec; pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_altivec; pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_altivec; +#endif // !X264_HIGH_BIT_DEPTH } void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] ) { +#if !X264_HIGH_BIT_DEPTH pf[I_PRED_CHROMA_P] = predict_8x8c_p_altivec; +#endif // !X264_HIGH_BIT_DEPTH } diff --git a/common/ppc/quant.c b/common/ppc/quant.c index 6f41a06f..ffd6a1ba 100644 --- a/common/ppc/quant.c +++ b/common/ppc/quant.c @@ -22,6 +22,7 @@ #include "ppccommon.h" #include "quant.h" +#if !X264_HIGH_BIT_DEPTH // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled" #define QUANT_16_U( idx0, idx1 ) \ { \ @@ -360,4 +361,5 @@ void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i DEQUANT_SHR(); } } +#endif // !X264_HIGH_BIT_DEPTH diff --git a/common/predict.c b/common/predict.c index 782dfa32..fa71b6e6 100644 --- a/common/predict.c +++ b/common/predict.c @@ -53,40 +53,40 @@ void x264_predict_16x16_dc_c( pixel *src ) { - pixel4 dc = 0; + int dc = 0; for( int i = 0; i < 16; i++ ) { dc += src[-1 + i * FDEC_STRIDE]; dc += src[i - FDEC_STRIDE]; } - dc = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 ); + pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 ); - PREDICT_16x16_DC( dc ); + PREDICT_16x16_DC( dcsplat ); } static void x264_predict_16x16_dc_left_c( pixel *src ) { - pixel4 dc = 0; + int dc = 0; for( int i = 0; i < 16; i++ ) dc += src[-1 + i * FDEC_STRIDE]; - dc = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 ); + pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 ); - PREDICT_16x16_DC( dc ); + PREDICT_16x16_DC( dcsplat ); } static void x264_predict_16x16_dc_top_c( pixel *src ) { - pixel4 dc = 0; + int dc = 0; for( int i = 0; i < 16; i++ ) dc += src[i - FDEC_STRIDE]; - dc = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 ); + pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 ); - PREDICT_16x16_DC( dc ); + PREDICT_16x16_DC( dcsplat ); } static void x264_predict_16x16_dc_128_c( pixel *src ) { - PREDICT_16x16_DC( PIXEL_SPLAT_X4( 0x80 ) ); + PREDICT_16x16_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) ); } void x264_predict_16x16_h_c( pixel *src ) { @@ -155,53 +155,53 @@ static void x264_predict_8x8c_dc_128_c( pixel *src ) { for( int y = 0; y < 8; y++ ) { - MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 0x80 ); - MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 0x80 ); + MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ); + MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ); src += FDEC_STRIDE; } } static void x264_predict_8x8c_dc_left_c( pixel *src ) { - pixel4 dc0 = 0, dc1 = 0; + int dc0 = 0, dc1 = 0; for( int y = 0; y < 4; y++ ) { dc0 += src[y * FDEC_STRIDE - 1]; dc1 += src[(y+4) * FDEC_STRIDE - 1]; } - dc0 = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 ); - dc1 = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 ); + pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 ); + pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 ); for( int y = 0; y < 4; y++ ) { - MPIXEL_X4( src+0 ) = dc0; - MPIXEL_X4( src+4 ) = dc0; + MPIXEL_X4( src+0 ) = dc0splat; + MPIXEL_X4( src+4 ) = dc0splat; src += FDEC_STRIDE; } for( int y = 0; y < 4; y++ ) { - MPIXEL_X4( src+0 ) = dc1; - MPIXEL_X4( src+4 ) = dc1; + MPIXEL_X4( src+0 ) = dc1splat; + MPIXEL_X4( src+4 ) = dc1splat; src += FDEC_STRIDE; } } static void x264_predict_8x8c_dc_top_c( pixel *src ) { - pixel4 dc0 = 0, dc1 = 0; + int dc0 = 0, dc1 = 0; for( int x = 0; x < 4; x++ ) { dc0 += src[x - FDEC_STRIDE]; dc1 += src[x + 4 - FDEC_STRIDE]; } - dc0 = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 ); - dc1 = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 ); + pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 ); + pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 ); for( int y = 0; y < 8; y++ ) { - MPIXEL_X4( src+0 ) = dc0; - MPIXEL_X4( src+4 ) = dc1; + MPIXEL_X4( src+0 ) = dc0splat; + MPIXEL_X4( src+4 ) = dc1splat; src += FDEC_STRIDE; } } @@ -306,7 +306,7 @@ static void x264_predict_8x8c_p_c( pixel *src ) static void x264_predict_4x4_dc_128_c( pixel *src ) { - PREDICT_4x4_DC( PIXEL_SPLAT_X4( 0x80 ) ); + PREDICT_4x4_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) ); } static void x264_predict_4x4_dc_left_c( pixel *src ) { @@ -491,7 +491,8 @@ static void x264_predict_8x8_filter_c( pixel *src, pixel edge[33], int i_neighbo } else { - M64( edge+24 ) = SRC(7,-1) * 0x0101010101010101ULL; + MPIXEL_X4( edge+24 ) = PIXEL_SPLAT_X4( SRC(7,-1) ); + MPIXEL_X4( edge+28 ) = PIXEL_SPLAT_X4( SRC(7,-1) ); edge[32] = SRC(7,-1); } } @@ -523,7 +524,7 @@ static void x264_predict_8x8_filter_c( pixel *src, pixel edge[33], int i_neighbo static void x264_predict_8x8_dc_128_c( pixel *src, pixel edge[33] ) { - PREDICT_8x8_DC( PIXEL_SPLAT_X4( 0x80 ) ); + PREDICT_8x8_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) ); } static void x264_predict_8x8_dc_left_c( pixel *src, pixel edge[33] ) { @@ -554,9 +555,13 @@ void x264_predict_8x8_h_c( pixel *src, pixel edge[33] ) } void x264_predict_8x8_v_c( pixel *src, pixel edge[33] ) { - uint64_t top = M64( edge+16 ); + pixel4 top[2] = { MPIXEL_X4( edge+16 ), + MPIXEL_X4( edge+20 ) }; for( int y = 0; y < 8; y++ ) - M64( src+y*FDEC_STRIDE ) = top; + { + MPIXEL_X4( src+y*FDEC_STRIDE+0 ) = top[0]; + MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = top[1]; + } } static void x264_predict_8x8_ddl_c( pixel *src, pixel edge[33] ) { diff --git a/common/quant.c b/common/quant.c index ece52f9d..a7b72cfb 100644 --- a/common/quant.c +++ b/common/quant.c @@ -142,7 +142,7 @@ static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, uint16_t *offset, int for( int i = 1; i < size; i++ ) { int level = dct[i]; - int sign = level>>15; + int sign = level>>31; level = (level+sign)^sign; sum[i] += level; level -= offset[i]; @@ -177,10 +177,7 @@ static int ALWAYS_INLINE x264_decimate_score_internal( dctcoef *dct, int i_max ) int i_score = 0; int idx = i_max - 1; - /* Yes, dct[idx-1] is guaranteed to be 32-bit aligned. idx>=0 instead of 1 works correctly for the same reason */ - while( idx >= 0 && MDCT_X2( &dct[idx-1] ) == 0 ) - idx -= 2; - if( idx >= 0 && dct[idx] == 0 ) + while( idx >= 0 && dct[idx] == 0 ) idx--; while( idx >= 0 ) { @@ -216,10 +213,7 @@ static int x264_decimate_score64( dctcoef *dct ) static int ALWAYS_INLINE x264_coeff_last_internal( dctcoef *l, int i_count ) { - int i_last; - for( i_last = i_count-1; i_last >= 3; i_last -= 4 ) - if( M64( l+i_last-3 ) ) - break; + int i_last = i_count-1; while( i_last >= 0 && l[i_last] == 0 ) i_last--; return i_last; @@ -287,6 +281,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15; pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16; +#if !X264_HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) { @@ -425,6 +420,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon; } #endif +#endif // !X264_HIGH_BIT_DEPTH pf->coeff_last[ DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4]; pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC]; pf->coeff_level_run[ DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4]; diff --git a/common/set.c b/common/set.c index 16cff8ef..86f38542 100644 --- a/common/set.c +++ b/common/set.c @@ -78,6 +78,7 @@ int x264_cqm_init( x264_t *h ) 32 - 11, 32 - 21 }; int max_qp_err = -1; int max_chroma_qp_err = -1; + int min_qp_err = QP_MAX+1; for( int i = 0; i < 6; i++ ) { @@ -94,9 +95,9 @@ int x264_cqm_init( x264_t *h ) } else { - CHECKED_MALLOC( h-> quant4_mf[i], 52*size*sizeof(uint16_t) ); + CHECKED_MALLOC( h-> quant4_mf[i], (QP_MAX+1)*size*sizeof(uint16_t) ); CHECKED_MALLOC( h->dequant4_mf[i], 6*size*sizeof(int) ); - CHECKED_MALLOC( h->unquant4_mf[i], 52*size*sizeof(int) ); + CHECKED_MALLOC( h->unquant4_mf[i], (QP_MAX+1)*size*sizeof(int) ); } for( j = (i<4 ? 0 : 4); j < i; j++ ) @@ -106,7 +107,7 @@ int x264_cqm_init( x264_t *h ) if( j < i ) h->quant4_bias[i] = h->quant4_bias[j]; else - CHECKED_MALLOC( h->quant4_bias[i], 52*size*sizeof(uint16_t) ); + CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(uint16_t) ); } for( int q = 0; q < 6; q++ ) @@ -140,7 +141,7 @@ int x264_cqm_init( x264_t *h ) quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]); } } - for( int q = 0; q < 52; q++ ) + for( int q = 0; q < QP_MAX+1; q++ ) { int j; for( int i_list = 0; i_list < 4; i_list++ ) @@ -148,6 +149,11 @@ int x264_cqm_init( x264_t *h ) { h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i]; h->quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1); + if( !j ) + { + min_qp_err = X264_MIN( min_qp_err, q ); + continue; + } // round to nearest, unless that would cause the deadzone to be negative h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) ) @@ -161,6 +167,11 @@ int x264_cqm_init( x264_t *h ) { h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i]; h->quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][i], q/6); + if( !j ) + { + min_qp_err = X264_MIN( min_qp_err, q ); + continue; + } h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j ); if( j > 0xffff && q > max_qp_err ) max_qp_err = q; @@ -179,6 +190,12 @@ int x264_cqm_init( x264_t *h ) x264_log( h, X264_LOG_ERROR, "but min chroma QP is implied to be %d.\n", h->chroma_qp_table[h->param.rc.i_qp_min] ); return -1; } + if( !h->mb.b_lossless && min_qp_err <= h->param.rc.i_qp_max ) + { + x264_log( h, X264_LOG_ERROR, "Quantization underflow. Your CQM is incompatible with QP > %d,\n", min_qp_err-1 ); + x264_log( h, X264_LOG_ERROR, "but max QP is implied to be %d.\n", h->param.rc.i_qp_max ); + return -1; + } return 0; fail: x264_cqm_delete( h ); diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 4ddf2e5b..8a12f833 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -125,6 +125,7 @@ PIXEL_AVG_WALL(sse2) PIXEL_AVG_WALL(sse2_misalign) PIXEL_AVG_WALL(cache64_ssse3) +#if !X264_HIGH_BIT_DEPTH #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\ {\ @@ -355,24 +356,28 @@ static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+15)&~15, h-1 ); } } +#endif // !X264_HIGH_BIT_DEPTH void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) return; + pf->memcpy_aligned = x264_memcpy_aligned_mmx; + pf->memzero_aligned = x264_memzero_aligned_mmx; +#if !X264_HIGH_BIT_DEPTH pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; - pf->memcpy_aligned = x264_memcpy_aligned_mmx; - pf->memzero_aligned = x264_memzero_aligned_mmx; pf->integral_init4v = x264_integral_init4v_mmx; pf->integral_init8v = x264_integral_init8v_mmx; +#endif // !X264_HIGH_BIT_DEPTH if( !(cpu&X264_CPU_MMXEXT) ) return; +#if !X264_HIGH_BIT_DEPTH pf->mc_luma = mc_luma_mmxext; pf->get_ref = get_ref_mmxext; pf->mc_chroma = x264_mc_chroma_mmxext; @@ -412,12 +417,14 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext; } #endif +#endif // !X264_HIGH_BIT_DEPTH if( !(cpu&X264_CPU_SSE2) ) return; pf->memcpy_aligned = x264_memcpy_aligned_sse2; pf->memzero_aligned = x264_memzero_aligned_sse2; +#if !X264_HIGH_BIT_DEPTH pf->integral_init4v = x264_integral_init4v_sse2; pf->integral_init8v = x264_integral_init8v_sse2; pf->hpel_filter = x264_hpel_filter_sse2_amd; @@ -492,4 +499,5 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->integral_init4h = x264_integral_init4h_sse4; pf->integral_init8h = x264_integral_init8h_sse4; +#endif // !X264_HIGH_BIT_DEPTH } diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index e771431e..4004265f 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -75,6 +75,7 @@ void x264_predict_16x16_v_sse2( uint8_t *src ); void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ); +#if !X264_HIGH_BIT_DEPTH ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8}; ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1}; ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4}; @@ -364,6 +365,7 @@ INTRA_SA8D_X3(ssse3) #else INTRA_SA8D_X3(mmxext) #endif +#endif // !X264_HIGH_BIT_DEPTH /**************************************************************************** * Exported functions: @@ -372,6 +374,7 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_MMX) ) return; +#if !X264_HIGH_BIT_DEPTH pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx; if( !(cpu&X264_CPU_MMXEXT) ) return; @@ -397,12 +400,14 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] ) #ifdef __GNUC__ pf[I_PRED_16x16_P] = x264_predict_16x16_p_ssse3; #endif +#endif // !X264_HIGH_BIT_DEPTH } void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) { if( !(cpu&X264_CPU_MMX) ) return; +#if !X264_HIGH_BIT_DEPTH #if ARCH_X86_64 pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left; #endif @@ -424,12 +429,14 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] ) #ifdef __GNUC__ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_ssse3; #endif +#endif // !X264_HIGH_BIT_DEPTH } void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter ) { if( !(cpu&X264_CPU_MMXEXT) ) return; +#if !X264_HIGH_BIT_DEPTH pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmxext; pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmxext; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmxext; @@ -456,12 +463,14 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3; pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3; *predict_8x8_filter = x264_predict_8x8_filter_ssse3; +#endif // !X264_HIGH_BIT_DEPTH } void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] ) { if( !(cpu&X264_CPU_MMXEXT) ) return; +#if !X264_HIGH_BIT_DEPTH pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmxext; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext; pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmxext; @@ -474,4 +483,5 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] ) pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3; pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3; pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3; +#endif // !X264_HIGH_BIT_DEPTH } diff --git a/configure b/configure index 24d15adf..43fbe393 100755 --- a/configure +++ b/configure @@ -18,6 +18,7 @@ echo " --enable-gprof adds -pg, doesn't strip" echo " --enable-visualize enables visualization (X11 only)" echo " --enable-pic build position-independent code" echo " --enable-shared build libx264.so" +echo " --bit-depth=BIT_DEPTH sets output bit depth (8-10), default 8" echo " --extra-asflags=EASFLAGS add EASFLAGS to ASFLAGS" echo " --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS" echo " --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS" @@ -124,6 +125,7 @@ gprof="no" pic="no" vis="no" shared="no" +bit_depth="8" CFLAGS="$CFLAGS -Wall -I." LDFLAGS="$LDFLAGS" @@ -208,6 +210,14 @@ for opt do CFLAGS="$CFLAGS --sysroot=${opt#--sysroot=}" LDFLAGS="$LDFLAGS --sysroot=${opt#--sysroot=}" ;; + --bit-depth=*) + bit_depth="${opt#--bit-depth=}" + if [ "$bit_depth" -lt "8" -o "$bit_depth" -gt "10" ]; then + echo "Supplied bit depth must be in range [8,10]." + exit 1 + fi + bit_depth=`expr $bit_depth + 0` + ;; *) echo "Unknown option $opt, ignored" ;; @@ -644,6 +654,12 @@ if cc_check '' -Wshadow ; then CFLAGS="-Wshadow $CFLAGS" fi +if [ "$bit_depth" -gt "8" ]; then + define X264_HIGH_BIT_DEPTH +fi + +define BIT_DEPTH $bit_depth + rm -f conftest* # generate config files @@ -724,6 +740,7 @@ gprof: $gprof PIC: $pic shared: $shared visualize: $vis +bit depth: $bit_depth EOF echo >> config.log diff --git a/encoder/analyse.c b/encoder/analyse.c index 48a6f394..44543731 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -134,25 +134,27 @@ typedef struct } x264_mb_analysis_t; /* lambda = pow(2,qp/6-2) */ -const uint8_t x264_lambda_tab[52] = { - 1, 1, 1, 1, 1, 1, 1, 1, /* 0-7 */ - 1, 1, 1, 1, /* 8-11 */ - 1, 1, 1, 1, 2, 2, 2, 2, /* 12-19 */ - 3, 3, 3, 4, 4, 4, 5, 6, /* 20-27 */ - 6, 7, 8, 9,10,11,13,14, /* 28-35 */ - 16,18,20,23,25,29,32,36, /* 36-43 */ - 40,45,51,57,64,72,81,91 /* 44-51 */ +const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = { + 1, 1, 1, 1, 1, 1, 1, 1, /* 0- 7 */ + 1, 1, 1, 1, 1, 1, 1, 1, /* 8-15 */ + 2, 2, 2, 2, 3, 3, 3, 4, /* 16-23 */ + 4, 4, 5, 6, 6, 7, 8, 9, /* 24-31 */ + 10, 11, 13, 14, 16, 18, 20, 23, /* 32-39 */ + 25, 29, 32, 36, 40, 45, 51, 57, /* 40-47 */ + 64, 72, 81, 91, 102, 114, 128, 144, /* 48-55 */ + 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */ }; /* lambda2 = pow(lambda,2) * .9 * 256 */ -const int x264_lambda2_tab[52] = { - 14, 18, 22, 28, 36, 45, 57, 72, /* 0 - 7 */ - 91, 115, 145, 182, 230, 290, 365, 460, /* 8 - 15 */ - 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16 - 23 */ - 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24 - 31 */ - 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32 - 39 */ -148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40 - 47 */ -943718, 1189010, 1498059, 1887436 /* 48 - 51 */ +const int x264_lambda2_tab[QP_MAX_MAX+1] = { + 14, 18, 22, 28, 36, 45, 57, 72, /* 0- 7 */ + 91, 115, 145, 182, 230, 290, 365, 460, /* 8-15 */ + 580, 731, 921, 1161, 1462, 1843, 2322, 2925, /* 16-23 */ + 3686, 4644, 5851, 7372, 9289, 11703, 14745, 18578, /* 24-31 */ + 23407, 29491, 37156, 46814, 58982, 74313, 93628, 117964, /* 32-39 */ + 148626, 187257, 235929, 297252, 374514, 471859, 594505, 749029, /* 40-47 */ + 943718,1189010,1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */ +5992238,7549747,9512085,11984476,15099494,19024170,23968953,30198988, /* 56-63 */ }; const uint8_t x264_exp2_lut[64] = { @@ -188,27 +190,31 @@ const float x264_log2_lz_lut[32] = { // should the intra and inter lambdas be different? // I'm just matching the behaviour of deadzone quant. -static const int x264_trellis_lambda2_tab[2][52] = { +static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = { // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS) - { 46, 58, 73, 92, 117, 147, - 185, 233, 294, 370, 466, 587, - 740, 932, 1174, 1480, 1864, 2349, - 2959, 3728, 4697, 5918, 7457, 9395, - 11837, 14914, 18790, 23674, 29828, 37581, - 47349, 59656, 75163, 94699, 119313, 150326, - 189399, 238627, 300652, 378798, 477255, 601304, - 757596, 954511, 1202608, 1515192, 1909022, 2405217, - 3030384, 3818045, 4810435, 6060769 }, + { 46, 58, 73, 92, 117, 147, + 185, 233, 294, 370, 466, 587, + 740, 932, 1174, 1480, 1864, 2349, + 2959, 3728, 4697, 5918, 7457, 9395, + 11837, 14914, 18790, 23674, 29828, 37581, + 47349, 59656, 75163, 94699, 119313, 150326, + 189399, 238627, 300652, 378798, 477255, 601304, + 757596, 954511, 1202608, 1515192, 1909022, 2405217, + 3030384, 3818045, 4810435, 6060769, 7636091, 9620872, + 12121539,15272182,19241743,24243077,30544363,38483486, + 48486154,61088726,76966972,96972308 }, // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS) - { 27, 34, 43, 54, 68, 86, - 108, 136, 172, 216, 273, 343, - 433, 545, 687, 865, 1090, 1374, - 1731, 2180, 2747, 3461, 4361, 5494, - 6922, 8721, 10988, 13844, 17442, 21976, - 27688, 34885, 43953, 55377, 69771, 87906, - 110755, 139543, 175813, 221511, 279087, 351627, - 443023, 558174, 703255, 886046, 1116348, 1406511, - 1772093, 2232697, 2813022, 3544186 } + { 27, 34, 43, 54, 68, 86, + 108, 136, 172, 216, 273, 343, + 433, 545, 687, 865, 1090, 1374, + 1731, 2180, 2747, 3461, 4361, 5494, + 6922, 8721, 10988, 13844, 17442, 21976, + 27688, 34885, 43953, 55377, 69771, 87906, + 110755, 139543, 175813, 221511, 279087, 351627, + 443023, 558174, 703255, 886046, 1116348, 1406511, + 1772093, 2232697, 2813022, 3544186, 4465396, 5626046, + 7088374, 8930791,11252092,14176748,17861583,22504184, + 28353495,35723165,45008368,56706990 } }; static const uint16_t x264_chroma_lambda2_offset_tab[] = { @@ -237,7 +243,7 @@ static const uint8_t i_sub_mb_p_cost_table[4] = { static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a ); -static uint16_t x264_cost_ref[92][3][33]; +static uint16_t x264_cost_ref[LAMBDA_MAX+1][3][33]; static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER; int x264_analyse_init_costs( x264_t *h, int qp ) @@ -275,7 +281,7 @@ fail: void x264_analyse_free_costs( x264_t *h ) { - for( int i = 0; i < 92; i++ ) + for( int i = 0; i < LAMBDA_MAX+1; i++ ) { if( h->cost_mv[i] ) x264_free( h->cost_mv[i] - 2*4*2048 ); diff --git a/encoder/cabac.c b/encoder/cabac.c index b99a32d1..be18f534 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -262,9 +262,9 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb ) if( i_dqp != 0 ) { int val = i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp - 1); - /* dqp is interpreted modulo 52 */ - if( val >= 51 && val != 52 ) - val = 103 - val; + /* dqp is interpreted modulo (QP_MAX+1) */ + if( val >= QP_MAX && val != QP_MAX+1 ) + val = 2*QP_MAX+1 - val; do { x264_cabac_encode_decision( cb, 60 + ctx, 1 ); @@ -767,15 +767,18 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb ) i_mb_pos_tex = x264_cabac_pos( cb ); h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start; - memcpy( cb->p, h->mb.pic.p_fenc[0], 256 ); - cb->p += 256; - for( int i = 0; i < 8; i++ ) - memcpy( cb->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 ); - cb->p += 64; - for( int i = 0; i < 8; i++ ) - memcpy( cb->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 ); - cb->p += 64; + bs_t s; + bs_init( &s, cb->p, cb->p_end - cb->p ); + for( int i = 0; i < 256; i++ ) + bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[0][i] ); + for( int ch = 0; ch < 2; ch++ ) + for( int i = 0; i < 8; i++ ) + for( int j = 0; j < 8; j++ ) + bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] ); + + bs_flush( &s ); + cb->p = s.p; x264_cabac_encode_init_core( cb ); h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex; diff --git a/encoder/cavlc.c b/encoder/cavlc.c index b2544652..0b58ada6 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -66,7 +66,7 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_len bs_t *s = &h->out.bs; static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff }; int i_level_prefix = 15; - int mask = level >> 15; + int mask = level >> 31; int abs_level = (level^mask)-mask; int i_level_code = abs_level*2-mask-2; if( ( i_level_code >> i_suffix_length ) < 15 ) @@ -219,10 +219,10 @@ static void cavlc_qp_delta( x264_t *h ) if( i_dqp ) { - if( i_dqp < -26 ) - i_dqp += 52; - else if( i_dqp > 25 ) - i_dqp -= 52; + if( i_dqp < -(QP_MAX+1)/2 ) + i_dqp += QP_MAX+1; + else if( i_dqp > QP_MAX/2 ) + i_dqp -= QP_MAX+1; } bs_write_se( s, i_dqp ); } @@ -309,14 +309,12 @@ void x264_macroblock_write_cavlc( x264_t *h ) bs_align_0( s ); - memcpy( s->p, h->mb.pic.p_fenc[0], 256 ); - s->p += 256; - for( int i = 0; i < 8; i++ ) - memcpy( s->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 ); - s->p += 64; - for( int i = 0; i < 8; i++ ) - memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 ); - s->p += 64; + for( int i = 0; i < 256; i++ ) + bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[0][i] ); + for( int ch = 0; ch < 2; ch++ ) + for( int i = 0; i < 8; i++ ) + for( int j = 0; j < 8; j++ ) + bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] ); bs_init( s, s->p, s->p_end - s->p ); s->p_start = p_start; diff --git a/encoder/encoder.c b/encoder/encoder.c index 6a2aacb7..a2369bd5 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -51,7 +51,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current, ****************************************************************************/ static float x264_psnr( int64_t i_sqe, int64_t i_size ) { - double f_mse = (double)i_sqe / ((double)65025.0 * (double)i_size); + double f_mse = (double)i_sqe / (PIXEL_MAX*PIXEL_MAX * (double)i_size); if( f_mse <= 0.0000000001 ) /* Max 100dB */ return 100; @@ -68,11 +68,13 @@ static void x264_frame_dump( x264_t *h ) FILE *f = fopen( h->param.psz_dump_yuv, "r+b" ); if( !f ) return; + int bytes_per_pixel = (BIT_DEPTH+7)/8; /* Write the frame in display order */ - fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET ); + fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2 * bytes_per_pixel, SEEK_SET ); for( int i = 0; i < h->fdec->i_plane; i++ ) for( int y = 0; y < h->param.i_height >> !!i; y++ ) - fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f ); + for( int j = 0; j < h->param.i_width >> !!i; j++ ) + fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]]+j, bytes_per_pixel, 1, f ); fclose( f ); } @@ -469,8 +471,8 @@ static int x264_validate_parameters( x264_t *h ) x264_log( h, X264_LOG_ERROR, "no ratecontrol method specified\n" ); return -1; } - h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, 0, 51 ); - h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, 51 ); + h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, 0, QP_MAX ); + h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX ); if( h->param.rc.i_rc_method == X264_RC_CRF ) { h->param.rc.i_qp_constant = h->param.rc.f_rf_constant; @@ -502,12 +504,12 @@ static int x264_validate_parameters( x264_t *h ) float qp_p = h->param.rc.i_qp_constant; float qp_i = qp_p - 6*log2f( h->param.rc.f_ip_factor ); float qp_b = qp_p + 6*log2f( h->param.rc.f_pb_factor ); - h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 ); - h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 ); + h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, QP_MAX ); + h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, QP_MAX ); h->param.rc.i_aq_mode = 0; h->param.rc.b_mb_tree = 0; } - h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 ); + h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, QP_MAX ); h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max ); if( h->param.rc.i_vbv_buffer_size ) { @@ -1054,8 +1056,9 @@ x264_t *x264_encoder_open( x264_param_t *param ) if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) ) goto fail; + static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 }; /* Checks for known miscompilation issues. */ - if( h->cost_mv[1][2013] != 24 ) + if( h->cost_mv[x264_lambda_tab[X264_LOOKAHEAD_QP]][2013] != cost_mv_correct[BIT_DEPTH-8] ) { x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" ); goto fail; @@ -1147,11 +1150,22 @@ x264_t *x264_encoder_open( x264_param_t *param ) fclose( f ); } - x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n", - h->sps->i_profile_idc == PROFILE_BASELINE ? "Baseline" : - h->sps->i_profile_idc == PROFILE_MAIN ? "Main" : - h->sps->i_profile_idc == PROFILE_HIGH ? "High" : - "High 4:4:4 Predictive", h->sps->i_level_idc/10, h->sps->i_level_idc%10 ); + const char *profile = h->sps->i_profile_idc == PROFILE_BASELINE ? "Baseline" : + h->sps->i_profile_idc == PROFILE_MAIN ? "Main" : + h->sps->i_profile_idc == PROFILE_HIGH ? "High" : + h->sps->i_profile_idc == PROFILE_HIGH10 ? "High 10" : + "High 4:4:4 Predictive"; + + if( h->sps->i_profile_idc < PROFILE_HIGH10 ) + { + x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n", + profile, h->sps->i_level_idc/10, h->sps->i_level_idc%10 ); + } + else + { + x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d, bit depth %d\n", + profile, h->sps->i_level_idc/10, h->sps->i_level_idc%10, BIT_DEPTH ); + } return h; fail: @@ -1836,7 +1850,7 @@ static int x264_slice_write( x264_t *h ) bs_align_1( &h->out.bs ); /* init cabac */ - x264_cabac_context_init( &h->cabac, h->sh.i_type, h->sh.i_qp, h->sh.i_cabac_init_idc ); + x264_cabac_context_init( &h->cabac, h->sh.i_type, x264_clip3( h->sh.i_qp-QP_BD_OFFSET, 0, 51 ), h->sh.i_cabac_init_idc ); x264_cabac_encode_init ( &h->cabac, h->out.bs.p, h->out.bs.p_end ); } h->mb.i_last_qp = h->sh.i_qp; @@ -2705,6 +2719,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current, for( int i = 0; i < 3; i++ ) { pic_out->img.i_stride[i] = h->fdec->i_stride[i]; + // FIXME This breaks the API when pixel != uint8_t. pic_out->img.plane[i] = h->fdec->plane[i]; } diff --git a/encoder/macroblock.h b/encoder/macroblock.h index b1b02fa5..7c833448 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -26,8 +26,8 @@ #include "common/macroblock.h" -extern const int x264_lambda2_tab[52]; -extern const uint8_t x264_lambda_tab[52]; +extern const int x264_lambda2_tab[QP_MAX_MAX+1]; +extern const uint16_t x264_lambda_tab[QP_MAX_MAX+1]; void x264_rdo_init( void ); diff --git a/encoder/me.h b/encoder/me.h index 912b05d1..b125f3d0 100644 --- a/encoder/me.h +++ b/encoder/me.h @@ -68,7 +68,7 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight ); uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ); -extern uint16_t *x264_cost_mv_fpel[92][4]; +extern uint16_t *x264_cost_mv_fpel[LAMBDA_MAX+1][4]; #define COPY1_IF_LT(x,y)\ if((y)<(x))\ diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index 7f5ba962..a2c58252 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -219,7 +219,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2 uint32_t ssd = res >> 32; frame->i_pixel_sum[i] += sum; frame->i_pixel_ssd[i] += ssd; - return ssd - (sum * sum >> shift); + return ssd - ((uint64_t)sum * sum >> shift); } // Find the total AC energy of the block in all planes. @@ -287,6 +287,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off { if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE ) { + float bit_depth_correction = powf(1 << (BIT_DEPTH-8), 0.5f); float avg_adj_pow2 = 0.f; for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ ) for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ ) @@ -299,8 +300,8 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off } avg_adj /= h->mb.i_mb_count; avg_adj_pow2 /= h->mb.i_mb_count; - strength = h->param.rc.f_aq_strength * avg_adj; - avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj; + strength = h->param.rc.f_aq_strength * avg_adj / bit_depth_correction; + avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (14.f * bit_depth_correction)) / avg_adj; } else strength = h->param.rc.f_aq_strength * 1.0397f; @@ -318,7 +319,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off else { uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame ); - qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f); + qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - (14.427f + 2*(BIT_DEPTH-8))); } if( quant_offsets ) qp_adj += quant_offsets[mb_xy]; @@ -620,8 +621,8 @@ int x264_ratecontrol_new( x264_t *h ) rc->ip_offset = 6.0 * log2f( h->param.rc.f_ip_factor ); rc->pb_offset = 6.0 * log2f( h->param.rc.f_pb_factor ); rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant; - rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, 51 ); - rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, 51 ); + rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, QP_MAX ); + rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, QP_MAX ); h->mb.ip_offset = rc->ip_offset + 0.5; rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 ); @@ -1180,18 +1181,24 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead ) if( l->level_idc == 41 && h->param.i_nal_hrd ) mincr = 4; - /* The spec has a bizarre special case for the first frame. */ - if( h->i_frame == 0 ) - { - //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR - double fr = 1. / 172; - int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height; - rc->frame_size_maximum = 384 * 8 * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr; - } + /* High 10 doesn't require minCR, so just set the maximum to a large value. */ + if( h->sps->i_profile_idc == PROFILE_HIGH10 ) + rc->frame_size_maximum = 1e9; else { - //384 * MaxMBPS * ( tr( n ) - tr( n - 1 ) ) / MinCR - rc->frame_size_maximum = 384 * 8 * ((double)h->fenc->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale) * l->mbps / mincr; + /* The spec has a bizarre special case for the first frame. */ + if( h->i_frame == 0 ) + { + //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR + double fr = 1. / 172; + int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height; + rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr; + } + else + { + //384 * MaxMBPS * ( tr( n ) - tr( n - 1 ) ) / MinCR + rc->frame_size_maximum = 384 * BIT_DEPTH * ((double)h->fenc->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale) * l->mbps / mincr; + } } } @@ -1231,7 +1238,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead ) rc->qpa_rc = rc->qpa_aq = 0; - rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 ); + rc->qp = x264_clip3( (int)(q + 0.5), 0, QP_MAX ); h->fdec->f_qp_avg_rc = h->fdec->f_qp_avg_aq = rc->qpm = q; @@ -1416,9 +1423,9 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num ) * So just calculate the average QP used so far. */ h->param.rc.i_qp_constant = (h->stat.i_frame_count[SLICE_TYPE_P] == 0) ? 24 : 1 + h->stat.f_frame_qp[SLICE_TYPE_P] / h->stat.i_frame_count[SLICE_TYPE_P]; - rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, 51 ); - rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, 51 ); - rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, 51 ); + rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX ); + rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, QP_MAX ); + rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, QP_MAX ); x264_log(h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", rc->num_entries); x264_log(h, X264_LOG_ERROR, "continuing anyway, at constant QP=%d\n", h->param.rc.i_qp_constant); @@ -2652,7 +2659,7 @@ static int init_pass2( x264_t *h ) } else if( expected_bits > all_available_bits && avgq > h->param.rc.i_qp_max - 2 ) { - if( h->param.rc.i_qp_max < 51 ) + if( h->param.rc.i_qp_max < QP_MAX ) x264_log( h, X264_LOG_WARNING, "try increasing target bitrate or increasing qp_max (currently %d)\n", h->param.rc.i_qp_max ); else x264_log( h, X264_LOG_WARNING, "try increasing target bitrate\n"); diff --git a/encoder/rdo.c b/encoder/rdo.c index 5dddd03f..863add79 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -443,10 +443,7 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct, /* We only need to zero an empty 4x4 block. 8x8 can be implicitly emptied via zero nnz, as can dc. */ if( i_coefs == 16 && !dc ) - { - M128( &dct[0] ) = M128_ZERO; - M128( &dct[8] ) = M128_ZERO; - } + memset( dct, 0, 16 * sizeof(dctcoef) ); return 0; } @@ -613,10 +610,7 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct, if( bnode == &nodes_cur[0] ) { if( i_coefs == 16 && !dc ) - { - M128( &dct[0] ) = M128_ZERO; - M128( &dct[8] ) = M128_ZERO; - } + memset( dct, 0, 16 * sizeof(dctcoef) ); return 0; } diff --git a/encoder/set.c b/encoder/set.c index 9e6e736b..a520b8a6 100644 --- a/encoder/set.c +++ b/encoder/set.c @@ -104,6 +104,8 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param ) sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0; if( sps->b_qpprime_y_zero_transform_bypass ) sps->i_profile_idc = PROFILE_HIGH444_PREDICTIVE; + else if( BIT_DEPTH > 8 ) + sps->i_profile_idc = PROFILE_HIGH10; else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT ) sps->i_profile_idc = PROFILE_HIGH; else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->b_fake_interlaced || param->analyse.i_weighted_pred > 0 ) @@ -260,8 +262,8 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps ) if( sps->i_profile_idc >= PROFILE_HIGH ) { bs_write_ue( s, 1 ); // chroma_format_idc = 4:2:0 - bs_write_ue( s, 0 ); // bit_depth_luma_minus8 - bs_write_ue( s, 0 ); // bit_depth_chroma_minus8 + bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8 + bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8 bs_write( s, 1, sps->b_qpprime_y_zero_transform_bypass ); bs_write( s, 1, 0 ); // seq_scaling_matrix_present_flag } @@ -488,7 +490,7 @@ void x264_pps_write( bs_t *s, x264_pps_t *pps ) bs_write( s, 1, pps->b_weighted_pred ); bs_write( s, 2, pps->b_weighted_bipred ); - bs_write_se( s, pps->i_pic_init_qp - 26 ); + bs_write_se( s, pps->i_pic_init_qp - 26 - QP_BD_OFFSET ); bs_write_se( s, pps->i_pic_init_qs - 26 ); bs_write_se( s, pps->i_chroma_qp_index_offset ); @@ -668,7 +670,8 @@ int x264_validate_levels( x264_t *h, int verbose ) int ret = 0; int mbs = h->sps->i_mb_width * h->sps->i_mb_height; int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering; - int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4; + int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 : + h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4; const x264_level_t *l = x264_levels; while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc ) diff --git a/encoder/slicetype.c b/encoder/slicetype.c index 84a82de1..c7a891da 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -303,7 +303,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, (mv1)[0], (mv1)[1], 8, 8, w ); \ h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \ } \ - i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \ + i_cost = penalty * a->i_lambda + h->pixf.mbcmp[PIXEL_8x8]( \ m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \ COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \ } @@ -393,9 +393,9 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, } x264_me_search( h, &m[l], mvc, i_mvc ); - m[l].cost -= 2; // remove mvcost from skip mbs + m[l].cost -= 2 * a->i_lambda; // remove mvcost from skip mbs if( M32( m[l].mv ) ) - m[l].cost += 5; + m[l].cost += 5 * a->i_lambda; skip_motionest: CP32( fenc_mvs[l], m[l].mv ); @@ -418,7 +418,7 @@ lowres_intra_mb: ALIGNED_ARRAY_16( pixel, edge,[33] ); pixel *pix = &pix1[8+FDEC_STRIDE - 1]; pixel *src = &fenc->lowres[0][i_pel_offset - 1]; - const int intra_penalty = 5; + const int intra_penalty = 5 * a->i_lambda; int satds[3]; memcpy( pix-FDEC_STRIDE, src-i_stride, 17 * sizeof(pixel) ); @@ -496,7 +496,7 @@ lowres_intra_mb: } } - fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost + (list_used << LOWRES_COST_SHIFT); + fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = X264_MIN( i_bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); } #undef TRY_BIDIR diff --git a/tools/checkasm.c b/tools/checkasm.c index ddbf8bfd..6a6aeec2 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -40,8 +40,10 @@ uint8_t *buf1, *buf2; /* buf3, buf4: used to store output */ uint8_t *buf3, *buf4; -/* pbuf*: point to the same memory as above, just for type convenience */ -pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4; +/* pbuf1, pbuf2: initialised to random pixel data and shouldn't write into them. */ +pixel *pbuf1, *pbuf2; +/* pbuf3, pbuf4: point to buf3, buf4, just for type convenience */ +pixel *pbuf3, *pbuf4; int quiet = 0; @@ -256,11 +258,15 @@ static int check_pixel( int cpu_ref, int cpu_new ) int z = i|(i>>4); z ^= z>>2; z ^= z>>1; - buf3[i] = ~(buf4[i] = -(z&1)); + pbuf4[i] = -(z&1) & PIXEL_MAX; + pbuf3[i] = ~pbuf4[i] & PIXEL_MAX; } // random pattern made of maxed pixel differences, in case an intermediate value overflows for( int i = 256; i < 0x1000; i++ ) - buf3[i] = ~(buf4[i] = -(buf1[i&~0x88]&1)); + { + pbuf4[i] = -(pbuf1[i&~0x88]&1) & PIXEL_MAX; + pbuf3[i] = ~(pbuf4[i]) & PIXEL_MAX; + } #define TEST_PIXEL( name, align ) \ ok = 1, used_asm = 0; \ @@ -535,22 +541,22 @@ static int check_dct( int cpu_ref, int cpu_new ) used_asm = 1; \ call_c( dct_c.name, t1, pbuf1, pbuf2 ); \ call_a( dct_asm.name, t2, pbuf1, pbuf2 ); \ - if( memcmp( t1, t2, size ) ) \ + if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ } \ } ok = 1; used_asm = 0; - TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 ); - TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 ); - TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4*2 ); - TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 ); + TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 ); + TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 ); + TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 ); + TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 ); report( "sub_dct4 :" ); ok = 1; used_asm = 0; - TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64*2 ); - TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*2*4 ); + TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64 ); + TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*4 ); report( "sub_dct8 :" ); #undef TEST_DCT @@ -574,13 +580,13 @@ static int check_dct( int cpu_ref, int cpu_new ) { \ set_func_name( #name ); \ used_asm = 1; \ - memcpy( buf3, buf1, 32*32 * sizeof(pixel) ); \ - memcpy( buf4, buf1, 32*32 * sizeof(pixel) ); \ - memcpy( dct1, src, 512 * sizeof(pixel) ); \ - memcpy( dct2, src, 512 * sizeof(pixel) ); \ + memcpy( pbuf3, pbuf1, 32*32 * sizeof(pixel) ); \ + memcpy( pbuf4, pbuf1, 32*32 * sizeof(pixel) ); \ + memcpy( dct1, src, 256 * sizeof(dctcoef) ); \ + memcpy( dct2, src, 256 * sizeof(dctcoef) ); \ call_c1( dct_c.name, pbuf3, (void*)dct1 ); \ call_a1( dct_asm.name, pbuf4, (void*)dct2 ); \ - if( memcmp( buf3, buf4, 32*32 * sizeof(pixel) ) ) \ + if( memcmp( pbuf3, pbuf4, 32*32 * sizeof(pixel) ) ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ @@ -615,10 +621,10 @@ static int check_dct( int cpu_ref, int cpu_new ) dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\ : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\ : ((*p++)&0x1fff)-0x1000; /* general case */\ - memcpy( dct2, dct1, 32 );\ + memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\ call_c1( dct_c.name, dct1[0] );\ call_a1( dct_asm.name, dct2[0] );\ - if( memcmp( dct1, dct2, 32 ) )\ + if( memcmp( dct1, dct2, 16 * sizeof(dctcoef) ) )\ ok = 0;\ }\ call_c2( dct_c.name, dct1[0] );\ @@ -658,11 +664,11 @@ static int check_dct( int cpu_ref, int cpu_new ) int nz_a, nz_c; \ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \ used_asm = 1; \ - memcpy( buf3, buf1, 16*FDEC_STRIDE * sizeof(pixel) ); \ - memcpy( buf4, buf1, 16*FDEC_STRIDE * sizeof(pixel) ); \ + memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \ + memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \ nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3 ); \ nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4 ); \ - if( memcmp( t1, t2, size*sizeof(dctcoef) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \ + if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*sizeof(pixel) ) || nz_c != nz_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ @@ -680,8 +686,8 @@ static int check_dct( int cpu_ref, int cpu_new ) used_asm = 1; \ for( int i = 0; i < 2; i++ ) \ { \ - memcpy( buf3, buf2, 16*FDEC_STRIDE * sizeof(pixel) ); \ - memcpy( buf4, buf2, 16*FDEC_STRIDE * sizeof(pixel) ); \ + memcpy( pbuf3, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \ + memcpy( pbuf4, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \ for( int j = 0; j < 4; j++ ) \ { \ memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \ @@ -689,7 +695,7 @@ static int check_dct( int cpu_ref, int cpu_new ) } \ nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \ nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \ - if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \ + if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \ { \ ok = 0; \ fprintf( stderr, #name " [FAILED]\n" ); \ @@ -779,11 +785,11 @@ static int check_mc( int cpu_ref, int cpu_new ) const x264_weight_t *weight = weight_none; \ set_func_name( "mc_luma_%dx%d", w, h ); \ used_asm = 1; \ - memset( buf3, 0xCD, 1024 ); \ - memset( buf4, 0xCD, 1024 ); \ + for( int i = 0; i < 1024; i++ ) \ + pbuf3[i] = pbuf4[i] = 0xCD; \ call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \ call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \ - if( memcmp( buf3, buf4, 1024 ) ) \ + if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \ { \ fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ ok = 0; \ @@ -796,8 +802,8 @@ static int check_mc( int cpu_ref, int cpu_new ) const x264_weight_t *weight = weight_none; \ set_func_name( "get_ref_%dx%d", w, h ); \ used_asm = 1; \ - memset( buf3, 0xCD, 1024 ); \ - memset( buf4, 0xCD, 1024 ); \ + for( int i = 0; i < 1024; i++ ) \ + pbuf3[i] = pbuf4[i] = 0xCD; \ call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \ ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \ for( int i = 0; i < h; i++ ) \ @@ -814,15 +820,15 @@ static int check_mc( int cpu_ref, int cpu_new ) { \ set_func_name( "mc_chroma_%dx%d", w, h ); \ used_asm = 1; \ - memset( buf3, 0xCD, 1024 ); \ - memset( buf4, 0xCD, 1024 ); \ + for( int i = 0; i < 1024; i++ ) \ + pbuf3[i] = pbuf4[i] = 0xCD; \ call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \ call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \ for( int j = 0; j < h; j++ ) \ for( int i = w; i < 4; i++ ) \ dst2[i+j*16] = dst1[i+j*16]; \ - if( memcmp( buf3, buf4, 1024 ) ) \ + if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \ { \ fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ ok = 0; \ @@ -867,15 +873,15 @@ static int check_mc( int cpu_ref, int cpu_new ) ok = 1, used_asm = 0; \ for( int i = 0; i < 10; i++ ) \ { \ - memcpy( buf3, pbuf1+320, 320 * sizeof(pixel) ); \ - memcpy( buf4, pbuf1+320, 320 * sizeof(pixel) ); \ + memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \ + memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ set_func_name( "%s_%s", #name, pixel_names[i] ); \ used_asm = 1; \ call_c1( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \ call_a1( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \ - if( memcmp( buf3, buf4, 320 * sizeof(pixel) ) ) \ + if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \ @@ -971,8 +977,8 @@ static int check_mc( int cpu_ref, int cpu_new ) void *tmp = pbuf3+49*64; set_func_name( "hpel_filter" ); ok = 1; used_asm = 1; - memset( buf3, 0, 4096 * sizeof(pixel) ); - memset( buf4, 0, 4096 * sizeof(pixel) ); + memset( pbuf3, 0, 4096 * sizeof(pixel) ); + memset( pbuf4, 0, 4096 * sizeof(pixel) ); call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, 64, 48, 10, tmp ); call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, 64, 48, 10, tmp ); for( int i = 0; i < 3; i++ ) @@ -1030,13 +1036,13 @@ static int check_mc( int cpu_ref, int cpu_new ) int stride = 80;\ set_func_name( #name );\ used_asm = 1;\ - memcpy( buf3, buf1, size*2*stride * sizeof(pixel) );\ - memcpy( buf4, buf1, size*2*stride * sizeof(pixel) );\ - uint16_t *sum = (uint16_t*)buf3;\ + memcpy( pbuf3, pbuf1, size*2*stride * sizeof(pixel) );\ + memcpy( pbuf4, pbuf1, size*2*stride * sizeof(pixel) );\ + uint16_t *sum = (uint16_t*)pbuf3;\ call_c1( mc_c.name, __VA_ARGS__ );\ - sum = (uint16_t*)buf4;\ + sum = (uint16_t*)pbuf4;\ call_a1( mc_a.name, __VA_ARGS__ );\ - if( memcmp( buf3, buf4, (stride-8)*2 * sizeof(pixel) )\ + if( memcmp( pbuf3, pbuf4, (stride-8)*2 * sizeof(pixel) )\ || (size>9 && memcmp( pbuf3+18*stride, pbuf4+18*stride, (stride-8)*2 * sizeof(pixel) )))\ ok = 0;\ call_c2( mc_c.name, __VA_ARGS__ );\ @@ -1096,11 +1102,11 @@ static int check_deblock( int cpu_ref, int cpu_new ) /* not exactly the real values of a,b,tc but close enough */ for( int i = 35, a = 255, c = 250; i >= 0; i-- ) { - alphas[i] = a; - betas[i] = (i+1)/2; - tcs[i][0] = tcs[i][3] = (c+6)/10; - tcs[i][1] = (c+7)/15; - tcs[i][2] = (c+9)/20; + alphas[i] = a << (BIT_DEPTH-8); + betas[i] = (i+1)/2 << (BIT_DEPTH-8); + tcs[i][0] = tcs[i][3] = (c+6)/10 << (BIT_DEPTH-8); + tcs[i][1] = (c+7)/15 << (BIT_DEPTH-8); + tcs[i][2] = (c+9)/20 << (BIT_DEPTH-8); a = a*9/10; c = c*9/10; } @@ -1111,15 +1117,15 @@ static int check_deblock( int cpu_ref, int cpu_new ) int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \ for( int j = 0; j < 1024; j++ ) \ /* two distributions of random to excersize different failure modes */ \ - buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \ - memcpy( buf4, buf3, 1024 * sizeof(pixel) ); \ + pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \ + memcpy( pbuf4, pbuf3, 1024 * sizeof(pixel) ); \ if( db_a.name != db_ref.name ) \ { \ set_func_name( #name ); \ used_asm = 1; \ call_c1( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ call_a1( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ - if( memcmp( buf3, buf4, 1024 * sizeof(pixel) ) ) \ + if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \ { \ ok = 0; \ fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \ @@ -1200,7 +1206,7 @@ static int check_quant( int cpu_ref, int cpu_new ) h->pps = h->pps_array; x264_param_default( &h->param ); h->chroma_qp_table = i_chroma_qp_table + 12; - h->param.rc.i_qp_min = 26; + h->param.rc.i_qp_min = 26 + QP_BD_OFFSET; h->param.analyse.b_transform_8x8 = 1; for( int i_cqm = 0; i_cqm < 4; i_cqm++ ) @@ -1219,9 +1225,10 @@ static int check_quant( int cpu_ref, int cpu_new ) } else { + int max_scale = BIT_DEPTH < 10 ? 255 : 228; if( i_cqm == 2 ) for( int i = 0; i < 64; i++ ) - cqm_buf[i] = 10 + rand() % 246; + cqm_buf[i] = 10 + rand() % (max_scale - 9); else for( int i = 0; i < 64; i++ ) cqm_buf[i] = 1; @@ -1260,7 +1267,7 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ set_func_name( #name ); \ used_asms[0] = 1; \ - for( int qp = 51; qp > 0; qp-- ) \ + for( int qp = QP_MAX; qp > 0; qp-- ) \ { \ for( int j = 0; j < 2; j++ ) \ { \ @@ -1269,7 +1276,7 @@ static int check_quant( int cpu_ref, int cpu_new ) dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \ result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ - if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \ + if( memcmp( dct1, dct2, 16*sizeof(dctcoef) ) || result_c != result_a ) \ { \ oks[0] = 0; \ fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \ @@ -1286,14 +1293,14 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ set_func_name( #qname ); \ used_asms[0] = 1; \ - for( int qp = 51; qp > 0; qp-- ) \ + for( int qp = QP_MAX; qp > 0; qp-- ) \ { \ for( int j = 0; j < 2; j++ ) \ { \ INIT_QUANT##w(j) \ int result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ int result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \ + if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \ { \ oks[0] = 0; \ fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ @@ -1317,14 +1324,14 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \ used_asms[1] = 1; \ - for( int qp = 51; qp > 0; qp-- ) \ + for( int qp = QP_MAX; qp > 0; qp-- ) \ { \ INIT_QUANT##w(1) \ call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - memcpy( dct2, dct1, w*w*2 ); \ + memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ - if( memcmp( dct1, dct2, w*w*2 ) ) \ + if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \ { \ oks[1] = 0; \ fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ @@ -1345,15 +1352,15 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \ used_asms[1] = 1; \ - for( int qp = 51; qp > 0; qp-- ) \ + for( int qp = QP_MAX; qp > 0; qp-- ) \ { \ for( int i = 0; i < 16; i++ ) \ dct1[i] = rand(); \ call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \ - memcpy( dct2, dct1, w*w*2 ); \ + memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ - if( memcmp( dct1, dct2, w*w*2 ) ) \ + if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \ { \ oks[1] = 0; \ fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ @@ -1381,12 +1388,12 @@ static int check_quant( int cpu_ref, int cpu_new ) for( int size = 16; size <= 64; size += 48 ) { set_func_name( "denoise_dct" ); - memcpy( dct1, buf1, size*2 ); - memcpy( dct2, buf1, size*2 ); + memcpy( dct1, buf1, size*sizeof(dctcoef) ); + memcpy( dct2, buf1, size*sizeof(dctcoef) ); memcpy( buf3+256, buf3, 256 ); call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size ); call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size ); - if( memcmp( dct1, dct2, size*2 ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) ) + if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) ) ok = 0; call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size ); call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size ); @@ -1431,7 +1438,7 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ int nnz = 0; \ int max = rand() & (w*w-1); \ - memset( dct1, 0, w*w*2 ); \ + memset( dct1, 0, w*w*sizeof(dctcoef) ); \ for( int idx = ac; idx < max; idx++ ) \ nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \ if( !nnz ) \ @@ -1464,7 +1471,7 @@ static int check_quant( int cpu_ref, int cpu_new ) x264_run_level_t runlevel_c, runlevel_a; \ int nnz = 0; \ int max = rand() & (w*w-1); \ - memset( dct1, 0, w*w*2 ); \ + memset( dct1, 0, w*w*sizeof(dctcoef) ); \ memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \ memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \ for( int idx = ac; idx < max; idx++ ) \ @@ -1474,7 +1481,7 @@ static int check_quant( int cpu_ref, int cpu_new ) int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \ int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \ if( result_c != result_a || runlevel_c.last != runlevel_a.last || \ - memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \ + memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \ memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \ { \ ok = 0; \ @@ -1529,11 +1536,11 @@ static int check_intra( int cpu_ref, int cpu_new ) {\ set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\ used_asm = 1;\ - memcpy( buf3, buf1, 32*20 * sizeof(pixel) );\ - memcpy( buf4, buf1, 32*20 * sizeof(pixel) );\ + memcpy( pbuf3, pbuf1, 32*20 * sizeof(pixel) );\ + memcpy( pbuf4, pbuf1, 32*20 * sizeof(pixel) );\ call_c( ip_c.name[dir], pbuf3+48, ##__VA_ARGS__ );\ call_a( ip_a.name[dir], pbuf4+48, ##__VA_ARGS__ );\ - if( memcmp( buf3, buf4, 32*20 * sizeof(pixel) ) )\ + if( memcmp( pbuf3, pbuf4, 32*20 * sizeof(pixel) ) )\ {\ fprintf( stderr, #name "[%d] : [FAILED]\n", dir );\ ok = 0;\ @@ -1544,7 +1551,7 @@ static int check_intra( int cpu_ref, int cpu_new ) {\ printf( "%2x ", edge[14-j] );\ for( int k = 0; k < w; k++ )\ - printf( "%2x ", buf4[48+k+j*32] );\ + printf( "%2x ", pbuf4[48+k+j*32] );\ printf( "\n" );\ }\ printf( "\n" );\ @@ -1552,7 +1559,7 @@ static int check_intra( int cpu_ref, int cpu_new ) {\ printf( " " );\ for( int k = 0; k < w; k++ )\ - printf( "%2x ", buf3[48+k+j*32] );\ + printf( "%2x ", pbuf3[48+k+j*32] );\ printf( "\n" );\ }\ }\ @@ -1831,8 +1838,9 @@ int main(int argc, char *argv[]) fprintf( stderr, "x264: using random seed %u\n", seed ); srand( seed ); - buf1 = x264_malloc( 0x3e00 + 16*BENCH_ALIGNS ); - if( !buf1 ) + buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 16*BENCH_ALIGNS ); + pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 16*BENCH_ALIGNS ); + if( !buf1 || !pbuf1 ) { fprintf( stderr, "malloc failed, unable to initiate tests!\n" ); return -1; @@ -1840,15 +1848,17 @@ int main(int argc, char *argv[]) #define INIT_POINTER_OFFSETS\ buf2 = buf1 + 0xf00;\ buf3 = buf2 + 0xf00;\ - buf4 = buf3 + 0x1000;\ - pbuf1 = (pixel*)buf1;\ - pbuf2 = (pixel*)buf2;\ + buf4 = buf3 + 0x1000*sizeof(pixel);\ + pbuf2 = pbuf1 + 0xf00;\ pbuf3 = (pixel*)buf3;\ pbuf4 = (pixel*)buf4; INIT_POINTER_OFFSETS; for( int i = 0; i < 0x1e00; i++ ) + { buf1[i] = rand() & 0xFF; - memset( buf1+0x1e00, 0, 0x2000 ); + pbuf1[i] = rand() & PIXEL_MAX; + } + memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) ); /* 16-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */ if( do_bench ) @@ -1857,6 +1867,7 @@ int main(int argc, char *argv[]) INIT_POINTER_OFFSETS; ret |= x264_stack_pagealign( check_all_flags, i*16 ); buf1 += 16; + pbuf1 += 16; quiet = 1; fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS ); } diff --git a/x264.c b/x264.c index 0bede93b..e6d27d17 100644 --- a/x264.c +++ b/x264.c @@ -262,6 +262,7 @@ static void Help( x264_param_t *defaults, int longhelp ) " .mkv -> Matroska\n" " .flv -> Flash Video\n" " .mp4 -> MP4 if compiled with GPAC support (%s)\n" + "Output bit depth: %d (configured at compile time)\n" "\n" "Options:\n" "\n" @@ -286,10 +287,11 @@ static void Help( x264_param_t *defaults, int longhelp ) "no", #endif #if HAVE_GPAC - "yes" + "yes", #else - "no" + "no", #endif + BIT_DEPTH ); H0( "Example usage:\n" ); H0( "\n" ); @@ -311,7 +313,7 @@ static void Help( x264_param_t *defaults, int longhelp ) H0( "\n" ); H0( "Presets:\n" ); H0( "\n" ); - H0( " --profile Force the limits of an H.264 profile [high]\n" + H0( " --profile Force the limits of an H.264 profile\n" " Overrides all settings.\n" ); H2( " - baseline:\n" " --no-8x8dct --bframes 0 --no-cabac\n" @@ -322,8 +324,11 @@ static void Help( x264_param_t *defaults, int longhelp ) " --no-8x8dct --cqm flat\n" " No lossless.\n" " - high:\n" - " No lossless.\n" ); - else H0( " - baseline,main,high\n" ); + " No lossless.\n" + " - high10:\n" + " No lossless.\n" + " Support for bit depth 8-10.\n" ); + else H0( " - baseline,main,high,high10\n" ); H0( " --preset Use a preset to select encoding settings [medium]\n" " Overridden by user settings.\n" ); H2( " - ultrafast:\n" @@ -453,9 +458,9 @@ static void Help( x264_param_t *defaults, int longhelp ) H0( "\n" ); H0( "Ratecontrol:\n" ); H0( "\n" ); - H1( " -q, --qp Force constant QP (0-51, 0=lossless)\n" ); + H1( " -q, --qp Force constant QP (0-%d, 0=lossless)\n", QP_MAX ); H0( " -B, --bitrate Set bitrate (kbit/s)\n" ); - H0( " --crf Quality-based VBR (0-51, 0=lossless) [%.1f]\n", defaults->rc.f_rf_constant ); + H0( " --crf Quality-based VBR (0-%d, 0=lossless) [%.1f]\n", QP_MAX, defaults->rc.f_rf_constant ); H1( " --rc-lookahead Number of frames for frametype lookahead [%d]\n", defaults->rc.i_lookahead ); H0( " --vbv-maxrate Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate ); H0( " --vbv-bufsize Set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size ); @@ -1040,6 +1045,7 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt ) #else printf( "using a non-gcc compiler\n" ); #endif + printf( "configuration: --bit-depth=%d\n", BIT_DEPTH ); exit(0); case OPT_FRAMES: param->i_frame_total = X264_MAX( atoi( optarg ), 0 ); @@ -1318,7 +1324,7 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame ) else if( type == 'B' ) pic->i_type = X264_TYPE_BREF; else if( type == 'b' ) pic->i_type = X264_TYPE_B; else ret = 0; - if( ret != 3 || qp < -1 || qp > 51 ) + if( ret != 3 || qp < -1 || qp > QP_MAX ) { x264_cli_log( "x264", X264_LOG_ERROR, "can't parse qpfile for frame %d\n", i_frame ); fclose( opt->qpfile ); diff --git a/x264.h b/x264.h index 097365a4..4d9b9ca6 100644 --- a/x264.h +++ b/x264.h @@ -344,7 +344,7 @@ typedef struct x264_param_t { int i_rc_method; /* X264_RC_* */ - int i_qp_constant; /* 0-51 */ + int i_qp_constant; /* 0 to (51 + 6*(BIT_DEPTH-8)) */ int i_qp_min; /* min allowed QP value */ int i_qp_max; /* max allowed QP value */ int i_qp_step; /* max QP step between frames */ @@ -550,7 +550,7 @@ void x264_param_apply_fastfirstpass( x264_param_t * ); /* x264_param_apply_profile: * Applies the restrictions of the given profile. * Currently available profiles are, from most to least restrictive: */ -static const char * const x264_profile_names[] = { "baseline", "main", "high", 0 }; +static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", 0 }; /* (can be NULL, in which case the function will do nothing) *