From: Henrik Gramner Date: Fri, 4 Mar 2016 16:53:08 +0000 (+0100) Subject: x86: Add asm for mbtree fixed point conversion X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=c82c7374938f4342971adf8b2495c3a1bbe621c4;p=x264 x86: Add asm for mbtree fixed point conversion The QP offsets of each macroblock are stored as floats internally and converted to big-endian Q8.8 fixed point numbers when written to the 2-pass stats file, and converted back to floats when read from the stats file. Add SSSE3 and AVX2 implementations for conversions in both directions. About 8x faster than C on Haswell. --- diff --git a/common/mc.c b/common/mc.c index a460541f..dc39c5eb 100644 --- a/common/mc.c +++ b/common/mc.c @@ -589,6 +589,19 @@ static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs } } +/* Conversion between float and Q8.8 fixed point (big-endian) for storage */ +static void mbtree_fix8_pack( uint16_t *dst, float *src, int count ) +{ + for( int i = 0; i < count; i++ ) + dst[i] = endian_fix16( (int16_t)(src[i] * 256.0f) ); +} + +static void mbtree_fix8_unpack( float *dst, uint16_t *src, int count ) +{ + for( int i = 0; i < count; i++ ) + dst[i] = (int16_t)endian_fix16( src[i] ) * (1.0f/256.0f); +} + void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) { pf->mc_luma = mc_luma; @@ -646,6 +659,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) pf->mbtree_propagate_cost = mbtree_propagate_cost; pf->mbtree_propagate_list = mbtree_propagate_list; + pf->mbtree_fix8_pack = mbtree_fix8_pack; + pf->mbtree_fix8_unpack = mbtree_fix8_unpack; #if HAVE_MMX x264_mc_init_mmx( cpu, pf ); diff --git a/common/mc.h b/common/mc.h index f0601b4b..cebdb557 100644 --- a/common/mc.h +++ b/common/mc.h @@ -201,10 +201,11 @@ typedef struct void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); - void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, int bipred_weight, int mb_y, int len, int list ); + void (*mbtree_fix8_pack)( uint16_t *dst, float *src, int count ); + void (*mbtree_fix8_unpack)( float *dst, uint16_t *src, int count ); } x264_mc_functions_t; void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ); diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index d9e6099f..c58aba4f 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -59,6 +59,13 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif ; !HIGH_BIT_DEPTH +mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6 + db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14 +mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14 + +pf_256: times 4 dd 256.0 +pf_inv256: times 4 dd 0.00390625 + pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff @@ -2260,3 +2267,109 @@ INIT_XMM ssse3 MBTREE_PROPAGATE_LIST INIT_XMM avx MBTREE_PROPAGATE_LIST + +%macro MBTREE_FIX8 0 +;----------------------------------------------------------------------------- +; void mbtree_fix8_pack( uint16_t *dst, float *src, int count ) +;----------------------------------------------------------------------------- +cglobal mbtree_fix8_pack, 3,4 +%if mmsize == 32 + vbroadcastf128 m2, [pf_256] + vbroadcasti128 m3, [mbtree_fix8_pack_shuf] +%else + movaps m2, [pf_256] + mova m3, [mbtree_fix8_pack_shuf] +%endif + sub r2d, mmsize/2 + movsxdifnidn r2, r2d + lea r1, [r1+4*r2] + lea r0, [r0+2*r2] + neg r2 + jg .skip_loop +.loop: + mulps m0, m2, [r1+4*r2] + mulps m1, m2, [r1+4*r2+mmsize] + cvttps2dq m0, m0 + cvttps2dq m1, m1 + packssdw m0, m1 + pshufb m0, m3 +%if mmsize == 32 + vpermq m0, m0, q3120 +%endif + mova [r0+2*r2], m0 + add r2, mmsize/2 + jle .loop +.skip_loop: + sub r2, mmsize/2 + jz .end + ; Do the remaining values in scalar in order to avoid overreading src. +.scalar: + mulss xm0, xm2, [r1+4*r2+2*mmsize] + cvttss2si r3d, xm0 + rol r3w, 8 + mov [r0+2*r2+mmsize], r3w + inc r2 + jl .scalar +.end: + RET + +;----------------------------------------------------------------------------- +; void mbtree_fix8_unpack( float *dst, uint16_t *src, int count ) +;----------------------------------------------------------------------------- +cglobal mbtree_fix8_unpack, 3,4 +%if mmsize == 32 + vbroadcastf128 m2, [pf_inv256] +%else + movaps m2, [pf_inv256] + mova m4, [mbtree_fix8_unpack_shuf+16] +%endif + mova m3, [mbtree_fix8_unpack_shuf] + sub r2d, mmsize/2 + movsxdifnidn r2, r2d + lea r1, [r1+2*r2] + lea r0, [r0+4*r2] + neg r2 + jg .skip_loop +.loop: +%if mmsize == 32 + vbroadcasti128 m0, [r1+2*r2] + vbroadcasti128 m1, [r1+2*r2+16] + pshufb m0, m3 + pshufb m1, m3 +%else + mova m1, [r1+2*r2] + pshufb m0, m1, m3 + pshufb m1, m4 +%endif + psrad m0, 16 ; sign-extend + psrad m1, 16 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + mulps m0, m2 + mulps m1, m2 + movaps [r0+4*r2], m0 + movaps [r0+4*r2+mmsize], m1 + add r2, mmsize/2 + jle .loop +.skip_loop: + sub r2, mmsize/2 + jz .end +.scalar: + movzx r3d, word [r1+2*r2+mmsize] + rol r3w, 8 + movsx r3d, r3w + ; Use 3-arg cvtsi2ss as a workaround for the fact that the instruction has a stupid dependency on + ; dst which causes terrible performance when used in a loop otherwise. Blame Intel for poor design. + cvtsi2ss xm0, xm2, r3d + mulss xm0, xm2 + movss [r0+4*r2+2*mmsize], xm0 + inc r2 + jl .scalar +.end: + RET +%endmacro + +INIT_XMM ssse3 +MBTREE_FIX8 +INIT_YMM avx2 +MBTREE_FIX8 diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 7bee9a51..0019f430 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -173,6 +173,10 @@ void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count ); +void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count ); +void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count ); +void x264_mbtree_fix8_unpack_avx2 ( float *dst, uint16_t *src, int count ); #define MC_CHROMA(cpu)\ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\ @@ -736,6 +740,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->plane_copy_swap = x264_plane_copy_swap_ssse3; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3; pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; + pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3; + pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3; if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) ) pf->integral_init4v = x264_integral_init4v_ssse3; @@ -841,6 +847,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->plane_copy_swap = x264_plane_copy_swap_ssse3; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3; pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; + pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3; + pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3; if( !(cpu&X264_CPU_SLOW_PSHUFB) ) { @@ -928,4 +936,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->plane_copy_swap = x264_plane_copy_swap_avx2; pf->get_ref = get_ref_avx2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2; + pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2; + pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2; } diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index 436f3550..723350c5 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -565,11 +565,7 @@ int x264_macroblock_tree_read( x264_t *h, x264_frame_t *frame, float *quant_offs } float *dst = rc->mbtree.rescale_enabled ? rc->mbtree.scale_buffer[0] : frame->f_qp_offset; - for( int i = 0; i < rc->mbtree.src_mb_count; i++ ) - { - int16_t qp_fix8 = endian_fix16( rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos][i] ); - dst[i] = qp_fix8 * (1.f/256.f); - } + h->mc.mbtree_fix8_unpack( dst, rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos], rc->mbtree.src_mb_count ); if( rc->mbtree.rescale_enabled ) x264_macroblock_tree_rescale( h, rc, frame->f_qp_offset ); if( h->frames.b_have_lowres ) @@ -1889,9 +1885,7 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler ) if( h->param.rc.b_mb_tree && h->fenc->b_kept_as_ref && !h->param.rc.b_stat_read ) { uint8_t i_type = h->sh.i_type; - /* Values are stored as big-endian FIX8.8 */ - for( int i = 0; i < h->mb.i_mb_count; i++ ) - rc->mbtree.qp_buffer[0][i] = endian_fix16( (int16_t)(h->fenc->f_qp_offset[i]*256.0) ); + h->mc.mbtree_fix8_pack( rc->mbtree.qp_buffer[0], h->fenc->f_qp_offset, h->mb.i_mb_count ); if( fwrite( &i_type, 1, 1, rc->p_mbtree_stat_file_out ) < 1 ) goto fail; if( fwrite( rc->mbtree.qp_buffer[0], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_out ) < h->mb.i_mb_count ) diff --git a/tools/checkasm.c b/tools/checkasm.c index ab009912..f6d0ad98 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1745,6 +1745,60 @@ static int check_mc( int cpu_ref, int cpu_new ) call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); } } + + if( mc_a.mbtree_fix8_pack != mc_ref.mbtree_fix8_pack ) + { + set_func_name( "mbtree_fix8_pack" ); + used_asm = 1; + float *fix8_src = (float*)(buf3 + 0x800); + uint16_t *dstc = (uint16_t*)buf3; + uint16_t *dsta = (uint16_t*)buf4; + for( int i = 0; i < 5; i++ ) + { + int count = 256 + i; + + for( int j = 0; j < count; j++ ) + fix8_src[j] = (int16_t)(rand()) / 256.0f; + dsta[count] = 0xAAAA; + + call_c( mc_c.mbtree_fix8_pack, dstc, fix8_src, count ); + call_a( mc_a.mbtree_fix8_pack, dsta, fix8_src, count ); + + if( memcmp( dsta, dstc, count * sizeof(uint16_t) ) || dsta[count] != 0xAAAA ) + { + ok = 0; + fprintf( stderr, "mbtree_fix8_pack FAILED\n" ); + break; + } + } + } + + if( mc_a.mbtree_fix8_unpack != mc_ref.mbtree_fix8_unpack ) + { + set_func_name( "mbtree_fix8_unpack" ); + used_asm = 1; + uint16_t *fix8_src = (uint16_t*)(buf3 + 0x800); + float *dstc = (float*)buf3; + float *dsta = (float*)buf4; + for( int i = 0; i < 5; i++ ) + { + int count = 256 + i; + + for( int j = 0; j < count; j++ ) + fix8_src[j] = rand(); + M32( &dsta[count] ) = 0xAAAAAAAA; + + call_c( mc_c.mbtree_fix8_unpack, dstc, fix8_src, count ); + call_a( mc_a.mbtree_fix8_unpack, dsta, fix8_src, count ); + + if( memcmp( dsta, dstc, count * sizeof(float) ) || M32( &dsta[count] ) != 0xAAAAAAAA ) + { + ok = 0; + fprintf( stderr, "mbtree_fix8_unpack FAILED\n" ); + break; + } + } + } report( "mbtree :" ); if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )