From 911e21a306dc7fddb3e8f0acb827ff89eaf9418d Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Wed, 13 Aug 2008 23:35:40 +0000 Subject: [PATCH] simd int->float 20% faster ac3 if downmixing, 15% if not Originally committed as revision 14743 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/ac3dec.c | 4 +-- libavcodec/ac3dec.h | 2 +- libavcodec/dsputil.c | 7 ++++++ libavcodec/dsputil.h | 2 ++ libavcodec/i386/dsputil_mmx.c | 46 +++++++++++++++++++++++++++++++++++ 5 files changed, 57 insertions(+), 4 deletions(-) diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c index 44233b9468f..ed0b1703495 100644 --- a/libavcodec/ac3dec.c +++ b/libavcodec/ac3dec.c @@ -994,9 +994,7 @@ static int decode_audio_block(AC3DecodeContext *s, int blk) } else { gain *= s->dynamic_range[0]; } - for(i=0; i<256; i++) { - s->transform_coeffs[ch][i] = s->fixed_coeffs[ch][i] * gain; - } + s->dsp.int32_to_float_fmul_scalar(s->transform_coeffs[ch], s->fixed_coeffs[ch], gain, 256); } /* downmix and MDCT. order depends on whether block switching is used for diff --git a/libavcodec/ac3dec.h b/libavcodec/ac3dec.h index 8b6c057cfc2..de43609f585 100644 --- a/libavcodec/ac3dec.h +++ b/libavcodec/ac3dec.h @@ -158,7 +158,7 @@ typedef struct { float mul_bias; ///< scaling for float_to_int16 conversion ///@} - int fixed_coeffs[AC3_MAX_CHANNELS][AC3_MAX_COEFS]; ///> fixed-point transform coefficients + DECLARE_ALIGNED_16(int, fixed_coeffs[AC3_MAX_CHANNELS][AC3_MAX_COEFS]); ///> fixed-point transform coefficients ///@defgroup arrays aligned arrays DECLARE_ALIGNED_16(float, transform_coeffs[AC3_MAX_CHANNELS][AC3_MAX_COEFS]); ///< transform coefficients diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index b6df158b3e6..241bad0d4f0 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -3948,6 +3948,12 @@ void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, c } } +static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){ + int i; + for(i=0; ivector_fmul_reverse = vector_fmul_reverse_c; c->vector_fmul_add_add = ff_vector_fmul_add_add_c; c->vector_fmul_window = ff_vector_fmul_window_c; + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; c->float_to_int16 = ff_float_to_int16_c; c->float_to_int16_interleave = ff_float_to_int16_interleave_c; c->add_int16 = add_int16_c; diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 83102db5010..eb1ea4f4ee4 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -370,6 +370,8 @@ typedef struct DSPContext { void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step); /* assume len is a multiple of 4, and arrays are 16-byte aligned */ void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len); + /* assume len is a multiple of 8, and arrays are 16-byte aligned */ + void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len); /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767] * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */ diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index c7bcd3b7836..f42a6bc4ff6 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -2192,6 +2192,50 @@ static void vector_fmul_window_sse(float *dst, const float *src0, const float *s ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len); } +static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) +{ + x86_reg i = -4*len; + asm volatile( + "movss %3, %%xmm4 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "1: \n" + "cvtpi2ps (%2,%0), %%xmm0 \n" + "cvtpi2ps 8(%2,%0), %%xmm1 \n" + "cvtpi2ps 16(%2,%0), %%xmm2 \n" + "cvtpi2ps 24(%2,%0), %%xmm3 \n" + "movlhps %%xmm1, %%xmm0 \n" + "movlhps %%xmm3, %%xmm2 \n" + "mulps %%xmm4, %%xmm0 \n" + "mulps %%xmm4, %%xmm2 \n" + "movaps %%xmm0, (%1,%0) \n" + "movaps %%xmm2, 16(%1,%0) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(dst+len), "r"(src+len), "xm"(mul) + ); +} + +static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) +{ + x86_reg i = -4*len; + asm volatile( + "movss %3, %%xmm4 \n" + "shufps $0, %%xmm4, %%xmm4 \n" + "1: \n" + "cvtdq2ps (%2,%0), %%xmm0 \n" + "cvtdq2ps 16(%2,%0), %%xmm1 \n" + "mulps %%xmm4, %%xmm0 \n" + "mulps %%xmm4, %%xmm1 \n" + "movaps %%xmm0, (%1,%0) \n" + "movaps %%xmm1, 16(%1,%0) \n" + "add $32, %0 \n" + "jl 1b \n" + :"+r"(i) + :"r"(dst+len), "r"(src+len), "xm"(mul) + ); +} + static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ // not bit-exact: pf2id uses different rounding than C and SSE asm volatile( @@ -2786,12 +2830,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->vector_fmul_reverse = vector_fmul_reverse_sse; c->vector_fmul_add_add = vector_fmul_add_add_sse; c->vector_fmul_window = vector_fmul_window_sse; + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; c->float_to_int16 = float_to_int16_sse; c->float_to_int16_interleave = float_to_int16_interleave_sse; } if(mm_flags & MM_3DNOW) c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse if(mm_flags & MM_SSE2){ + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; c->float_to_int16 = float_to_int16_sse2; c->float_to_int16_interleave = float_to_int16_interleave_sse2; c->add_int16 = add_int16_sse2; -- 2.39.2