]> git.sesse.net Git - ffmpeg/commitdiff
lavc: Move vector_fmul_window to AVFloatDSPContext
authorJustin Ruggles <justin.ruggles@gmail.com>
Mon, 7 Jan 2013 04:47:30 +0000 (23:47 -0500)
committerLuca Barbato <lu_zero@gentoo.org>
Wed, 16 Jan 2013 09:45:45 +0000 (10:45 +0100)
Signed-off-by: Luca Barbato <lu_zero@gentoo.org>
22 files changed:
libavcodec/aacdec.c
libavcodec/ac3dec.c
libavcodec/ac3dec.h
libavcodec/arm/dsputil_init_neon.c
libavcodec/arm/dsputil_neon.S
libavcodec/atrac1.c
libavcodec/dsputil.c
libavcodec/dsputil.h
libavcodec/nellymoserdec.c
libavcodec/ppc/float_altivec.c
libavcodec/twinvq.c
libavcodec/vorbisdec.c
libavcodec/wmaprodec.c
libavcodec/x86/dsputil_mmx.c
libavutil/arm/float_dsp_init_neon.c
libavutil/arm/float_dsp_neon.S
libavutil/float_dsp.c
libavutil/float_dsp.h
libavutil/ppc/float_dsp_altivec.c
libavutil/ppc/float_dsp_altivec.h
libavutil/ppc/float_dsp_init.c
libavutil/x86/float_dsp_init.c

index d10a482cadfbe8a6b26ef405ae628a613985ae83..d59dea4f0ef6088728a9ef8ebf710c0522848c7c 100644 (file)
@@ -2173,35 +2173,35 @@ static void imdct_and_windowing(AACContext *ac, SingleChannelElement *sce)
      */
     if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
             (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
-        ac->dsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
+        ac->fdsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
     } else {
-        memcpy(                        out,               saved,            448 * sizeof(float));
+        memcpy(                         out,               saved,            448 * sizeof(float));
 
         if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-            ac->dsp.vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
-            ac->dsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
-            ac->dsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
-            ac->dsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
-            ac->dsp.vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
-            memcpy(                    out + 448 + 4*128, temp, 64 * sizeof(float));
+            ac->fdsp.vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
+            ac->fdsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
+            ac->fdsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
+            ac->fdsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
+            ac->fdsp.vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
+            memcpy(                     out + 448 + 4*128, temp, 64 * sizeof(float));
         } else {
-            ac->dsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
-            memcpy(                    out + 576,         buf + 64,         448 * sizeof(float));
+            ac->fdsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
+            memcpy(                     out + 576,         buf + 64,         448 * sizeof(float));
         }
     }
 
     // buffer update
     if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
-        memcpy(                    saved,       temp + 64,         64 * sizeof(float));
-        ac->dsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
-        ac->dsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
-        ac->dsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
-        memcpy(                    saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
+        memcpy(                     saved,       temp + 64,         64 * sizeof(float));
+        ac->fdsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
+        ac->fdsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
+        ac->fdsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
+        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
     } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
-        memcpy(                    saved,       buf + 512,        448 * sizeof(float));
-        memcpy(                    saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
+        memcpy(                     saved,       buf + 512,        448 * sizeof(float));
+        memcpy(                     saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
     } else { // LONG_STOP or ONLY_LONG
-        memcpy(                    saved,       buf + 512,        512 * sizeof(float));
+        memcpy(                     saved,       buf + 512,        512 * sizeof(float));
     }
 }
 
index f15bfa2a07b8f49221dcb894884c82d5ca6b249c..3c5147229f70b1982039991ec6156201d3c7e86b 100644 (file)
@@ -170,6 +170,7 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
     ff_mdct_init(&s->imdct_512, 9, 1, 1.0);
     ff_kbd_window_init(s->window, 5.0, 256);
     ff_dsputil_init(&s->dsp, avctx);
+    avpriv_float_dsp_init(&s->fdsp, avctx->flags & CODEC_FLAG_BITEXACT);
     ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);
     ff_fmt_convert_init(&s->fmt_conv, avctx);
     av_lfg_init(&s->dith_state, 0);
@@ -606,15 +607,15 @@ static inline void do_imdct(AC3DecodeContext *s, int channels)
             for (i = 0; i < 128; i++)
                 x[i] = s->transform_coeffs[ch][2 * i];
             s->imdct_256.imdct_half(&s->imdct_256, s->tmp_output, x);
-            s->dsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
-                                      s->tmp_output, s->window, 128);
+            s->fdsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
+                                       s->tmp_output, s->window, 128);
             for (i = 0; i < 128; i++)
                 x[i] = s->transform_coeffs[ch][2 * i + 1];
             s->imdct_256.imdct_half(&s->imdct_256, s->delay[ch - 1], x);
         } else {
             s->imdct_512.imdct_half(&s->imdct_512, s->tmp_output, s->transform_coeffs[ch]);
-            s->dsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
-                                      s->tmp_output, s->window, 128);
+            s->fdsp.vector_fmul_window(s->outptr[ch - 1], s->delay[ch - 1],
+                                       s->tmp_output, s->window, 128);
             memcpy(s->delay[ch - 1], s->tmp_output + 128, 128 * sizeof(float));
         }
     }
index 1e8ee68e614fd61d8584df380099da8b623141a0..8d3a311bfc0ae4f9a16b944ea27716f8d0a9d881 100644 (file)
@@ -50,6 +50,7 @@
 #ifndef AVCODEC_AC3DEC_H
 #define AVCODEC_AC3DEC_H
 
+#include "libavutil/float_dsp.h"
 #include "libavutil/lfg.h"
 #include "ac3.h"
 #include "ac3dsp.h"
@@ -193,6 +194,7 @@ typedef struct AC3DecodeContext {
 
 ///@name Optimization
     DSPContext dsp;                         ///< for optimization
+    AVFloatDSPContext fdsp;
     AC3DSPContext ac3dsp;
     FmtConvertContext fmt_conv;             ///< optimized conversion functions
 ///@}
index b2e7204a609ec81dd0a99ba7b42158ea09192b63..34bb6191f17a223fbcda81c9224bf3ad0ab8c288 100644 (file)
@@ -142,8 +142,6 @@ void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
 void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
 void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
 
-void ff_vector_fmul_window_neon(float *dst, const float *src0,
-                                const float *src1, const float *win, int len);
 void ff_butterflies_float_neon(float *v1, float *v2, int len);
 float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
 void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
@@ -302,7 +300,6 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
         c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
     }
 
-    c->vector_fmul_window         = ff_vector_fmul_window_neon;
     c->butterflies_float          = ff_butterflies_float_neon;
     c->scalarproduct_float        = ff_scalarproduct_float_neon;
     c->vector_fmul_reverse        = ff_vector_fmul_reverse_neon;
index cf9ad9e583f565de84d9bfc6197566ce8583b902..a0d201cd852003873c959c96670c584537504cb0 100644 (file)
@@ -532,53 +532,6 @@ function ff_add_pixels_clamped_neon, export=1
         bx              lr
 endfunc
 
-function ff_vector_fmul_window_neon, export=1
-        push            {r4,r5,lr}
-        ldr             lr,  [sp, #12]
-        sub             r2,  r2,  #8
-        sub             r5,  lr,  #2
-        add             r2,  r2,  r5, lsl #2
-        add             r4,  r3,  r5, lsl #3
-        add             ip,  r0,  r5, lsl #3
-        mov             r5,  #-16
-        vld1.32         {d0,d1},  [r1,:128]!
-        vld1.32         {d2,d3},  [r2,:128], r5
-        vld1.32         {d4,d5},  [r3,:128]!
-        vld1.32         {d6,d7},  [r4,:128], r5
-1:      subs            lr,  lr,  #4
-        vmul.f32        d22, d0,  d4
-        vrev64.32       q3,  q3
-        vmul.f32        d23, d1,  d5
-        vrev64.32       q1,  q1
-        vmul.f32        d20, d0,  d7
-        vmul.f32        d21, d1,  d6
-        beq             2f
-        vmla.f32        d22, d3,  d7
-        vld1.32         {d0,d1},  [r1,:128]!
-        vmla.f32        d23, d2,  d6
-        vld1.32         {d18,d19},[r2,:128], r5
-        vmls.f32        d20, d3,  d4
-        vld1.32         {d24,d25},[r3,:128]!
-        vmls.f32        d21, d2,  d5
-        vld1.32         {d6,d7},  [r4,:128], r5
-        vmov            q1,  q9
-        vrev64.32       q11, q11
-        vmov            q2,  q12
-        vswp            d22, d23
-        vst1.32         {d20,d21},[r0,:128]!
-        vst1.32         {d22,d23},[ip,:128], r5
-        b               1b
-2:      vmla.f32        d22, d3,  d7
-        vmla.f32        d23, d2,  d6
-        vmls.f32        d20, d3,  d4
-        vmls.f32        d21, d2,  d5
-        vrev64.32       q11, q11
-        vswp            d22, d23
-        vst1.32         {d20,d21},[r0,:128]!
-        vst1.32         {d22,d23},[ip,:128], r5
-        pop             {r4,r5,pc}
-endfunc
-
 #if CONFIG_VORBIS_DECODER
 function ff_vorbis_inverse_coupling_neon, export=1
         vmov.i32        q10, #1<<31
index 268ce86f013894b2a8d1f0759829bcbe507d9ec5..b74bef664ffd728552978d4fd06000dcec016471 100644 (file)
@@ -32,6 +32,7 @@
 #include <stddef.h>
 #include <stdio.h>
 
+#include "libavutil/float_dsp.h"
 #include "avcodec.h"
 #include "get_bits.h"
 #include "dsputil.h"
@@ -81,7 +82,7 @@ typedef struct {
     DECLARE_ALIGNED(32, float, high)[512];
     float*              bands[3];
     FFTContext          mdct_ctx[3];
-    DSPContext          dsp;
+    AVFloatDSPContext   fdsp;
 } AT1Ctx;
 
 /** size of the transform in samples in the long mode for each QMF band */
@@ -141,8 +142,8 @@ static int at1_imdct_block(AT1SUCtx* su, AT1Ctx *q)
             at1_imdct(q, &q->spec[pos], &su->spectrum[0][ref_pos + start_pos], nbits, band_num);
 
             /* overlap and window */
-            q->dsp.vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
-                                      &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 16);
+            q->fdsp.vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
+                                       &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 16);
 
             prev_buf = &su->spectrum[0][ref_pos+start_pos + 16];
             start_pos += block_size;
@@ -357,7 +358,7 @@ static av_cold int atrac1_decode_init(AVCodecContext *avctx)
 
     ff_atrac_generate_tables();
 
-    ff_dsputil_init(&q->dsp, avctx);
+    avpriv_float_dsp_init(&q->fdsp, avctx->flags & CODEC_FLAG_BITEXACT);
 
     q->bands[0] = q->low;
     q->bands[1] = q->mid;
index d297b8a7389760a4f4e1ae257b1833ca73dcb58d..91a4da55327036c60a8d10bbdd880bc6e3dbbbee 100644 (file)
@@ -2367,23 +2367,6 @@ static void vector_fmul_add_c(float *dst, const float *src0, const float *src1,
         dst[i] = src0[i] * src1[i] + src2[i];
 }
 
-static void vector_fmul_window_c(float *dst, const float *src0,
-                                 const float *src1, const float *win, int len)
-{
-    int i,j;
-    dst += len;
-    win += len;
-    src0+= len;
-    for(i=-len, j=len-1; i<0; i++, j--) {
-        float s0 = src0[i];
-        float s1 = src1[j];
-        float wi = win[i];
-        float wj = win[j];
-        dst[i] = s0*wj - s1*wi;
-        dst[j] = s0*wi + s1*wj;
-    }
-}
-
 static void butterflies_float_c(float *restrict v1, float *restrict v2,
                                 int len)
 {
@@ -2839,7 +2822,6 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
 #endif
     c->vector_fmul_reverse = vector_fmul_reverse_c;
     c->vector_fmul_add = vector_fmul_add_c;
-    c->vector_fmul_window = vector_fmul_window_c;
     c->vector_clipf = vector_clipf_c;
     c->scalarproduct_int16 = scalarproduct_int16_c;
     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
index 4749be9b54bd0dccb7a1085f3eef537af69574c6..3a5c94a952ee42e67fecbb36f5aabdc4ded26ed1 100644 (file)
@@ -352,8 +352,6 @@ typedef struct DSPContext {
     void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
     /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
     void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
-    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
-    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
     /* assume len is a multiple of 8, and arrays are 16-byte aligned */
     void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
     /**
index b6aa6f5f82f9b47a841ef069199ad104abc3580e..08cc4ab15b7444a71343ecfa7101b9b310775175 100644 (file)
@@ -32,6 +32,7 @@
  */
 
 #include "libavutil/channel_layout.h"
+#include "libavutil/float_dsp.h"
 #include "libavutil/lfg.h"
 #include "libavutil/random_seed.h"
 #include "avcodec.h"
@@ -52,7 +53,7 @@ typedef struct NellyMoserDecodeContext {
     AVLFG           random_state;
     GetBitContext   gb;
     float           scale_bias;
-    DSPContext      dsp;
+    AVFloatDSPContext fdsp;
     FFTContext      imdct_ctx;
     DECLARE_ALIGNED(32, float, imdct_buf)[2][NELLY_BUF_LEN];
     float          *imdct_out;
@@ -107,7 +108,9 @@ static void nelly_decode_block(NellyMoserDecodeContext *s,
                (NELLY_BUF_LEN - NELLY_FILL_LEN) * sizeof(float));
 
         s->imdct_ctx.imdct_half(&s->imdct_ctx, s->imdct_out, aptr);
-        s->dsp.vector_fmul_window(aptr, s->imdct_prev + NELLY_BUF_LEN/2, s->imdct_out, ff_sine_128, NELLY_BUF_LEN/2);
+        s->fdsp.vector_fmul_window(aptr, s->imdct_prev + NELLY_BUF_LEN / 2,
+                                   s->imdct_out, ff_sine_128,
+                                   NELLY_BUF_LEN / 2);
         FFSWAP(float *, s->imdct_out, s->imdct_prev);
     }
 }
@@ -121,7 +124,7 @@ static av_cold int decode_init(AVCodecContext * avctx) {
     av_lfg_init(&s->random_state, 0);
     ff_mdct_init(&s->imdct_ctx, 8, 1, 1.0);
 
-    ff_dsputil_init(&s->dsp, avctx);
+    avpriv_float_dsp_init(&s->fdsp, avctx->flags & CODEC_FLAG_BITEXACT);
 
     s->scale_bias = 1.0/(32768*8);
     avctx->sample_fmt = AV_SAMPLE_FMT_FLT;
index 5068fd4eb838c163dffa0e8a2b899083e4428135..b56440ba10f923f56e0f69f8d425326ea3515d76 100644 (file)
@@ -75,43 +75,8 @@ static void vector_fmul_add_altivec(float *dst, const float *src0,
     }
 }
 
-static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len)
-{
-    vector float zero, t0, t1, s0, s1, wi, wj;
-    const vector unsigned char reverse = vcprm(3,2,1,0);
-    int i,j;
-
-    dst += len;
-    win += len;
-    src0+= len;
-
-    zero = (vector float)vec_splat_u32(0);
-
-    for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
-        s0 = vec_ld(i, src0);
-        s1 = vec_ld(j, src1);
-        wi = vec_ld(i, win);
-        wj = vec_ld(j, win);
-
-        s1 = vec_perm(s1, s1, reverse);
-        wj = vec_perm(wj, wj, reverse);
-
-        t0 = vec_madd(s0, wj, zero);
-        t0 = vec_nmsub(s1, wi, t0);
-        t1 = vec_madd(s0, wi, zero);
-        t1 = vec_madd(s1, wj, t1);
-        t1 = vec_perm(t1, t1, reverse);
-
-        vec_st(t0, i, dst);
-        vec_st(t1, j, dst);
-    }
-}
-
 void ff_float_init_altivec(DSPContext* c, AVCodecContext *avctx)
 {
     c->vector_fmul_reverse = vector_fmul_reverse_altivec;
     c->vector_fmul_add = vector_fmul_add_altivec;
-    if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
-        c->vector_fmul_window = vector_fmul_window_altivec;
-    }
 }
index 89894692376f10c3ac3550102edef6d2aba13f07..b59031bdb9f127d13fbe75faa5621de39702152e 100644 (file)
@@ -650,11 +650,10 @@ static void imdct_and_window(TwinContext *tctx, enum FrameType ftype, int wtype,
 
         mdct->imdct_half(mdct, buf1 + bsize*j, in + bsize*j);
 
-        tctx->dsp.vector_fmul_window(out2,
-                                     prev_buf + (bsize-wsize)/2,
-                                     buf1 + bsize*j,
-                                     ff_sine_windows[av_log2(wsize)],
-                                     wsize/2);
+        tctx->fdsp.vector_fmul_window(out2, prev_buf + (bsize-wsize) / 2,
+                                      buf1 + bsize * j,
+                                      ff_sine_windows[av_log2(wsize)],
+                                      wsize / 2);
         out2 += wsize;
 
         memcpy(out2, buf1 + bsize*j + wsize/2, (bsize - wsize/2)*sizeof(float));
index aac9019ed683724fd63a9a2f2baa7bfb68eaf918..9bea908cc1dc87d230a1ec41a12dc8fea8f9397d 100644 (file)
@@ -1620,13 +1620,13 @@ static int vorbis_parse_audio_packet(vorbis_context *vc, float **floor_ptr)
         const float *win  = vc->win[blockflag & previous_window];
 
         if (blockflag == previous_window) {
-            vc->dsp.vector_fmul_window(ret, saved, buf, win, blocksize / 4);
+            vc->fdsp.vector_fmul_window(ret, saved, buf, win, blocksize / 4);
         } else if (blockflag > previous_window) {
-            vc->dsp.vector_fmul_window(ret, saved, buf, win, bs0 / 4);
+            vc->fdsp.vector_fmul_window(ret, saved, buf, win, bs0 / 4);
             memcpy(ret+bs0/2, buf+bs0/4, ((bs1-bs0)/4) * sizeof(float));
         } else {
             memcpy(ret, saved, ((bs1 - bs0) / 4) * sizeof(float));
-            vc->dsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
+            vc->fdsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
         }
         memcpy(saved, buf + blocksize / 4, blocksize / 4 * sizeof(float));
     }
index d58278b76def13346531e81a8e6a060fe42f0f81..f04b43fd619c12bab42b507de472fd1d8d85fe93 100644 (file)
@@ -1046,8 +1046,8 @@ static void wmapro_window(WMAProDecodeCtx *s)
 
         winlen >>= 1;
 
-        s->dsp.vector_fmul_window(start, start, start + winlen,
-                                  window, winlen);
+        s->fdsp.vector_fmul_window(start, start, start + winlen,
+                                   window, winlen);
 
         s->channel[c].prev_block_len = s->subframe_len;
     }
index b16f7e4de3c87e79add781b256832fd34373cf87..7ac6ecb2c4f5de0f2bfbcc452360084289f34479 100644 (file)
@@ -1892,72 +1892,6 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
     }
 }
 
-#if HAVE_6REGS
-static void vector_fmul_window_3dnowext(float *dst, const float *src0,
-                                        const float *src1, const float *win,
-                                        int len)
-{
-    x86_reg i = -len * 4;
-    x86_reg j =  len * 4 - 8;
-    __asm__ volatile (
-        "1:                             \n"
-        "pswapd (%5, %1), %%mm1         \n"
-        "movq   (%5, %0), %%mm0         \n"
-        "pswapd (%4, %1), %%mm5         \n"
-        "movq   (%3, %0), %%mm4         \n"
-        "movq      %%mm0, %%mm2         \n"
-        "movq      %%mm1, %%mm3         \n"
-        "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
-        "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
-        "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
-        "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
-        "pfadd     %%mm3, %%mm2         \n"
-        "pfsub     %%mm0, %%mm1         \n"
-        "pswapd    %%mm2, %%mm2         \n"
-        "movq      %%mm1, (%2, %0)      \n"
-        "movq      %%mm2, (%2, %1)      \n"
-        "sub          $8, %1            \n"
-        "add          $8, %0            \n"
-        "jl           1b                \n"
-        "femms                          \n"
-        : "+r"(i), "+r"(j)
-        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
-    );
-}
-
-static void vector_fmul_window_sse(float *dst, const float *src0,
-                                   const float *src1, const float *win, int len)
-{
-    x86_reg i = -len * 4;
-    x86_reg j =  len * 4 - 16;
-    __asm__ volatile (
-        "1:                             \n"
-        "movaps      (%5, %1), %%xmm1   \n"
-        "movaps      (%5, %0), %%xmm0   \n"
-        "movaps      (%4, %1), %%xmm5   \n"
-        "movaps      (%3, %0), %%xmm4   \n"
-        "shufps $0x1b, %%xmm1, %%xmm1   \n"
-        "shufps $0x1b, %%xmm5, %%xmm5   \n"
-        "movaps        %%xmm0, %%xmm2   \n"
-        "movaps        %%xmm1, %%xmm3   \n"
-        "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
-        "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
-        "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
-        "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
-        "addps         %%xmm3, %%xmm2   \n"
-        "subps         %%xmm0, %%xmm1   \n"
-        "shufps $0x1b, %%xmm2, %%xmm2   \n"
-        "movaps        %%xmm1, (%2, %0) \n"
-        "movaps        %%xmm2, (%2, %1) \n"
-        "sub              $16, %1       \n"
-        "add              $16, %0       \n"
-        "jl                1b           \n"
-        : "+r"(i), "+r"(j)
-        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
-    );
-}
-#endif /* HAVE_6REGS */
-
 static void vector_clipf_sse(float *dst, const float *src,
                              float min, float max, int len)
 {
@@ -2320,14 +2254,6 @@ static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
 #endif /* HAVE_YASM */
 }
 
-static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
-                                  int mm_flags)
-{
-#if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
-    c->vector_fmul_window  = vector_fmul_window_3dnowext;
-#endif
-}
-
 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
 {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
@@ -2343,10 +2269,6 @@ static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
 
     c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
 
-#if HAVE_6REGS
-    c->vector_fmul_window = vector_fmul_window_sse;
-#endif
-
     c->vector_clipf = vector_clipf_sse;
 #endif /* HAVE_INLINE_ASM */
 
@@ -2530,9 +2452,6 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
     if (mm_flags & AV_CPU_FLAG_3DNOW)
         dsputil_init_3dnow(c, avctx, mm_flags);
 
-    if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
-        dsputil_init_3dnowext(c, avctx, mm_flags);
-
     if (mm_flags & AV_CPU_FLAG_SSE)
         dsputil_init_sse(c, avctx, mm_flags);
 
index 88eb4b3d2a87ed0b479822ef2fad5954b4bbe948..16ea47154a6709618fdf7cb99d479542a49f4101 100644 (file)
@@ -32,9 +32,13 @@ void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul,
 void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
                                 int len);
 
+void ff_vector_fmul_window_neon(float *dst, const float *src0,
+                                const float *src1, const float *win, int len);
+
 void ff_float_dsp_init_neon(AVFloatDSPContext *fdsp)
 {
     fdsp->vector_fmul = ff_vector_fmul_neon;
     fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_neon;
     fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
+    fdsp->vector_fmul_window = ff_vector_fmul_window_neon;
 }
index 6d7bd5236ec659410ae3c3e0c2ee0f7979043b5a..540cfc690790328c2bdf65e38823a0e5310a324c 100644 (file)
@@ -146,3 +146,50 @@ NOVFP   vdup.32         q8,  r2
         bx              lr
         .unreq          len
 endfunc
+
+function ff_vector_fmul_window_neon, export=1
+        push            {r4,r5,lr}
+        ldr             lr,  [sp, #12]
+        sub             r2,  r2,  #8
+        sub             r5,  lr,  #2
+        add             r2,  r2,  r5, lsl #2
+        add             r4,  r3,  r5, lsl #3
+        add             ip,  r0,  r5, lsl #3
+        mov             r5,  #-16
+        vld1.32         {d0,d1},  [r1,:128]!
+        vld1.32         {d2,d3},  [r2,:128], r5
+        vld1.32         {d4,d5},  [r3,:128]!
+        vld1.32         {d6,d7},  [r4,:128], r5
+1:      subs            lr,  lr,  #4
+        vmul.f32        d22, d0,  d4
+        vrev64.32       q3,  q3
+        vmul.f32        d23, d1,  d5
+        vrev64.32       q1,  q1
+        vmul.f32        d20, d0,  d7
+        vmul.f32        d21, d1,  d6
+        beq             2f
+        vmla.f32        d22, d3,  d7
+        vld1.32         {d0,d1},  [r1,:128]!
+        vmla.f32        d23, d2,  d6
+        vld1.32         {d18,d19},[r2,:128], r5
+        vmls.f32        d20, d3,  d4
+        vld1.32         {d24,d25},[r3,:128]!
+        vmls.f32        d21, d2,  d5
+        vld1.32         {d6,d7},  [r4,:128], r5
+        vmov            q1,  q9
+        vrev64.32       q11, q11
+        vmov            q2,  q12
+        vswp            d22, d23
+        vst1.32         {d20,d21},[r0,:128]!
+        vst1.32         {d22,d23},[ip,:128], r5
+        b               1b
+2:      vmla.f32        d22, d3,  d7
+        vmla.f32        d23, d2,  d6
+        vmls.f32        d20, d3,  d4
+        vmls.f32        d21, d2,  d5
+        vrev64.32       q11, q11
+        vswp            d22, d23
+        vst1.32         {d20,d21},[r0,:128]!
+        vst1.32         {d22,d23},[ip,:128], r5
+        pop             {r4,r5,pc}
+endfunc
index 22139defe4035b73ade7454b24c6832b46f3be9a..cf33df303c63f05dcfa64f0c5e75a1715e1d5233 100644 (file)
@@ -52,12 +52,32 @@ static void vector_dmul_scalar_c(double *dst, const double *src, double mul,
         dst[i] = src[i] * mul;
 }
 
+static void vector_fmul_window_c(float *dst, const float *src0,
+                                 const float *src1, const float *win, int len)
+{
+    int i, j;
+
+    dst  += len;
+    win  += len;
+    src0 += len;
+
+    for (i = -len, j = len - 1; i < 0; i++, j--) {
+        float s0 = src0[i];
+        float s1 = src1[j];
+        float wi = win[i];
+        float wj = win[j];
+        dst[i] = s0 * wj - s1 * wi;
+        dst[j] = s0 * wi + s1 * wj;
+    }
+}
+
 void avpriv_float_dsp_init(AVFloatDSPContext *fdsp, int bit_exact)
 {
     fdsp->vector_fmul = vector_fmul_c;
     fdsp->vector_fmac_scalar = vector_fmac_scalar_c;
     fdsp->vector_fmul_scalar = vector_fmul_scalar_c;
     fdsp->vector_dmul_scalar = vector_dmul_scalar_c;
+    fdsp->vector_fmul_window = vector_fmul_window_c;
 
 #if ARCH_ARM
     ff_float_dsp_init_arm(fdsp);
index 41b73c5b265634b3ece78cb24b0e04d8ac1d3361..d0ceaaf5e1b7c66a57e362e7a7e41bf9e607d12d 100644 (file)
@@ -81,6 +81,25 @@ typedef struct AVFloatDSPContext {
      */
     void (*vector_dmul_scalar)(double *dst, const double *src, double mul,
                                int len);
+
+    /**
+     * Overlap/add with window function.
+     * Used primarily by MDCT-based audio codecs.
+     * Source and destination vectors must overlap exactly or not at all.
+     *
+     * @param dst  result vector
+     *             constraints: 16-byte aligned
+     * @param src0 first source vector
+     *             constraints: 16-byte aligned
+     * @param src1 second source vector
+     *             constraints: 16-byte aligned
+     * @param win  half-window vector
+     *             constraints: 16-byte aligned
+     * @param len  length of vector
+     *             constraints: multiple of 4
+     */
+    void (*vector_fmul_window)(float *dst, const float *src0,
+                               const float *src1, const float *win, int len);
 } AVFloatDSPContext;
 
 /**
index 55e3fbe91f7e9c8bc2c19b6083d70857cb79095c..e5fd9aba33cae8154a62e57bc8feeeb4d2c6bc94 100644 (file)
@@ -36,3 +36,36 @@ void ff_vector_fmul_altivec(float *dst, const float *src0, const float *src1,
         vec_st(d1, 16, dst + i);
     }
 }
+
+void ff_vector_fmul_window_altivec(float *dst, const float *src0,
+                                   const float *src1, const float *win, int len)
+{
+    vector float zero, t0, t1, s0, s1, wi, wj;
+    const vector unsigned char reverse = vcprm(3, 2, 1, 0);
+    int i, j;
+
+    dst  += len;
+    win  += len;
+    src0 += len;
+
+    zero = (vector float)vec_splat_u32(0);
+
+    for (i = -len * 4, j = len * 4 - 16; i < 0; i += 16, j -= 16) {
+        s0 = vec_ld(i, src0);
+        s1 = vec_ld(j, src1);
+        wi = vec_ld(i, win);
+        wj = vec_ld(j, win);
+
+        s1 = vec_perm(s1, s1, reverse);
+        wj = vec_perm(wj, wj, reverse);
+
+        t0 = vec_madd(s0, wj, zero);
+        t0 = vec_nmsub(s1, wi, t0);
+        t1 = vec_madd(s0, wi, zero);
+        t1 = vec_madd(s1, wj, t1);
+        t1 = vec_perm(t1, t1, reverse);
+
+        vec_st(t0, i, dst);
+        vec_st(t1, j, dst);
+    }
+}
index 0b9425bef4137ebe3ff0fb9708b3c28baada1a00..4d46edf61a0d5e3a5d30facadedade8df9168a8d 100644 (file)
@@ -24,4 +24,8 @@
 extern void ff_vector_fmul_altivec(float *dst, const float *src0,
                                    const float *src1, int len);
 
+extern void ff_vector_fmul_window_altivec(float *dst, const float *src0,
+                                          const float *src1, const float *win,
+                                          int len);
+
 #endif /* AVUTIL_PPC_FLOAT_DSP_ALTIVEC_H */
index 20527642ccbca06ce2e2e67011c62c08c8197d85..1134b56926775b95299e569ab572937b863655a4 100644 (file)
@@ -32,5 +32,9 @@ void ff_float_dsp_init_ppc(AVFloatDSPContext *fdsp, int bit_exact)
         return;
 
     fdsp->vector_fmul = ff_vector_fmul_altivec;
+
+    if (!bit_exact) {
+        fdsp->vector_fmul_window = ff_vector_fmul_window_altivec;
+    }
 #endif
 }
index b3b7ff4c3a29698ac2a1e5c6380bc045ff55511a..56fb32b0d54b27edf12bc9720c94758e4da40fa2 100644 (file)
@@ -21,6 +21,7 @@
 #include "libavutil/cpu.h"
 #include "libavutil/float_dsp.h"
 #include "cpu.h"
+#include "asm.h"
 
 extern void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
                                int len);
@@ -40,10 +41,84 @@ extern void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
 extern void ff_vector_dmul_scalar_avx(double *dst, const double *src,
                                       double mul, int len);
 
+#if HAVE_6REGS
+static void vector_fmul_window_3dnowext(float *dst, const float *src0,
+                                        const float *src1, const float *win,
+                                        int len)
+{
+    x86_reg i = -len * 4;
+    x86_reg j =  len * 4 - 8;
+    __asm__ volatile (
+        "1:                             \n"
+        "pswapd (%5, %1), %%mm1         \n"
+        "movq   (%5, %0), %%mm0         \n"
+        "pswapd (%4, %1), %%mm5         \n"
+        "movq   (%3, %0), %%mm4         \n"
+        "movq      %%mm0, %%mm2         \n"
+        "movq      %%mm1, %%mm3         \n"
+        "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
+        "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
+        "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
+        "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
+        "pfadd     %%mm3, %%mm2         \n"
+        "pfsub     %%mm0, %%mm1         \n"
+        "pswapd    %%mm2, %%mm2         \n"
+        "movq      %%mm1, (%2, %0)      \n"
+        "movq      %%mm2, (%2, %1)      \n"
+        "sub          $8, %1            \n"
+        "add          $8, %0            \n"
+        "jl           1b                \n"
+        "femms                          \n"
+        : "+r"(i), "+r"(j)
+        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
+    );
+}
+
+static void vector_fmul_window_sse(float *dst, const float *src0,
+                                   const float *src1, const float *win, int len)
+{
+    x86_reg i = -len * 4;
+    x86_reg j =  len * 4 - 16;
+    __asm__ volatile (
+        "1:                             \n"
+        "movaps      (%5, %1), %%xmm1   \n"
+        "movaps      (%5, %0), %%xmm0   \n"
+        "movaps      (%4, %1), %%xmm5   \n"
+        "movaps      (%3, %0), %%xmm4   \n"
+        "shufps $0x1b, %%xmm1, %%xmm1   \n"
+        "shufps $0x1b, %%xmm5, %%xmm5   \n"
+        "movaps        %%xmm0, %%xmm2   \n"
+        "movaps        %%xmm1, %%xmm3   \n"
+        "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
+        "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
+        "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
+        "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
+        "addps         %%xmm3, %%xmm2   \n"
+        "subps         %%xmm0, %%xmm1   \n"
+        "shufps $0x1b, %%xmm2, %%xmm2   \n"
+        "movaps        %%xmm1, (%2, %0) \n"
+        "movaps        %%xmm2, (%2, %1) \n"
+        "sub              $16, %1       \n"
+        "add              $16, %0       \n"
+        "jl                1b           \n"
+        : "+r"(i), "+r"(j)
+        : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
+    );
+}
+#endif /* HAVE_6REGS */
+
 void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
 {
     int mm_flags = av_get_cpu_flags();
 
+#if HAVE_6REGS
+    if (INLINE_AMD3DNOWEXT(mm_flags)) {
+        fdsp->vector_fmul_window  = vector_fmul_window_3dnowext;
+    }
+    if (INLINE_SSE(mm_flags)) {
+        fdsp->vector_fmul_window = vector_fmul_window_sse;
+    }
+#endif
     if (EXTERNAL_SSE(mm_flags)) {
         fdsp->vector_fmul = ff_vector_fmul_sse;
         fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;