#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
+#include "fft.h"
-static const int m1m1m1m1[4] __attribute__((aligned(16))) =
+DECLARE_ALIGNED(16, static const int, m1m1m1m1)[4] =
{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
}
-void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
+void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
{
av_unused x86_reg i, j, k, l;
- long n = 1 << s->nbits;
+ long n = 1 << s->mdct_bits;
long n2 = n >> 1;
long n4 = n >> 2;
long n8 = n >> 3;
- const uint16_t *revtab = s->fft.revtab + n8;
+ const uint16_t *revtab = s->revtab + n8;
const FFTSample *tcos = s->tcos;
const FFTSample *tsin = s->tsin;
FFTComplex *z = (FFTComplex *)output;
::"r"(-4*k), "r"(4*k),
"r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)
);
-#ifdef ARCH_X86_64
+#if ARCH_X86_64
// if we have enough regs, don't let gcc make the luts latency-bound
// but if not, latency is faster than spilling
__asm__("movlps %%xmm0, %0 \n"
#endif
}
- ff_fft_dispatch_sse(z, s->fft.nbits);
+ ff_fft_dispatch_sse(z, s->nbits);
/* post rotation + reinterleave + reorder */
);
}
-void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input)
+void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
{
x86_reg j, k;
- long n = 1 << s->nbits;
+ long n = 1 << s->mdct_bits;
long n4 = n >> 2;
ff_imdct_half_sse(s, output+n4, input);