* ac3_imdct_sse.c: accelerated SSE ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_sse.c,v 1.5 2001/07/26 20:00:33 reno Exp $
+ * $Id: ac3_imdct_sse.c,v 1.13 2002/07/31 20:56:51 sam Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
-#define MODULE_NAME imdctsse
-#include "modules_inner.h"
-
/*****************************************************************************
* Preamble
*****************************************************************************/
-#include "defs.h"
-
#include <math.h>
#include <stdio.h>
-#include "config.h"
-#include "common.h"
-#include "threads.h"
-#include "mtime.h"
+#include <vlc/vlc.h>
#include "ac3_imdct.h"
#include "ac3_imdct_common.h"
# define M_PI 3.14159265358979323846
#endif
-void _M( fft_64p ) ( complex_t *x );
-void _M( fft_128p ) ( complex_t *a );
-
-static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
-static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse);
-static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
-static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+void E_( fft_64p ) ( complex_t *x );
+void E_( fft_128p ) ( complex_t *a );
+static void imdct512_pre_ifft_twiddle_sse ( const int *, complex_t *,
+ float *, float * );
+static void imdct512_post_ifft_twiddle_sse ( complex_t *, float * );
+static void imdct512_window_delay_sse ( complex_t *, float *,
+ float *, float * );
+static void imdct512_window_delay_nol_sse ( complex_t *, float *,
+ float *, float * );
-void _M( imdct_init ) (imdct_t * p_imdct)
+void E_( imdct_init ) (imdct_t * p_imdct)
{
int i;
float scale = 181.019;
}
}
-void _M( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[])
+void E_( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[])
{
- imdct512_pre_ifft_twiddle_sse (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
- _M( fft_128p ) ( p_imdct->buf );
- imdct512_post_ifft_twiddle_sse (p_imdct->buf, p_imdct->xcos_sin_sse);
- imdct512_window_delay_sse (p_imdct->buf, data, window, delay);
+ imdct512_pre_ifft_twiddle_sse( pm128, p_imdct->buf, data,
+ p_imdct->xcos_sin_sse );
+ E_( fft_128p ) ( p_imdct->buf );
+ imdct512_post_ifft_twiddle_sse( p_imdct->buf, p_imdct->xcos_sin_sse );
+ imdct512_window_delay_sse( p_imdct->buf, data, window, delay );
}
-void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
+void E_( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
{
- imdct512_pre_ifft_twiddle_sse (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
- _M( fft_128p ) ( p_imdct->buf );
- imdct512_post_ifft_twiddle_sse (p_imdct->buf, p_imdct->xcos_sin_sse);
- imdct512_window_delay_nol_sse (p_imdct->buf, data, window, delay);
+ imdct512_pre_ifft_twiddle_sse( pm128, p_imdct->buf, data,
+ p_imdct->xcos_sin_sse );
+ E_( fft_128p ) ( p_imdct->buf );
+ imdct512_post_ifft_twiddle_sse( p_imdct->buf, p_imdct->xcos_sin_sse );
+ imdct512_window_delay_nol_sse( p_imdct->buf, data, window, delay );
}
static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
"movl $64, -4(%%ebp)\n"
".align 16\n"
-".loop:\n"
+"0:\n"
"movl (%%eax), %%esi\n"
"movl 4(%%eax), %%edi\n"
"movss (%%ecx, %%esi, 8), %%xmm1\n" /* 2j */
"movaps %%xmm0, -16(%%ebx)\n"
"decl -4(%%ebp)\n"
- "jnz .loop\n"
+ "jnz 0b\n"
"popl %%esi\n"
"popl %%edi\n"
"movl $32, %%ebx\n" /* loop counter */
".align 16\n"
-".loop1:\n"
+"0:\n"
"movaps (%%eax), %%xmm0\n" /* im1 | re1 | im0 | re0 */
"movaps (%%ecx), %%xmm2\n" /* -c | -s | -s | c */
"addl $64, %%ecx\n"
"addl $32, %%eax\n"
"decl %%ebx\n"
- "jnz .loop1\n"
+ "jnz 0b\n"
"popl %%ebx\n"
: "=a" (buf)
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
- "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movaps (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"movaps (%%ecx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"addps %%xmm5, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
- "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
+ "movaps 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"movaps 16(%%ecx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
"subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */
- "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movaps (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"movaps (%%ecx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
- "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
+ "movaps 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"addps %%xmm5, %%xmm0\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im0 */
- "movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movaps -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */
"movss 24(%%esi), %%xmm7\n" /* re3 */
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n"
- "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
+ "movaps (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movaps %%xmm0, (%%ecx)\n"
"addl $32, %%esi\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
- "movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movaps -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */
"movss 24(%%esi), %%xmm7\n" /* im3 */
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"mulps %%xmm4, %%xmm1\n"
- "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
+ "movaps (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movaps %%xmm1, (%%ecx)\n"
"addl $32, %%esi\n"
}
-static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
+static void imdct512_window_delay_nol_sse( complex_t *buf, float *data_ptr,
+ float *window_prt, float *delay_prt )
{
__asm__ __volatile__ (
".align 16\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
- "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movaps (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
- "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
+ "movaps 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */
"addl $32, %%edx\n"
"movaps %%xmm0, (%%eax)\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */
- "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movaps (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n"
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
- "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
+ "movaps 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
"addl $32, %%esi\n"
"subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
"mulps %%xmm4, %%xmm6\n"
"leal 512(%%ebp), %%esi\n" /* buf[64].re */
"leal 508(%%ebp), %%edi\n" /* buf[63].im */
"movl $16, %%ebx\n" /* loop count */
- "addl $-1024, %%ecx\n" /* delay */
".align 16\n"
".first_128_delays:\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im0 */
- "movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movaps -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* re2 */
"movss 24(%%esi), %%xmm7\n" /* re3 */
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
"mulps %%xmm4, %%xmm0\n"
- "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
+ "movaps (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
"movaps %%xmm0, (%%ecx)\n"
"addl $32, %%esi\n"
"movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
"movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
- "movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movaps -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
"shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
"movss 16(%%esi), %%xmm6\n" /* im2 */
"movss 24(%%esi), %%xmm7\n" /* im3 */
"movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
"movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
"mulps %%xmm4, %%xmm1\n"
- "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
+ "movaps (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
"shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
"movaps %%xmm1, (%%ecx)\n"
"addl $32, %%esi\n"