/*****************************************************************************
* ac3_srfft_sse.c: accelerated SSE ac3 fft functions
*****************************************************************************
/*****************************************************************************
* ac3_srfft_sse.c: accelerated SSE ac3 fft functions
*****************************************************************************
- * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_srfft_sse.c,v 1.6 2001/10/30 19:34:53 reno Exp $
+ * Copyright (C) 1999-2001 VideoLAN
+ * $Id: ac3_srfft_sse.c,v 1.13 2002/06/01 12:31:59 sam Exp $
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
/*****************************************************************************
* Preamble
*****************************************************************************/
/*****************************************************************************
* Preamble
*****************************************************************************/
+
+static float hsqrt2_sse[] ATTR_ALIGN(16) =
+ { 0.707106781188, 0.707106781188, -0.707106781188, -0.707106781188 };
+
+static float C_1_sse[] ATTR_ALIGN(16) =
+ { -1.0, 1.0, -1.0, 1.0 };
+
+typedef struct {
+ int k;
+ void * C1;
+ } ck_sse_t;
+
+
-static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
+static void fft_asmb_sse (ck_sse_t * ck, int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3);
void _M( fft_64p ) ( complex_t *a )
{
const complex_t *d, const complex_t *d_3);
void _M( fft_64p ) ( complex_t *a )
{
- fft_asmb_sse(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+ fft_asmb_sse(&ck, 2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
- fft_asmb_sse(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+ fft_asmb_sse(&ck, 4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
- fft_asmb_sse(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+ fft_asmb_sse(&ck, 2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
- fft_asmb_sse(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+ fft_asmb_sse(&ck, 2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
- fft_asmb_sse(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+ fft_asmb_sse(&ck, 8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
- fft_asmb_sse(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+ fft_asmb_sse(&ck, 2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
- fft_asmb_sse(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+ fft_asmb_sse(&ck, 4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
- fft_asmb_sse(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+ fft_asmb_sse(&ck, 2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
- fft_asmb_sse(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+ fft_asmb_sse(&ck, 2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
- fft_asmb_sse(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+ fft_asmb_sse(&ck, 8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
- fft_asmb_sse(2, &a[64], &a[72], &delta16[0], &delta16_3[0]);
+ fft_asmb_sse(&ck, 2, &a[64], &a[72], &delta16[0], &delta16_3[0]);
- fft_asmb_sse(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
+ fft_asmb_sse(&ck, 4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
- fft_asmb_sse(2, &a[96], &a[104], &delta16[0], &delta16_3[0]);
+ fft_asmb_sse(&ck, 2, &a[96], &a[104], &delta16[0], &delta16_3[0]);
- fft_asmb_sse(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
+ fft_asmb_sse(&ck, 4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
- fft_asmb_sse(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
+ fft_asmb_sse(&ck, 16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
-static float hsqrt2_sse[] ATTR_ALIGN(16) =
- { 0.707106781188, 0.707106781188, -0.707106781188, -0.707106781188 };
-
-static float C_1_sse[] ATTR_ALIGN(16) =
- { -1.0, 1.0, -1.0, 1.0 };
-
-static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
+static void fft_asmb_sse (ck_sse_t * ck, int k, complex_t *x, complex_t *wTB,
- "movl %%ecx, -4(%%ebp)\n" /* k */
- "shll $4, %%ecx\n" /* 16k */ ///
+ "movl 4(%%ecx), %%ebx\n"
+ "movl %%ebx, -4(%%ebp)\n"
+ "movl (%%ecx), %%ecx\n"
+
+ "movl %%ecx, -8(%%ebp)\n" /* k */
"movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */
"movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
"shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
"movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */
"movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
"shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
"movaps (%%ebx), %%xmm4\n"
"mulps %%xmm4, %%xmm7\n"
"addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */
"movlhps %%xmm5, %%xmm1\n" /* d[1] * wT[1] | wT[0] */
"shufps $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */
"movaps %%xmm1, %%xmm3\n" /* d[1] * wT[1] | wT[0] */
"movaps (%%ebx), %%xmm4\n"
"mulps %%xmm4, %%xmm7\n"
"addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */
"movlhps %%xmm5, %%xmm1\n" /* d[1] * wT[1] | wT[0] */
"shufps $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */
"movaps %%xmm1, %%xmm3\n" /* d[1] * wT[1] | wT[0] */
- "leal (%%eax, %%ecx, 2), %%esp\n"
+ "leal (%%eax, %%ecx, 2), %%ebx\n"
"addps %%xmm2, %%xmm1\n" /* u */
"subps %%xmm2, %%xmm3\n" /* v */
"mulps %%xmm4, %%xmm3\n"
"addps %%xmm2, %%xmm1\n" /* u */
"subps %%xmm2, %%xmm3\n" /* v */
"mulps %%xmm4, %%xmm3\n"
"movaps (%%edi), %%xmm0\n" /* wT[1] | wT[0] */
"movaps (%%edx), %%xmm1\n" /* d[1] | d[0] */
"movaps (%%edi), %%xmm0\n" /* wT[1] | wT[0] */
"movaps (%%edx), %%xmm1\n" /* d[1] | d[0] */
"mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
"mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
"shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
"mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
"shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
"movaps (%%ebx), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */
"movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
"movaps (%%ebx), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */
"movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
"addps %%xmm4, %%xmm0\n" /* u */
"subps %%xmm4, %%xmm1\n" /* v */
"movaps (%%eax), %%xmm6\n" /* x[1] | x[0] */
"addps %%xmm4, %%xmm0\n" /* u */
"subps %%xmm4, %%xmm1\n" /* v */
"movaps (%%eax), %%xmm6\n" /* x[1] | x[0] */
- "leal (%%eax, %%ecx, 2), %%esp\n"
+ "leal (%%eax, %%ecx, 2), %%ebx\n"
- "popl %%esp\n"
- : "=c" (k), "=a" (x), "=D" (wTB)
- : "c" (k), "a" (x), "D" (wTB), "d" (d), "S" (d_3), "b" (C_1_sse) );
+ : "=a" (x), "=D" (wTB)
+ : "c" (ck), "a" (x), "D" (wTB), "d" (d), "S" (d_3) );