1 /*****************************************************************************
2 * ac3_srfft_sse.c: accelerated SSE ac3 fft functions
3 *****************************************************************************
4 * Copyright (C) 1999, 2000, 2001 VideoLAN
5 * $Id: ac3_srfft_sse.c,v 1.8 2001/11/09 10:02:31 reno Exp $
7 * Authors: Renaud Dartus <reno@videolan.org>
8 * Aaron Holtzman <aholtzma@engr.uvic.ca>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
25 #define MODULE_NAME imdctsse
26 #include "modules_inner.h"
28 /*****************************************************************************
30 *****************************************************************************/
43 #include "ac3_imdct.h"
44 #include "ac3_srfft.h"
47 static float hsqrt2_sse[] ATTR_ALIGN(16) =
48 { 0.707106781188, 0.707106781188, -0.707106781188, -0.707106781188 };
50 static float C_1_sse[] ATTR_ALIGN(16) =
51 { -1.0, 1.0, -1.0, 1.0 };
59 static void fft_4_sse (complex_t *x);
60 static void fft_8_sse (complex_t *x);
61 static void fft_asmb_sse (ck_sse_t * ck, int k, complex_t *x, complex_t *wTB,
62 const complex_t *d, const complex_t *d_3);
64 void _M( fft_64p ) ( complex_t *a )
69 fft_8_sse(&a[0]); fft_4_sse(&a[8]); fft_4_sse(&a[12]);
70 fft_asmb_sse(&ck, 2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
72 fft_8_sse(&a[16]), fft_8_sse(&a[24]);
73 fft_asmb_sse(&ck, 4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
75 fft_8_sse(&a[32]); fft_4_sse(&a[40]); fft_4_sse(&a[44]);
76 fft_asmb_sse(&ck, 2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
78 fft_8_sse(&a[48]); fft_4_sse(&a[56]); fft_4_sse(&a[60]);
79 fft_asmb_sse(&ck, 2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
81 fft_asmb_sse(&ck, 8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
84 void _M( fft_128p ) ( complex_t *a )
89 fft_8_sse(&a[0]); fft_4_sse(&a[8]); fft_4_sse(&a[12]);
90 fft_asmb_sse(&ck, 2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
92 fft_8_sse(&a[16]), fft_8_sse(&a[24]);
93 fft_asmb_sse(&ck, 4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
95 fft_8_sse(&a[32]); fft_4_sse(&a[40]); fft_4_sse(&a[44]);
96 fft_asmb_sse(&ck, 2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
98 fft_8_sse(&a[48]); fft_4_sse(&a[56]); fft_4_sse(&a[60]);
99 fft_asmb_sse(&ck, 2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
101 fft_asmb_sse(&ck, 8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
103 fft_8_sse(&a[64]); fft_4_sse(&a[72]); fft_4_sse(&a[76]);
104 /* fft_16(&a[64]); */
105 fft_asmb_sse(&ck, 2, &a[64], &a[72], &delta16[0], &delta16_3[0]);
107 fft_8_sse(&a[80]); fft_8_sse(&a[88]);
109 /* fft_32(&a[64]); */
110 fft_asmb_sse(&ck, 4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
112 fft_8_sse(&a[96]); fft_4_sse(&a[104]), fft_4_sse(&a[108]);
113 /* fft_16(&a[96]); */
114 fft_asmb_sse(&ck, 2, &a[96], &a[104], &delta16[0], &delta16_3[0]);
116 fft_8_sse(&a[112]), fft_8_sse(&a[120]);
117 /* fft_32(&a[96]); */
118 fft_asmb_sse(&ck, 4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
120 /* fft_128(&a[0]); */
121 fft_asmb_sse(&ck, 16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
124 static void fft_4_sse (complex_t *x)
126 __asm__ __volatile__ (
128 "movaps (%%eax), %%xmm0\n" /* x[1] | x[0] */
129 "movaps 16(%%eax), %%xmm2\n" /* x[3] | x[2] */
130 "movaps %%xmm0, %%xmm1\n" /* x[1] | x[0] */
131 "addps %%xmm2, %%xmm0\n" /* x[1] + x[3] | x[0] + x[2] */
132 "subps %%xmm2, %%xmm1\n" /* x[1] - x[3] | x[0] - x[2] */
133 "xorps %%xmm6, %%xmm6\n"
134 "movhlps %%xmm1, %%xmm4\n" /* ? | x[1] - x[3] */
135 "movhlps %%xmm0, %%xmm3\n" /* ? | x[1] + x[3] */
136 "subss %%xmm4, %%xmm6\n" /* 0 | -(x[1] - x[3]).re */
137 "movlhps %%xmm1, %%xmm0\n" /* x[0] - x[2] | x[0] + x[2] */
138 "movlhps %%xmm6, %%xmm4\n" /* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
139 "movaps %%xmm0, %%xmm2\n" /* x[0] - x[2] | x[0] + x[2] */
140 "shufps $0x94, %%xmm4, %%xmm3\n" /* i*(x[1] - x[3]) | x[1] + x[3] */
141 "addps %%xmm3, %%xmm0\n"
142 "subps %%xmm3, %%xmm2\n"
143 "movaps %%xmm0, (%%eax)\n"
144 "movaps %%xmm2, 16(%%eax)\n"
149 static void fft_8_sse (complex_t *x)
151 __asm__ __volatile__ (
154 "movlps (%%eax), %%xmm0\n" /* x[0] */
155 "movlps 32(%%eax), %%xmm1\n" /* x[4] */
156 "movhps 16(%%eax), %%xmm0\n" /* x[2] | x[0] */
157 "movhps 48(%%eax), %%xmm1\n" /* x[6] | x[4] */
158 "movaps %%xmm0, %%xmm2\n" /* x[2] | x[0] */
159 "xorps %%xmm3, %%xmm3\n"
160 "addps %%xmm1, %%xmm0\n" /* x[2] + x[6] | x[0] + x[4] */
161 "subps %%xmm1, %%xmm2\n" /* x[2] - x[6] | x[0] - x[4] */
162 "movhlps %%xmm0, %%xmm5\n" /* x[2] + x[6] */
163 "movhlps %%xmm2, %%xmm4\n" /* x[2] - x[6] */
164 "movlhps %%xmm2, %%xmm0\n" /* x[0] - x[4] | x[0] + x[4] */
165 "subss %%xmm4, %%xmm3\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
166 "movaps %%xmm0, %%xmm7\n" /* x[0] - x[4] | x[0] + x[4] */
167 "movaps %%xmm3, %%xmm4\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
168 "movlps 8(%%eax), %%xmm1\n" /* x[1] */
169 "shufps $0x14, %%xmm4, %%xmm5\n" /* i*(x[2] - x[6]) | x[2] + x[6] */
171 "addps %%xmm5, %%xmm0\n" /* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */
172 "subps %%xmm5, %%xmm7\n" /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
174 "movhps 24(%%eax), %%xmm1\n" /* x[3] | x[1] */
175 "movlps 40(%%eax), %%xmm2\n" /* x[5] */
176 "movhps 56(%%eax), %%xmm2\n" /* x[7] | x[5] */
177 "movaps %%xmm1, %%xmm3\n" /* x[3] | x[1] */
178 "addps %%xmm2, %%xmm1\n" /* x[3] + x[7] | x[1] + x[5] */
179 "subps %%xmm2, %%xmm3\n" /* x[3] - x[7] | x[1] - x[5] */
180 "movaps (%%ecx), %%xmm4\n" /* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
181 "movaps %%xmm3, %%xmm6\n" /* x[3] - x[7] | x[1] - x[5] */
182 "mulps %%xmm4, %%xmm3\n" /* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */
183 "shufps $0xc8, %%xmm4, %%xmm4\n" /* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
184 "shufps $0xb1, %%xmm6, %%xmm6\n" /* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */
185 "mulps %%xmm4, %%xmm6\n" /* (x7-x3).re/s2|(x3-x7).im/s2|(x5-x1).re/s2|(x1-x5).im/s2 */
186 "addps %%xmm3, %%xmm6\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */
187 "movhlps %%xmm1, %%xmm5\n" /* x[3] + x[7] */
188 "movlhps %%xmm6, %%xmm1\n" /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
189 "shufps $0xe4, %%xmm6, %%xmm5\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
190 "movaps %%xmm1, %%xmm3\n" /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
191 "addps %%xmm5, %%xmm1\n" /* u */
192 "subps %%xmm5, %%xmm3\n" /* v */
193 "movaps %%xmm0, %%xmm2\n" /* yb */
194 "movaps %%xmm7, %%xmm4\n" /* yt */
195 "movaps (%%edx), %%xmm5\n"
196 "mulps %%xmm5, %%xmm3\n"
197 "addps %%xmm1, %%xmm0\n" /* yt + u */
198 "subps %%xmm1, %%xmm2\n" /* yt - u */
199 "shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
200 "movaps %%xmm0, (%%eax)\n"
201 "movaps %%xmm2, 32(%%eax)\n"
202 "addps %%xmm3, %%xmm4\n" /* yb - i*v */
203 "subps %%xmm3, %%xmm7\n" /* yb + i*v */
204 "movaps %%xmm4, 16(%%eax)\n"
205 "movaps %%xmm7, 48(%%eax)\n"
208 : "a" (x), "c" (hsqrt2_sse), "d" (C_1_sse));
211 static void fft_asmb_sse (ck_sse_t * ck, int k, complex_t *x, complex_t *wTB,
212 const complex_t *d, const complex_t *d_3)
216 __asm__ __volatile__ (
219 "movl %%esp, %%ebp\n"
230 "movl 4(%%ecx), %%ebx\n"
231 "movl %%ebx, -4(%%ebp)\n"
232 "movl (%%ecx), %%ecx\n"
234 "movl %%ecx, -8(%%ebp)\n" /* k */
237 "shll $4, %%ecx\n" /* 16k */
239 /* TRANSZERO and TRANS */
241 "movaps (%%eax), %%xmm0\n" /* x[1] | x[0] */
242 "movaps (%%edi), %%xmm1\n" /* wT[1] | wT[0] */
243 "movaps (%%edi, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */
244 "movlps (%%edx), %%xmm3\n" /* d */
245 "movlps (%%esi), %%xmm4\n" /* d3 */
246 "movhlps %%xmm1, %%xmm5\n" /* wT[1] */
247 "movhlps %%xmm2, %%xmm6\n" /* wB[1] */
248 "shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
249 "shufps $0x50, %%xmm4, %%xmm4\n" /* d3[1].im | d3[1].im | d3[i].re | d3[i].re */
250 "movlhps %%xmm5, %%xmm5\n" /* wT[1] | wT[1] */
251 "movlhps %%xmm6, %%xmm6\n" /* wB[1] | wB[1] */
252 "mulps %%xmm3, %%xmm5\n"
253 "mulps %%xmm4, %%xmm6\n"
254 "movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */
255 "movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
256 "shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
257 "movl -4(%%ebp), %%ebx\n"
258 "movaps (%%ebx), %%xmm4\n"
259 "mulps %%xmm4, %%xmm7\n"
260 "addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */
261 "movlhps %%xmm5, %%xmm1\n" /* d[1] * wT[1] | wT[0] */
262 "shufps $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */
263 "movaps %%xmm1, %%xmm3\n" /* d[1] * wT[1] | wT[0] */
264 "leal (%%eax, %%ecx, 2), %%ebx\n"
265 "addps %%xmm2, %%xmm1\n" /* u */
266 "subps %%xmm2, %%xmm3\n" /* v */
267 "mulps %%xmm4, %%xmm3\n"
268 "movaps (%%eax, %%ecx), %%xmm5\n" /* xk[1] | xk[0] */
269 "shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
270 "movaps %%xmm0, %%xmm2\n" /* x[1] | x[0] */
271 "movaps %%xmm5, %%xmm6\n" /* xk[1] | xk[0] */
272 "addps %%xmm1, %%xmm0\n"
273 "subps %%xmm1, %%xmm2\n"
274 "addps %%xmm3, %%xmm5\n"
275 "subps %%xmm3, %%xmm6\n"
276 "movaps %%xmm0, (%%eax)\n"
277 "movaps %%xmm2, (%%ebx)\n"
278 "movaps %%xmm5, (%%eax, %%ecx)\n"
279 "movaps %%xmm6, (%%ebx, %%ecx)\n"
288 "movaps (%%edi), %%xmm0\n" /* wT[1] | wT[0] */
289 "movaps (%%edx), %%xmm1\n" /* d[1] | d[0] */
291 "movaps (%%edi, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */
292 "movaps (%%esi), %%xmm5\n" /* d3[1] | d3[0] */
294 "movhlps %%xmm0, %%xmm2\n" /* wT[1] */
295 "movhlps %%xmm1, %%xmm3\n" /* d[1] */
297 "movhlps %%xmm4, %%xmm6\n" /* wB[1] */
298 "movhlps %%xmm5, %%xmm7\n" /* d3[1] */
300 "shufps $0x50, %%xmm1, %%xmm1\n" /* d[0].im | d[0].im | d[0].re | d[0].re */
301 "shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
303 "movlhps %%xmm0, %%xmm0\n" /* wT[0] | wT[0] */
304 "shufps $0x50, %%xmm5, %%xmm5\n" /* d3[0].im | d3[0].im | d3[0].re | d3[0].re */
305 "movlhps %%xmm2, %%xmm2\n" /* wT[1] | wT[1] */
306 "shufps $0x50, %%xmm7, %%xmm7\n" /* d3[1].im | d3[1].im | d3[1].re | d3[1].re */
308 "mulps %%xmm1, %%xmm0\n" /* d[0].im * wT[0].im | d[0].im * wT[0].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
309 "mulps %%xmm3, %%xmm2\n" /* d[1].im * wT[1].im | d[1].im * wT[1].re | d[1].re * wT[1].im | d[1].re * wT[1].re */
310 "movlhps %%xmm4, %%xmm4\n" /* wB[0] | wB[0] */
311 "movlhps %%xmm6, %%xmm6\n" /* wB[1] | wB[1] */
313 "movhlps %%xmm0, %%xmm1\n" /* d[0].im * wT[0].im | d[0].im * wT[0].re */
314 "movlhps %%xmm2, %%xmm0\n" /* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
315 "mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
316 "mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
317 "shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
318 "movl -4(%%ebp), %%ebx\n"
319 "movaps (%%ebx), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */
321 "movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
322 "mulps %%xmm3, %%xmm1\n" /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
323 "movlhps %%xmm6, %%xmm4\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */
324 "addps %%xmm1, %%xmm0\n" /* wT[1] * d[1] | wT[0] * d[0] */
326 "shufps $0xb1, %%xmm6, %%xmm5\n" /* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
327 "mulps %%xmm3, %%xmm5\n" /* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
328 "addps %%xmm5, %%xmm4\n" /* wB[1] * d3[1] | wB[0] * d3[0] */
330 "movaps %%xmm0, %%xmm1\n" /* wT[1] * d[1] | wT[0] * d[0] */
331 "addps %%xmm4, %%xmm0\n" /* u */
332 "subps %%xmm4, %%xmm1\n" /* v */
333 "movaps (%%eax), %%xmm6\n" /* x[1] | x[0] */
334 "leal (%%eax, %%ecx, 2), %%ebx\n"
335 "mulps %%xmm3, %%xmm1\n"
338 "shufps $0xb1, %%xmm1, %%xmm1\n" /* -i * v */
339 "movaps (%%eax, %%ecx), %%xmm7\n" /* xk[1] | xk[0] */
340 "movaps %%xmm6, %%xmm2\n"
341 "movaps %%xmm7, %%xmm4\n"
342 "addps %%xmm0, %%xmm6\n"
343 "subps %%xmm0, %%xmm2\n"
344 "movaps %%xmm6, (%%eax)\n"
345 "movaps %%xmm2, (%%ebx)\n"
346 "addps %%xmm1, %%xmm7\n"
347 "subps %%xmm1, %%xmm4\n"
349 "movaps %%xmm7, (%%eax, %%ecx)\n"
350 "movaps %%xmm4, (%%ebx, %%ecx)\n"
368 : "=a" (x), "=D" (wTB)
369 : "c" (ck), "a" (x), "D" (wTB), "d" (d), "S" (d_3) );