1 /*****************************************************************************
2 * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
3 *****************************************************************************
4 * Copyright (C) 1999, 2000 VideoLAN
5 * $Id: ac3_imdct_3dn.c,v 1.6 2001/07/26 20:00:33 reno Exp $
7 * Authors: Renaud Dartus <reno@videolan.org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 *****************************************************************************/
24 #define MODULE_NAME imdct3dn
25 #include "modules_inner.h"
27 /*****************************************************************************
29 *****************************************************************************/
40 #include "ac3_imdct.h"
41 #include "ac3_imdct_common.h"
42 #include "ac3_retables.h"
45 # define M_PI 3.14159265358979323846
48 void _M( fft_64p ) ( complex_t *x );
49 void _M( fft_128p ) ( complex_t *a );
51 static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
52 static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse);
53 static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
54 static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
57 void _M( imdct_init ) (imdct_t * p_imdct)
60 float scale = 181.019;
62 for (i=0; i < 128; i++)
64 float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
65 float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
66 p_imdct->xcos_sin_sse[i * 4] = xcos_i;
67 p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i;
68 p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i;
69 p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i;
73 void _M( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[])
75 imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
76 _M( fft_128p ) (p_imdct->buf);
77 imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
78 imdct512_window_delay_3dn (p_imdct->buf, data, window, delay);
81 void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
83 imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
84 _M( fft_128p ) (p_imdct->buf);
85 imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
86 imdct512_window_delay_nol_3dn (p_imdct->buf, data, window, delay);
89 static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
91 __asm__ __volatile__ (
96 "movl $128, %%ebx\n" /* loop counter */
100 "movl (%%eax), %%esi\n"
101 "movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */
102 "punpckldq %%mm1, %%mm1\n" /* 2j | 2j */
106 "movq (%%edx, %%esi, 8), %%mm0\n" /* -s_j | c_j */
107 "movq 8(%%edx, %%esi, 8), %%mm2\n" /* -c_j | -s_j */
111 "movd 1020(%%ecx, %%esi, 4), %%mm4\n" /* 255-2j */
112 "punpckldq %%mm4, %%mm4\n" /* 255-2j | 255-2j */
115 "pfmul %%mm4, %%mm0\n" /* 255-2j * -s_j | 255-2j * c_j */
116 "pfmul %%mm1, %%mm2\n" /* 2j * -c_j | 2j * -s_j */
118 "pfadd %%mm2, %%mm0\n" /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */
120 "movq %%mm0, -8(%%edi)\n"
129 : "a" (pmt), "c" (data), "d" (xcos_sin_sse), "D" (buf));
132 static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
134 __asm__ __volatile__ (
137 "movl $64, %%ebx\n" /* loop counter */
141 "movq (%%eax), %%mm0\n" /* im0 | re0 */
142 "movq %%mm0, %%mm1\n" /* im0 | re0 */
143 "punpckldq %%mm0, %%mm0\n" /* re0 | re0 */
144 "punpckhdq %%mm1, %%mm1\n" /* im0 | im0 */
146 "movq (%%ecx), %%mm2\n" /* -s | c */
147 "movq 8(%%ecx), %%mm3\n" /* -c | -s */
148 "movq %%mm3, %%mm4\n"
150 "punpckhdq %%mm2,%%mm3\n" /* -s | -c */
151 "punpckldq %%mm2,%%mm4\n" /* c | -s */
153 "movq 8(%%eax), %%mm2\n" /* im1 | re1 */
154 "movq %%mm2, %%mm5\n" /* im1 | re1 */
155 "punpckldq %%mm2, %%mm2\n" /* re1 | re1 */
156 "punpckhdq %%mm5, %%mm5\n" /* im1 | im1 */
158 "pfmul %%mm3, %%mm0\n" /* -s * re0 | -c * re0 */
159 "pfmul %%mm4, %%mm1\n" /* c * im0 | -s * im0 */
161 "movq 16(%%ecx), %%mm6\n" /* -s1 | c1 */
162 "movq 24(%%ecx), %%mm7\n" /* -c1 | -s1 */
163 "movq %%mm7, %%mm4\n"
165 "punpckhdq %%mm6,%%mm7\n" /* -s1 | -c1 */
166 "punpckldq %%mm6,%%mm4\n" /* c1 | -s1 */
168 "pfmul %%mm7, %%mm2\n" /* -s1*re1 | -c1*re1 */
169 "pfmul %%mm4, %%mm5\n" /* c1*im1 | -s1*im1 */
171 "pfadd %%mm1, %%mm0\n" /* -s * re0 + c * im0 | -c * re0 - s * im0 */
172 "pfadd %%mm5, %%mm2\n" /* -s1 * re1 + c1 * im1 | -c1 * re1 - s1 * im1 */
174 "movq %%mm0, (%%eax)\n"
175 "movq %%mm2, 8(%%eax)\n"
184 : "a" (buf), "c" (xcos_sin_sse) );
187 static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
189 __asm__ __volatile__ (
200 "movl %%esi, %%ebp\n" /* buf */
201 "movl $32, %%ebx\n" /* loop count */
202 "leal 516(%%ebp), %%esi\n" /* buf[64].im */
203 "leal 504(%%ebp), %%edi\n" /* buf[63].re */
207 ".first_128_samples:\n"
208 "movd (%%esi), %%mm0\n" /* im0 */
209 "movd 8(%%esi), %%mm2\n" /* im1 */
210 "movd (%%edi), %%mm1\n" /* re0 */
211 "movd -8(%%edi), %%mm3\n" /* re1 */
213 "pxor %%mm4, %%mm4\n"
214 "pxor %%mm5, %%mm5\n"
215 "pfsub %%mm0, %%mm4\n" /* -im0 */
216 "pfsub %%mm2, %%mm5\n" /* -im1 */
218 "punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
219 "punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
221 "movq (%%edx), %%mm0\n" /* w1 | w0 */
222 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
223 "movq (%%ecx), %%mm2\n" /* d1 | d0 */
224 "movq 8(%%ecx), %%mm3\n" /* d3 | d2 */
226 "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
227 "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
229 "pfadd %%mm2, %%mm0\n" /* w1*re0+d1 | -w0*im0+d0 */
230 "pfadd %%mm3, %%mm1\n" /* w3*re1+d3 | -w2*im1+d2 */
233 "movq %%mm0, (%%eax)\n"
234 "movq %%mm1, 8(%%eax)\n"
240 "jnz .first_128_samples\n"
242 "movl %%ebp, %%esi\n" /* buf[0].re */
243 "movl $32, %%ebx\n" /* loop count */
244 "leal 1020(%%ebp), %%edi\n" /* buf[127].im */
247 ".second_128_samples:\n"
248 "movd (%%esi), %%mm0\n" /* buf[i].re */
249 "movd 8(%%esi), %%mm2\n" /* re1 */
250 "movd (%%edi), %%mm1\n" /* buf[127-i].im */
251 "movd -8(%%edi), %%mm3\n" /* im1 */
253 "pxor %%mm4, %%mm4\n"
254 "pxor %%mm5, %%mm5\n"
255 "pfsub %%mm0, %%mm4\n" /* -re0 */
256 "pfsub %%mm2, %%mm5\n" /* -re1 */
258 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
259 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
261 "movq (%%edx), %%mm0\n" /* w1 | w0 */
262 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
263 "movq (%%ecx), %%mm2\n" /* d1 | d0 */
264 "movq 8(%%ecx), %%mm3\n" /* d3 | d2 */
268 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
269 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
271 "pfadd %%mm2, %%mm0\n" /* w1*im0+d1 | -w0*re0+d0 */
272 "pfadd %%mm3, %%mm1\n" /* w3*im1+d3 | -w2*re1+d2 */
276 "movq %%mm0, (%%eax)\n"
277 "movq %%mm1, 8(%%eax)\n"
283 "jnz .second_128_samples\n"
285 "leal 512(%%ebp), %%esi\n" /* buf[64].re */
286 "leal 508(%%ebp), %%edi\n" /* buf[63].im */
287 "movl $32, %%ebx\n" /* loop count */
288 "addl $-1024, %%ecx\n" /* delay */
291 ".first_128_delay:\n"
292 "movd (%%esi), %%mm0\n" /* re0 */
293 "movd 8(%%esi), %%mm2\n" /* re1 */
294 "movd (%%edi), %%mm1\n" /* im0 */
295 "movd -8(%%edi), %%mm3\n" /* im1 */
297 "pxor %%mm4, %%mm4\n"
298 "pxor %%mm5, %%mm5\n"
299 "pfsub %%mm0, %%mm4\n" /* -re0 */
300 "pfsub %%mm2, %%mm5\n" /* -re1 */
302 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
303 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
306 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
307 "movq -8(%%edx), %%mm0\n" /* w1 | w0 */
311 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
312 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
314 "movq %%mm0, (%%ecx)\n"
315 "movq %%mm1, 8(%%ecx)\n"
320 "jnz .first_128_delay\n"
322 "leal 4(%%ebp), %%esi\n" /* buf[0].im */
323 "leal 1016(%%ebp), %%edi\n" /* buf[127].re */
324 "movl $32, %%ebx\n" /* loop count */
327 ".second_128_delay:\n"
328 "movd (%%esi), %%mm0\n" /* im0 */
329 "movd 8(%%esi), %%mm2\n" /* im1 */
330 "movd (%%edi), %%mm1\n" /* re0 */
331 "movd -8(%%edi), %%mm3\n" /* re1 */
333 "pxor %%mm4, %%mm4\n"
334 "pxor %%mm5, %%mm5\n"
335 "pfsub %%mm1, %%mm4\n" /* -re0 */
336 "pfsub %%mm3, %%mm5\n" /* -re1 */
338 "punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
339 "punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
342 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
343 "movq -8(%%edx), %%mm3\n" /* w1 | w0 */
347 "pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
348 "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
351 "movq %%mm1, (%%ecx)\n"
352 "movq %%mm3, 8(%%ecx)\n"
357 "jnz .second_128_delay\n"
368 : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt)
369 : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt));
372 static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
374 __asm__ __volatile__ (
385 "movl %%esi, %%ebp\n" /* buf */
386 "movl $32, %%ebx\n" /* loop count */
387 "leal 516(%%ebp), %%esi\n" /* buf[64].im */
388 "leal 504(%%ebp), %%edi\n" /* buf[63].re */
391 ".first_128_samples2:\n"
392 "movd (%%esi), %%mm0\n" /* im0 */
393 "movd 8(%%esi), %%mm2\n" /* im1 */
394 "movd (%%edi), %%mm1\n" /* re0 */
395 "movd -8(%%edi), %%mm3\n" /* re1 */
397 "pxor %%mm4, %%mm4\n"
398 "pxor %%mm5, %%mm5\n"
399 "pfsub %%mm0, %%mm4\n" /* -im0 */
400 "pfsub %%mm2, %%mm5\n" /* -im1 */
402 "punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
403 "punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
405 "movq (%%edx), %%mm0\n" /* w1 | w0 */
406 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
408 "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
409 "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
412 "movq %%mm0, (%%eax)\n"
413 "movq %%mm1, 8(%%eax)\n"
419 "jnz .first_128_samples2\n"
421 "movl %%ebp, %%esi\n" /* buf[0].re */
422 "movl $32, %%ebx\n" /* loop count */
423 "leal 1020(%%ebp), %%edi\n" /* buf[127].im */
426 ".second_128_samples2:\n"
427 "movd (%%esi), %%mm0\n" /* buf[i].re */
428 "movd 8(%%esi), %%mm2\n" /* re1 */
429 "movd (%%edi), %%mm1\n" /* buf[127-i].im */
430 "movd -8(%%edi), %%mm3\n" /* im1 */
432 "pxor %%mm4, %%mm4\n"
433 "pxor %%mm5, %%mm5\n"
434 "pfsub %%mm0, %%mm4\n" /* -re0 */
435 "pfsub %%mm2, %%mm5\n" /* -re1 */
437 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
438 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
440 "movq (%%edx), %%mm0\n" /* w1 | w0 */
441 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
445 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
446 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
450 "movq %%mm0, (%%eax)\n"
451 "movq %%mm1, 8(%%eax)\n"
457 "jnz .second_128_samples2\n"
459 "leal 512(%%ebp), %%esi\n" /* buf[64].re */
460 "leal 508(%%ebp), %%edi\n" /* buf[63].im */
461 "movl $32, %%ebx\n" /* loop count */
462 "addl $-1024, %%ecx\n" /* delay */
465 ".first_128_delays:\n"
466 "movd (%%esi), %%mm0\n" /* re0 */
467 "movd 8(%%esi), %%mm2\n" /* re1 */
468 "movd (%%edi), %%mm1\n" /* im0 */
469 "movd -8(%%edi), %%mm3\n" /* im1 */
471 "pxor %%mm4, %%mm4\n"
472 "pxor %%mm5, %%mm5\n"
473 "pfsub %%mm0, %%mm4\n" /* -re0 */
474 "pfsub %%mm2, %%mm5\n" /* -re1 */
476 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
477 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
480 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
481 "movq -8(%%edx), %%mm0\n" /* w1 | w0 */
485 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
486 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
489 "movq %%mm0, (%%ecx)\n"
490 "movq %%mm1, 8(%%ecx)\n"
495 "jnz .first_128_delays\n"
497 "leal 4(%%ebp), %%esi\n" /* buf[0].im */
498 "leal 1016(%%ebp), %%edi\n" /* buf[127].re */
499 "movl $32, %%ebx\n" /* loop count */
502 ".second_128_delays:\n"
503 "movd (%%esi), %%mm0\n" /* im0 */
504 "movd 8(%%esi), %%mm2\n" /* im1 */
505 "movd (%%edi), %%mm1\n" /* re0 */
506 "movd -8(%%edi), %%mm3\n" /* re1 */
508 "pxor %%mm4, %%mm4\n"
509 "pxor %%mm5, %%mm5\n"
510 "pfsub %%mm1, %%mm4\n" /* -re0 */
511 "pfsub %%mm3, %%mm5\n" /* -re1 */
513 "punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
514 "punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
517 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
518 "movq -8(%%edx), %%mm3\n" /* w1 | w0 */
522 "pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
523 "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
526 "movq %%mm1, (%%ecx)\n"
527 "movq %%mm3, 8(%%ecx)\n"
532 "jnz .second_128_delays\n"
543 : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt)
544 : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt));