1 /*****************************************************************************
2 * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
3 *****************************************************************************
4 * Copyright (C) 1999, 2000 VideoLAN
5 * $Id: ac3_imdct_3dn.c,v 1.2 2001/05/28 02:38:48 sam Exp $
7 * Authors: Renaud Dartus <reno@videolan.org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 *****************************************************************************/
24 #define MODULE_NAME imdct3dn
25 #include "modules_inner.h"
27 /*****************************************************************************
29 *****************************************************************************/
40 #include "ac3_imdct.h"
41 #include "ac3_imdct_common.h"
42 #include "ac3_retables.h"
45 # define M_PI 3.14159265358979323846
48 void _M( fft_64p ) ( complex_t *x );
49 void _M( fft_128p ) ( complex_t *a );
51 static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
52 static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse);
53 static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
54 static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
57 void _M( imdct_init ) (imdct_t * p_imdct)
60 float scale = 181.019;
62 fprintf(stderr,"imct_init\n");
63 for (i=0; i < 128; i++)
65 float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
66 float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
67 p_imdct->xcos_sin_sse[i * 4] = xcos_i;
68 p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i;
69 p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i;
70 p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i;
72 fprintf(stderr,"done imct_init\n");
75 void _M( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[])
77 imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
78 _M( fft_128p ) (p_imdct->buf);
79 imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
80 imdct512_window_delay_3dn (p_imdct->buf, data, window, delay);
83 void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
85 imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
86 _M( fft_128p ) (p_imdct->buf);
87 imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
88 imdct512_window_delay_nol_3dn (p_imdct->buf, data, window, delay);
91 static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
93 __asm__ __volatile__ (
96 "addl $-4, %%esp\n" /* local variable, loop counter */
105 "movl 8(%%ebp), %%eax\n" /* pmt */
106 "movl 12(%%ebp), %%ebx\n" /* buf */
107 "movl 16(%%ebp), %%ecx\n" /* data */
108 "movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
109 "movl $128, -4(%%ebp)\n"
112 "movl (%%eax), %%esi\n"
113 "movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */
114 "punpckldq %%mm1, %%mm1\n" /* 2j | 2j */
118 "movq (%%edx, %%esi, 8), %%mm0\n" /* -s_j | c_j */
119 "movq 8(%%edx, %%esi, 8), %%mm2\n" /* -c_j | -s_j */
123 "movd 1020(%%ecx, %%esi, 4), %%mm4\n" /* 255-2j */
124 "punpckldq %%mm4, %%mm4\n" /* 255-2j | 255-2j */
127 "pfmul %%mm4, %%mm0\n" /* 255-2j * -s_j | 255-2j * c_j */
128 "pfmul %%mm1, %%mm2\n" /* 2j * -c_j | 2j * -s_j */
130 "pfadd %%mm2, %%mm0\n" /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */
132 "movq %%mm0, -8(%%ebx)\n"
149 static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
151 __asm__ __volatile__ (
153 "movl $64, %%ebx\n" /* loop counter */
156 "movq (%%eax), %%mm0\n" /* im0 | re0 */
157 "movq %%mm0, %%mm1\n" /* im0 | re0 */
158 "punpckldq %%mm0, %%mm0\n" /* re0 | re0 */
159 "punpckhdq %%mm1, %%mm1\n" /* im0 | im0 */
161 "movq (%%ecx), %%mm2\n" /* -s | c */
162 "movq 8(%%ecx), %%mm3\n" /* -c | -s */
163 "movq %%mm3, %%mm4\n"
165 "punpckhdq %%mm2,%%mm3\n" /* -s | -c */
166 "punpckldq %%mm2,%%mm4\n" /* c | -s */
168 "movq 8(%%eax), %%mm2\n" /* im1 | re1 */
169 "movq %%mm2, %%mm5\n" /* im1 | re1 */
170 "punpckldq %%mm2, %%mm2\n" /* re1 | re1 */
171 "punpckhdq %%mm5, %%mm5\n" /* im1 | im1 */
173 "pfmul %%mm3, %%mm0\n" /* -s * re0 | -c * re0 */
174 "pfmul %%mm4, %%mm1\n" /* c * im0 | -s * im0 */
176 "movq 16(%%ecx), %%mm6\n" /* -s1 | c1 */
177 "movq 24(%%ecx), %%mm7\n" /* -c1 | -s1 */
178 "movq %%mm7, %%mm4\n"
180 "punpckhdq %%mm6,%%mm7\n" /* -s1 | -c1 */
181 "punpckldq %%mm6,%%mm4\n" /* c1 | -s1 */
183 "pfmul %%mm7, %%mm2\n" /* -s1*re1 | -c1*re1 */
184 "pfmul %%mm4, %%mm5\n" /* c1*im1 | -s1*im1 */
186 "pfadd %%mm1, %%mm0\n" /* -s * re0 + c * im0 | -c * re0 - s * im0 */
187 "pfadd %%mm5, %%mm2\n" /* -s1 * re1 + c1 * im1 | -c1 * re1 - s1 * im1 */
189 "movq %%mm0, (%%eax)\n"
190 "movq %%mm2, 8(%%eax)\n"
199 : "a" (buf), "c" (xcos_sin_sse) );
202 static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
204 __asm__ __volatile__ (
206 "movl %%esp, %%ebp\n"
215 "movl 20(%%ebp), %%ebx\n" /* delay */
216 "movl 16(%%ebp), %%edx\n" /* window */
218 "movl 8(%%ebp), %%eax\n" /* buf */
219 "movl $32, %%ecx\n" /* loop count */
220 "leal 516(%%eax), %%esi\n" /* buf[64].im */
221 "leal 504(%%eax), %%edi\n" /* buf[63].re */
222 "movl 12(%%ebp), %%eax\n" /* data */
224 ".first_128_samples:\n"
225 "movd (%%esi), %%mm0\n" /* im0 */
226 "movd 8(%%esi), %%mm2\n" /* im1 */
227 "movd (%%edi), %%mm1\n" /* re0 */
228 "movd -8(%%edi), %%mm3\n" /* re1 */
230 "pxor %%mm4, %%mm4\n"
231 "pxor %%mm5, %%mm5\n"
232 "pfsub %%mm0, %%mm4\n" /* -im0 */
233 "pfsub %%mm2, %%mm5\n" /* -im1 */
235 "punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
236 "punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
238 "movq (%%edx), %%mm0\n" /* w1 | w0 */
239 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
240 "movq (%%ebx), %%mm2\n" /* d1 | d0 */
241 "movq 8(%%ebx), %%mm3\n" /* d3 | d2 */
243 "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
244 "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
246 "pfadd %%mm2, %%mm0\n" /* w1*re0+d1 | -w0*im0+d0 */
247 "pfadd %%mm3, %%mm1\n" /* w3*re1+d3 | -w2*im1+d2 */
250 "movq %%mm0, (%%eax)\n"
251 "movq %%mm1, 8(%%eax)\n"
257 "jnz .first_128_samples\n"
259 "movl 8(%%ebp), %%esi\n" /* buf[0].re */
260 "leal 1020(%%esi), %%edi\n" /* buf[127].im */
261 "movl $32, %%ecx\n" /* loop count */
263 ".second_128_samples:\n"
264 "movd (%%esi), %%mm0\n" /* buf[i].re */
265 "movd 8(%%esi), %%mm2\n" /* re1 */
266 "movd (%%edi), %%mm1\n" /* buf[127-i].im */
267 "movd -8(%%edi), %%mm3\n" /* im1 */
269 "pxor %%mm4, %%mm4\n"
270 "pxor %%mm5, %%mm5\n"
271 "pfsub %%mm0, %%mm4\n" /* -re0 */
272 "pfsub %%mm2, %%mm5\n" /* -re1 */
274 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
275 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
277 "movq (%%edx), %%mm0\n" /* w1 | w0 */
278 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
279 "movq (%%ebx), %%mm2\n" /* d1 | d0 */
280 "movq 8(%%ebx), %%mm3\n" /* d3 | d2 */
284 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
285 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
287 "pfadd %%mm2, %%mm0\n" /* w1*im0+d1 | -w0*re0+d0 */
288 "pfadd %%mm3, %%mm1\n" /* w3*im1+d3 | -w2*re1+d2 */
292 "movq %%mm0, (%%eax)\n"
293 "movq %%mm1, 8(%%eax)\n"
299 "jnz .second_128_samples\n"
301 "movl 8(%%ebp), %%eax\n"
302 "leal 512(%%eax), %%esi\n" /* buf[64].re */
303 "leal 508(%%eax), %%edi\n" /* buf[63].im */
304 "movl $32, %%ecx\n" /* loop count */
305 "movl 20(%%ebp), %%eax\n" /* delay */
307 ".first_128_delay:\n"
308 "movd (%%esi), %%mm0\n" /* re0 */
309 "movd 8(%%esi), %%mm2\n" /* re1 */
310 "movd (%%edi), %%mm1\n" /* im0 */
311 "movd -8(%%edi), %%mm3\n" /* im1 */
313 "pxor %%mm4, %%mm4\n"
314 "pxor %%mm5, %%mm5\n"
315 "pfsub %%mm0, %%mm4\n" /* -re0 */
316 "pfsub %%mm2, %%mm5\n" /* -re1 */
318 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
319 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
322 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
323 "movq -8(%%edx), %%mm0\n" /* w1 | w0 */
327 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
328 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
331 "movq %%mm0, (%%eax)\n"
332 "movq %%mm1, 8(%%eax)\n"
337 "jnz .first_128_delay\n"
339 "movl 8(%%ebp), %%ebx\n"
340 "leal 4(%%ebx), %%esi\n" /* buf[0].im */
341 "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
342 "movl $32, %%ecx\n" /* loop count */
344 ".second_128_delay:\n"
345 "movd (%%esi), %%mm0\n" /* im0 */
346 "movd 8(%%esi), %%mm2\n" /* im1 */
347 "movd (%%edi), %%mm1\n" /* re0 */
348 "movd -8(%%edi), %%mm3\n" /* re1 */
350 "pxor %%mm4, %%mm4\n"
351 "pxor %%mm5, %%mm5\n"
352 "pfsub %%mm1, %%mm4\n" /* -re0 */
353 "pfsub %%mm3, %%mm5\n" /* -re1 */
355 "punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
356 "punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
359 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
360 "movq -8(%%edx), %%mm3\n" /* w1 | w0 */
364 "pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
365 "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
368 "movq %%mm1, (%%eax)\n"
369 "movq %%mm3, 8(%%eax)\n"
374 "jnz .second_128_delay\n"
388 static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
390 __asm__ __volatile__ (
392 "movl %%esp, %%ebp\n"
401 "movl 20(%%ebp), %%ebx\n" /* delay */
402 "movl 16(%%ebp), %%edx\n" /* window */
404 "movl 8(%%ebp), %%eax\n" /* buf */
405 "movl $32, %%ecx\n" /* loop count */
406 "leal 516(%%eax), %%esi\n" /* buf[64].im */
407 "leal 504(%%eax), %%edi\n" /* buf[63].re */
408 "movl 12(%%ebp), %%eax\n" /* data */
410 ".first_128_samples2:\n"
411 "movd (%%esi), %%mm0\n" /* im0 */
412 "movd 8(%%esi), %%mm2\n" /* im1 */
413 "movd (%%edi), %%mm1\n" /* re0 */
414 "movd -8(%%edi), %%mm3\n" /* re1 */
416 "pxor %%mm4, %%mm4\n"
417 "pxor %%mm5, %%mm5\n"
418 "pfsub %%mm0, %%mm4\n" /* -im0 */
419 "pfsub %%mm2, %%mm5\n" /* -im1 */
421 "punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
422 "punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
424 "movq (%%edx), %%mm0\n" /* w1 | w0 */
425 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
427 "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
428 "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
431 "movq %%mm0, (%%eax)\n"
432 "movq %%mm1, 8(%%eax)\n"
438 "jnz .first_128_samples2\n"
440 "movl 8(%%ebp), %%esi\n" /* buf[0].re */
441 "leal 1020(%%esi), %%edi\n" /* buf[127].im */
442 "movl $32, %%ecx\n" /* loop count */
444 ".second_128_samples2:\n"
445 "movd (%%esi), %%mm0\n" /* buf[i].re */
446 "movd 8(%%esi), %%mm2\n" /* re1 */
447 "movd (%%edi), %%mm1\n" /* buf[127-i].im */
448 "movd -8(%%edi), %%mm3\n" /* im1 */
450 "pxor %%mm4, %%mm4\n"
451 "pxor %%mm5, %%mm5\n"
452 "pfsub %%mm0, %%mm4\n" /* -re0 */
453 "pfsub %%mm2, %%mm5\n" /* -re1 */
455 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
456 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
458 "movq (%%edx), %%mm0\n" /* w1 | w0 */
459 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
463 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
464 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
468 "movq %%mm0, (%%eax)\n"
469 "movq %%mm1, 8(%%eax)\n"
475 "jnz .second_128_samples2\n"
477 "movl 8(%%ebp), %%eax\n"
478 "leal 512(%%eax), %%esi\n" /* buf[64].re */
479 "leal 508(%%eax), %%edi\n" /* buf[63].im */
480 "movl $32, %%ecx\n" /* loop count */
481 "movl 20(%%ebp), %%eax\n" /* delay */
483 ".first_128_delays:\n"
484 "movd (%%esi), %%mm0\n" /* re0 */
485 "movd 8(%%esi), %%mm2\n" /* re1 */
486 "movd (%%edi), %%mm1\n" /* im0 */
487 "movd -8(%%edi), %%mm3\n" /* im1 */
489 "pxor %%mm4, %%mm4\n"
490 "pxor %%mm5, %%mm5\n"
491 "pfsub %%mm0, %%mm4\n" /* -re0 */
492 "pfsub %%mm2, %%mm5\n" /* -re1 */
494 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
495 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
498 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
499 "movq -8(%%edx), %%mm0\n" /* w1 | w0 */
503 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
504 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
507 "movq %%mm0, (%%eax)\n"
508 "movq %%mm1, 8(%%eax)\n"
513 "jnz .first_128_delays\n"
515 "movl 8(%%ebp), %%ebx\n"
516 "leal 4(%%ebx), %%esi\n" /* buf[0].im */
517 "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
518 "movl $32, %%ecx\n" /* loop count */
520 ".second_128_delays:\n"
521 "movd (%%esi), %%mm0\n" /* im0 */
522 "movd 8(%%esi), %%mm2\n" /* im1 */
523 "movd (%%edi), %%mm1\n" /* re0 */
524 "movd -8(%%edi), %%mm3\n" /* re1 */
526 "pxor %%mm4, %%mm4\n"
527 "pxor %%mm5, %%mm5\n"
528 "pfsub %%mm1, %%mm4\n" /* -re0 */
529 "pfsub %%mm3, %%mm5\n" /* -re1 */
531 "punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
532 "punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
535 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
536 "movq -8(%%edx), %%mm3\n" /* w1 | w0 */
540 "pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
541 "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
544 "movq %%mm1, (%%eax)\n"
545 "movq %%mm3, 8(%%eax)\n"
550 "jnz .second_128_delays\n"