1 /*****************************************************************************
2 * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
3 *****************************************************************************
4 * Copyright (C) 1999, 2000 VideoLAN
5 * $Id: ac3_imdct_3dn.c,v 1.12 2002/07/31 20:56:51 sam Exp $
7 * Authors: Renaud Dartus <reno@videolan.org>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
22 *****************************************************************************/
24 /*****************************************************************************
26 *****************************************************************************/
32 #include "ac3_imdct.h"
33 #include "ac3_imdct_common.h"
34 #include "ac3_retables.h"
37 # define M_PI 3.14159265358979323846
40 void E_( fft_64p ) ( complex_t *x );
41 void E_( fft_128p ) ( complex_t *a );
43 static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
44 static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse);
45 static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
46 static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
49 void E_( imdct_init ) (imdct_t * p_imdct)
52 float scale = 181.019;
54 for (i=0; i < 128; i++)
56 float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
57 float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
58 p_imdct->xcos_sin_sse[i * 4] = xcos_i;
59 p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i;
60 p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i;
61 p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i;
65 void E_( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[])
67 imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
68 E_( fft_128p ) (p_imdct->buf);
69 imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
70 imdct512_window_delay_3dn (p_imdct->buf, data, window, delay);
73 void E_( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
75 imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
76 E_( fft_128p ) (p_imdct->buf);
77 imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse);
78 imdct512_window_delay_nol_3dn (p_imdct->buf, data, window, delay);
81 static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
83 __asm__ __volatile__ (
88 "movl $128, %%ebx\n" /* loop counter */
92 "movl (%%eax), %%esi\n"
93 "movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */
94 "punpckldq %%mm1, %%mm1\n" /* 2j | 2j */
98 "movq (%%edx, %%esi, 8), %%mm0\n" /* -s_j | c_j */
99 "movq 8(%%edx, %%esi, 8), %%mm2\n" /* -c_j | -s_j */
103 "movd 1020(%%ecx, %%esi, 4), %%mm4\n" /* 255-2j */
104 "punpckldq %%mm4, %%mm4\n" /* 255-2j | 255-2j */
107 "pfmul %%mm4, %%mm0\n" /* 255-2j * -s_j | 255-2j * c_j */
108 "pfmul %%mm1, %%mm2\n" /* 2j * -c_j | 2j * -s_j */
110 "pfadd %%mm2, %%mm0\n" /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */
112 "movq %%mm0, -8(%%edi)\n"
121 : "a" (pmt), "c" (data), "d" (xcos_sin_sse), "D" (buf));
124 static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
126 __asm__ __volatile__ (
129 "movl $64, %%ebx\n" /* loop counter */
133 "movq (%%eax), %%mm0\n" /* im0 | re0 */
134 "movq %%mm0, %%mm1\n" /* im0 | re0 */
135 "punpckldq %%mm0, %%mm0\n" /* re0 | re0 */
136 "punpckhdq %%mm1, %%mm1\n" /* im0 | im0 */
138 "movq (%%ecx), %%mm2\n" /* -s | c */
139 "movq 8(%%ecx), %%mm3\n" /* -c | -s */
140 "movq %%mm3, %%mm4\n"
142 "punpckhdq %%mm2,%%mm3\n" /* -s | -c */
143 "punpckldq %%mm2,%%mm4\n" /* c | -s */
145 "movq 8(%%eax), %%mm2\n" /* im1 | re1 */
146 "movq %%mm2, %%mm5\n" /* im1 | re1 */
147 "punpckldq %%mm2, %%mm2\n" /* re1 | re1 */
148 "punpckhdq %%mm5, %%mm5\n" /* im1 | im1 */
150 "pfmul %%mm3, %%mm0\n" /* -s * re0 | -c * re0 */
151 "pfmul %%mm4, %%mm1\n" /* c * im0 | -s * im0 */
153 "movq 16(%%ecx), %%mm6\n" /* -s1 | c1 */
154 "movq 24(%%ecx), %%mm7\n" /* -c1 | -s1 */
155 "movq %%mm7, %%mm4\n"
157 "punpckhdq %%mm6,%%mm7\n" /* -s1 | -c1 */
158 "punpckldq %%mm6,%%mm4\n" /* c1 | -s1 */
160 "pfmul %%mm7, %%mm2\n" /* -s1*re1 | -c1*re1 */
161 "pfmul %%mm4, %%mm5\n" /* c1*im1 | -s1*im1 */
163 "pfadd %%mm1, %%mm0\n" /* -s * re0 + c * im0 | -c * re0 - s * im0 */
164 "pfadd %%mm5, %%mm2\n" /* -s1 * re1 + c1 * im1 | -c1 * re1 - s1 * im1 */
166 "movq %%mm0, (%%eax)\n"
167 "movq %%mm2, 8(%%eax)\n"
176 : "a" (buf), "c" (xcos_sin_sse) );
179 static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
181 __asm__ __volatile__ (
192 "movl %%esi, %%ebp\n" /* buf */
193 "movl $32, %%ebx\n" /* loop count */
194 "leal 516(%%ebp), %%esi\n" /* buf[64].im */
195 "leal 504(%%ebp), %%edi\n" /* buf[63].re */
199 ".first_128_samples:\n"
200 "movd (%%esi), %%mm0\n" /* im0 */
201 "movd 8(%%esi), %%mm2\n" /* im1 */
202 "movd (%%edi), %%mm1\n" /* re0 */
203 "movd -8(%%edi), %%mm3\n" /* re1 */
205 "pxor %%mm4, %%mm4\n"
206 "pxor %%mm5, %%mm5\n"
207 "pfsub %%mm0, %%mm4\n" /* -im0 */
208 "pfsub %%mm2, %%mm5\n" /* -im1 */
210 "punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
211 "punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
213 "movq (%%edx), %%mm0\n" /* w1 | w0 */
214 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
215 "movq (%%ecx), %%mm2\n" /* d1 | d0 */
216 "movq 8(%%ecx), %%mm3\n" /* d3 | d2 */
218 "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
219 "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
221 "pfadd %%mm2, %%mm0\n" /* w1*re0+d1 | -w0*im0+d0 */
222 "pfadd %%mm3, %%mm1\n" /* w3*re1+d3 | -w2*im1+d2 */
225 "movq %%mm0, (%%eax)\n"
226 "movq %%mm1, 8(%%eax)\n"
232 "jnz .first_128_samples\n"
234 "movl %%ebp, %%esi\n" /* buf[0].re */
235 "movl $32, %%ebx\n" /* loop count */
236 "leal 1020(%%ebp), %%edi\n" /* buf[127].im */
239 ".second_128_samples:\n"
240 "movd (%%esi), %%mm0\n" /* buf[i].re */
241 "movd 8(%%esi), %%mm2\n" /* re1 */
242 "movd (%%edi), %%mm1\n" /* buf[127-i].im */
243 "movd -8(%%edi), %%mm3\n" /* im1 */
245 "pxor %%mm4, %%mm4\n"
246 "pxor %%mm5, %%mm5\n"
247 "pfsub %%mm0, %%mm4\n" /* -re0 */
248 "pfsub %%mm2, %%mm5\n" /* -re1 */
250 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
251 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
253 "movq (%%edx), %%mm0\n" /* w1 | w0 */
254 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
255 "movq (%%ecx), %%mm2\n" /* d1 | d0 */
256 "movq 8(%%ecx), %%mm3\n" /* d3 | d2 */
260 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
261 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
263 "pfadd %%mm2, %%mm0\n" /* w1*im0+d1 | -w0*re0+d0 */
264 "pfadd %%mm3, %%mm1\n" /* w3*im1+d3 | -w2*re1+d2 */
268 "movq %%mm0, (%%eax)\n"
269 "movq %%mm1, 8(%%eax)\n"
275 "jnz .second_128_samples\n"
277 "leal 512(%%ebp), %%esi\n" /* buf[64].re */
278 "leal 508(%%ebp), %%edi\n" /* buf[63].im */
279 "movl $32, %%ebx\n" /* loop count */
280 "addl $-1024, %%ecx\n" /* delay */
283 ".first_128_delay:\n"
284 "movd (%%esi), %%mm0\n" /* re0 */
285 "movd 8(%%esi), %%mm2\n" /* re1 */
286 "movd (%%edi), %%mm1\n" /* im0 */
287 "movd -8(%%edi), %%mm3\n" /* im1 */
289 "pxor %%mm4, %%mm4\n"
290 "pxor %%mm5, %%mm5\n"
291 "pfsub %%mm0, %%mm4\n" /* -re0 */
292 "pfsub %%mm2, %%mm5\n" /* -re1 */
294 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
295 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
298 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
299 "movq -8(%%edx), %%mm0\n" /* w1 | w0 */
303 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
304 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
306 "movq %%mm0, (%%ecx)\n"
307 "movq %%mm1, 8(%%ecx)\n"
312 "jnz .first_128_delay\n"
314 "leal 4(%%ebp), %%esi\n" /* buf[0].im */
315 "leal 1016(%%ebp), %%edi\n" /* buf[127].re */
316 "movl $32, %%ebx\n" /* loop count */
319 ".second_128_delay:\n"
320 "movd (%%esi), %%mm0\n" /* im0 */
321 "movd 8(%%esi), %%mm2\n" /* im1 */
322 "movd (%%edi), %%mm1\n" /* re0 */
323 "movd -8(%%edi), %%mm3\n" /* re1 */
325 "pxor %%mm4, %%mm4\n"
326 "pxor %%mm5, %%mm5\n"
327 "pfsub %%mm1, %%mm4\n" /* -re0 */
328 "pfsub %%mm3, %%mm5\n" /* -re1 */
330 "punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
331 "punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
334 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
335 "movq -8(%%edx), %%mm3\n" /* w1 | w0 */
339 "pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
340 "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
343 "movq %%mm1, (%%ecx)\n"
344 "movq %%mm3, 8(%%ecx)\n"
349 "jnz .second_128_delay\n"
360 : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt)
361 : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt));
364 static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
366 __asm__ __volatile__ (
377 "movl %%esi, %%ebp\n" /* buf */
378 "movl $32, %%ebx\n" /* loop count */
379 "leal 516(%%ebp), %%esi\n" /* buf[64].im */
380 "leal 504(%%ebp), %%edi\n" /* buf[63].re */
383 ".first_128_samples2:\n"
384 "movd (%%esi), %%mm0\n" /* im0 */
385 "movd 8(%%esi), %%mm2\n" /* im1 */
386 "movd (%%edi), %%mm1\n" /* re0 */
387 "movd -8(%%edi), %%mm3\n" /* re1 */
389 "pxor %%mm4, %%mm4\n"
390 "pxor %%mm5, %%mm5\n"
391 "pfsub %%mm0, %%mm4\n" /* -im0 */
392 "pfsub %%mm2, %%mm5\n" /* -im1 */
394 "punpckldq %%mm1, %%mm4\n" /* re0 | -im0 */
395 "punpckldq %%mm3, %%mm5\n" /* re1 | -im1 */
397 "movq (%%edx), %%mm0\n" /* w1 | w0 */
398 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
400 "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */
401 "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */
404 "movq %%mm0, (%%eax)\n"
405 "movq %%mm1, 8(%%eax)\n"
411 "jnz .first_128_samples2\n"
413 "movl %%ebp, %%esi\n" /* buf[0].re */
414 "movl $32, %%ebx\n" /* loop count */
415 "leal 1020(%%ebp), %%edi\n" /* buf[127].im */
418 ".second_128_samples2:\n"
419 "movd (%%esi), %%mm0\n" /* buf[i].re */
420 "movd 8(%%esi), %%mm2\n" /* re1 */
421 "movd (%%edi), %%mm1\n" /* buf[127-i].im */
422 "movd -8(%%edi), %%mm3\n" /* im1 */
424 "pxor %%mm4, %%mm4\n"
425 "pxor %%mm5, %%mm5\n"
426 "pfsub %%mm0, %%mm4\n" /* -re0 */
427 "pfsub %%mm2, %%mm5\n" /* -re1 */
429 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
430 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
432 "movq (%%edx), %%mm0\n" /* w1 | w0 */
433 "movq 8(%%edx), %%mm1\n" /* w3 | w2 */
437 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
438 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
442 "movq %%mm0, (%%eax)\n"
443 "movq %%mm1, 8(%%eax)\n"
449 "jnz .second_128_samples2\n"
451 "leal 512(%%ebp), %%esi\n" /* buf[64].re */
452 "leal 508(%%ebp), %%edi\n" /* buf[63].im */
453 "movl $32, %%ebx\n" /* loop count */
454 "addl $-1024, %%ecx\n" /* delay */
457 ".first_128_delays:\n"
458 "movd (%%esi), %%mm0\n" /* re0 */
459 "movd 8(%%esi), %%mm2\n" /* re1 */
460 "movd (%%edi), %%mm1\n" /* im0 */
461 "movd -8(%%edi), %%mm3\n" /* im1 */
463 "pxor %%mm4, %%mm4\n"
464 "pxor %%mm5, %%mm5\n"
465 "pfsub %%mm0, %%mm4\n" /* -re0 */
466 "pfsub %%mm2, %%mm5\n" /* -re1 */
468 "punpckldq %%mm1, %%mm4\n" /* im0 | -re0 */
469 "punpckldq %%mm3, %%mm5\n" /* im1 | -re1 */
472 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
473 "movq -8(%%edx), %%mm0\n" /* w1 | w0 */
477 "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */
478 "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */
481 "movq %%mm0, (%%ecx)\n"
482 "movq %%mm1, 8(%%ecx)\n"
487 "jnz .first_128_delays\n"
489 "leal 4(%%ebp), %%esi\n" /* buf[0].im */
490 "leal 1016(%%ebp), %%edi\n" /* buf[127].re */
491 "movl $32, %%ebx\n" /* loop count */
494 ".second_128_delays:\n"
495 "movd (%%esi), %%mm0\n" /* im0 */
496 "movd 8(%%esi), %%mm2\n" /* im1 */
497 "movd (%%edi), %%mm1\n" /* re0 */
498 "movd -8(%%edi), %%mm3\n" /* re1 */
500 "pxor %%mm4, %%mm4\n"
501 "pxor %%mm5, %%mm5\n"
502 "pfsub %%mm1, %%mm4\n" /* -re0 */
503 "pfsub %%mm3, %%mm5\n" /* -re1 */
505 "punpckldq %%mm4, %%mm0\n" /* -re0 | im0 */
506 "punpckldq %%mm5, %%mm2\n" /* -re1 | im1 */
509 "movq -16(%%edx), %%mm1\n" /* w3 | w2 */
510 "movq -8(%%edx), %%mm3\n" /* w1 | w0 */
514 "pfmul %%mm0, %%mm1\n" /* -w1*re0 | w0*im0 */
515 "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */
518 "movq %%mm1, (%%ecx)\n"
519 "movq %%mm3, 8(%%ecx)\n"
524 "jnz .second_128_delays\n"
535 : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt)
536 : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt));