X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=plugins%2Fimdct%2Fac3_imdct_3dn.c;h=f21159bff5cbfa14b4d9e38adfab6ccd62dc1cb7;hb=976dfc3eb46069ea3b920dec899e0c13ccf17c96;hp=b476b8fefbabdd9567f7a8af47b0a337547af96f;hpb=f42dc7826cee6211c5ada73f5ed94cfa4742bce5;p=vlc diff --git a/plugins/imdct/ac3_imdct_3dn.c b/plugins/imdct/ac3_imdct_3dn.c index b476b8fefb..f21159bff5 100644 --- a/plugins/imdct/ac3_imdct_3dn.c +++ b/plugins/imdct/ac3_imdct_3dn.c @@ -2,7 +2,7 @@ * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT ***************************************************************************** * Copyright (C) 1999, 2000 VideoLAN - * $Id: ac3_imdct_3dn.c,v 1.4 2001/06/03 12:47:21 sam Exp $ + * $Id: ac3_imdct_3dn.c,v 1.12 2002/07/31 20:56:51 sam Exp $ * * Authors: Renaud Dartus * @@ -21,21 +21,13 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. *****************************************************************************/ -#define MODULE_NAME imdct3dn -#include "modules_inner.h" - /***************************************************************************** * Preamble *****************************************************************************/ -#include "defs.h" - #include #include -#include "config.h" -#include "common.h" -#include "threads.h" -#include "mtime.h" +#include #include "ac3_imdct.h" #include "ac3_imdct_common.h" @@ -45,8 +37,8 @@ # define M_PI 3.14159265358979323846 #endif -void _M( fft_64p ) ( complex_t *x ); -void _M( fft_128p ) ( complex_t *a ); +void E_( fft_64p ) ( complex_t *x ); +void E_( fft_128p ) ( complex_t *a ); static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse); static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse); @@ -54,7 +46,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt); -void _M( imdct_init ) (imdct_t * p_imdct) +void E_( imdct_init ) (imdct_t * p_imdct) { int i; float scale = 181.019; @@ -70,18 +62,18 @@ void _M( imdct_init ) (imdct_t * p_imdct) } } -void _M( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[]) +void E_( imdct_do_512 ) (imdct_t * p_imdct, float data[], float delay[]) { imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse); - _M( fft_128p ) (p_imdct->buf); + E_( fft_128p ) (p_imdct->buf); imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse); imdct512_window_delay_3dn (p_imdct->buf, data, window, delay); } -void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[]) +void E_( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[]) { imdct512_pre_ifft_twiddle_3dn (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse); - _M( fft_128p ) (p_imdct->buf); + E_( fft_128p ) (p_imdct->buf); imdct512_post_ifft_twiddle_3dn (p_imdct->buf, p_imdct->xcos_sin_sse); imdct512_window_delay_nol_3dn (p_imdct->buf, data, window, delay); } @@ -89,24 +81,14 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[]) static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse) { __asm__ __volatile__ ( - "pushl %%ebp\n" - "movl %%esp, %%ebp\n" - "addl $-4, %%esp\n" /* local variable, loop counter */ - - "pushl %%eax\n" + ".align 16\n" "pushl %%ebx\n" - "pushl %%ecx\n" - "pushl %%edx\n" - "pushl %%edi\n" "pushl %%esi\n" + + "movl $128, %%ebx\n" /* loop counter */ - "movl 8(%%ebp), %%eax\n" /* pmt */ - "movl 12(%%ebp), %%ebx\n" /* buf */ - "movl 16(%%ebp), %%ecx\n" /* data */ - "movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */ - "movl $128, -4(%%ebp)\n" - -".loop:\n" + ".align 16\n" +"0:\n" "movl (%%eax), %%esi\n" "movd (%%ecx, %%esi, 8), %%mm1\n" /* 2j */ "punpckldq %%mm1, %%mm1\n" /* 2j | 2j */ @@ -124,33 +106,30 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float "pfmul %%mm4, %%mm0\n" /* 255-2j * -s_j | 255-2j * c_j */ "pfmul %%mm1, %%mm2\n" /* 2j * -c_j | 2j * -s_j */ - "addl $8, %%ebx\n" + "addl $8, %%edi\n" "pfadd %%mm2, %%mm0\n" /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */ - "movq %%mm0, -8(%%ebx)\n" - "decl -4(%%ebp)\n" - "jnz .loop\n" + "movq %%mm0, -8(%%edi)\n" + "decl %%ebx\n" + "jnz 0b\n" "popl %%esi\n" - "popl %%edi\n" - "popl %%edx\n" - "popl %%ecx\n" "popl %%ebx\n" - "popl %%eax\n" - "addl $4, %%esp\n" - "popl %%ebp\n" "femms\n" - ::); + : "=D" (buf) + : "a" (pmt), "c" (data), "d" (xcos_sin_sse), "D" (buf)); } static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse) { __asm__ __volatile__ ( + ".align 16\n" "pushl %%ebx\n" "movl $64, %%ebx\n" /* loop counter */ -".loop1:\n" + ".align 16\n" +"0:\n" "movq (%%eax), %%mm0\n" /* im0 | re0 */ "movq %%mm0, %%mm1\n" /* im0 | re0 */ "punpckldq %%mm0, %%mm0\n" /* re0 | re0 */ @@ -189,7 +168,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse) "addl $32, %%ecx\n" "addl $16, %%eax\n" "decl %%ebx\n" - "jnz .loop1\n" + "jnz 0b\n" "popl %%ebx\n" "femms\n" @@ -200,25 +179,23 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse) static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) { __asm__ __volatile__ ( - "pushl %%ebp\n" - "movl %%esp, %%ebp\n" + ".align 16\n" "pushl %%eax\n" "pushl %%ebx\n" "pushl %%ecx\n" "pushl %%edx\n" - "pushl %%esi\n" "pushl %%edi\n" + "pushl %%esi\n" + "pushl %%ebp\n" - "movl 20(%%ebp), %%ebx\n" /* delay */ - "movl 16(%%ebp), %%edx\n" /* window */ - - "movl 8(%%ebp), %%eax\n" /* buf */ - "movl $32, %%ecx\n" /* loop count */ - "leal 516(%%eax), %%esi\n" /* buf[64].im */ - "leal 504(%%eax), %%edi\n" /* buf[63].re */ - "movl 12(%%ebp), %%eax\n" /* data */ + "movl %%esi, %%ebp\n" /* buf */ + "movl $32, %%ebx\n" /* loop count */ + "leal 516(%%ebp), %%esi\n" /* buf[64].im */ + "leal 504(%%ebp), %%edi\n" /* buf[63].re */ + + ".align 16\n" ".first_128_samples:\n" "movd (%%esi), %%mm0\n" /* im0 */ "movd 8(%%esi), %%mm2\n" /* im1 */ @@ -235,8 +212,8 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "movq (%%edx), %%mm0\n" /* w1 | w0 */ "movq 8(%%edx), %%mm1\n" /* w3 | w2 */ - "movq (%%ebx), %%mm2\n" /* d1 | d0 */ - "movq 8(%%ebx), %%mm3\n" /* d3 | d2 */ + "movq (%%ecx), %%mm2\n" /* d1 | d0 */ + "movq 8(%%ecx), %%mm3\n" /* d3 | d2 */ "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */ "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */ @@ -247,23 +224,24 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "addl $16, %%edx\n" "movq %%mm0, (%%eax)\n" "movq %%mm1, 8(%%eax)\n" - "addl $16, %%ebx\n" + "addl $16, %%ecx\n" "addl $16, %%esi\n" "addl $16, %%eax\n" "addl $-16, %%edi\n" - "decl %%ecx\n" + "decl %%ebx\n" "jnz .first_128_samples\n" - "movl 8(%%ebp), %%esi\n" /* buf[0].re */ - "leal 1020(%%esi), %%edi\n" /* buf[127].im */ - "movl $32, %%ecx\n" /* loop count */ + "movl %%ebp, %%esi\n" /* buf[0].re */ + "movl $32, %%ebx\n" /* loop count */ + "leal 1020(%%ebp), %%edi\n" /* buf[127].im */ + ".align 16\n" ".second_128_samples:\n" "movd (%%esi), %%mm0\n" /* buf[i].re */ "movd 8(%%esi), %%mm2\n" /* re1 */ "movd (%%edi), %%mm1\n" /* buf[127-i].im */ "movd -8(%%edi), %%mm3\n" /* im1 */ - + "pxor %%mm4, %%mm4\n" "pxor %%mm5, %%mm5\n" "pfsub %%mm0, %%mm4\n" /* -re0 */ @@ -274,8 +252,8 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "movq (%%edx), %%mm0\n" /* w1 | w0 */ "movq 8(%%edx), %%mm1\n" /* w3 | w2 */ - "movq (%%ebx), %%mm2\n" /* d1 | d0 */ - "movq 8(%%ebx), %%mm3\n" /* d3 | d2 */ + "movq (%%ecx), %%mm2\n" /* d1 | d0 */ + "movq 8(%%ecx), %%mm3\n" /* d3 | d2 */ "addl $16, %%esi\n" @@ -292,16 +270,16 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "addl $16, %%edx\n" "addl $16, %%eax\n" - "addl $16, %%ebx\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .second_128_samples\n" - "movl 8(%%ebp), %%eax\n" - "leal 512(%%eax), %%esi\n" /* buf[64].re */ - "leal 508(%%eax), %%edi\n" /* buf[63].im */ - "movl $32, %%ecx\n" /* loop count */ - "movl 20(%%ebp), %%eax\n" /* delay */ + "leal 512(%%ebp), %%esi\n" /* buf[64].re */ + "leal 508(%%ebp), %%edi\n" /* buf[63].im */ + "movl $32, %%ebx\n" /* loop count */ + "addl $-1024, %%ecx\n" /* delay */ + ".align 16\n" ".first_128_delay:\n" "movd (%%esi), %%mm0\n" /* re0 */ "movd 8(%%esi), %%mm2\n" /* re1 */ @@ -325,20 +303,19 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */ "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */ - - "movq %%mm0, (%%eax)\n" - "movq %%mm1, 8(%%eax)\n" + "movq %%mm0, (%%ecx)\n" + "movq %%mm1, 8(%%ecx)\n" "addl $16, %%esi\n" "addl $-16, %%edi\n" - "addl $16, %%eax\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .first_128_delay\n" - "movl 8(%%ebp), %%ebx\n" - "leal 4(%%ebx), %%esi\n" /* buf[0].im */ - "leal 1016(%%ebx), %%edi\n" /* buf[127].re */ - "movl $32, %%ecx\n" /* loop count */ + "leal 4(%%ebp), %%esi\n" /* buf[0].im */ + "leal 1016(%%ebp), %%edi\n" /* buf[127].re */ + "movl $32, %%ebx\n" /* loop count */ + ".align 16\n" ".second_128_delay:\n" "movd (%%esi), %%mm0\n" /* im0 */ "movd 8(%%esi), %%mm2\n" /* im1 */ @@ -363,48 +340,46 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */ - "movq %%mm1, (%%eax)\n" - "movq %%mm3, 8(%%eax)\n" + "movq %%mm1, (%%ecx)\n" + "movq %%mm3, 8(%%ecx)\n" "addl $16, %%esi\n" "addl $-16, %%edi\n" - "addl $16, %%eax\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .second_128_delay\n" - "popl %%edi\n" + "popl %%ebp\n" "popl %%esi\n" + "popl %%edi\n" "popl %%edx\n" "popl %%ecx\n" "popl %%ebx\n" "popl %%eax\n" - "leave\n" "femms\n" - ::); + : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt) + : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt)); } static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) { __asm__ __volatile__ ( - "pushl %%ebp\n" - "movl %%esp, %%ebp\n" - + ".align 16\n" + "pushl %%eax\n" "pushl %%ebx\n" "pushl %%ecx\n" "pushl %%edx\n" - "pushl %%esi\n" "pushl %%edi\n" + "pushl %%esi\n" + "pushl %%ebp\n" - "movl 20(%%ebp), %%ebx\n" /* delay */ - "movl 16(%%ebp), %%edx\n" /* window */ - - "movl 8(%%ebp), %%eax\n" /* buf */ - "movl $32, %%ecx\n" /* loop count */ - "leal 516(%%eax), %%esi\n" /* buf[64].im */ - "leal 504(%%eax), %%edi\n" /* buf[63].re */ - "movl 12(%%ebp), %%eax\n" /* data */ + "movl %%esi, %%ebp\n" /* buf */ + "movl $32, %%ebx\n" /* loop count */ + "leal 516(%%ebp), %%esi\n" /* buf[64].im */ + "leal 504(%%ebp), %%edi\n" /* buf[63].re */ + ".align 16\n" ".first_128_samples2:\n" "movd (%%esi), %%mm0\n" /* im0 */ "movd 8(%%esi), %%mm2\n" /* im1 */ @@ -428,17 +403,18 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa "addl $16, %%edx\n" "movq %%mm0, (%%eax)\n" "movq %%mm1, 8(%%eax)\n" - "addl $16, %%ebx\n" + "addl $16, %%ecx\n" "addl $16, %%esi\n" "addl $16, %%eax\n" "addl $-16, %%edi\n" - "decl %%ecx\n" + "decl %%ebx\n" "jnz .first_128_samples2\n" - "movl 8(%%ebp), %%esi\n" /* buf[0].re */ - "leal 1020(%%esi), %%edi\n" /* buf[127].im */ - "movl $32, %%ecx\n" /* loop count */ + "movl %%ebp, %%esi\n" /* buf[0].re */ + "movl $32, %%ebx\n" /* loop count */ + "leal 1020(%%ebp), %%edi\n" /* buf[127].im */ + ".align 16\n" ".second_128_samples2:\n" "movd (%%esi), %%mm0\n" /* buf[i].re */ "movd 8(%%esi), %%mm2\n" /* re1 */ @@ -468,16 +444,16 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa "addl $16, %%edx\n" "addl $16, %%eax\n" - "addl $16, %%ebx\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .second_128_samples2\n" - "movl 8(%%ebp), %%eax\n" - "leal 512(%%eax), %%esi\n" /* buf[64].re */ - "leal 508(%%eax), %%edi\n" /* buf[63].im */ - "movl $32, %%ecx\n" /* loop count */ - "movl 20(%%ebp), %%eax\n" /* delay */ + "leal 512(%%ebp), %%esi\n" /* buf[64].re */ + "leal 508(%%ebp), %%edi\n" /* buf[63].im */ + "movl $32, %%ebx\n" /* loop count */ + "addl $-1024, %%ecx\n" /* delay */ + ".align 16\n" ".first_128_delays:\n" "movd (%%esi), %%mm0\n" /* re0 */ "movd 8(%%esi), %%mm2\n" /* re1 */ @@ -502,19 +478,19 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */ - "movq %%mm0, (%%eax)\n" - "movq %%mm1, 8(%%eax)\n" + "movq %%mm0, (%%ecx)\n" + "movq %%mm1, 8(%%ecx)\n" "addl $16, %%esi\n" "addl $-16, %%edi\n" - "addl $16, %%eax\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .first_128_delays\n" - "movl 8(%%ebp), %%ebx\n" - "leal 4(%%ebx), %%esi\n" /* buf[0].im */ - "leal 1016(%%ebx), %%edi\n" /* buf[127].re */ - "movl $32, %%ecx\n" /* loop count */ + "leal 4(%%ebp), %%esi\n" /* buf[0].im */ + "leal 1016(%%ebp), %%edi\n" /* buf[127].re */ + "movl $32, %%ebx\n" /* loop count */ + ".align 16\n" ".second_128_delays:\n" "movd (%%esi), %%mm0\n" /* im0 */ "movd 8(%%esi), %%mm2\n" /* im1 */ @@ -539,23 +515,24 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */ - "movq %%mm1, (%%eax)\n" - "movq %%mm3, 8(%%eax)\n" + "movq %%mm1, (%%ecx)\n" + "movq %%mm3, 8(%%ecx)\n" "addl $16, %%esi\n" "addl $-16, %%edi\n" - "addl $16, %%eax\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .second_128_delays\n" - "popl %%edi\n" + "popl %%ebp\n" "popl %%esi\n" + "popl %%edi\n" "popl %%edx\n" "popl %%ecx\n" "popl %%ebx\n" "popl %%eax\n" - "leave\n" "femms\n" - ::); + : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt) + : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt)); }