From bad0a366cf1f68575c5fb00256b0a38a0d9510f8 Mon Sep 17 00:00:00 2001 From: Renaud Dartus Date: Thu, 26 Jul 2001 20:00:33 +0000 Subject: [PATCH] * Format asm functions for gcc -> fixed the segfaults with imdct_sse -> sound is hugly with imdct_sse in debug mode --- doc/vlc.1 | 4 +- plugins/imdct/ac3_imdct_3dn.c | 187 ++++++++++++++-------------------- plugins/imdct/ac3_imdct_sse.c | 159 +++++++++++++---------------- plugins/imdct/ac3_srfft_sse.c | 52 +++++----- 4 files changed, 177 insertions(+), 225 deletions(-) diff --git a/doc/vlc.1 b/doc/vlc.1 index b09f29b8f4..d205219e88 100644 --- a/doc/vlc.1 +++ b/doc/vlc.1 @@ -53,10 +53,10 @@ Choose stereo or mono audio output. Activate hardware AC3 pass-through mode. .TP .B \-\-downmix -Specify a module for AC3 downmix: "downmix", "downmixsse", for instance. +Specify a module for AC3 downmix: "downmix", "sse" or "3dn" for instance. .TP .B \-\-imdct -Specify a module for AC3 IMDCT: "imdct", "imdctsse", for instance. +Specify a module for AC3 IMDCT: "imdct", sse" or "3dn" for instance. .TP .B \-\-novideo Disable video output. diff --git a/plugins/imdct/ac3_imdct_3dn.c b/plugins/imdct/ac3_imdct_3dn.c index 0bca93204d..e5af410f6e 100644 --- a/plugins/imdct/ac3_imdct_3dn.c +++ b/plugins/imdct/ac3_imdct_3dn.c @@ -2,7 +2,7 @@ * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT ***************************************************************************** * Copyright (C) 1999, 2000 VideoLAN - * $Id: ac3_imdct_3dn.c,v 1.5 2001/07/08 23:15:11 reno Exp $ + * $Id: ac3_imdct_3dn.c,v 1.6 2001/07/26 20:00:33 reno Exp $ * * Authors: Renaud Dartus * @@ -90,23 +90,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float { __asm__ __volatile__ ( ".align 16\n" - "pushl %%ebp\n" - "movl %%esp, %%ebp\n" - "addl $-4, %%esp\n" /* local variable, loop counter */ - - "pushl %%eax\n" "pushl %%ebx\n" - "pushl %%ecx\n" - "pushl %%edx\n" - "pushl %%edi\n" "pushl %%esi\n" + + "movl $128, %%ebx\n" /* loop counter */ - "movl 8(%%ebp), %%eax\n" /* pmt */ - "movl 12(%%ebp), %%ebx\n" /* buf */ - "movl 16(%%ebp), %%ecx\n" /* data */ - "movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */ - "movl $128, -4(%%ebp)\n" - ".align 16\n" ".loop:\n" "movl (%%eax), %%esi\n" @@ -126,24 +114,19 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float "pfmul %%mm4, %%mm0\n" /* 255-2j * -s_j | 255-2j * c_j */ "pfmul %%mm1, %%mm2\n" /* 2j * -c_j | 2j * -s_j */ - "addl $8, %%ebx\n" + "addl $8, %%edi\n" "pfadd %%mm2, %%mm0\n" /* 2j * -c_j + 255-2j * -s_j | 2j * -s_j + 255-2j * c_j */ - "movq %%mm0, -8(%%ebx)\n" - "decl -4(%%ebp)\n" + "movq %%mm0, -8(%%edi)\n" + "decl %%ebx\n" "jnz .loop\n" "popl %%esi\n" - "popl %%edi\n" - "popl %%edx\n" - "popl %%ecx\n" "popl %%ebx\n" - "popl %%eax\n" - "addl $4, %%esp\n" - "popl %%ebp\n" "femms\n" - ::); + : "=D" (buf) + : "a" (pmt), "c" (data), "d" (xcos_sin_sse), "D" (buf)); } static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse) @@ -205,25 +188,21 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w { __asm__ __volatile__ ( ".align 16\n" - "pushl %%ebp\n" - "movl %%esp, %%ebp\n" "pushl %%eax\n" "pushl %%ebx\n" "pushl %%ecx\n" "pushl %%edx\n" - "pushl %%esi\n" "pushl %%edi\n" + "pushl %%esi\n" + "pushl %%ebp\n" - "movl 20(%%ebp), %%ebx\n" /* delay */ - "movl 16(%%ebp), %%edx\n" /* window */ - - "movl 8(%%ebp), %%eax\n" /* buf */ - "movl $32, %%ecx\n" /* loop count */ - "leal 516(%%eax), %%esi\n" /* buf[64].im */ - "leal 504(%%eax), %%edi\n" /* buf[63].re */ - "movl 12(%%ebp), %%eax\n" /* data */ + "movl %%esi, %%ebp\n" /* buf */ + "movl $32, %%ebx\n" /* loop count */ + "leal 516(%%ebp), %%esi\n" /* buf[64].im */ + "leal 504(%%ebp), %%edi\n" /* buf[63].re */ + ".align 16\n" ".first_128_samples:\n" "movd (%%esi), %%mm0\n" /* im0 */ @@ -241,8 +220,8 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "movq (%%edx), %%mm0\n" /* w1 | w0 */ "movq 8(%%edx), %%mm1\n" /* w3 | w2 */ - "movq (%%ebx), %%mm2\n" /* d1 | d0 */ - "movq 8(%%ebx), %%mm3\n" /* d3 | d2 */ + "movq (%%ecx), %%mm2\n" /* d1 | d0 */ + "movq 8(%%ecx), %%mm3\n" /* d3 | d2 */ "pfmul %%mm4, %%mm0\n" /* w1*re0 | -w0*im0 */ "pfmul %%mm5, %%mm1\n" /* w3*re1 | -w2*im1 */ @@ -253,16 +232,16 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "addl $16, %%edx\n" "movq %%mm0, (%%eax)\n" "movq %%mm1, 8(%%eax)\n" - "addl $16, %%ebx\n" + "addl $16, %%ecx\n" "addl $16, %%esi\n" "addl $16, %%eax\n" "addl $-16, %%edi\n" - "decl %%ecx\n" + "decl %%ebx\n" "jnz .first_128_samples\n" - "movl 8(%%ebp), %%esi\n" /* buf[0].re */ - "leal 1020(%%esi), %%edi\n" /* buf[127].im */ - "movl $32, %%ecx\n" /* loop count */ + "movl %%ebp, %%esi\n" /* buf[0].re */ + "movl $32, %%ebx\n" /* loop count */ + "leal 1020(%%ebp), %%edi\n" /* buf[127].im */ ".align 16\n" ".second_128_samples:\n" @@ -270,7 +249,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "movd 8(%%esi), %%mm2\n" /* re1 */ "movd (%%edi), %%mm1\n" /* buf[127-i].im */ "movd -8(%%edi), %%mm3\n" /* im1 */ - + "pxor %%mm4, %%mm4\n" "pxor %%mm5, %%mm5\n" "pfsub %%mm0, %%mm4\n" /* -re0 */ @@ -281,8 +260,8 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "movq (%%edx), %%mm0\n" /* w1 | w0 */ "movq 8(%%edx), %%mm1\n" /* w3 | w2 */ - "movq (%%ebx), %%mm2\n" /* d1 | d0 */ - "movq 8(%%ebx), %%mm3\n" /* d3 | d2 */ + "movq (%%ecx), %%mm2\n" /* d1 | d0 */ + "movq 8(%%ecx), %%mm3\n" /* d3 | d2 */ "addl $16, %%esi\n" @@ -299,15 +278,14 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "addl $16, %%edx\n" "addl $16, %%eax\n" - "addl $16, %%ebx\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .second_128_samples\n" - "movl 8(%%ebp), %%eax\n" - "leal 512(%%eax), %%esi\n" /* buf[64].re */ - "leal 508(%%eax), %%edi\n" /* buf[63].im */ - "movl $32, %%ecx\n" /* loop count */ - "movl 20(%%ebp), %%eax\n" /* delay */ + "leal 512(%%ebp), %%esi\n" /* buf[64].re */ + "leal 508(%%ebp), %%edi\n" /* buf[63].im */ + "movl $32, %%ebx\n" /* loop count */ + "addl $-1024, %%ecx\n" /* delay */ ".align 16\n" ".first_128_delay:\n" @@ -333,19 +311,17 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "pfmul %%mm4, %%mm0\n" /* w1*im0 | -w0*re0 */ "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */ - - "movq %%mm0, (%%eax)\n" - "movq %%mm1, 8(%%eax)\n" + "movq %%mm0, (%%ecx)\n" + "movq %%mm1, 8(%%ecx)\n" "addl $16, %%esi\n" "addl $-16, %%edi\n" - "addl $16, %%eax\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .first_128_delay\n" - "movl 8(%%ebp), %%ebx\n" - "leal 4(%%ebx), %%esi\n" /* buf[0].im */ - "leal 1016(%%ebx), %%edi\n" /* buf[127].re */ - "movl $32, %%ecx\n" /* loop count */ + "leal 4(%%ebp), %%esi\n" /* buf[0].im */ + "leal 1016(%%ebp), %%edi\n" /* buf[127].re */ + "movl $32, %%ebx\n" /* loop count */ ".align 16\n" ".second_128_delay:\n" @@ -372,48 +348,44 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */ - "movq %%mm1, (%%eax)\n" - "movq %%mm3, 8(%%eax)\n" + "movq %%mm1, (%%ecx)\n" + "movq %%mm3, 8(%%ecx)\n" "addl $16, %%esi\n" "addl $-16, %%edi\n" - "addl $16, %%eax\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .second_128_delay\n" - "popl %%edi\n" + "popl %%ebp\n" "popl %%esi\n" + "popl %%edi\n" "popl %%edx\n" "popl %%ecx\n" "popl %%ebx\n" "popl %%eax\n" - "leave\n" "femms\n" - ::); + : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt) + : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt)); } static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) { __asm__ __volatile__ ( ".align 16\n" - "pushl %%ebp\n" - "movl %%esp, %%ebp\n" - + "pushl %%eax\n" "pushl %%ebx\n" "pushl %%ecx\n" "pushl %%edx\n" - "pushl %%esi\n" "pushl %%edi\n" + "pushl %%esi\n" + "pushl %%ebp\n" - "movl 20(%%ebp), %%ebx\n" /* delay */ - "movl 16(%%ebp), %%edx\n" /* window */ - - "movl 8(%%ebp), %%eax\n" /* buf */ - "movl $32, %%ecx\n" /* loop count */ - "leal 516(%%eax), %%esi\n" /* buf[64].im */ - "leal 504(%%eax), %%edi\n" /* buf[63].re */ - "movl 12(%%ebp), %%eax\n" /* data */ + "movl %%esi, %%ebp\n" /* buf */ + "movl $32, %%ebx\n" /* loop count */ + "leal 516(%%ebp), %%esi\n" /* buf[64].im */ + "leal 504(%%ebp), %%edi\n" /* buf[63].re */ ".align 16\n" ".first_128_samples2:\n" @@ -439,16 +411,16 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa "addl $16, %%edx\n" "movq %%mm0, (%%eax)\n" "movq %%mm1, 8(%%eax)\n" - "addl $16, %%ebx\n" + "addl $16, %%ecx\n" "addl $16, %%esi\n" "addl $16, %%eax\n" "addl $-16, %%edi\n" - "decl %%ecx\n" + "decl %%ebx\n" "jnz .first_128_samples2\n" - "movl 8(%%ebp), %%esi\n" /* buf[0].re */ - "leal 1020(%%esi), %%edi\n" /* buf[127].im */ - "movl $32, %%ecx\n" /* loop count */ + "movl %%ebp, %%esi\n" /* buf[0].re */ + "movl $32, %%ebx\n" /* loop count */ + "leal 1020(%%ebp), %%edi\n" /* buf[127].im */ ".align 16\n" ".second_128_samples2:\n" @@ -480,15 +452,14 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa "addl $16, %%edx\n" "addl $16, %%eax\n" - "addl $16, %%ebx\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .second_128_samples2\n" - "movl 8(%%ebp), %%eax\n" - "leal 512(%%eax), %%esi\n" /* buf[64].re */ - "leal 508(%%eax), %%edi\n" /* buf[63].im */ - "movl $32, %%ecx\n" /* loop count */ - "movl 20(%%ebp), %%eax\n" /* delay */ + "leal 512(%%ebp), %%esi\n" /* buf[64].re */ + "leal 508(%%ebp), %%edi\n" /* buf[63].im */ + "movl $32, %%ebx\n" /* loop count */ + "addl $-1024, %%ecx\n" /* delay */ ".align 16\n" ".first_128_delays:\n" @@ -515,18 +486,17 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa "pfmul %%mm5, %%mm1\n" /* w3*im1 | -w2*re1 */ - "movq %%mm0, (%%eax)\n" - "movq %%mm1, 8(%%eax)\n" + "movq %%mm0, (%%ecx)\n" + "movq %%mm1, 8(%%ecx)\n" "addl $16, %%esi\n" "addl $-16, %%edi\n" - "addl $16, %%eax\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .first_128_delays\n" - "movl 8(%%ebp), %%ebx\n" - "leal 4(%%ebx), %%esi\n" /* buf[0].im */ - "leal 1016(%%ebx), %%edi\n" /* buf[127].re */ - "movl $32, %%ecx\n" /* loop count */ + "leal 4(%%ebp), %%esi\n" /* buf[0].im */ + "leal 1016(%%ebp), %%edi\n" /* buf[127].re */ + "movl $32, %%ebx\n" /* loop count */ ".align 16\n" ".second_128_delays:\n" @@ -553,23 +523,24 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa "pfmul %%mm2, %%mm3\n" /* -w3*re1 | w2*im1 */ - "movq %%mm1, (%%eax)\n" - "movq %%mm3, 8(%%eax)\n" + "movq %%mm1, (%%ecx)\n" + "movq %%mm3, 8(%%ecx)\n" "addl $16, %%esi\n" "addl $-16, %%edi\n" - "addl $16, %%eax\n" - "decl %%ecx\n" + "addl $16, %%ecx\n" + "decl %%ebx\n" "jnz .second_128_delays\n" - "popl %%edi\n" + "popl %%ebp\n" "popl %%esi\n" + "popl %%edi\n" "popl %%edx\n" "popl %%ecx\n" "popl %%ebx\n" "popl %%eax\n" - "leave\n" "femms\n" - ::); + : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt) + : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt)); } diff --git a/plugins/imdct/ac3_imdct_sse.c b/plugins/imdct/ac3_imdct_sse.c index 50ff4d1c64..f55817611d 100644 --- a/plugins/imdct/ac3_imdct_sse.c +++ b/plugins/imdct/ac3_imdct_sse.c @@ -2,7 +2,7 @@ * ac3_imdct_sse.c: accelerated SSE ac3 DCT ***************************************************************************** * Copyright (C) 1999, 2000 VideoLAN - * $Id: ac3_imdct_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $ + * $Id: ac3_imdct_sse.c,v 1.5 2001/07/26 20:00:33 reno Exp $ * * Authors: Renaud Dartus * Aaron Holtzman @@ -103,10 +103,7 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float "pushl %%edi\n" "pushl %%esi\n" - "movl 8(%%ebp), %%eax\n" /* pmt */ - "movl 12(%%ebp), %%ebx\n" /* buf */ - "movl 16(%%ebp), %%ecx\n" /* data */ - "movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */ + "movl %%edi, %%ebx\n" /* buf */ "movl $64, -4(%%ebp)\n" ".align 16\n" @@ -153,7 +150,9 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float "addl $4, %%esp\n" "popl %%ebp\n" - ::); + : "=D" (buf) + : "a" (pmt), "c" (data), "d" (xcos_sin_sse), "D" (buf)); + } static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse) @@ -226,24 +225,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w { __asm__ __volatile__ ( ".align 16\n" - "pushl %%ebp\n" - "movl %%esp, %%ebp\n" - + "pushl %%eax\n" "pushl %%ebx\n" "pushl %%ecx\n" "pushl %%edx\n" - "pushl %%esi\n" "pushl %%edi\n" + "pushl %%esi\n" + "pushl %%ebp\n" - "movl 20(%%ebp), %%ebx\n" /* delay */ - "movl 16(%%ebp), %%edx\n" /* window */ - - "movl 8(%%ebp), %%eax\n" /* buf */ - "movl $16, %%ecx\n" /* loop count */ - "leal 516(%%eax), %%esi\n" /* buf[64].im */ - "leal 504(%%eax), %%edi\n" /* buf[63].re */ - "movl 12(%%ebp), %%eax\n" /* data */ + "movl %%esi, %%ebp\n" /* buf */ + "movl $16, %%ebx\n" /* loop count */ + "leal 516(%%ebp), %%esi\n" /* buf[64].im */ + "leal 504(%%ebp), %%edi\n" /* buf[63].re */ ".align 16\n" ".first_128_samples:\n" @@ -256,7 +250,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w "movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */ - "movaps (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */ + "movaps (%%ecx), %%xmm5\n" /* d3 | d2 | d1 | d0 */ "shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */ "movss 16(%%esi), %%xmm6\n" /* im2 */ @@ -270,23 +264,23 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w "addps %%xmm5, %%xmm0\n" "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */ "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */ - "movaps 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */ + "movaps 16(%%ecx), %%xmm5\n" /* d7 | d6 | d5 | d4 */ "subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */ "addl $32, %%edx\n" "movaps %%xmm0, (%%eax)\n" - "addl $32, %%ebx\n" + "addl $32, %%ecx\n" "mulps %%xmm4, %%xmm6\n" "addl $32, %%esi\n" "addl $32, %%eax\n" "addps %%xmm5, %%xmm6\n" "addl $-32, %%edi\n" "movaps %%xmm6, -16(%%eax)\n" - "decl %%ecx\n" + "decl %%ebx\n" "jnz .first_128_samples\n" - "movl 8(%%ebp), %%esi\n" /* buf[0].re */ - "leal 1020(%%esi), %%edi\n" /* buf[127].im */ - "movl $16, %%ecx\n" /* loop count */ + "movl %%ebp, %%esi\n" /* buf[0].re */ + "movl $16, %%ebx\n" /* loop count */ + "leal 1020(%%ebp), %%edi\n" /* buf[127].im */ ".align 16\n" ".second_128_samples:\n" @@ -299,7 +293,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w "movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */ - "movaps (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */ + "movaps (%%ecx), %%xmm5\n" /* d3 | d2 | d1 | d0 */ "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */ "movss 16(%%esi), %%xmm6\n" /* re2 */ @@ -317,21 +311,20 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w "addps %%xmm5, %%xmm0\n" "mulps %%xmm4, %%xmm6\n" "addl $-32, %%edi\n" - "movaps 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */ + "movaps 16(%%ecx), %%xmm5\n" /* d7 | d6 | d5 | d4 */ "movaps %%xmm0, (%%eax)\n" "addps %%xmm5, %%xmm6\n" "addl $32, %%edx\n" "addl $32, %%eax\n" - "addl $32, %%ebx\n" + "addl $32, %%ecx\n" "movaps %%xmm6, -16(%%eax)\n" - "decl %%ecx\n" + "decl %%ebx\n" "jnz .second_128_samples\n" - "movl 8(%%ebp), %%eax\n" - "leal 512(%%eax), %%esi\n" /* buf[64].re */ - "leal 508(%%eax), %%edi\n" /* buf[63].im */ - "movl $16, %%ecx\n" /* loop count */ - "movl 20(%%ebp), %%eax\n" /* delay */ + "leal 512(%%ebp), %%esi\n" /* buf[64].re */ + "leal 508(%%ebp), %%edi\n" /* buf[63].im */ + "movl $16, %%ebx\n" /* loop count */ + "addl $-1024, %%ecx\n" /* delay */ ".align 16\n" ".first_128_delay:\n" @@ -356,20 +349,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w "mulps %%xmm4, %%xmm0\n" "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */ - "movaps %%xmm0, (%%eax)\n" + "movaps %%xmm0, (%%ecx)\n" "addl $32, %%esi\n" "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */ "addl $-32, %%edi\n" "mulps %%xmm5, %%xmm6\n" - "addl $32, %%eax\n" - "movaps %%xmm6, -16(%%eax)\n" - "decl %%ecx\n" + "addl $32, %%ecx\n" + "movaps %%xmm6, -16(%%ecx)\n" + "decl %%ebx\n" "jnz .first_128_delay\n" - "movl 8(%%ebp), %%ebx\n" - "leal 4(%%ebx), %%esi\n" /* buf[0].im */ - "leal 1016(%%ebx), %%edi\n" /* buf[127].re */ - "movl $16, %%ecx\n" /* loop count */ + "leal 4(%%ebp), %%esi\n" /* buf[0].im */ + "leal 1016(%%ebp), %%edi\n" /* buf[127].re */ + "movl $16, %%ebx\n" /* loop count */ ".align 16\n" ".second_128_delay:\n" @@ -394,49 +386,45 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w "mulps %%xmm4, %%xmm1\n" "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */ - "movaps %%xmm1, (%%eax)\n" + "movaps %%xmm1, (%%ecx)\n" "addl $32, %%esi\n" "subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */ "addl $-32, %%edi\n" "mulps %%xmm5, %%xmm2\n" - "addl $32, %%eax\n" - "movaps %%xmm2, -16(%%eax)\n" - "decl %%ecx\n" + "addl $32, %%ecx\n" + "movaps %%xmm2, -16(%%ecx)\n" + "decl %%ebx\n" "jnz .second_128_delay\n" - "popl %%edi\n" + "popl %%ebp\n" "popl %%esi\n" + "popl %%edi\n" "popl %%edx\n" "popl %%ecx\n" "popl %%ebx\n" "popl %%eax\n" + : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt) + : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt)); - "leave\n" - ::); } static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt) { __asm__ __volatile__ ( ".align 16\n" - "pushl %%ebp\n" - "movl %%esp, %%ebp\n" "pushl %%eax\n" "pushl %%ebx\n" "pushl %%ecx\n" "pushl %%edx\n" - "pushl %%esi\n" "pushl %%edi\n" + "pushl %%esi\n" + "pushl %%ebp\n" - /* movl 20(%%ebp), %%ebx delay */ - "movl 16(%%ebp), %%edx\n" /* window */ - - "movl 8(%%ebp), %%eax\n" /* buf */ - "movl $16, %%ecx\n" /* loop count */ - "leal 516(%%eax), %%esi\n" /* buf[64].im */ - "leal 504(%%eax), %%edi\n" /* buf[63].re */ - "movl 12(%%ebp), %%eax\n" /* data */ + "movl %%esi, %%ebp\n" /* buf */ + "movl $16, %%ebx\n" /* loop count */ + "leal 516(%%ebp), %%esi\n" /* buf[64].im */ + "leal 504(%%ebp), %%edi\n" /* buf[63].re */ ".align 16\n" ".first_128_sample:\n" @@ -469,12 +457,12 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa "addl $32, %%eax\n" "addl $-32, %%edi\n" "movaps %%xmm6, -16(%%eax)\n" - "decl %%ecx\n" + "decl %%ebx\n" "jnz .first_128_sample\n" - "movl 8(%%ebp), %%esi\n" /* buf[0].re */ - "leal 1020(%%esi), %%edi\n" /* buf[127].im */ - "movl $16, %%ecx\n" /* loop count */ + "movl %%ebp, %%esi\n" /* buf[0].re */ + "movl $16, %%ebx\n" /* loop count */ + "leal 1020(%%ebp), %%edi\n" /* buf[127].im */ ".align 16\n" ".second_128_sample:\n" @@ -507,14 +495,13 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa "addl $32, %%edx\n" "addl $32, %%eax\n" "movaps %%xmm6, -16(%%eax)\n" - "decl %%ecx\n" + "decl %%ebx\n" "jnz .second_128_sample\n" - "movl 8(%%ebp), %%eax\n" - "leal 512(%%eax), %%esi\n" /* buf[64].re */ - "leal 508(%%eax), %%edi\n" /* buf[63].im */ - "movl $16, %%ecx\n" /* loop count */ - "movl 20(%%ebp), %%eax\n" /* delay */ + "leal 512(%%ebp), %%esi\n" /* buf[64].re */ + "leal 508(%%ebp), %%edi\n" /* buf[63].im */ + "movl $16, %%ebx\n" /* loop count */ + "addl $-1024, %%ecx\n" /* delay */ ".align 16\n" ".first_128_delays:\n" @@ -539,20 +526,19 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa "mulps %%xmm4, %%xmm0\n" "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */ - "movaps %%xmm0, (%%eax)\n" + "movaps %%xmm0, (%%ecx)\n" "addl $32, %%esi\n" "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */ "addl $-32, %%edi\n" "mulps %%xmm5, %%xmm6\n" - "addl $32, %%eax\n" - "movaps %%xmm6, -16(%%eax)\n" - "decl %%ecx\n" + "addl $32, %%ecx\n" + "movaps %%xmm6, -16(%%ecx)\n" + "decl %%ebx\n" "jnz .first_128_delays\n" - "movl 8(%%ebp), %%ebx\n" - "leal 4(%%ebx), %%esi\n" /* buf[0].im */ - "leal 1016(%%ebx), %%edi\n" /* buf[127].re */ - "movl $16, %%ecx\n" /* loop count */ + "leal 4(%%ebp), %%esi\n" /* buf[0].im */ + "leal 1016(%%ebp), %%edi\n" /* buf[127].re */ + "movl $16, %%ebx\n" /* loop count */ ".align 16\n" ".second_128_delays:\n" @@ -577,23 +563,24 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa "mulps %%xmm4, %%xmm1\n" "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */ - "movaps %%xmm1, (%%eax)\n" + "movaps %%xmm1, (%%ecx)\n" "addl $32, %%esi\n" "subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */ "addl $-32, %%edi\n" "mulps %%xmm5, %%xmm2\n" - "addl $32, %%eax\n" - "movaps %%xmm2, -16(%%eax)\n" - "decl %%ecx\n" + "addl $32, %%ecx\n" + "movaps %%xmm2, -16(%%ecx)\n" + "decl %%ebx\n" "jnz .second_128_delays\n" - "popl %%edi\n" + "popl %%ebp\n" "popl %%esi\n" + "popl %%edi\n" "popl %%edx\n" "popl %%ecx\n" "popl %%ebx\n" "popl %%eax\n" - - "leave\n" - ::); + : "=S" (buf), "=a" (data_ptr), "=c" (delay_prt), "=d" (window_prt) + : "S" (buf), "a" (data_ptr), "c" (delay_prt), "d" (window_prt)); + } diff --git a/plugins/imdct/ac3_srfft_sse.c b/plugins/imdct/ac3_srfft_sse.c index f7e4b64060..8f2443b644 100644 --- a/plugins/imdct/ac3_srfft_sse.c +++ b/plugins/imdct/ac3_srfft_sse.c @@ -2,7 +2,7 @@ * ac3_srfft_sse.c: accelerated SSE ac3 fft functions ***************************************************************************** * Copyright (C) 1999, 2000, 2001 VideoLAN - * $Id: ac3_srfft_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $ + * $Id: ac3_srfft_sse.c,v 1.5 2001/07/26 20:00:33 reno Exp $ * * Authors: Renaud Dartus * Aaron Holtzman @@ -228,28 +228,21 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, "pushl %%eax\n" "pushl %%ebx\n" - "pushl %%ecx\n" // + "pushl %%ecx\n" "pushl %%edx\n" "pushl %%esi\n" -// "movl %%edi, %%ecx\n" /* k */ - "pushl %%edi\n" // + "pushl %%edi\n" - "movl 8(%%ebp), %%ecx\n" /* k */ - "movl 12(%%ebp), %%eax\n" /* x */ "movl %%ecx, -4(%%ebp)\n" /* k */ - "movl 16(%%ebp), %%ebx\n" /* wT */ - "movl 20(%%ebp), %%edx\n" /* d */ - "movl 24(%%ebp), %%esi\n" /* d3 */ "shll $4, %%ecx\n" /* 16k */ /// "addl $8, %%edx\n" - "leal (%%eax, %%ecx, 2), %%edi\n" "addl $8, %%esi\n" /* TRANSZERO and TRANS */ ".align 16\n" "movaps (%%eax), %%xmm0\n" /* x[1] | x[0] */ - "movaps (%%ebx), %%xmm1\n" /* wT[1] | wT[0] */ - "movaps (%%ebx, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */ + "movaps (%%edi), %%xmm1\n" /* wT[1] | wT[0] */ + "movaps (%%edi, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */ "movlps (%%edx), %%xmm3\n" /* d */ "movlps (%%esi), %%xmm4\n" /* d3 */ "movhlps %%xmm1, %%xmm5\n" /* wT[1] */ @@ -263,14 +256,14 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, "movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */ "movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */ "shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */ - "movl $C_1_sse, %%edi\n" - "movaps (%%edi), %%xmm4\n" + "movl $C_1_sse, %%ebx\n" + "movaps (%%ebx), %%xmm4\n" "mulps %%xmm4, %%xmm7\n" "addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */ "movlhps %%xmm5, %%xmm1\n" /* d[1] * wT[1] | wT[0] */ "shufps $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */ "movaps %%xmm1, %%xmm3\n" /* d[1] * wT[1] | wT[0] */ - "leal (%%eax, %%ecx, 2), %%edi\n" + "leal (%%eax, %%ecx, 2), %%ebx\n" "addps %%xmm2, %%xmm1\n" /* u */ "subps %%xmm2, %%xmm3\n" /* v */ "mulps %%xmm4, %%xmm3\n" @@ -283,21 +276,21 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, "addps %%xmm3, %%xmm5\n" "subps %%xmm3, %%xmm6\n" "movaps %%xmm0, (%%eax)\n" - "movaps %%xmm2, (%%edi)\n" + "movaps %%xmm2, (%%ebx)\n" "movaps %%xmm5, (%%eax, %%ecx)\n" - "movaps %%xmm6, (%%edi, %%ecx)\n" + "movaps %%xmm6, (%%ebx, %%ecx)\n" "addl $16, %%eax\n" - "addl $16, %%ebx\n" + "addl $16, %%edi\n" "addl $8, %%edx\n" "addl $8, %%esi\n" "decl -4(%%ebp)\n" ".align 16\n" ".loop:\n" - "movaps (%%ebx), %%xmm0\n" /* wT[1] | wT[0] */ + "movaps (%%edi), %%xmm0\n" /* wT[1] | wT[0] */ "movaps (%%edx), %%xmm1\n" /* d[1] | d[0] */ - "movaps (%%ebx, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */ + "movaps (%%edi, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */ "movaps (%%esi), %%xmm5\n" /* d3[1] | d3[0] */ "movhlps %%xmm0, %%xmm2\n" /* wT[1] */ @@ -324,8 +317,8 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, "mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */ "mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */ "shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */ - "movl $C_1_sse, %%edi\n" - "movaps (%%edi), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */ + "movl $C_1_sse, %%ebx\n" + "movaps (%%ebx), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */ "movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */ "mulps %%xmm3, %%xmm1\n" /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */ @@ -340,9 +333,9 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, "addps %%xmm4, %%xmm0\n" /* u */ "subps %%xmm4, %%xmm1\n" /* v */ "movaps (%%eax), %%xmm6\n" /* x[1] | x[0] */ - "leal (%%eax, %%ecx, 2), %%edi\n" + "leal (%%eax, %%ecx, 2), %%ebx\n" "mulps %%xmm3, %%xmm1\n" - "addl $16, %%ebx\n" + "addl $16, %%edi\n" "addl $16, %%esi\n" "shufps $0xb1, %%xmm1, %%xmm1\n" /* -i * v */ "movaps (%%eax, %%ecx), %%xmm7\n" /* xk[1] | xk[0] */ @@ -351,12 +344,12 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, "addps %%xmm0, %%xmm6\n" "subps %%xmm0, %%xmm2\n" "movaps %%xmm6, (%%eax)\n" - "movaps %%xmm2, (%%edi)\n" + "movaps %%xmm2, (%%ebx)\n" "addps %%xmm1, %%xmm7\n" "subps %%xmm1, %%xmm4\n" "addl $16, %%edx\n" "movaps %%xmm7, (%%eax, %%ecx)\n" - "movaps %%xmm4, (%%edi, %%ecx)\n" + "movaps %%xmm4, (%%ebx, %%ecx)\n" "addl $16, %%eax\n" "decl -4(%%ebp)\n" @@ -364,16 +357,17 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB, ".align 16\n" ".end:\n" - "popl %%edi\n" // + "popl %%edi\n" "popl %%esi\n" "popl %%edx\n" - "popl %%ecx\n" // + "popl %%ecx\n" "popl %%ebx\n" "popl %%eax\n" "addl $4, %%esp\n" "leave\n" - ::); + : "=c" (k), "=a" (x), "=D" (wTB) + : "c" (k), "a" (x), "D" (wTB), "d" (d), "S" (d_3)); } -- 2.39.5