src/audio_output/aout_s8.o \
src/audio_output/aout_u16.o \
src/audio_output/aout_s16.o \
- src/audio_output/aout_spdif.o
+ src/audio_output/aout_spdif.o
VIDEO_OUTPUT = src/video_output/video_output.o \
src/video_output/video_text.o \
src/ac3_decoder/ac3_downmix_c.o
AC3_SPDIF = src/ac3_spdif/ac3_spdif.o \
- src/ac3_spdif/ac3_iec958.o
+ src/ac3_spdif/ac3_iec958.o
LPCM_DECODER = src/lpcm_decoder/lpcm_decoder_thread.o \
src/lpcm_decoder/lpcm_decoder.o
* tests.h: several test functions needed by the plugins
*****************************************************************************
* Copyright (C) 1996, 1997, 1998, 1999, 2000 VideoLAN
- * $Id: tests.h,v 1.9 2001/03/21 13:42:33 sam Exp $
+ * $Id: tests.h,v 1.10 2001/05/14 15:58:03 reno Exp $
*
* Authors: Samuel Hocevar <sam@zoy.org>
*
#define CPU_CAPABILITY_MMX 1<<3
#define CPU_CAPABILITY_3DNOW 1<<4
#define CPU_CAPABILITY_MMXEXT 1<<5
+#define CPU_CAPABILITY_SSE 1<<6
#define CPU_CAPABILITY_ALTIVEC 1<<16
/*****************************************************************************
* ac3_bit_allocate.c: ac3 allocation tables
*****************************************************************************
* Copyright (C) 2000 VideoLAN
- * $Id: ac3_bit_allocate.c,v 1.20 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_bit_allocate.c,v 1.21 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
#include "input_ext-dec.h"
#include "ac3_decoder.h"
-#include "ac3_internal.h"
+#include "ac3_internal.h" /* DELTA_BIT_REUSE */
static void ba_compute_psd (bit_allocate_t * p_bit, s16 start, s16 end, s16 exps[]);
* ac3_decoder.c: core ac3 decoder
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder.c,v 1.32 2001/05/07 03:14:09 stef Exp $
+ * $Id: ac3_decoder.c,v 1.33 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Michel Lespinasse <walken@zoy.org>
#include "audio_output.h"
#include "ac3_decoder.h"
-#include "ac3_decoder_thread.h"
+#include "ac3_decoder_thread.h" /* ac3dec_thread_t */
#include "ac3_internal.h"
-#include <stdio.h>
-
-void imdct_init (imdct_t * p_imdct);
-void downmix_init (downmix_t * p_downmix);
-
-static float cmixlev_lut[4] = { 0.707, 0.595, 0.500, 0.707 };
-static float smixlev_lut[4] = { 0.707, 0.500, 0.0 , 0.500 };
+static const float cmixlev_lut[4] = { 0.707, 0.595, 0.500, 0.707 };
+static const float smixlev_lut[4] = { 0.707, 0.500, 0.0 , 0.500 };
int ac3_init (ac3dec_t * p_ac3dec)
{
-// p_ac3dec->bit_stream.buffer = 0;
-// p_ac3dec->bit_stream.i_available = 0;
- p_ac3dec->mantissa.lfsr_state = 1; /* dither_gen initialization */
+ p_ac3dec->mantissa.lfsr_state = 1; /* dither_gen initialization */
imdct_init(&p_ac3dec->imdct);
downmix_init(&p_ac3dec->downmix);
if (parse_bsi (p_ac3dec))
{
- intf_WarnMsg (3,"Error during ac3parsing");
+ intf_WarnMsg (3,"ac3dec warn: error during parsing");
parse_auxdata (p_ac3dec);
return 1;
}
if (parse_audblk (p_ac3dec, i))
{
- intf_WarnMsg (3,"Error during ac3audioblock");
+ intf_WarnMsg (3,"ac3dec warn: error during audioblock");
parse_auxdata (p_ac3dec);
return 1;
}
if (exponent_unpack (p_ac3dec))
{
- intf_WarnMsg (3,"Error during ac3unpack");
+ intf_WarnMsg (3,"ac3dec warn: error during unpack");
parse_auxdata (p_ac3dec);
return 1;
}
* ac3_decoder.h : ac3 decoder interface
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder.h,v 1.7 2001/04/30 21:04:20 reno Exp $
+ * $Id: ac3_decoder.h,v 1.8 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Renaud Dartus <reno@videolan.org>
float xsin1[N/4];
float xcos2[N/8];
float xsin2[N/8];
-
+
/* Twiddle factor LUT */
complex_t *w[7];
complex_t w_1[1];
* ac3_decoder_thread.h : ac3 decoder thread interface
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder_thread.h,v 1.6 2001/05/01 04:18:18 sam Exp $
+ * $Id: ac3_decoder_thread.h,v 1.7 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
*
* Thread properties
*/
vlc_thread_t thread_id; /* id for thread functions */
-// bit_stream_t bit_stream;
-
/*
* Input properties
*/
decoder_fifo_t * p_fifo; /* stores the PES stream data */
-// data_packet_t * p_data;
int sync_ptr; /* sync ptr from ac3 magic header */
adec_config_t * p_config;
* ac3_downmix.c: ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_downmix.c,v 1.22 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_downmix.c,v 1.23 2001/05/14 15:58:03 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
#include "threads.h"
#include "mtime.h"
+#include "intf_msg.h" /* intf_DbgMsg(), intf_ErrMsg() */
#include "tests.h"
#include "stream_control.h"
#include "input_ext-dec.h"
#include "ac3_decoder.h"
-#include "ac3_internal.h"
#include "ac3_downmix.h"
void downmix_init (downmix_t * p_downmix)
{
#if 0
- if ( TestCPU (CPU_CAPABILITY_MMX) )
+ if ( TestCPU (CPU_CAPABILITY_SSE) )
{
- fprintf(stderr,"Using MMX for downmix\n");
- p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_kni;
- p_downmix->downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_kni;
- p_downmix->downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_kni;
- p_downmix->downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_kni;
- p_downmix->downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_kni;
- p_downmix->stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_kni;
- p_downmix->stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_kni;
- } else
+ intf_WarnMsg (1,"ac3dec: using MMX_SSE for downmix");
+ p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_sse;
+ p_downmix->downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_sse;
+ p_downmix->downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_sse;
+ p_downmix->downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_sse;
+ p_downmix->downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_sse;
+ p_downmix->stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_sse;
+ p_downmix->stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_sse;
+ }
+ else if ( TestCPU (CPU_CAPABILITY_3DNOW) )
+ {
+ intf_WarnMsg (1,"ac3dec: using MMX_3DNOW for downmix");
+ p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_3dn;
+ p_downmix->downmix_2f_2r_to_2ch = downmix_2f_2r_to_2ch_3dn;
+ p_downmix->downmix_3f_1r_to_2ch = downmix_3f_1r_to_2ch_3dn;
+ p_downmix->downmix_2f_1r_to_2ch = downmix_2f_1r_to_2ch_3dn;
+ p_downmix->downmix_3f_0r_to_2ch = downmix_3f_0r_to_2ch_3dn;
+ p_downmix->stream_sample_2ch_to_s16 = stream_sample_2ch_to_s16_3dn;
+ p_downmix->stream_sample_1ch_to_s16 = stream_sample_1ch_to_s16_3dn;
+ }
+ else
#endif
{
p_downmix->downmix_3f_2r_to_2ch = downmix_3f_2r_to_2ch_c;
* ac3_downmix.h: ac3 downmix functions
*****************************************************************************
* Copyright (C) 2000, 2001 VideoLAN
- * $Id: ac3_downmix.h,v 1.6 2001/04/30 21:04:20 reno Exp $
+ * $Id: ac3_downmix.h,v 1.7 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
*
void stream_sample_2ch_to_s16_c(s16 *s16_samples, float *left, float *right);
void stream_sample_1ch_to_s16_c(s16 *s16_samples, float *center);
-#if 0
-/* Kni functions */
-void downmix_3f_2r_to_2ch_kni(float *samples, dm_par_t * dm_par);
-void downmix_3f_1r_to_2ch_kni(float *samples, dm_par_t * dm_par);
-void downmix_2f_2r_to_2ch_kni(float *samples, dm_par_t * dm_par);
-void downmix_2f_1r_to_2ch_kni(float *samples, dm_par_t * dm_par);
-void downmix_3f_0r_to_2ch_kni(float *samples, dm_par_t * dm_par);
-void stream_sample_2ch_to_s16_kni(s16 *s16_samples, float *left, float *right);
-void stream_sample_1ch_to_s16_kni(s16 *s16_samples, float *center);
-#endif
+/* SSE functions */
+void downmix_3f_2r_to_2ch_sse(float *samples, dm_par_t * dm_par);
+void downmix_3f_1r_to_2ch_sse(float *samples, dm_par_t * dm_par);
+void downmix_2f_2r_to_2ch_sse(float *samples, dm_par_t * dm_par);
+void downmix_2f_1r_to_2ch_sse(float *samples, dm_par_t * dm_par);
+void downmix_3f_0r_to_2ch_sse(float *samples, dm_par_t * dm_par);
+void stream_sample_2ch_to_s16_sse(s16 *s16_samples, float *left, float *right);
+void stream_sample_1ch_to_s16_sse(s16 *s16_samples, float *center);
+
+/* 3DNow! functions */
+void downmix_3f_2r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
+void downmix_3f_1r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
+void downmix_2f_2r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
+void downmix_2f_1r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
+void downmix_3f_0r_to_2ch_3dn(float *samples, dm_par_t * dm_par);
+void stream_sample_2ch_to_s16_3dn(s16 *s16_samples, float *left, float *right);
+void stream_sample_1ch_to_s16_3dn(s16 *s16_samples, float *center);
+
+
--- /dev/null
+/*****************************************************************************
+ * ac3_downmix_3dn.c: ac3 downmix functions
+ *****************************************************************************
+ * Copyright (C) 1999, 2000, 2001 VideoLAN
+ * $Id: ac3_downmix_3dn.c,v 1.1 2001/05/14 15:58:04 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "defs.h"
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+#include "tests.h"
+
+#include "stream_control.h"
+#include "input_ext-dec.h"
+#include "ac3_decoder.h"
+
+
+void downmix_3f_2r_to_2ch_3dn (float * samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $128, %%ecx\n" /* loop counter */
+
+ "movd (%%ebx), %%mm5\n" /* unit */
+ "punpckldq %%mm5, %%mm5\n" /* unit | unit */
+
+ "movd 4(%%ebx), %%mm6\n" /* clev */
+ "punpckldq %%mm6, %%mm6\n" /* clev | clev */
+
+ "movd 8(%%ebx), %%mm7\n" /* slev */
+ "punpckldq %%mm7, %%mm7\n" /* slev | slev */
+
+".loop:\n"
+ "movq (%%eax), %%mm0\n" /* left */
+ "movq 2048(%%eax), %%mm1\n" /* right */
+ "movq 1024(%%eax), %%mm2\n" /* center */
+ "movq 3072(%%eax), %%mm3\n" /* leftsur */
+ "movq 4096(%%eax), %%mm4\n" /* rightsur */
+ "pfmul %%mm5, %%mm0\n"
+ "pfmul %%mm5, %%mm1\n"
+ "pfmul %%mm6, %%mm2\n"
+ "pfadd %%mm2, %%mm0\n"
+ "pfadd %%mm2, %%mm1\n"
+ "pfmul %%mm7, %%mm3\n"
+ "pfmul %%mm7, %%mm4\n"
+ "pfadd %%mm3, %%mm0\n"
+ "pfadd %%mm4, %%mm1\n"
+
+ "movq %%mm0, (%%eax)\n"
+ "movq %%mm1, 1024(%%eax)\n"
+
+ "addl $8, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop\n"
+
+ "popl %%ecx\n"
+ "femms\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+}
+
+void downmix_2f_2r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $128, %%ecx\n" /* loop counter */
+
+ "movd (%%ebx), %%mm5\n" /* unit */
+ "punpckldq %%mm5, %%mm5\n" /* unit | unit */
+
+ "movd 8(%%ebx), %%mm7\n" /* slev */
+ "punpckldq %%mm7, %%mm7\n" /* slev | slev */
+
+".loop3:\n"
+ "movq (%%eax), %%mm0\n" /* left */
+ "movq 1024(%%eax), %%mm1\n" /* right */
+ "movq 2048(%%eax), %%mm3\n" /* leftsur */
+ "movq 3072(%%eax), %%mm4\n" /* rightsur */
+ "pfmul %%mm5, %%mm0\n"
+ "pfmul %%mm5, %%mm1\n"
+ "pfmul %%mm7, %%mm3\n"
+ "pfmul %%mm7, %%mm4\n"
+ "pfadd %%mm3, %%mm0\n"
+ "pfadd %%mm4, %%mm1\n"
+
+ "movq %%mm0, (%%eax)\n"
+ "movq %%mm1, 1024(%%eax)\n"
+
+ "addl $8, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop3\n"
+
+ "popl %%ecx\n"
+ "femms\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+}
+void downmix_3f_1r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+
+ "pushl %%ecx\n"
+ "movl $128, %%ecx\n" /* loop counter */
+
+ "movd (%%ebx), %%mm5\n" /* unit */
+ "punpckldq %%mm5, %%mm5\n" /* unit | unit */
+
+ "movd 4(%%ebx), %%mm6\n" /* clev */
+ "punpckldq %%mm6, %%mm6\n" /* clev | clev */
+
+ "movd 8(%%ebx), %%mm7\n" /* slev */
+ "punpckldq %%mm7, %%mm7\n" /* slev | slev */
+
+".loop4:\n"
+ "movq (%%eax), %%mm0\n" /* left */
+ "movq 2048(%%eax), %%mm1\n" /* right */
+ "movq 1024(%%eax), %%mm2\n" /* center */
+ "movq 3072(%%eax), %%mm3\n" /* sur */
+ "pfmul %%mm5, %%mm0\n"
+ "pfmul %%mm5, %%mm1\n"
+ "pfmul %%mm6, %%mm2\n"
+ "pfadd %%mm2, %%mm0\n"
+ "pfmul %%mm7, %%mm3\n"
+ "pfadd %%mm2, %%mm1\n"
+ "pfsub %%mm3, %%mm0\n"
+ "pfadd %%mm3, %%mm1\n"
+
+ "movq %%mm0, (%%eax)\n"
+ "movq %%mm1, 1024(%%eax)\n"
+
+ "addl $8, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop4\n"
+
+ "popl %%ecx\n"
+ "femms\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+}
+void downmix_2f_1r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $128, %%ecx\n" /* loop counter */
+
+ "movd (%%ebx), %%mm5\n" /* unit */
+ "punpckldq %%mm5, %%mm5\n" /* unit | unit */
+
+ "movd 8(%%ebx), %%mm7\n" /* slev */
+ "punpckldq %%mm7, %%mm7\n" /* slev | slev */
+
+".loop5:\n"
+ "movq (%%eax), %%mm0\n" /* left */
+ "movq 1024(%%eax), %%mm1\n" /* right */
+ "movq 2048(%%eax), %%mm3\n" /* sur */
+ "pfmul %%mm5, %%mm0\n"
+ "pfmul %%mm5, %%mm1\n"
+ "pfmul %%mm7, %%mm3\n"
+ "pfsub %%mm3, %%mm0\n"
+ "pfadd %%mm3, %%mm1\n"
+
+ "movq %%mm0, (%%eax)\n"
+ "movq %%mm1, 1024(%%eax)\n"
+
+ "addl $8, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop5\n"
+
+ "popl %%ecx\n"
+ "femms\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+}
+
+void downmix_3f_0r_to_2ch_3dn (float *samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $128, %%ecx\n" /* loop counter */
+
+ "movd (%%ebx), %%mm5\n" /* unit */
+ "punpckldq %%mm5, %%mm5\n" /* unit | unit */
+
+ "movd 4(%%ebx), %%mm6\n" /* clev */
+ "punpckldq %%mm6, %%mm6\n" /* clev | clev */
+
+".loop6:\n"
+ "movq (%%eax), %%mm0\n" /*left */
+ "movq 2048(%%eax), %%mm1\n" /* right */
+ "movq 1024(%%eax), %%mm2\n" /* center */
+ "pfmul %%mm5, %%mm0\n"
+ "pfmul %%mm5, %%mm1\n"
+ "pfmul %%mm6, %%mm2\n"
+ "pfadd %%mm2, %%mm0\n"
+ "pfadd %%mm2, %%mm1\n"
+
+ "movq %%mm0, (%%eax)\n"
+ "movq %%mm1, 1024(%%eax)\n"
+
+ "addl $8, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop6\n"
+
+ "popl %%ecx\n"
+ "femms\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+}
+
+void stream_sample_1ch_to_s16_3dn (s16 *s16_samples, float *left)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "pushl %%edx\n"
+
+ "movl $sqrt2, %%edx\n"
+ "movd (%%edx), %%mm7\n"
+ "punpckldq %%mm7, %%mm7\n" /* sqrt2 | sqrt2 */
+ "movl $128, %%ecx\n"
+
+".loop2:\n"
+ "movq (%%ebx), %%mm0\n" /* c1 | c0 */
+ "pfmul %%mm7, %%mm0\n"
+
+ "pf2id %%mm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */
+
+ "packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */
+
+ "movq %%mm0, (%%eax)\n"
+ "addl $8, %%eax\n"
+ "addl $8, %%ebx\n"
+
+ "decl %%ecx\n"
+ "jnz .loop2\n"
+
+ "popl %%edx\n"
+ "popl %%ecx\n"
+ "femms\n"
+ : "=a" (s16_samples), "=b" (left)
+ : "a" (s16_samples), "b" (left));
+}
+
+void stream_sample_2ch_to_s16_3dn (s16 *s16_samples, float *left, float *right)
+{
+
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $128, %%ecx\n"
+
+".loop1:\n"
+ "movq (%%ebx), %%mm0\n" /* l1 | l0 */
+ "movq (%%edx), %%mm1\n" /* r1 | r0 */
+ "movq %%mm0, %%mm2\n" /* l1 | l0 */
+ "punpckldq %%mm1, %%mm0\n" /* r0 | l0 */
+ "punpckhdq %%mm1, %%mm2\n" /* r1 | l1 */
+
+ "pf2id %%mm0, %%mm0\n" /* r0 l0 --> mm0, int_32 */
+ "pf2id %%mm2, %%mm2\n" /* r0 l0 --> mm0, int_32 */
+
+ "packssdw %%mm2, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */
+
+ "movq %%mm0, (%%eax)\n"
+ "movq %%mm2, 8(%%eax)\n"
+ "addl $8, %%eax\n"
+ "addl $8, %%ebx\n"
+ "addl $8, %%edx\n"
+
+ "decl %%ecx\n"
+ "jnz .loop1\n"
+
+ "popl %%ecx\n"
+ "femms\n"
+ : "=a" (s16_samples), "=b" (left), "=d" (right)
+ : "a" (s16_samples), "b" (left), "d" (right));
+
+}
* ac3_downmix_c.c: ac3 downmix functions
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_downmix_c.c,v 1.7 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_downmix_c.c,v 1.8 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
#include "input_ext-dec.h"
#include "ac3_decoder.h"
-#include "ac3_internal.h"
-#include "ac3_downmix.h"
-
-void __inline__ downmix_3f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_3f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *center, *left_sur, *right_sur;
}
}
-void __inline__ downmix_2f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_2f_2r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *left_sur, *right_sur;
}
}
-void __inline__ downmix_3f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_3f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *center, *right_sur;
}
-void __inline__ downmix_2f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_2f_1r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *right_sur;
}
-void __inline__ downmix_3f_0r_to_2ch_c (float *samples, dm_par_t *dm_par)
+void downmix_3f_0r_to_2ch_c (float *samples, dm_par_t *dm_par)
{
int i;
float *left, *right, *center;
}
-void __inline__ stream_sample_2ch_to_s16_c (s16 *out_buf, float *left, float *right)
+void stream_sample_2ch_to_s16_c (s16 *out_buf, float *left, float *right)
{
int i;
for (i=0; i < 256; i++) {
}
-void __inline__ stream_sample_1ch_to_s16_c (s16 *out_buf, float *center)
+void stream_sample_1ch_to_s16_c (s16 *out_buf, float *center)
{
int i;
float tmp;
--- /dev/null
+/*****************************************************************************
+ * ac3_downmix_sse.c: ac3 downmix functions
+ *****************************************************************************
+ * Copyright (C) 1999, 2000, 2001 VideoLAN
+ * $Id: ac3_downmix_sse.c,v 1.1 2001/05/14 15:58:04 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ * Aaron Holtzman <aholtzma@engr.uvic.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "defs.h"
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+#include "tests.h"
+
+#include "stream_control.h"
+#include "input_ext-dec.h"
+#include "ac3_decoder.h"
+
+
+void sqrt2 (void)
+{
+ __asm__ (".float 0f0.7071068");
+}
+
+void downmix_3f_2r_to_2ch_sse (float * samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $64, %%ecx\n" /* loop counter */
+
+ "movss (%%ebx), %%xmm5\n" /* unit */
+ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
+
+ "movss 4(%%ebx), %%xmm6\n" /* clev */
+ "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
+
+ "movss 8(%%ebx), %%xmm7\n" /* slev */
+ "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
+
+".loop:\n"
+ "movups (%%eax), %%xmm0\n" /* left */
+ "movups 2048(%%eax), %%xmm1\n" /* right */
+ "movups 1024(%%eax), %%xmm2\n" /* center */
+ "movups 3072(%%eax), %%xmm3\n" /* leftsur */
+ "movups 4096(%%eax), %%xmm4\n" /* rithgsur */
+ "mulps %%xmm5, %%xmm0\n"
+ "mulps %%xmm5, %%xmm1\n"
+ "mulps %%xmm6, %%xmm2\n"
+ "addps %%xmm2, %%xmm0\n"
+ "addps %%xmm2, %%xmm1\n"
+ "mulps %%xmm7, %%xmm3\n"
+ "mulps %%xmm7, %%xmm4\n"
+ "addps %%xmm3, %%xmm0\n"
+ "addps %%xmm4, %%xmm1\n"
+
+ "movups %%xmm0, (%%eax)\n"
+ "movups %%xmm1, 1024(%%eax)\n"
+
+ "addl $16, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop\n"
+
+ "popl %%ecx\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+}
+
+void downmix_2f_2r_to_2ch_sse (float *samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $64, %%ecx\n" /* loop counter */
+
+ "movss (%%ebx), %%xmm5\n" /* unit */
+ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
+
+ "movss 8(%%ebx), %%xmm7\n" /* slev */
+ "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
+
+".loop3:\n"
+ "movups (%%eax), %%xmm0\n" /* left */
+ "movups 1024(%%eax), %%xmm1\n" /* right */
+ "movups 2048(%%eax), %%xmm3\n" /* leftsur */
+ "movups 3072(%%eax), %%xmm4\n" /* rightsur */
+ "mulps %%xmm5, %%xmm0\n"
+ "mulps %%xmm5, %%xmm1\n"
+ "mulps %%xmm7, %%xmm3\n"
+ "mulps %%xmm7, %%xmm4\n"
+ "addps %%xmm3, %%xmm0\n"
+ "addps %%xmm4, %%xmm1\n"
+
+ "movups %%xmm0, (%%eax)\n"
+ "movups %%xmm1, 1024(%%eax)\n"
+
+ "addl $16, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop3\n"
+
+ "popl %%ecx\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+}
+void downmix_3f_1r_to_2ch_sse (float *samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+
+ "pushl %%ecx\n"
+ "movl $64, %%ecx\n" /* loop counter */
+
+ "movss (%%ebx), %%xmm5\n" /* unit */
+ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
+
+ "movss 4(%%ebx), %%xmm6\n" /* clev */
+ "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
+
+ "movss 8(%%ebx), %%xmm7\n" /* slev */
+ "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
+
+".loop4:\n"
+ "movups (%%eax), %%xmm0\n" /* left */
+ "movups 2048(%%eax), %%xmm1\n" /* right */
+ "movups 1024(%%eax), %%xmm2\n" /* center */
+ "movups 3072(%%eax), %%xmm3\n" /* sur */
+ "mulps %%xmm5, %%xmm0\n"
+ "mulps %%xmm5, %%xmm1\n"
+ "mulps %%xmm6, %%xmm2\n"
+ "addps %%xmm2, %%xmm0\n"
+ "mulps %%xmm7, %%xmm3\n"
+ "addps %%xmm2, %%xmm1\n"
+ "subps %%xmm3, %%xmm0\n"
+ "addps %%xmm3, %%xmm1\n"
+
+ "movups %%xmm0, (%%eax)\n"
+ "movups %%xmm1, 1024(%%eax)\n"
+
+ "addl $16, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop4\n"
+
+ "popl %%ecx\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+
+}
+void downmix_2f_1r_to_2ch_sse (float *samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $64, %%ecx\n" /* loop counter */
+
+ "movss (%%ebx), %%xmm5\n" /* unit */
+ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
+
+ "movss 8(%%ebx), %%xmm7\n" /* slev */
+ "shufps $0, %%xmm7, %%xmm7\n" /* slev | slev | slev | slev */
+
+".loop5:\n"
+ "movups (%%eax), %%xmm0\n" /* left */
+ "movups 1024(%%eax), %%xmm1\n" /* right */
+ "movups 2048(%%eax), %%xmm3\n" /* sur */
+ "mulps %%xmm5, %%xmm0\n"
+ "mulps %%xmm5, %%xmm1\n"
+ "mulps %%xmm7, %%xmm3\n"
+ "subps %%xmm3, %%xmm0\n"
+ "addps %%xmm3, %%xmm1\n"
+
+ "movups %%xmm0, (%%eax)\n"
+ "movups %%xmm1, 1024(%%eax)\n"
+
+ "addl $16, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop5\n"
+
+ "popl %%ecx\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+
+
+}
+void downmix_3f_0r_to_2ch_sse (float *samples, dm_par_t * dm_par)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $64, %%ecx\n" /* loop counter */
+
+ "movss (%%ebx), %%xmm5\n" /* unit */
+ "shufps $0, %%xmm5, %%xmm5\n" /* unit | unit | unit | unit */
+
+ "movss 4(%%ebx), %%xmm6\n" /* clev */
+ "shufps $0, %%xmm6, %%xmm6\n" /* clev | clev | clev | clev */
+
+".loop6:\n"
+ "movups (%%eax), %%xmm0\n" /*left */
+ "movups 2048(%%eax), %%xmm1\n" /* right */
+ "movups 1024(%%eax), %%xmm2\n" /* center */
+ "mulps %%xmm5, %%xmm0\n"
+ "mulps %%xmm5, %%xmm1\n"
+ "mulps %%xmm6, %%xmm2\n"
+ "addps %%xmm2, %%xmm0\n"
+ "addps %%xmm2, %%xmm1\n"
+
+ "movups %%xmm0, (%%eax)\n"
+ "movups %%xmm1, 1024(%%eax)\n"
+
+ "addl $16, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop6\n"
+
+ "popl %%ecx\n"
+ : "=a" (samples)
+ : "a" (samples), "b" (dm_par));
+}
+
+void stream_sample_1ch_to_s16_sse (s16 *s16_samples, float *left)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "pushl %%edx\n"
+
+ "movl $sqrt2, %%edx\n"
+ "movss (%%edx), %%xmm7\n"
+ "shufps $0, %%xmm7, %%xmm7\n" /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
+ "movl $64, %%ecx\n"
+
+".loop2:\n"
+ "movups (%%ebx), %%xmm0\n" /* c3 | c2 | c1 | c0 */
+ "mulps %%xmm7, %%xmm0\n"
+ "movhlps %%xmm0, %%xmm2\n" /* c3 | c2 */
+
+ "cvtps2pi %%xmm0, %%mm0\n" /* c1 c0 --> mm0, int_32 */
+ "cvtps2pi %%xmm2, %%mm1\n" /* c3 c2 --> mm1, int_32 */
+
+ "packssdw %%mm0, %%mm0\n" /* c1 c1 c0 c0 --> mm0, int_16 */
+ "packssdw %%mm1, %%mm1\n" /* c3 c3 c2 c2 --> mm1, int_16 */
+
+ "movq %%mm0, (%%eax)\n"
+ "movq %%mm1, 8(%%eax)\n"
+ "addl $16, %%eax\n"
+ "addl $16, %%ebx\n"
+
+ "decl %%ecx\n"
+ "jnz .loop2\n"
+
+ "popl %%edx\n"
+ "popl %%ecx\n"
+ "emms\n"
+ : "=a" (s16_samples), "=b" (left)
+ : "a" (s16_samples), "b" (left));
+}
+
+void stream_sample_2ch_to_s16_sse (s16 *s16_samples, float *left, float *right)
+{
+
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $64, %%ecx\n"
+
+".loop1:\n"
+ "movups (%%ebx), %%xmm0\n" /* l3 | l2 | l1 | l0 */
+ "movups (%%edx), %%xmm1\n" /* r3 | r2 | r1 | r0 */
+ "movhlps %%xmm0, %%xmm2\n" /* l3 | l2 */
+ "movhlps %%xmm1, %%xmm3\n" /* r3 | r2 */
+ "unpcklps %%xmm1, %%xmm0\n" /* r1 | l1 | r0 | l0 */
+ "unpcklps %%xmm3, %%xmm2\n" /* r3 | l3 | r2 | l2 */
+
+ "cvtps2pi %%xmm0, %%mm0\n" /* r0 l0 --> mm0, int_32 */
+ "movhlps %%xmm0, %%xmm0\n"
+ "cvtps2pi %%xmm0, %%mm1\n" /* r1 l1 --> mm1, int_32 */
+ "cvtps2pi %%xmm2, %%mm2\n" /* r2 l2 --> mm2, int_32 */
+ "movhlps %%xmm2, %%xmm2\n"
+ "cvtps2pi %%xmm2, %%mm3\n" /* r3 l3 --> mm3, int_32 */
+
+ "packssdw %%mm1, %%mm0\n" /* r1 l1 r0 l0 --> mm0, int_16 */
+ "packssdw %%mm3, %%mm2\n" /* r3 l3 r2 l2 --> mm2, int_16 */
+
+ "movq %%mm0, (%%eax)\n"
+ "movq %%mm2, 8(%%eax)\n"
+ "addl $16, %%eax\n"
+ "addl $16, %%ebx\n"
+ "addl $16, %%edx\n"
+
+ "decl %%ecx\n"
+ "jnz .loop1\n"
+
+ "popl %%ecx\n"
+ "emms\n"
+ : "=a" (s16_samples), "=b" (left), "=d" (right)
+ : "a" (s16_samples), "b" (left), "d" (right));
+
+}
* ac3_exponent.c: ac3 exponent calculations
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_exponent.c,v 1.23 2001/04/20 12:14:34 reno Exp $
+ * $Id: ac3_exponent.c,v 1.24 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Michel Lespinasse <walken@zoy.org>
#include "threads.h"
#include "mtime.h"
+#include "intf_msg.h" /* intf_DbgMsg(), intf_ErrMsg() */
+
#include "stream_control.h"
#include "input_ext-dec.h"
#include "audio_output.h"
#include "ac3_decoder.h"
-#include "ac3_decoder_thread.h"
-
-#include "intf_msg.h"
#include "ac3_internal.h"
* ac3_imdct.c: ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct.c,v 1.18 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_imdct.c,v 1.19 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
#include "input_ext-dec.h"
#include "ac3_decoder.h"
-#include "ac3_internal.h"
-#include "ac3_downmix.h"
-#include "ac3_imdct_c.h"
-#if 0
-#include "ac3_imdct_kni.h"
-#endif
+#include "ac3_imdct_c.h" /* imdct_init_c */
+#include "ac3_imdct_sse.h" /* imdct_init_sse */
-#include "tests.h"
+#include "tests.h" /* TestCPU */
#ifndef M_PI
# define M_PI 3.14159265358979323846
void imdct_init(imdct_t * p_imdct)
{
int i;
- float scale = 255.99609372;
-
+ float scale = 181.019;
#if 0
- if ( TestCPU (CPU_CAPABILITY_MMX) )
+ if ( TestCPU (CPU_CAPABILITY_SSE) )
{
- imdct_init_kni (p_imdct);
- } else
+ imdct_init_sse (p_imdct);
+ }
+ else
#endif
{
imdct_init_c (p_imdct);
* ac3_imdct_c.c: ac3 DCT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_c.c,v 1.2 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_imdct_c.c,v 1.3 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
#include "input_ext-dec.h"
#include "ac3_decoder.h"
-#include "ac3_internal.h"
+#include "ac3_imdct_c.h"
#ifndef M_PI
# define M_PI 3.14159265358979323846
void fft_64p_c (complex_t *x);
void fft_128p_c (complex_t *x);
-void imdct_do_512_c (imdct_t * p_imdct, float data[], float delay[]);
-void imdct_do_512_nol_c (imdct_t * p_imdct, float data[], float delay[]);
-
static float window[] = {
0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
int imdct_init_c (imdct_t * p_imdct)
{
int i;
- float scale = 255.99609372;
+ float scale = 181.019;
p_imdct->imdct_do_512 = imdct_do_512_c;
p_imdct->imdct_do_512_nol = imdct_do_512_nol_c;
--- /dev/null
+/*****************************************************************************
+ * ac3_imdct_sse.c: ac3 DCT
+ *****************************************************************************
+ * Copyright (C) 1999, 2000 VideoLAN
+ * $Id: ac3_imdct_sse.c,v 1.1 2001/05/14 15:58:04 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ * Aaron Holtzman <aholtzma@engr.uvic.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "defs.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+
+#include "intf_msg.h" /* intf_DbgMsg(), intf_ErrMsg() */
+
+#include "stream_control.h"
+#include "input_ext-dec.h"
+
+#include "ac3_decoder.h"
+
+#include "ac3_imdct_sse.h"
+
+static const float window[] = {
+ 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
+ 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
+ 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
+ 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
+ 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
+ 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
+ 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
+ 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
+ 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
+ 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
+ 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
+ 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
+ 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
+ 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
+ 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
+ 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
+ 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
+ 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
+ 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
+ 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
+ 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
+ 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
+ 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
+ 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
+ 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
+ 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
+ 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
+ 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
+ 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
+ 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
+ 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
+ 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
+};
+
+static const int pm128[128] =
+{
+ 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
+ 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124,
+ 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122,
+ 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126,
+ 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121,
+ 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125,
+ 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
+ 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
+};
+
+void fft_64p_sse (complex_t *x);
+void fft_128p_sse(complex_t *a);
+static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse);
+static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse);
+static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt);
+
+
+int imdct_init_sse (imdct_t * p_imdct)
+{
+ int i;
+ float scale = 181.019;
+
+ intf_WarnMsg (1, "ac3dec: using MMX_SSE for imdct");
+ p_imdct->imdct_do_512 = imdct_do_512_sse;
+ p_imdct->imdct_do_512_nol = imdct_do_512_nol_sse;
+ p_imdct->fft_64p = fft_64p_sse;
+
+ for (i=0; i < 128; i++)
+ {
+ float xcos_i = cos(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
+ float xsin_i = sin(2.0f * M_PI * (8*i+1)/(8*N)) * scale;
+ p_imdct->xcos_sin_sse[i * 4] = xcos_i;
+ p_imdct->xcos_sin_sse[i * 4 + 1] = -xsin_i;
+ p_imdct->xcos_sin_sse[i * 4 + 2] = -xsin_i;
+ p_imdct->xcos_sin_sse[i * 4 + 3] = -xcos_i;
+ }
+ return 0;
+}
+
+void imdct_do_512_sse (imdct_t * p_imdct, float data[], float delay[])
+{
+ imdct512_pre_ifft_twiddle_sse (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
+ fft_128p_sse (p_imdct->buf);
+ imdct512_post_ifft_twiddle_sse (p_imdct->buf, p_imdct->xcos_sin_sse);
+ imdct512_window_delay_sse (p_imdct->buf, data, window, delay);
+}
+
+
+void imdct_do_512_nol_sse (imdct_t * p_imdct, float data[], float delay[])
+{
+ imdct512_pre_ifft_twiddle_sse (pm128, p_imdct->buf, data, p_imdct->xcos_sin_sse);
+ fft_128p_sse (p_imdct->buf);
+ imdct512_post_ifft_twiddle_sse (p_imdct->buf, p_imdct->xcos_sin_sse);
+ imdct512_window_delay_nol_sse (p_imdct->buf, data, window, delay);
+}
+
+static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
+{
+ __asm__ __volatile__ (
+ "pushl %%ebp\n"
+ "movl %%esp, %%ebp\n"
+ "addl $-4, %%esp\n" /* local variable, loop counter */
+
+ "pushl %%eax\n"
+ "pushl %%ebx\n"
+ "pushl %%ecx\n"
+ "pushl %%edx\n"
+ "pushl %%edi\n"
+ "pushl %%esi\n"
+
+ "movl 8(%%ebp), %%eax\n" /* pmt */
+ "movl 12(%%ebp), %%ebx\n" /* buf */
+ "movl 16(%%ebp), %%ecx\n" /* data */
+ "movl 20(%%ebp), %%edx\n" /* xcos_sin_sse */
+ "movl $64, -4(%%ebp)\n"
+
+".loop:\n"
+ "movl (%%eax), %%esi\n"
+ "movl 4(%%eax), %%edi\n"
+ "movss (%%ecx, %%esi, 8), %%xmm1\n" /* 2j */
+ "movss (%%ecx, %%edi, 8), %%xmm3\n" /* 2(j+1) */
+
+ "shll $1, %%esi\n"
+ "shll $1, %%edi\n"
+
+ "movups (%%edx, %%esi, 8), %%xmm0\n" /* -c_j | -s_j | -s_j | c_j */
+ "movups (%%edx, %%edi, 8), %%xmm2\n" /* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
+
+ "negl %%esi\n"
+ "negl %%edi\n"
+
+ "movss 1020(%%ecx, %%esi, 4), %%xmm4\n" /* 255-2j */
+ "addl $8, %%eax\n"
+ "movss 1020(%%ecx, %%edi, 4), %%xmm5\n" /* 255-2(j+1) */
+
+ "shufps $0, %%xmm1, %%xmm4\n" /* 2j | 2j | 255-2j | 255-2j */
+ "shufps $0, %%xmm3, %%xmm5\n" /* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */
+ "mulps %%xmm4, %%xmm0\n"
+ "mulps %%xmm5, %%xmm2\n"
+ "movhlps %%xmm0, %%xmm1\n"
+ "movhlps %%xmm2, %%xmm3\n"
+ "addl $16, %%ebx\n"
+ "addps %%xmm1, %%xmm0\n"
+ "addps %%xmm3, %%xmm2\n"
+ "movlhps %%xmm2, %%xmm0\n"
+
+ "movups %%xmm0, -16(%%ebx)\n"
+ "decl -4(%%ebp)\n"
+ "jnz .loop\n"
+
+ "popl %%esi\n"
+ "popl %%edi\n"
+ "popl %%edx\n"
+ "popl %%ecx\n"
+ "popl %%ebx\n"
+ "popl %%eax\n"
+
+ "addl $4, %%esp\n"
+ "popl %%ebp\n"
+ ::);
+}
+
+static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
+{
+ __asm__ __volatile__ (
+ "pushl %%ecx\n"
+ "movl $32, %%ecx\n" /* loop counter */
+
+".loop1:\n"
+ "movups (%%eax), %%xmm0\n" /* im1 | re1 | im0 | re0 */
+
+ "movups (%%ebx), %%xmm2\n" /* -c | -s | -s | c */
+ "movhlps %%xmm0, %%xmm1\n" /* im1 | re1 */
+ "movups 16(%%ebx), %%xmm3\n" /* -c1 | -s1 | -s1 | c1 */
+
+ "shufps $0x50, %%xmm0, %%xmm0\n" /* im0 | im0 | re0 | re0 */
+ "shufps $0x50, %%xmm1, %%xmm1\n" /* im1 | im1 | re1 | re1 */
+
+ "movups 16(%%eax), %%xmm4\n" /* im3 | re3 | im2 | re2 */
+
+ "shufps $0x27, %%xmm2, %%xmm2\n" /* c | -s | -s | -c */
+ "movhlps %%xmm4, %%xmm5\n" /* im3 | re3 */
+ "shufps $0x27, %%xmm3, %%xmm3\n" /* c1 | -s1 | -s1 | -c1 */
+
+ "movups 32(%%ebx), %%xmm6\n" /* -c2 | -s2 | -s2 | c2 */
+ "movups 48(%%ebx), %%xmm7\n" /* -c3 | -s3 | -s3 | c3 */
+
+ "shufps $0x50, %%xmm4, %%xmm4\n" /* im2 | im2 | re2 | re2 */
+ "shufps $0x50, %%xmm5, %%xmm5\n" /* im3 | im3 | re3 | re3 */
+
+ "mulps %%xmm2, %%xmm0\n"
+ "mulps %%xmm3, %%xmm1\n"
+
+ "shufps $0x27, %%xmm6, %%xmm6\n" /* c2 | -s2 | -s2 | -c2 */
+ "shufps $0x27, %%xmm7, %%xmm7\n" /* c3 | -s3 | -s3 | -c3 */
+
+ "movhlps %%xmm0, %%xmm2\n"
+ "movhlps %%xmm1, %%xmm3\n"
+
+ "mulps %%xmm6, %%xmm4\n"
+ "mulps %%xmm7, %%xmm5\n"
+
+ "addps %%xmm2, %%xmm0\n"
+ "addps %%xmm3, %%xmm1\n"
+
+ "movhlps %%xmm4, %%xmm6\n"
+ "movhlps %%xmm5, %%xmm7\n"
+
+ "addps %%xmm6, %%xmm4\n"
+ "addps %%xmm7, %%xmm5\n"
+
+ "movlhps %%xmm1, %%xmm0\n"
+ "movlhps %%xmm5, %%xmm4\n"
+
+ "movups %%xmm0, (%%eax)\n"
+ "movups %%xmm4, 16(%%eax)\n"
+ "addl $64, %%ebx\n"
+ "addl $32, %%eax\n"
+ "decl %%ecx\n"
+ "jnz .loop1\n"
+
+ "popl %%ecx\n"
+ : "=a" (buf)
+ : "a" (buf), "b" (xcos_sin_sse) );
+}
+
+static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
+{
+ __asm__ __volatile__ (
+ "pushl %%ebp\n"
+ "movl %%esp, %%ebp\n"
+
+ "pushl %%eax\n"
+ "pushl %%ebx\n"
+ "pushl %%ecx\n"
+ "pushl %%edx\n"
+ "pushl %%esi\n"
+ "pushl %%edi\n"
+
+ "movl 20(%%ebp), %%ebx\n" /* delay */
+ "movl 16(%%ebp), %%edx\n" /* window */
+
+ "movl 8(%%ebp), %%eax\n" /* buf */
+ "movl $16, %%ecx\n" /* loop count */
+ "leal 516(%%eax), %%esi\n" /* buf[64].im */
+ "leal 504(%%eax), %%edi\n" /* buf[63].re */
+ "movl 12(%%ebp), %%eax\n" /* data */
+
+".first_128_samples:\n"
+ "movss (%%esi), %%xmm0\n"
+ "movss 8(%%esi), %%xmm2\n"
+ "movss (%%edi), %%xmm1\n"
+ "movss -8(%%edi), %%xmm3\n"
+
+ "movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
+ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
+
+ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movups (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
+ "shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
+
+ "movss 16(%%esi), %%xmm6\n" /* im2 */
+ "movss 24(%%esi), %%xmm7\n" /* im3 */
+ "subps %%xmm1, %%xmm0\n" /* -re1 | im1 | -re0 | im0 */
+ "movss -16(%%edi), %%xmm2\n" /* re2 */
+ "movss -24(%%edi), %%xmm3\n" /* re3 */
+ "mulps %%xmm4, %%xmm0\n"
+ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
+ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
+ "addps %%xmm5, %%xmm0\n"
+ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
+ "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
+ "movups 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
+ "subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */
+ "addl $32, %%edx\n"
+ "movups %%xmm0, (%%eax)\n"
+ "addl $32, %%ebx\n"
+ "mulps %%xmm4, %%xmm6\n"
+ "addl $32, %%esi\n"
+ "addl $32, %%eax\n"
+ "addps %%xmm5, %%xmm6\n"
+ "addl $-32, %%edi\n"
+ "movups %%xmm6, -16(%%eax)\n"
+ "decl %%ecx\n"
+ "jnz .first_128_samples\n"
+
+ "movl 8(%%ebp), %%esi\n" /* buf[0].re */
+ "leal 1020(%%esi), %%edi\n" /* buf[127].im */
+ "movl $16, %%ecx\n" /* loop count */
+
+".second_128_samples:\n"
+ "movss (%%esi), %%xmm0\n" /* buf[i].re */
+ "movss 8(%%esi), %%xmm2\n" /* re1 */
+ "movss (%%edi), %%xmm1\n" /* buf[127-i].im */
+ "movss -8(%%edi), %%xmm3\n" /* im1 */
+
+ "movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
+ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */
+
+ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "movups (%%ebx), %%xmm5\n" /* d3 | d2 | d1 | d0 */
+
+ "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
+ "movss 16(%%esi), %%xmm6\n" /* re2 */
+ "movss 24(%%esi), %%xmm7\n" /* re3 */
+ "movss -16(%%edi), %%xmm2\n" /* im2 */
+ "movss -24(%%edi), %%xmm3\n" /* im3 */
+ "subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
+ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
+ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
+ "mulps %%xmm4, %%xmm0\n"
+ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
+ "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
+ "addl $32, %%esi\n"
+ "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
+ "addps %%xmm5, %%xmm0\n"
+ "mulps %%xmm4, %%xmm6\n"
+ "addl $-32, %%edi\n"
+ "movups 16(%%ebx), %%xmm5\n" /* d7 | d6 | d5 | d4 */
+ "movups %%xmm0, (%%eax)\n"
+ "addps %%xmm5, %%xmm6\n"
+ "addl $32, %%edx\n"
+ "addl $32, %%eax\n"
+ "addl $32, %%ebx\n"
+ "movups %%xmm6, -16(%%eax)\n"
+ "decl %%ecx\n"
+ "jnz .second_128_samples\n"
+
+ "movl 8(%%ebp), %%eax\n"
+ "leal 512(%%eax), %%esi\n" /* buf[64].re */
+ "leal 508(%%eax), %%edi\n" /* buf[63].im */
+ "movl $16, %%ecx\n" /* loop count */
+ "movl 20(%%ebp), %%eax\n" /* delay */
+
+".first_128_delay:\n"
+ "movss (%%esi), %%xmm0\n"
+ "movss 8(%%esi), %%xmm2\n"
+ "movss (%%edi), %%xmm1\n"
+ "movss -8(%%edi), %%xmm3\n"
+
+ "movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
+ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im0 */
+
+ "movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
+ "movss 16(%%esi), %%xmm6\n" /* re2 */
+ "movss 24(%%esi), %%xmm7\n" /* re3 */
+ "movss -16(%%edi), %%xmm2\n" /* im2 */
+ "movss -24(%%edi), %%xmm3\n" /* im3 */
+ "subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
+ "addl $-32, %%edx\n"
+ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
+ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
+ "mulps %%xmm4, %%xmm0\n"
+ "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
+ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
+ "movups %%xmm0, (%%eax)\n"
+ "addl $32, %%esi\n"
+ "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
+ "addl $-32, %%edi\n"
+ "mulps %%xmm5, %%xmm6\n"
+ "addl $32, %%eax\n"
+ "movups %%xmm6, -16(%%eax)\n"
+ "decl %%ecx\n"
+ "jnz .first_128_delay\n"
+
+ "movl 8(%%ebp), %%ebx\n"
+ "leal 4(%%ebx), %%esi\n" /* buf[0].im */
+ "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
+ "movl $16, %%ecx\n" /* loop count */
+
+".second_128_delay:\n"
+ "movss (%%esi), %%xmm0\n"
+ "movss 8(%%esi), %%xmm2\n"
+ "movss (%%edi), %%xmm1\n"
+ "movss -8(%%edi), %%xmm3\n"
+
+ "movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
+ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
+
+ "movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
+ "movss 16(%%esi), %%xmm6\n" /* im2 */
+ "movss 24(%%esi), %%xmm7\n" /* im3 */
+ "movss -16(%%edi), %%xmm2\n" /* re2 */
+ "movss -24(%%edi), %%xmm3\n" /* re3 */
+ "subps %%xmm0, %%xmm1\n" /* re1 | -im1 | re0 | -im0 */
+ "addl $-32, %%edx\n"
+ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
+ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
+ "mulps %%xmm4, %%xmm1\n"
+ "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
+ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
+ "movups %%xmm1, (%%eax)\n"
+ "addl $32, %%esi\n"
+ "subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */
+ "addl $-32, %%edi\n"
+ "mulps %%xmm5, %%xmm2\n"
+ "addl $32, %%eax\n"
+ "movups %%xmm2, -16(%%eax)\n"
+ "decl %%ecx\n"
+ "jnz .second_128_delay\n"
+
+ "popl %%edi\n"
+ "popl %%esi\n"
+ "popl %%edx\n"
+ "popl %%ecx\n"
+ "popl %%ebx\n"
+ "popl %%eax\n"
+
+ "leave\n"
+ ::);
+}
+
+static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
+{
+ __asm__ __volatile__ (
+ "pushl %%ebp\n"
+ "movl %%esp, %%ebp\n"
+
+ "pushl %%eax\n"
+ "pushl %%ebx\n"
+ "pushl %%ecx\n"
+ "pushl %%edx\n"
+ "pushl %%esi\n"
+ "pushl %%edi\n"
+
+ /* movl 20(%%ebp), %%ebx delay */
+ "movl 16(%%ebp), %%edx\n" /* window */
+
+ "movl 8(%%ebp), %%eax\n" /* buf */
+ "movl $16, %%ecx\n" /* loop count */
+ "leal 516(%%eax), %%esi\n" /* buf[64].im */
+ "leal 504(%%eax), %%edi\n" /* buf[63].re */
+ "movl 12(%%ebp), %%eax\n" /* data */
+
+".first_128_sample:\n"
+ "movss (%%esi), %%xmm0\n"
+ "movss 8(%%esi), %%xmm2\n"
+ "movss (%%edi), %%xmm1\n"
+ "movss -8(%%edi), %%xmm3\n"
+
+ "movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
+ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
+
+ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ /* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
+ "shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
+
+ "movss 16(%%esi), %%xmm6\n" /* im2 */
+ "movss 24(%%esi), %%xmm7\n" /* im3 */
+ "subps %%xmm1, %%xmm0\n" /* -re1 | im1 | -re0 | im0 */
+ "movss -16(%%edi), %%xmm2\n" /* re2 */
+ "movss -24(%%edi), %%xmm3\n" /* re3 */
+ "mulps %%xmm4, %%xmm0\n"
+ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
+ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
+ /* addps %%xmm5, %%xmm0 */
+ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
+ "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
+ /* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */
+ "subps %%xmm2, %%xmm6\n" /* -re3 | im3 | -re2 | im2 */
+ "addl $32, %%edx\n"
+ "movups %%xmm0, (%%eax)\n"
+ /* addl $32, %%ebx */
+ "mulps %%xmm4, %%xmm6\n"
+ "addl $32, %%esi\n"
+ "addl $32, %%eax\n"
+ /* addps %%xmm5, %%xmm6 */
+ "addl $-32, %%edi\n"
+ "movups %%xmm6, -16(%%eax)\n"
+ "decl %%ecx\n"
+ "jnz .first_128_sample\n"
+
+ "movl 8(%%ebp), %%esi\n" /* buf[0].re */
+ "leal 1020(%%esi), %%edi\n" /* buf[127].im */
+ "movl $16, %%ecx\n" /* loop count */
+
+".second_128_sample:\n"
+ "movss (%%esi), %%xmm0\n" /* buf[i].re */
+ "movss 8(%%esi), %%xmm2\n" /* re1 */
+ "movss (%%edi), %%xmm1\n" /* buf[127-i].im */
+ "movss -8(%%edi), %%xmm3\n" /* im1 */
+
+ "movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
+ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im1 */
+
+ "movups (%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ /* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
+
+ "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
+ "movss 16(%%esi), %%xmm6\n" /* re2 */
+ "movss 24(%%esi), %%xmm7\n" /* re3 */
+ "movss -16(%%edi), %%xmm2\n" /* im2 */
+ "movss -24(%%edi), %%xmm3\n" /* im3 */
+ "subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
+ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
+ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
+ "mulps %%xmm4, %%xmm0\n"
+ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
+ "movups 16(%%edx), %%xmm4\n" /* w7 | w6 | w5 | w4 */
+ "addl $32, %%esi\n"
+ "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
+ /* addps %%xmm5, %%xmm0 */
+ "mulps %%xmm4, %%xmm6\n"
+ "addl $-32, %%edi\n"
+ /* movups 16(%%ebx), %%xmm5 d7 | d6 | d5 | d4 */
+ "movups %%xmm0, (%%eax)\n"
+ /* addps %%xmm5, %%xmm6 */
+ "addl $32, %%edx\n"
+ "addl $32, %%eax\n"
+ /* addl $32, %%ebx */
+ "movups %%xmm6, -16(%%eax)\n"
+ "decl %%ecx\n"
+ "jnz .second_128_sample\n"
+
+ "movl 8(%%ebp), %%eax\n"
+ "leal 512(%%eax), %%esi\n" /* buf[64].re */
+ "leal 508(%%eax), %%edi\n" /* buf[63].im */
+ "movl $16, %%ecx\n" /* loop count */
+ "movl 20(%%ebp), %%eax\n" /* delay */
+
+".first_128_delays:\n"
+ "movss (%%esi), %%xmm0\n"
+ "movss 8(%%esi), %%xmm2\n"
+ "movss (%%edi), %%xmm1\n"
+ "movss -8(%%edi), %%xmm3\n"
+
+ "movlhps %%xmm2, %%xmm0\n" /* 0.0 | re1 | 0.0 | re0 */
+ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | im1 | 0.0 | im0 */
+
+ "movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
+ "movss 16(%%esi), %%xmm6\n" /* re2 */
+ "movss 24(%%esi), %%xmm7\n" /* re3 */
+ "movss -16(%%edi), %%xmm2\n" /* im2 */
+ "movss -24(%%edi), %%xmm3\n" /* im3 */
+ "subps %%xmm1, %%xmm0\n" /* -im1 | re1 | -im0 | re0 */
+ "addl $-32, %%edx\n"
+ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | re3 | 0.0 | re2 */
+ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | im3 | 0.0 | im2 */
+ "mulps %%xmm4, %%xmm0\n"
+ "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
+ "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
+ "movups %%xmm0, (%%eax)\n"
+ "addl $32, %%esi\n"
+ "subps %%xmm2, %%xmm6\n" /* -im3 | re3 | -im2 | re2 */
+ "addl $-32, %%edi\n"
+ "mulps %%xmm5, %%xmm6\n"
+ "addl $32, %%eax\n"
+ "movups %%xmm6, -16(%%eax)\n"
+ "decl %%ecx\n"
+ "jnz .first_128_delays\n"
+
+ "movl 8(%%ebp), %%ebx\n"
+ "leal 4(%%ebx), %%esi\n" /* buf[0].im */
+ "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
+ "movl $16, %%ecx\n" /* loop count */
+
+".second_128_delays:\n"
+ "movss (%%esi), %%xmm0\n"
+ "movss 8(%%esi), %%xmm2\n"
+ "movss (%%edi), %%xmm1\n"
+ "movss -8(%%edi), %%xmm3\n"
+
+ "movlhps %%xmm2, %%xmm0\n" /* 0.0 | im1 | 0.0 | im0 */
+ "movlhps %%xmm3, %%xmm1\n" /* 0.0 | re1 | 0.0 | re0 */
+
+ "movups -16(%%edx), %%xmm4\n" /* w3 | w2 | w1 | w0 */
+ "shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
+ "movss 16(%%esi), %%xmm6\n" /* im2 */
+ "movss 24(%%esi), %%xmm7\n" /* im3 */
+ "movss -16(%%edi), %%xmm2\n" /* re2 */
+ "movss -24(%%edi), %%xmm3\n" /* re3 */
+ "subps %%xmm0, %%xmm1\n" /* re1 | -im1 | re0 | -im0 */
+ "addl $-32, %%edx\n"
+ "movlhps %%xmm7, %%xmm6\n" /* 0.0 | im3 | 0.0 | im2 */
+ "movlhps %%xmm3, %%xmm2\n" /* 0.0 | re3 | 0.0 | re2 */
+ "mulps %%xmm4, %%xmm1\n"
+ "movups (%%edx), %%xmm5\n" /* w7 | w6 | w5 | w4 */
+ "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
+ "movups %%xmm1, (%%eax)\n"
+ "addl $32, %%esi\n"
+ "subps %%xmm6, %%xmm2\n" /* re | -im3 | re | -im2 */
+ "addl $-32, %%edi\n"
+ "mulps %%xmm5, %%xmm2\n"
+ "addl $32, %%eax\n"
+ "movups %%xmm2, -16(%%eax)\n"
+ "decl %%ecx\n"
+ "jnz .second_128_delays\n"
+
+ "popl %%edi\n"
+ "popl %%esi\n"
+ "popl %%edx\n"
+ "popl %%ecx\n"
+ "popl %%ebx\n"
+ "popl %%eax\n"
+
+ "leave\n"
+ ::);
+}
--- /dev/null
+int imdct_init_sse (imdct_t * p_imdct);
+void imdct_do_512_sse(imdct_t * p_imdct, float data[], float delay[]);
+void imdct_do_512_nol_sse(imdct_t * p_imdct, float data[], float delay[]);
* ac3_internals.h: needed by the ac3 decoder
*****************************************************************************
* Copyright (C) 2000 VideoLAN
- * $Id: ac3_internal.h,v 1.8 2001/03/21 13:42:34 sam Exp $
+ * $Id: ac3_internal.h,v 1.9 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Lespinasse <walken@zoy.org>
*
void bit_allocate (ac3dec_t *);
/* ac3_downmix.c */
-int downmix (ac3dec_t *, float *, s16 *);
+void downmix_init (downmix_t * p_downmix);
/* ac3_exponent.c */
int exponent_unpack (ac3dec_t *);
/* ac3_imdct.c */
+void imdct_init (imdct_t * p_imdct);
void imdct (ac3dec_t * p_ac3dec, s16 * buffer);
/* ac3_mantissa.c */
* ac3_mantissa.c: ac3 mantissa computation
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_mantissa.c,v 1.27 2001/05/07 03:14:09 stef Exp $
+ * $Id: ac3_mantissa.c,v 1.28 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
#include "audio_output.h"
#include "ac3_decoder.h"
-#include "ac3_decoder_thread.h"
-
-#include "ac3_internal.h"
#include "intf_msg.h"
p_ac3dec->total_bits_read += 5;
if ((group_code = GetBits (&p_ac3dec->bit_stream,5)) > 26)
{
- intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (1)" );
+ intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (1)" );
return 0;
}
p_ac3dec->total_bits_read += 7;
if ((group_code = GetBits (&p_ac3dec->bit_stream,7)) > 124)
{
- intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (2)" );
+ intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (2)" );
return 0;
}
p_ac3dec->total_bits_read += 3;
if ((group_code = GetBits (&p_ac3dec->bit_stream,3)) > 6)
{
- intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (3)" );
+ intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (3)" );
return 0;
}
p_ac3dec->total_bits_read += 7;
if ((group_code = GetBits (&p_ac3dec->bit_stream,7)) > 120)
{
- intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (4)" );
+ intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (4)" );
return 0;
}
p_ac3dec->total_bits_read += 4;
if ((group_code = GetBits (&p_ac3dec->bit_stream,4)) > 14)
{
- intf_WarnMsg ( 3, "ac3dec error: invalid mantissa (5)" );
+ intf_WarnMsg ( 3, "ac3dec warn: invalid mantissa (5)" );
return 0;
}
* ac3_parse.c: ac3 parsing procedures
*****************************************************************************
* Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_parse.c,v 1.21 2001/05/07 04:42:42 sam Exp $
+ * $Id: ac3_parse.c,v 1.22 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
#include "intf_msg.h"
#include "ac3_decoder.h"
-#include "ac3_decoder_thread.h"
+#include "ac3_decoder_thread.h" /* ac3dec_thread_t */
-#include "ac3_internal.h"
+#include "ac3_internal.h" /* EXP_REUSE */
/* Misc LUT */
static const u16 nfchans[] = { 2, 1, 2, 3, 3, 4, 4, 5 };
static const int fscod_tbl[] = {48000, 44100, 32000};
/* Some internal functions */
-void parse_bsi_stats (ac3dec_t * p_ac3dec);
-void parse_audblk_stats (ac3dec_t * p_ac3dec);
+#ifdef STATS
+static void parse_bsi_stats (ac3dec_t * p_ac3dec);
+static void parse_audblk_stats (ac3dec_t * p_ac3dec);
+#endif
/* Parse a syncinfo structure */
int ac3_sync_frame (ac3dec_t * p_ac3dec, ac3_sync_info_t * p_sync_info)
}
#ifdef STATS
-// parse_audblk_stats(p_ac3dec);
+ parse_audblk_stats(p_ac3dec);
#endif
return 0;
RemoveBits (&p_ac3dec->bit_stream,16);
}
-void parse_bsi_stats (ac3dec_t * p_ac3dec) /*Some stats */
+#ifdef STATS
+static void parse_bsi_stats (ac3dec_t * p_ac3dec) /* Some stats */
{
struct mixlev_s
{
i = 0;
}
-void parse_audblk_stats (ac3dec_t * p_ac3dec)
+static void parse_audblk_stats (ac3dec_t * p_ac3dec)
{
char *exp_strat_tbl[4] = {"R ","D15 ","D25 ","D45 "};
u32 i;
intf_ErrMsg ("(ac3dec_parseaudblk) ");
- intf_ErrMsg ("%s ",p_ac3dec->audblk.cplinu ? "cpl on" : "cpl off");
- intf_ErrMsg ("%s ",p_ac3dec->audblk.baie? "bai" : " ");
- intf_ErrMsg ("%s ",p_ac3dec->audblk.snroffste? "snroffst" : " ");
- intf_ErrMsg ("%s ",p_ac3dec->audblk.deltbaie? "deltba" : " ");
- intf_ErrMsg ("%s ",p_ac3dec->audblk.phsflginu? "phsflg" : " ");
- intf_ErrMsg ("(%s %s %s %s %s) ",exp_strat_tbl[p_ac3dec->audblk.chexpstr[0]],
- exp_strat_tbl[p_ac3dec->audblk.chexpstr[1]],exp_strat_tbl[p_ac3dec->audblk.chexpstr[2]],
- exp_strat_tbl[p_ac3dec->audblk.chexpstr[3]],exp_strat_tbl[p_ac3dec->audblk.chexpstr[4]]);
- intf_ErrMsg ("[");
- for(i=0;i<p_ac3dec->bsi.nfchans;i++)
- intf_ErrMsg ("%1d",p_ac3dec->audblk.blksw[i]);
- intf_ErrMsg ("]");
-
- intf_ErrMsg ("\n");
+ intf_ErrMsg ("%s ",p_ac3dec->audblk.cplinu ? "cpl on" : "cpl off");
+ intf_ErrMsg ("%s ",p_ac3dec->audblk.baie? "bai" : " ");
+ intf_ErrMsg ("%s ",p_ac3dec->audblk.snroffste? "snroffst" : " ");
+ intf_ErrMsg ("%s ",p_ac3dec->audblk.deltbaie? "deltba" : " ");
+ intf_ErrMsg ("%s ",p_ac3dec->audblk.phsflginu? "phsflg" : " ");
+ intf_ErrMsg ("(%s %s %s %s %s) ",exp_strat_tbl[p_ac3dec->audblk.chexpstr[0]],
+ exp_strat_tbl[p_ac3dec->audblk.chexpstr[1]],exp_strat_tbl[p_ac3dec->audblk.chexpstr[2]],
+ exp_strat_tbl[p_ac3dec->audblk.chexpstr[3]],exp_strat_tbl[p_ac3dec->audblk.chexpstr[4]]);
+ intf_ErrMsg ("[");
+ for(i=0;i<p_ac3dec->bsi.nfchans;i++)
+ intf_ErrMsg ("%1d",p_ac3dec->audblk.blksw[i]);
+ intf_ErrMsg ("]");
+
+ intf_ErrMsg ("\n");
}
+#endif
* ac3_rematrix.c: ac3 audio rematrixing
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_rematrix.c,v 1.16 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_rematrix.c,v 1.17 2001/05/14 15:58:04 reno Exp $
*
* Authors: Michel Kaempf <maxx@via.ecp.fr>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
#include "input_ext-dec.h"
#include "ac3_decoder.h"
-#include "ac3_internal.h"
struct rematrix_band_s {
u32 start;
* ac3_srfft.c: ac3 FFT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_srfft.c,v 1.3 2001/05/06 04:32:02 sam Exp $
+ * $Id: ac3_srfft.c,v 1.4 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
#include "ac3_decoder.h"
#include "ac3_srfft.h"
-void fft_8 (complex_t *x);
+static void fft_8 (complex_t *x);
-void fft_4(complex_t *x)
+static void fft_4(complex_t *x)
{
/* delta_p = 1 here */
/* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
}
-void fft_8 (complex_t *x)
+static void fft_8 (complex_t *x)
{
/* delta_p = diag{1, sqrt(i)} here */
/* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
}
-void fft_asmb(int k, complex_t *x, complex_t *wTB,
+static void fft_asmb(int k, complex_t *x, complex_t *wTB,
const complex_t *d, const complex_t *d_3)
{
register complex_t *x2k, *x3k, *x4k, *wB;
}
-void fft_asmb16(complex_t *x, complex_t *wTB)
+static void fft_asmb16(complex_t *x, complex_t *wTB)
{
register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
int k = 2;
* ac3_srfft.h: ac3 FFT
*****************************************************************************
* Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_srfft.h,v 1.2 2001/04/30 21:10:25 reno Exp $
+ * $Id: ac3_srfft.h,v 1.3 2001/05/14 15:58:04 reno Exp $
*
* Authors: Renaud Dartus <reno@videolan.org>
* Aaron Holtzman <aholtzma@engr.uvic.ca>
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
-static complex_t delta16[4] =
+static const complex_t delta16[4] =
{ {1.00000000000000, 0.00000000000000},
{0.92387953251129, -0.38268343236509},
{0.70710678118655, -0.70710678118655},
{0.38268343236509, -0.92387953251129}};
-static complex_t delta16_3[4] =
+static const complex_t delta16_3[4] =
{ {1.00000000000000, 0.00000000000000},
{0.38268343236509, -0.92387953251129},
{-0.70710678118655, -0.70710678118655},
{-0.92387953251129, 0.38268343236509}};
-static complex_t delta32[8] =
+static const complex_t delta32[8] =
{ {1.00000000000000, 0.00000000000000},
{0.98078528040323, -0.19509032201613},
{0.92387953251129, -0.38268343236509},
{0.38268343236509, -0.92387953251129},
{0.19509032201613, -0.98078528040323}};
-static complex_t delta32_3[8] =
+static const complex_t delta32_3[8] =
{ {1.00000000000000, 0.00000000000000},
{0.83146961230255, -0.55557023301960},
{0.38268343236509, -0.92387953251129},
{-0.92387953251129, 0.38268343236509},
{-0.55557023301960, 0.83146961230255}};
-static complex_t delta64[16] =
+static const complex_t delta64[16] =
{ {1.00000000000000, 0.00000000000000},
{0.99518472667220, -0.09801714032956},
{0.98078528040323, -0.19509032201613},
{0.19509032201613, -0.98078528040323},
{0.09801714032956, -0.99518472667220}};
-static complex_t delta64_3[16] =
+static const complex_t delta64_3[16] =
{ {1.00000000000000, 0.00000000000000},
{0.95694033573221, -0.29028467725446},
{0.83146961230255, -0.55557023301960},
{-0.55557023301960, 0.83146961230255},
{-0.29028467725446, 0.95694033573221}};
-static complex_t delta128[32] =
+static const complex_t delta128[32] =
{ {1.00000000000000, 0.00000000000000},
{0.99879545620517, -0.04906767432742},
{0.99518472667220, -0.09801714032956},
{0.09801714032956, -0.99518472667220},
{0.04906767432742, -0.99879545620517}};
-static complex_t delta128_3[32] =
+static const complex_t delta128_3[32] =
{ {1.00000000000000, 0.00000000000000},
{0.98917650996478, -0.14673047445536},
{0.95694033573221, -0.29028467725446},
--- /dev/null
+/*****************************************************************************
+ * ac3_srfft_sse.c: ac3 fft functions
+ *****************************************************************************
+ * Copyright (C) 1999, 2000, 2001 VideoLAN
+ * $Id: ac3_srfft_sse.c,v 1.1 2001/05/14 15:58:04 reno Exp $
+ *
+ * Authors: Renaud Dartus <reno@videolan.org>
+ * Aaron Holtzman <aholtzma@engr.uvic.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include <stdio.h>
+
+#include "defs.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include "config.h"
+#include "common.h"
+#include "threads.h"
+#include "mtime.h"
+
+#include "stream_control.h"
+#include "input_ext-dec.h"
+
+#include "ac3_decoder.h"
+#include "ac3_srfft.h"
+
+void hsqrt2 (void);
+void C_1 (void);
+static void fft_4_sse (complex_t *x);
+static void fft_8_sse (complex_t *x);
+static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
+ const complex_t *d, const complex_t *d_3);
+
+void fft_64p_sse(complex_t *a)
+{
+ fft_8_sse(&a[0]); fft_4_sse(&a[8]); fft_4_sse(&a[12]);
+ fft_asmb_sse(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+
+ fft_8_sse(&a[16]), fft_8_sse(&a[24]);
+ fft_asmb_sse(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+ fft_8_sse(&a[32]); fft_4_sse(&a[40]); fft_4_sse(&a[44]);
+ fft_asmb_sse(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+
+ fft_8_sse(&a[48]); fft_4_sse(&a[56]); fft_4_sse(&a[60]);
+ fft_asmb_sse(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+
+ fft_asmb_sse(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+}
+
+
+void fft_128p_sse(complex_t *a)
+{
+ fft_8_sse(&a[0]); fft_4_sse(&a[8]); fft_4_sse(&a[12]);
+ fft_asmb_sse(2, &a[0], &a[8], &delta16[0], &delta16_3[0]);
+
+ fft_8_sse(&a[16]), fft_8_sse(&a[24]);
+ fft_asmb_sse(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
+
+ fft_8_sse(&a[32]); fft_4_sse(&a[40]); fft_4_sse(&a[44]);
+ fft_asmb_sse(2, &a[32], &a[40], &delta16[0], &delta16_3[0]);
+
+ fft_8_sse(&a[48]); fft_4_sse(&a[56]); fft_4_sse(&a[60]);
+ fft_asmb_sse(2, &a[48], &a[56], &delta16[0], &delta16_3[0]);
+
+ fft_asmb_sse(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
+
+ fft_8_sse(&a[64]); fft_4_sse(&a[72]); fft_4_sse(&a[76]);
+ /* fft_16(&a[64]); */
+ fft_asmb_sse(2, &a[64], &a[72], &delta16[0], &delta16_3[0]);
+
+ fft_8_sse(&a[80]); fft_8_sse(&a[88]);
+
+ /* fft_32(&a[64]); */
+ fft_asmb_sse(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
+
+ fft_8_sse(&a[96]); fft_4_sse(&a[104]), fft_4_sse(&a[108]);
+ /* fft_16(&a[96]); */
+ fft_asmb_sse(2, &a[96], &a[104], &delta16[0], &delta16_3[0]);
+
+ fft_8_sse(&a[112]), fft_8_sse(&a[120]);
+ /* fft_32(&a[96]); */
+ fft_asmb_sse(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
+
+ /* fft_128(&a[0]); */
+ fft_asmb_sse(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
+}
+
+void hsqrt2 (void)
+{
+ __asm__ (
+ ".float 0f0.707106781188\n"
+ ".float 0f0.707106781188\n"
+ ".float 0f-0.707106781188\n"
+ ".float 0f-0.707106781188\n"
+ );
+}
+
+void C_1 (void)
+{
+ __asm__ (
+ ".float 0f-1.0\n"
+ ".float 0f1.0\n"
+ ".float 0f-1.0\n"
+ ".float 0f1.0\n"
+ );
+}
+
+static void fft_4_sse (complex_t *x)
+{
+ __asm__ __volatile__ (
+ "movups (%%eax), %%xmm0\n" /* x[1] | x[0] */
+ "movups 16(%%eax), %%xmm2\n" /* x[3] | x[2] */
+ "movups %%xmm0, %%xmm1\n" /* x[1] | x[0] */
+ "addps %%xmm2, %%xmm0\n" /* x[1] + x[3] | x[0] + x[2] */
+ "subps %%xmm2, %%xmm1\n" /* x[1] - x[3] | x[0] - x[2] */
+ "xorps %%xmm6, %%xmm6\n"
+ "movhlps %%xmm1, %%xmm4\n" /* ? | x[1] - x[3] */
+ "movhlps %%xmm0, %%xmm3\n" /* ? | x[1] + x[3] */
+ "subss %%xmm4, %%xmm6\n" /* 0 | -(x[1] - x[3]).re */
+ "movlhps %%xmm1, %%xmm0\n" /* x[0] - x[2] | x[0] + x[2] */
+ "movlhps %%xmm6, %%xmm4\n" /* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
+ "movups %%xmm0, %%xmm2\n" /* x[0] - x[2] | x[0] + x[2] */
+ "shufps $0x94, %%xmm4, %%xmm3\n" /* i*(x[1] - x[3]) | x[1] + x[3] */
+ "addps %%xmm3, %%xmm0\n"
+ "subps %%xmm3, %%xmm2\n"
+ "movups %%xmm0, (%%eax)\n"
+ "movups %%xmm2, 16(%%eax)\n"
+ : "=a" (x)
+ : "a" (x) );
+}
+
+static void fft_8_sse (complex_t *x)
+{
+ __asm__ __volatile__ (
+ "pushl %%ebx\n"
+
+ "movlps (%%eax), %%xmm0\n" /* x[0] */
+ "movlps 32(%%eax), %%xmm1\n" /* x[4] */
+ "movhps 16(%%eax), %%xmm0\n" /* x[2] | x[0] */
+ "movhps 48(%%eax), %%xmm1\n" /* x[6] | x[4] */
+ "movups %%xmm0, %%xmm2\n" /* x[2] | x[0] */
+ "xorps %%xmm3, %%xmm3\n"
+ "addps %%xmm1, %%xmm0\n" /* x[2] + x[6] | x[0] + x[4] */
+ "subps %%xmm1, %%xmm2\n" /* x[2] - x[6] | x[0] - x[4] */
+ "movhlps %%xmm0, %%xmm5\n" /* x[2] + x[6] */
+ "movhlps %%xmm2, %%xmm4\n" /* x[2] - x[6] */
+ "movlhps %%xmm2, %%xmm0\n" /* x[0] - x[4] | x[0] + x[4] */
+ "subss %%xmm4, %%xmm3\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
+ "movups %%xmm0, %%xmm7\n" /* x[0] - x[4] | x[0] + x[4] */
+ "movups %%xmm3, %%xmm4\n" /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
+ "movlps 8(%%eax), %%xmm1\n" /* x[1] */
+ "shufps $0x14, %%xmm4, %%xmm5\n" /* i*(x[2] - x[6]) | x[2] + x[6] */
+
+ "addps %%xmm5, %%xmm0\n" /* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */
+ "subps %%xmm5, %%xmm7\n" /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
+
+ "movhps 24(%%eax), %%xmm1\n" /* x[3] | x[1] */
+ "movl $hsqrt2, %%ebx\n"
+ "movlps 40(%%eax), %%xmm2\n" /* x[5] */
+ "movhps 56(%%eax), %%xmm2\n" /* x[7] | x[5] */
+ "movups %%xmm1, %%xmm3\n" /* x[3] | x[1] */
+ "addps %%xmm2, %%xmm1\n" /* x[3] + x[7] | x[1] + x[5] */
+ "subps %%xmm2, %%xmm3\n" /* x[3] - x[7] | x[1] - x[5] */
+ "movups (%%ebx), %%xmm4\n" /* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
+ "movups %%xmm3, %%xmm6\n" /* x[3] - x[7] | x[1] - x[5] */
+ "mulps %%xmm4, %%xmm3\n" /* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */
+ "shufps $0xc8, %%xmm4, %%xmm4\n" /* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
+ "shufps $0xb1, %%xmm6, %%xmm6\n" /* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */
+ "mulps %%xmm4, %%xmm6\n" /* (x7-x3).re/s2|(x3-x7).im/s2|(x5-x1).re/s2|(x1-x5).im/s2 */
+ "addps %%xmm3, %%xmm6\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */
+ "movhlps %%xmm1, %%xmm5\n" /* x[3] + x[7] */
+ "movlhps %%xmm6, %%xmm1\n" /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+ "shufps $0xe4, %%xmm6, %%xmm5\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
+ "movups %%xmm1, %%xmm3\n" /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+ "movl $C_1, %%ebx\n"
+ "addps %%xmm5, %%xmm1\n" /* u */
+ "subps %%xmm5, %%xmm3\n" /* v */
+ "movups %%xmm0, %%xmm2\n" /* yb */
+ "movups %%xmm7, %%xmm4\n" /* yt */
+ "movups (%%ebx), %%xmm5\n"
+ "mulps %%xmm5, %%xmm3\n"
+ "addps %%xmm1, %%xmm0\n" /* yt + u */
+ "subps %%xmm1, %%xmm2\n" /* yt - u */
+ "shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
+ "movups %%xmm0, (%%eax)\n"
+ "movups %%xmm2, 32(%%eax)\n"
+ "addps %%xmm3, %%xmm4\n" /* yb - i*v */
+ "subps %%xmm3, %%xmm7\n" /* yb + i*v */
+ "movups %%xmm4, 16(%%eax)\n"
+ "movups %%xmm7, 48(%%eax)\n"
+
+ "popl %%ebx\n"
+ : "=a" (x)
+ : "a" (x));
+}
+
+
+static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
+ const complex_t *d, const complex_t *d_3)
+{
+ __asm__ __volatile__ (
+ "pushl %%ebp\n"
+ "movl %%esp, %%ebp\n"
+
+ "subl $4, %%esp\n"
+
+ "pushl %%eax\n"
+ "pushl %%ebx\n"
+ "pushl %%ecx\n"
+ "pushl %%edx\n"
+ "pushl %%esi\n"
+ "pushl %%edi\n"
+
+ "movl 8(%%ebp), %%ecx\n" /* k */
+ "movl 12(%%ebp), %%eax\n" /* x */
+ "movl %%ecx, -4(%%ebp)\n" /* k */
+ "movl 16(%%ebp), %%ebx\n" /* wT */
+ "movl 20(%%ebp), %%edx\n" /* d */
+ "movl 24(%%ebp), %%esi\n" /* d3 */
+ "shll $4, %%ecx\n" /* 16k */
+ "addl $8, %%edx\n"
+ "leal (%%eax, %%ecx, 2), %%edi\n"
+ "addl $8, %%esi\n"
+
+ /* TRANSZERO and TRANS */
+ "movups (%%eax), %%xmm0\n" /* x[1] | x[0] */
+ "movups (%%ebx), %%xmm1\n" /* wT[1] | wT[0] */
+ "movups (%%ebx, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */
+ "movlps (%%edx), %%xmm3\n" /* d */
+ "movlps (%%esi), %%xmm4\n" /* d3 */
+ "movhlps %%xmm1, %%xmm5\n" /* wT[1] */
+ "movhlps %%xmm2, %%xmm6\n" /* wB[1] */
+ "shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
+ "shufps $0x50, %%xmm4, %%xmm4\n" /* d3[1].im | d3[1].im | d3[i].re | d3[i].re */
+ "movlhps %%xmm5, %%xmm5\n" /* wT[1] | wT[1] */
+ "movlhps %%xmm6, %%xmm6\n" /* wB[1] | wB[1] */
+ "mulps %%xmm3, %%xmm5\n"
+ "mulps %%xmm4, %%xmm6\n"
+ "movhlps %%xmm5, %%xmm7\n" /* wT[1].im * d[1].im | wT[1].re * d[1].im */
+ "movlhps %%xmm6, %%xmm5\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
+ "shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
+ "movl $C_1, %%edi\n"
+ "movups (%%edi), %%xmm4\n"
+ "mulps %%xmm4, %%xmm7\n"
+ "addps %%xmm7, %%xmm5\n" /* wB[1] * d3[1] | wT[1] * d[1] */
+ "movlhps %%xmm5, %%xmm1\n" /* d[1] * wT[1] | wT[0] */
+ "shufps $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */
+ "movups %%xmm1, %%xmm3\n" /* d[1] * wT[1] | wT[0] */
+ "leal (%%eax, %%ecx, 2), %%edi\n"
+ "addps %%xmm2, %%xmm1\n" /* u */
+ "subps %%xmm2, %%xmm3\n" /* v */
+ "mulps %%xmm4, %%xmm3\n"
+ "movups (%%eax, %%ecx), %%xmm5\n" /* xk[1] | xk[0] */
+ "shufps $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
+ "movups %%xmm0, %%xmm2\n" /* x[1] | x[0] */
+ "movups %%xmm5, %%xmm6\n" /* xk[1] | xk[0] */
+ "addps %%xmm1, %%xmm0\n"
+ "subps %%xmm1, %%xmm2\n"
+ "addps %%xmm3, %%xmm5\n"
+ "subps %%xmm3, %%xmm6\n"
+ "movups %%xmm0, (%%eax)\n"
+ "movups %%xmm2, (%%edi)\n"
+ "movups %%xmm5, (%%eax, %%ecx)\n"
+ "movups %%xmm6, (%%edi, %%ecx)\n"
+ "addl $16, %%eax\n"
+ "addl $16, %%ebx\n"
+ "addl $8, %%edx\n"
+ "addl $8, %%esi\n"
+ "decl -4(%%ebp)\n"
+
+".loop:\n"
+ "movups (%%ebx), %%xmm0\n" /* wT[1] | wT[0] */
+ "movups (%%edx), %%xmm1\n" /* d[1] | d[0] */
+
+ "movups (%%ebx, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */
+ "movups (%%esi), %%xmm5\n" /* d3[1] | d3[0] */
+
+ "movhlps %%xmm0, %%xmm2\n" /* wT[1] */
+ "movhlps %%xmm1, %%xmm3\n" /* d[1] */
+
+ "movhlps %%xmm4, %%xmm6\n" /* wB[1] */
+ "movhlps %%xmm5, %%xmm7\n" /* d3[1] */
+
+ "shufps $0x50, %%xmm1, %%xmm1\n" /* d[0].im | d[0].im | d[0].re | d[0].re */
+ "shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
+
+ "movlhps %%xmm0, %%xmm0\n" /* wT[0] | wT[0] */
+ "shufps $0x50, %%xmm5, %%xmm5\n" /* d3[0].im | d3[0].im | d3[0].re | d3[0].re */
+ "movlhps %%xmm2, %%xmm2\n" /* wT[1] | wT[1] */
+ "shufps $0x50, %%xmm7, %%xmm7\n" /* d3[1].im | d3[1].im | d3[1].re | d3[1].re */
+
+ "mulps %%xmm1, %%xmm0\n" /* d[0].im * wT[0].im | d[0].im * wT[0].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
+ "mulps %%xmm3, %%xmm2\n" /* d[1].im * wT[1].im | d[1].im * wT[1].re | d[1].re * wT[1].im | d[1].re * wT[1].re */
+ "movlhps %%xmm4, %%xmm4\n" /* wB[0] | wB[0] */
+ "movlhps %%xmm6, %%xmm6\n" /* wB[1] | wB[1] */
+
+ "movhlps %%xmm0, %%xmm1\n" /* d[0].im * wT[0].im | d[0].im * wT[0].re */
+ "movlhps %%xmm2, %%xmm0\n" /* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
+ "mulps %%xmm5, %%xmm4\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
+ "mulps %%xmm7, %%xmm6\n" /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
+ "shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
+ "movl $C_1, %%edi\n"
+ "movups (%%edi), %%xmm3\n" /* 1.0 | -1.0 | 1.0 | -1.0 */
+
+ "movhlps %%xmm4, %%xmm5\n" /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
+ "mulps %%xmm3, %%xmm1\n" /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
+ "movlhps %%xmm6, %%xmm4\n" /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */
+ "addps %%xmm1, %%xmm0\n" /* wT[1] * d[1] | wT[0] * d[0] */
+
+ "shufps $0xb1, %%xmm6, %%xmm5\n" /* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
+ "mulps %%xmm3, %%xmm5\n" /* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
+ "addps %%xmm5, %%xmm4\n" /* wB[1] * d3[1] | wB[0] * d3[0] */
+
+ "movups %%xmm0, %%xmm1\n" /* wT[1] * d[1] | wT[0] * d[0] */
+ "addps %%xmm4, %%xmm0\n" /* u */
+ "subps %%xmm4, %%xmm1\n" /* v */
+ "movups (%%eax), %%xmm6\n" /* x[1] | x[0] */
+ "leal (%%eax, %%ecx, 2), %%edi\n"
+ "mulps %%xmm3, %%xmm1\n"
+ "addl $16, %%ebx\n"
+ "addl $16, %%esi\n"
+ "shufps $0xb1, %%xmm1, %%xmm1\n" /* -i * v */
+ "movups (%%eax, %%ecx), %%xmm7\n" /* xk[1] | xk[0] */
+ "movups %%xmm6, %%xmm2\n"
+ "movups %%xmm7, %%xmm4\n"
+ "addps %%xmm0, %%xmm6\n"
+ "subps %%xmm0, %%xmm2\n"
+ "movups %%xmm6, (%%eax)\n"
+ "movups %%xmm2, (%%edi)\n"
+ "addps %%xmm1, %%xmm7\n"
+ "subps %%xmm1, %%xmm4\n"
+ "addl $16, %%edx\n"
+ "movups %%xmm7, (%%eax, %%ecx)\n"
+ "movups %%xmm4, (%%edi, %%ecx)\n"
+
+ "addl $16, %%eax\n"
+ "decl -4(%%ebp)\n"
+ "jnz .loop\n"
+
+".end:\n"
+ "popl %%edi\n"
+ "popl %%esi\n"
+ "popl %%edx\n"
+ "popl %%ecx\n"
+ "popl %%ebx\n"
+ "popl %%eax\n"
+
+ "addl $4, %%esp\n"
+
+ "leave\n"
+ ::);
+}
* and spawn threads.
*****************************************************************************
* Copyright (C) 1998, 1999, 2000 VideoLAN
- * $Id: main.c,v 1.93 2001/05/07 03:14:09 stef Exp $
+ * $Id: main.c,v 1.94 2001/05/14 15:58:04 reno Exp $
*
* Authors: Vincent Seguin <seguin@via.ecp.fr>
* Samuel Hocevar <sam@zoy.org>
if( i_edx & 0x02000000 )
{
i_capabilities |= CPU_CAPABILITY_MMXEXT;
+ i_capabilities |= CPU_CAPABILITY_SSE;
}
/* test for additional capabilities */
{
i_capabilities |= CPU_CAPABILITY_MMXEXT;
}
-
#else
/* default behaviour */