* Alignement in asm functions

author Renaud Dartus <reno@videolan.org>

Sun, 8 Jul 2001 23:15:11 +0000 (23:15 +0000)

committer Renaud Dartus <reno@videolan.org>

Sun, 8 Jul 2001 23:15:11 +0000 (23:15 +0000)
author Renaud Dartus <reno@videolan.org>
Sun, 8 Jul 2001 23:15:11 +0000 (23:15 +0000)
committer Renaud Dartus <reno@videolan.org>
Sun, 8 Jul 2001 23:15:11 +0000 (23:15 +0000)
diff --git a/include/ac3_imdct.h b/include/ac3_imdct.h

index f77afa183d0b388f5fe1c85481f8550e916cad5e..7a405a34638a77f3e3615852e92c92945361f195 100644 (file)
--- a/include/ac3_imdct.h
+++ b/include/ac3_imdct.h
@@ -2,7 +2,7 @@
   * ac3_imdct.h : AC3 IMDCT types
   *****************************************************************************
   * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct.h,v 1.4 2001/06/12 00:30:41 reno Exp $
+ * $Id: ac3_imdct.h,v 1.5 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Michel Kaempf <maxx@via.ecp.fr>
   *          Renaud Dartus <reno@videolan.org>
@@ -42,18 +42,19 @@ typedef struct imdct_s
      float xsin1[N/4] __attribute__ ((aligned(16)));
      float xcos2[N/8] __attribute__ ((aligned(16)));
      float xsin2[N/8] __attribute__ ((aligned(16)));
+    float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
     
      /* Twiddle factor LUT */
-    complex_t *w[7] __attribute__ ((aligned(16)));
      complex_t w_1[1] __attribute__ ((aligned(16)));
+    float used_for_alignement1;
+    float used_for_alignement2;
      complex_t w_2[2] __attribute__ ((aligned(16)));
      complex_t w_4[4] __attribute__ ((aligned(16)));
      complex_t w_8[8] __attribute__ ((aligned(16)));
      complex_t w_16[16] __attribute__ ((aligned(16)));
      complex_t w_32[32] __attribute__ ((aligned(16)));
      complex_t w_64[64] __attribute__ ((aligned(16)));
-
-    float xcos_sin_sse[128 * 4] __attribute__ ((aligned(16)));
+    complex_t *w[7] __attribute__ ((aligned(16)));
      
      /* Module used and shortcuts */
      struct module_s * p_module;
diff --git a/plugins/downmix/ac3_downmix_3dn.c b/plugins/downmix/ac3_downmix_3dn.c

index d02c1d5054e95808d166d76ea66d42ffb36e57d1..a10641bf1e3ac3d57871b7a2fac27052e226f214 100644 (file)
--- a/plugins/downmix/ac3_downmix_3dn.c
+++ b/plugins/downmix/ac3_downmix_3dn.c
@@ -2,7 +2,7 @@
   * ac3_downmix_3dn.c: accelerated 3D Now! ac3 downmix functions
   *****************************************************************************
   * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_downmix_3dn.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $
+ * $Id: ac3_downmix_3dn.c,v 1.4 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Renaud Dartus <reno@videolan.org>
   *
@@ -46,6 +46,7 @@ void sqrt2_3dn (void)
  void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebx\n"
      "movl  $128,  %%ebx\n"            /* loop counter */
  
@@ -58,6 +59,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
      "movd    8(%%ecx), %%mm7\n"        /* slev */
      "punpckldq %%mm7, %%mm7\n"        /* slev | slev */
  
+    ".align 16\n"
  ".loop:\n"
      "movq    (%%eax),     %%mm0\n"   /* left */
      "movq    2048(%%eax), %%mm1\n"   /* right */
@@ -90,6 +92,7 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
  void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebx\n"
      "movl  $128, %%ebx\n"       /* loop counter */
  
@@ -99,6 +102,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
      "movd    8(%%ecx), %%mm7\n"    /* slev */
      "punpckldq %%mm7, %%mm7\n"    /* slev | slev */
  
+    ".align 16\n"
  ".loop3:\n"
      "movq   (%%eax), %%mm0\n"       /* left */
      "movq   1024(%%eax), %%mm1\n"   /* right */
@@ -127,7 +131,7 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
  void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
-
+    ".align 16\n"
      "pushl    %%ebx\n"
      "movl    $128, %%ebx\n"            /* loop counter */
  
@@ -140,6 +144,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
      "movd    8(%%ecx), %%mm7\n"        /* slev */
      "punpckldq %%mm7, %%mm7\n"      /* slev | slev */
  
+    ".align 16\n"
  ".loop4:\n"
      "movq    (%%eax), %%mm0\n"       /* left */
      "movq    2048(%%eax), %%mm1\n"   /* right */
@@ -170,6 +175,7 @@ void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
  void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl    %%ebx\n"
      "movl    $128, %%ebx\n"            /* loop counter */
  
@@ -179,6 +185,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
      "movd    8(%%ecx), %%mm7\n"        /* slev */
      "punpckldq %%mm7, %%mm7\n"      /* slev | slev */
  
+    ".align 16\n"
  ".loop5:\n"
      "movq    (%%eax), %%mm0\n"       /* left */
      "movq    1024(%%eax), %%mm1\n"   /* right */
@@ -205,6 +212,7 @@ void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
  void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl    %%ebx\n"
      "movl    $128, %%ebx\n"            /* loop counter */
  
@@ -214,6 +222,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
      "movd    4(%%ecx), %%mm6\n"        /* clev */
      "punpckldq %%mm6, %%mm6\n"      /* clev | clev */
  
+    ".align 16\n"
  ".loop6:\n"
      "movq    (%%eax), %%mm0\n"       /*left */
      "movq    2048(%%eax), %%mm1\n"   /* right */
@@ -240,6 +249,7 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
  void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebx\n"
      "pushl %%edx\n"
  
@@ -248,6 +258,7 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
      "punpckldq %%mm7, %%mm7\n"   /* sqrt2 | sqrt2 */
      "movl $128, %%ebx\n"
  
+    ".align 16\n"
  ".loop2:\n"
      "movq (%%ecx), %%mm0\n"        /* c1 | c0 */
      "pfmul   %%mm7, %%mm0\n"
@@ -274,9 +285,11 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
  {
  
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebx\n"
      "movl $128, %%ebx\n"
  
+    ".align 16\n"
  ".loop1:\n"
      "movq  (%%ecx), %%mm0\n"    /* l1 | l0 */
      "movq  (%%edx), %%mm1\n"    /* r1 | r0 */
diff --git a/plugins/downmix/ac3_downmix_sse.c b/plugins/downmix/ac3_downmix_sse.c

index 1066b5a585b26cbb08b922907cf1d883d0548548..b6d2e8b3440ca9cc3b191c91b87490a302351c3a 100644 (file)
--- a/plugins/downmix/ac3_downmix_sse.c
+++ b/plugins/downmix/ac3_downmix_sse.c
@@ -2,7 +2,7 @@
   * ac3_downmix_sse.c: accelerated SSE ac3 downmix functions
   *****************************************************************************
   * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_downmix_sse.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $
+ * $Id: ac3_downmix_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Renaud Dartus <reno@videolan.org>
   *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -41,48 +41,51 @@
  void sqrt2_sse (void) __asm__ ("sqrt2_sse");
  void sqrt2_sse (void)
  {
-    __asm__ (".float 0f0.7071068");
+    __asm__ (".align 16\n"
+             ".float 0f0.7071068");
  }
  
  void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebx\n"
-    "movl  $64,  %%ebx\n"            /* loop counter */
+    "movl  $64, %%ebx\n"            /* loop counter */
  
-    "movss    (%%ecx), %%xmm5\n"        /* unit */
-    "shufps    $0, %%xmm5, %%xmm5\n"    /* unit | unit | unit | unit */
+    "movss     (%%ecx), %%xmm5\n"   /* unit */
+    "shufps $0, %%xmm5, %%xmm5\n"   /* unit | unit | unit | unit */
  
-    "movss    4(%%ecx), %%xmm6\n"        /* clev */
-    "shufps    $0, %%xmm6, %%xmm6\n"    /* clev | clev | clev | clev */
+    "movss    4(%%ecx), %%xmm6\n"   /* clev */
+    "shufps $0, %%xmm6, %%xmm6\n"   /* clev | clev | clev | clev */
  
-    "movss    8(%%ecx), %%xmm7\n"        /* slev */
-    "shufps    $0, %%xmm7, %%xmm7\n"    /* slev | slev | slev | slev */
+    "movss    8(%%ecx), %%xmm7\n"   /* slev */
+    "shufps $0, %%xmm7, %%xmm7\n"   /* slev | slev | slev | slev */
  
+    ".align 16\n"
  ".loop:\n"
-    "movups    (%%eax),     %%xmm0\n"  /* left */
-    "movups    2048(%%eax), %%xmm1\n"  /* right */
-    "movups 1024(%%eax), %%xmm2\n"    /* center */
-    "movups    3072(%%eax), %%xmm3\n"    /* leftsur */
-    "movups    4096(%%eax), %%xmm4\n"    /* rithgsur */
-    "mulps    %%xmm5, %%xmm0\n"
-    "mulps    %%xmm5, %%xmm1\n"
-    "mulps    %%xmm6, %%xmm2\n"
-    "addps    %%xmm2, %%xmm0\n"
-    "addps     %%xmm2, %%xmm1\n"
-    "mulps    %%xmm7, %%xmm3\n"
-    "mulps    %%xmm7, %%xmm4\n"
-    "addps    %%xmm3, %%xmm0\n"
-    "addps    %%xmm4, %%xmm1\n"
-
-    "movups    %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
-
-    "addl    $16, %%eax\n"
-    "decl     %%ebx\n"
-    "jnz    .loop\n"
+    "movaps     (%%eax), %%xmm0\n"  /* left */
+    "movaps 2048(%%eax), %%xmm1\n"  /* right */
+    "movaps 1024(%%eax), %%xmm2\n"  /* center */
+    "movaps 3072(%%eax), %%xmm3\n"  /* leftsur */
+    "movaps 4096(%%eax), %%xmm4\n"  /* rithgsur */
+    "mulps %%xmm5, %%xmm0\n"
+    "mulps %%xmm5, %%xmm1\n"
+    "mulps %%xmm6, %%xmm2\n"
+    "addps %%xmm2, %%xmm0\n"
+    "addps %%xmm2, %%xmm1\n"
+    "mulps %%xmm7, %%xmm3\n"
+    "mulps %%xmm7, %%xmm4\n"
+    "addps %%xmm3, %%xmm0\n"
+    "addps %%xmm4, %%xmm1\n"
+
+    "movaps %%xmm0, (%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
+
+    "addl $16, %%eax\n"
+    "decl %%ebx\n"
+    "jnz  .loop\n"
      
-    "popl   %%ebx\n"
+    "popl %%ebx\n"
      : "=a" (samples)
      : "a" (samples), "c" (dm_par));
  }
@@ -90,35 +93,37 @@ void _M( downmix_3f_2r_to_2ch ) (float * samples, dm_par_t * dm_par)
  void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebx\n"
      "movl  $64, %%ebx\n"            /* loop counter */
  
-    "movss  (%%ecx), %%xmm5\n"        /* unit */
+    "movss     (%%ecx), %%xmm5\n"   /* unit */
      "shufps $0, %%xmm5, %%xmm5\n"   /* unit | unit | unit | unit */
  
-    "movss    8(%%ecx), %%xmm7\n"        /* slev */
-    "shufps    $0, %%xmm7, %%xmm7\n"    /* slev | slev | slev | slev */
+    "movss    8(%%ecx), %%xmm7\n"   /* slev */
+    "shufps $0, %%xmm7, %%xmm7\n"   /* slev | slev | slev | slev */
  
+    ".align 16\n"
  ".loop3:\n"
-    "movups    (%%eax), %%xmm0\n"      /* left */
-    "movups    1024(%%eax), %%xmm1\n"  /* right */
-    "movups 2048(%%eax), %%xmm3\n"    /* leftsur */
-    "movups    3072(%%eax), %%xmm4\n"    /* rightsur */
-    "mulps    %%xmm5, %%xmm0\n"
-    "mulps    %%xmm5, %%xmm1\n"
-    "mulps    %%xmm7, %%xmm3\n"
-    "mulps    %%xmm7, %%xmm4\n"
-    "addps    %%xmm3, %%xmm0\n"
-    "addps    %%xmm4, %%xmm1\n"
-
-    "movups    %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
-
-    "addl    $16, %%eax\n"
-    "decl     %%ebx\n"
-    "jnz    .loop3\n"
-
-    "popl    %%ebx\n"
+    "movaps     (%%eax), %%xmm0\n"  /* left */
+    "movaps 1024(%%eax), %%xmm1\n"  /* right */
+    "movaps 2048(%%eax), %%xmm3\n"  /* leftsur */
+    "movaps 3072(%%eax), %%xmm4\n"  /* rightsur */
+    "mulps %%xmm5, %%xmm0\n"
+    "mulps %%xmm5, %%xmm1\n"
+    "mulps %%xmm7, %%xmm3\n"
+    "mulps %%xmm7, %%xmm4\n"
+    "addps %%xmm3, %%xmm0\n"
+    "addps %%xmm4, %%xmm1\n"
+
+    "movaps %%xmm0, (%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
+
+    "addl $16, %%eax\n"
+    "decl %%ebx\n"
+    "jnz  .loop3\n"
+
+    "popl %%ebx\n"
      : "=a" (samples)
      : "a" (samples), "c" (dm_par));
  }
@@ -126,112 +131,114 @@ void _M( downmix_2f_2r_to_2ch ) (float *samples, dm_par_t * dm_par)
  void _M( downmix_3f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
+    "pushl %%ebx\n"
+    "movl  $64, %%ebx\n"            /* loop counter */
  
-    "pushl    %%ebx\n"
-    "movl    $64, %%ebx\n"            /* loop counter */
-
-    "movss    (%%ecx), %%xmm5\n"        /* unit */
-    "shufps    $0, %%xmm5, %%xmm5\n"    /* unit | unit | unit | unit */
+    "movss     (%%ecx), %%xmm5\n"   /* unit */
+    "shufps $0, %%xmm5, %%xmm5\n"   /* unit | unit | unit | unit */
  
-    "movss    4(%%ecx), %%xmm6\n"        /* clev */
-    "shufps    $0, %%xmm6, %%xmm6\n"    /* clev | clev | clev | clev */
+    "movss    4(%%ecx), %%xmm6\n"   /* clev */
+    "shufps $0, %%xmm6, %%xmm6\n"   /* clev | clev | clev | clev */
  
-    "movss    8(%%ecx), %%xmm7\n"        /* slev */
-    "shufps    $0, %%xmm7, %%xmm7\n"    /* slev | slev | slev | slev */
+    "movss    8(%%ecx), %%xmm7\n"   /* slev */
+    "shufps $0, %%xmm7, %%xmm7\n"   /* slev | slev | slev | slev */
  
+    ".align 16\n"
  ".loop4:\n"
-    "movups    (%%eax), %%xmm0\n"      /* left */
-    "movups    2048(%%eax), %%xmm1\n"  /* right */
-    "movups    1024(%%eax), %%xmm2\n"    /* center */
-    "movups    3072(%%eax), %%xmm3\n"    /* sur */
-    "mulps    %%xmm5, %%xmm0\n"
-    "mulps    %%xmm5, %%xmm1\n"
-    "mulps    %%xmm6, %%xmm2\n"
-    "addps    %%xmm2, %%xmm0\n"
-    "mulps    %%xmm7, %%xmm3\n"
-    "addps     %%xmm2, %%xmm1\n"
-    "subps    %%xmm3, %%xmm0\n"
-    "addps    %%xmm3, %%xmm1\n"
-
-    "movups    %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
-
-    "addl    $16, %%eax\n"
-    "decl     %%ebx\n"
-    "jnz    .loop4\n"
-
-    "popl    %%ebx\n"
+    "movaps     (%%eax), %%xmm0\n"  /* left */
+    "movaps 2048(%%eax), %%xmm1\n"  /* right */
+    "movaps 1024(%%eax), %%xmm2\n"  /* center */
+    "movaps 3072(%%eax), %%xmm3\n"  /* sur */
+    "mulps %%xmm5, %%xmm0\n"
+    "mulps %%xmm5, %%xmm1\n"
+    "mulps %%xmm6, %%xmm2\n"
+    "addps %%xmm2, %%xmm0\n"
+    "mulps %%xmm7, %%xmm3\n"
+    "addps %%xmm2, %%xmm1\n"
+    "subps %%xmm3, %%xmm0\n"
+    "addps %%xmm3, %%xmm1\n"
+
+    "movaps %%xmm0, (%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
+
+    "addl $16, %%eax\n"
+    "decl %%ebx\n"
+    "jnz  .loop4\n"
+
+    "popl %%ebx\n"
      : "=a" (samples)
      : "a" (samples), "c" (dm_par));
-
  }
  
  void _M( downmix_2f_1r_to_2ch ) (float *samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
-    "pushl    %%ebx\n"
-    "movl    $64, %%ebx\n"            /* loop counter */
+    ".align 16\n"
+    "pushl %%ebx\n"
+    "movl  $64, %%ebx\n"            /* loop counter */
  
-    "movss    (%%ecx), %%xmm5\n"        /* unit */
-    "shufps    $0, %%xmm5, %%xmm5\n"    /* unit | unit | unit | unit */
+    "movss     (%%ecx), %%xmm5\n"   /* unit */
+    "shufps $0, %%xmm5, %%xmm5\n"   /* unit | unit | unit | unit */
  
-    "movss    8(%%ecx), %%xmm7\n"        /* slev */
-    "shufps    $0, %%xmm7, %%xmm7\n"    /* slev | slev | slev | slev */
+    "movss    8(%%ecx), %%xmm7\n"   /* slev */
+    "shufps $0, %%xmm7, %%xmm7\n"   /* slev | slev | slev | slev */
  
+    ".align 16\n"
  ".loop5:\n"
-    "movups    (%%eax), %%xmm0\n"      /* left */
-    "movups    1024(%%eax), %%xmm1\n"  /* right */
-    "movups    2048(%%eax), %%xmm3\n"    /* sur */
-    "mulps    %%xmm5, %%xmm0\n"
-    "mulps    %%xmm5, %%xmm1\n"
-    "mulps    %%xmm7, %%xmm3\n"
-    "subps    %%xmm3, %%xmm0\n"
-    "addps    %%xmm3, %%xmm1\n"
-
-    "movups    %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
-
-    "addl    $16, %%eax\n"
-    "decl     %%ebx\n"
-    "jnz    .loop5\n"
-
-    "popl    %%ebx\n"
-    : "=a" (samples)
-    : "a" (samples), "c" (dm_par));
+    "movaps     (%%eax), %%xmm0\n"  /* left */
+    "movaps 1024(%%eax), %%xmm1\n"  /* right */
+    "movaps 2048(%%eax), %%xmm3\n"  /* sur */
+    "mulps %%xmm5, %%xmm0\n"
+    "mulps %%xmm5, %%xmm1\n"
+    "mulps %%xmm7, %%xmm3\n"
+    "subps %%xmm3, %%xmm0\n"
+    "addps %%xmm3, %%xmm1\n"
+
+    "movaps %%xmm0, (%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
  
+    "addl $16, %%eax\n"
+    "decl %%ebx\n"
+    "jnz  .loop5\n"
  
+    "popl %%ebx\n"
+    : "=a" (samples)
+    : "a" (samples), "c" (dm_par));
  }
  
  void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
  {
      __asm__ __volatile__ (
-    "pushl    %%ebx\n"
-    "movl    $64, %%ebx\n"            /* loop counter */
+    ".align 16\n"
+    "pushl %%ebx\n"
+    "movl  $64, %%ebx\n"           /* loop counter */
  
-    "movss    (%%ecx), %%xmm5\n"        /* unit */
-    "shufps    $0, %%xmm5, %%xmm5\n"    /* unit | unit | unit | unit */
+    "movss     (%%ecx), %%xmm5\n"  /* unit */
+    "shufps $0, %%xmm5, %%xmm5\n"  /* unit | unit | unit | unit */
  
-    "movss    4(%%ecx), %%xmm6\n"        /* clev */
-    "shufps    $0, %%xmm6, %%xmm6\n"    /* clev | clev | clev | clev */
+    "movss    4(%%ecx), %%xmm6\n"  /* clev */
+    "shufps $0, %%xmm6, %%xmm6\n"  /* clev | clev | clev | clev */
  
+    ".align 16\n"
  ".loop6:\n"
-    "movups    (%%eax), %%xmm0\n"      /*left */
-    "movups    2048(%%eax), %%xmm1\n"  /* right */
-    "movups 1024(%%eax), %%xmm2\n"    /* center */
-    "mulps    %%xmm5, %%xmm0\n"
-    "mulps    %%xmm5, %%xmm1\n"
-    "mulps    %%xmm6, %%xmm2\n"
-    "addps    %%xmm2, %%xmm0\n"
-    "addps     %%xmm2, %%xmm1\n"
-
-    "movups    %%xmm0, (%%eax)\n"
-    "movups    %%xmm1, 1024(%%eax)\n"
-
-    "addl    $16, %%eax\n"
-    "decl     %%ebx\n"
-    "jnz    .loop6\n"
-
-    "popl    %%ebx\n"
+    "movaps     (%%eax), %%xmm0\n"  /*left */
+    "movaps 2048(%%eax), %%xmm1\n"  /* right */
+    "movaps 1024(%%eax), %%xmm2\n"  /* center */
+    "mulps %%xmm5, %%xmm0\n"
+    "mulps %%xmm5, %%xmm1\n"
+    "mulps %%xmm6, %%xmm2\n"
+    "addps %%xmm2, %%xmm0\n"
+    "addps %%xmm2, %%xmm1\n"
+
+    "movaps %%xmm0, (%%eax)\n"
+    "movaps %%xmm1, 1024(%%eax)\n"
+
+    "addl $16, %%eax\n"
+    "decl %%ebx\n"
+    "jnz  .loop6\n"
+
+    "popl %%ebx\n"
      : "=a" (samples)
      : "a" (samples), "c" (dm_par));
  }
@@ -239,24 +246,26 @@ void _M( downmix_3f_0r_to_2ch ) (float *samples, dm_par_t * dm_par)
  void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebx\n"
      "pushl %%edx\n"
  
      "movl   $sqrt2_sse, %%edx\n"
-    "movss (%%edx), %%xmm7\n"
-    "shufps $0, %%xmm7, %%xmm7\n"   /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
-    "movl $64, %%ebx\n"
-
+    "movss  (%%edx), %%xmm7\n"
+    "shufps $0, %%xmm7, %%xmm7\n"  /* sqrt2 | sqrt2 | sqrt2 | sqrt2 */
+    "movl   $64, %%ebx\n"
+    
+    ".align 16\n"
  ".loop2:\n"
-    "movups (%%ecx), %%xmm0\n"        /* c3 | c2 | c1 | c0 */
+    "movaps (%%ecx), %%xmm0\n"     /* c3 | c2 | c1 | c0 */
      "mulps   %%xmm7, %%xmm0\n"
-    "movhlps %%xmm0, %%xmm2\n"        /* c3 | c2 */
+    "movhlps %%xmm0, %%xmm2\n"     /* c3 | c2 */
  
-    "cvtps2pi %%xmm0, %%mm0\n"        /* c1 c0 --> mm0, int_32 */
-    "cvtps2pi %%xmm2, %%mm1\n"        /* c3 c2 --> mm1, int_32 */
+    "cvtps2pi %%xmm0, %%mm0\n"     /* c1 c0 --> mm0, int_32 */
+    "cvtps2pi %%xmm2, %%mm1\n"     /* c3 c2 --> mm1, int_32 */
  
-    "packssdw %%mm0, %%mm0\n"        /* c1 c1 c0 c0 --> mm0, int_16 */
-    "packssdw %%mm1, %%mm1\n"        /* c3 c3 c2 c2 --> mm1, int_16 */
+    "packssdw %%mm0, %%mm0\n"      /* c1 c1 c0 c0 --> mm0, int_16 */
+    "packssdw %%mm1, %%mm1\n"      /* c3 c3 c2 c2 --> mm1, int_16 */
  
      "movq %%mm0, (%%eax)\n"
      "movq %%mm1, 8(%%eax)\n"
@@ -275,18 +284,19 @@ void _M( stream_sample_1ch_to_s16 ) (s16 *s16_samples, float *left)
  
  void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right)
  {
-
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebx\n"
-    "movl $64, %%ebx\n"
+    "movl  $64, %%ebx\n"
  
+    ".align 16\n"
  ".loop1:\n"
-    "movups  (%%ecx), %%xmm0\n"    /* l3 | l2 | l1 | l0 */
-    "movups  (%%edx), %%xmm1\n"    /* r3 | r2 | r1 | r0 */
-    "movhlps  %%xmm0, %%xmm2\n"    /* l3 | l2 */
-    "movhlps  %%xmm1, %%xmm3\n"    /* r3 | r2 */
-    "unpcklps %%xmm1, %%xmm0\n"    /* r1 | l1 | r0 | l0 */
-    "unpcklps %%xmm3, %%xmm2\n"    /* r3 | l3 | r2 | l2 */
+    "movaps  (%%ecx), %%xmm0\n"   /* l3 | l2 | l1 | l0 */
+    "movaps  (%%edx), %%xmm1\n"   /* r3 | r2 | r1 | r0 */
+    "movhlps  %%xmm0, %%xmm2\n"   /* l3 | l2 */
+    "movhlps  %%xmm1, %%xmm3\n"   /* r3 | r2 */
+    "unpcklps %%xmm1, %%xmm0\n"   /* r1 | l1 | r0 | l0 */
+    "unpcklps %%xmm3, %%xmm2\n"   /* r3 | l3 | r2 | l2 */
  
      "cvtps2pi %%xmm0, %%mm0\n"    /* r0 l0 --> mm0, int_32 */
      "movhlps  %%xmm0, %%xmm0\n"
@@ -295,8 +305,8 @@ void _M( stream_sample_2ch_to_s16 ) (s16 *s16_samples, float *left, float *right
      "movhlps  %%xmm2, %%xmm2\n"
      "cvtps2pi %%xmm2, %%mm3\n"    /* r3 l3 --> mm3, int_32 */
      
-    "packssdw %%mm1, %%mm0\n"    /* r1 l1 r0 l0 --> mm0, int_16 */
-    "packssdw %%mm3, %%mm2\n"    /* r3 l3 r2 l2 --> mm2, int_16 */
+    "packssdw %%mm1, %%mm0\n"     /* r1 l1 r0 l0 --> mm0, int_16 */
+    "packssdw %%mm3, %%mm2\n"     /* r3 l3 r2 l2 --> mm2, int_16 */
  
      "movq %%mm0, (%%eax)\n"
      "movq %%mm2, 8(%%eax)\n"
diff --git a/plugins/imdct/ac3_imdct_3dn.c b/plugins/imdct/ac3_imdct_3dn.c

index b476b8fefbabdd9567f7a8af47b0a337547af96f..0bca93204d677eb5460168ee445fffd4db650913 100644 (file)
--- a/plugins/imdct/ac3_imdct_3dn.c
+++ b/plugins/imdct/ac3_imdct_3dn.c
@@ -2,7 +2,7 @@
   * ac3_imdct_3dn.c: accelerated 3D Now! ac3 DCT
   *****************************************************************************
   * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_3dn.c,v 1.4 2001/06/03 12:47:21 sam Exp $
+ * $Id: ac3_imdct_3dn.c,v 1.5 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Renaud Dartus <reno@videolan.org>
   *
@@ -89,6 +89,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
  static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
  {
      __asm__ __volatile__ (     
+    ".align 16\n"
         "pushl %%ebp\n"
         "movl  %%esp, %%ebp\n"
         "addl  $-4, %%esp\n" /* local variable, loop counter */
@@ -106,6 +107,7 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
         "movl 20(%%ebp), %%edx\n"       /* xcos_sin_sse */
         "movl $128, -4(%%ebp)\n"
         
+    ".align 16\n"
  ".loop:\n"
         "movl  (%%eax), %%esi\n"
         "movd (%%ecx, %%esi, 8), %%mm1\n"   /* 2j */
@@ -147,9 +149,11 @@ static void imdct512_pre_ifft_twiddle_3dn (const int *pmt, complex_t *buf, float
  static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
  {
      __asm__ __volatile__ ( 
+    ".align 16\n"
         "pushl %%ebx\n"
         "movl $64, %%ebx\n"         /* loop counter */
  
+    ".align 16\n"
  ".loop1:\n"
         "movq   (%%eax), %%mm0\n"   /* im0 | re0 */
         "movq     %%mm0, %%mm1\n"   /* im0 | re0 */
@@ -200,6 +204,7 @@ static void imdct512_post_ifft_twiddle_3dn (complex_t *buf, float *xcos_sin_sse)
  static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
         "pushl %%ebp\n"
         "movl  %%esp, %%ebp\n"
  
@@ -219,6 +224,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
         "leal 504(%%eax), %%edi\n"  /* buf[63].re */
         "movl  12(%%ebp), %%eax\n"  /* data */
  
+    ".align 16\n"
  ".first_128_samples:\n"
         "movd   (%%esi), %%mm0\n" /* im0 */
         "movd  8(%%esi), %%mm2\n" /* im1 */
@@ -258,6 +264,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
         "leal 1020(%%esi), %%edi\n" /* buf[127].im */
         "movl $32, %%ecx\n"         /* loop count */
      
+    ".align 16\n"
  ".second_128_samples:\n"
         "movd   (%%esi), %%mm0\n" /* buf[i].re */
         "movd  8(%%esi), %%mm2\n" /* re1 */
@@ -302,6 +309,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
         "movl $32, %%ecx\n"         /* loop count */
         "movl  20(%%ebp), %%eax\n"  /* delay */
  
+    ".align 16\n"
  ".first_128_delay:\n"
         "movd   (%%esi), %%mm0\n" /* re0 */
         "movd  8(%%esi), %%mm2\n" /* re1 */
@@ -339,6 +347,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
         "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
         "movl $32, %%ecx\n"         /* loop count */
      
+    ".align 16\n"
  ".second_128_delay:\n"
         "movd   (%%esi), %%mm0\n" /* im0 */
         "movd  8(%%esi), %%mm2\n" /* im1 */
@@ -386,6 +395,7 @@ static void imdct512_window_delay_3dn (complex_t *buf, float *data_ptr, float *w
  static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
         "pushl %%ebp\n"
         "movl  %%esp, %%ebp\n"
         
@@ -405,6 +415,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
         "leal 504(%%eax), %%edi\n"  /* buf[63].re */
         "movl  12(%%ebp), %%eax\n"  /* data */
  
+    ".align 16\n"
  ".first_128_samples2:\n"
         "movd   (%%esi), %%mm0\n" /* im0 */
         "movd  8(%%esi), %%mm2\n" /* im1 */
@@ -439,6 +450,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
         "leal 1020(%%esi), %%edi\n" /* buf[127].im */
         "movl $32, %%ecx\n"         /* loop count */
      
+    ".align 16\n"
  ".second_128_samples2:\n"
         "movd   (%%esi), %%mm0\n" /* buf[i].re */
         "movd  8(%%esi), %%mm2\n" /* re1 */
@@ -478,6 +490,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
         "movl $32, %%ecx\n"         /* loop count */
         "movl  20(%%ebp), %%eax\n"  /* delay */
  
+    ".align 16\n"
  ".first_128_delays:\n"
         "movd   (%%esi), %%mm0\n" /* re0 */
         "movd  8(%%esi), %%mm2\n" /* re1 */
@@ -515,6 +528,7 @@ static void imdct512_window_delay_nol_3dn (complex_t *buf, float *data_ptr, floa
         "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
         "movl $32, %%ecx\n"         /* loop count */
      
+    ".align 16\n"
  ".second_128_delays:\n"
         "movd   (%%esi), %%mm0\n" /* im0 */
         "movd  8(%%esi), %%mm2\n" /* im1 */
diff --git a/plugins/imdct/ac3_imdct_sse.c b/plugins/imdct/ac3_imdct_sse.c

index dbaceb950b665855fe92ab9fee71fdf2ba61f6b0..50ff4d1c643fcc132df2582a84395c534963d95b 100644 (file)
--- a/plugins/imdct/ac3_imdct_sse.c
+++ b/plugins/imdct/ac3_imdct_sse.c
@@ -2,7 +2,7 @@
   * ac3_imdct_sse.c: accelerated SSE ac3 DCT
   *****************************************************************************
   * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_imdct_sse.c,v 1.3 2001/05/28 02:38:48 sam Exp $
+ * $Id: ac3_imdct_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Renaud Dartus <reno@videolan.org>
   *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -91,6 +91,7 @@ void _M( imdct_do_512_nol ) (imdct_t * p_imdct, float data[], float delay[])
  static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float *data, float *xcos_sin_sse)
  {
      __asm__ __volatile__ (    
+    ".align 16\n"
      "pushl %%ebp\n"
      "movl  %%esp, %%ebp\n"
      "addl  $-4, %%esp\n" /* local variable, loop counter */
@@ -103,11 +104,12 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
      "pushl %%esi\n"
  
      "movl  8(%%ebp), %%eax\n"     /* pmt */
-    "movl 12(%%ebp), %%ebx\n"    /* buf */
-    "movl 16(%%ebp), %%ecx\n"    /* data */
+    "movl 12(%%ebp), %%ebx\n"     /* buf */
+    "movl 16(%%ebp), %%ecx\n"     /* data */
      "movl 20(%%ebp), %%edx\n"     /* xcos_sin_sse */
      "movl $64, -4(%%ebp)\n"
      
+    ".align 16\n"
  ".loop:\n"
      "movl  (%%eax), %%esi\n"
      "movl 4(%%eax), %%edi\n"
@@ -117,18 +119,18 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
      "shll $1, %%esi\n"
      "shll $1, %%edi\n"
  
-    "movups (%%edx, %%esi, 8), %%xmm0\n" /* -c_j | -s_j | -s_j | c_j */
-    "movups (%%edx, %%edi, 8), %%xmm2\n" /* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
+    "movaps (%%edx, %%esi, 8), %%xmm0\n" /* -c_j | -s_j | -s_j | c_j */
+    "movaps (%%edx, %%edi, 8), %%xmm2\n" /* -c_j+1 | -s_j+1 | -s_j+1 | c_j+1 */
  
      "negl %%esi\n"
      "negl %%edi\n"
  
      "movss 1020(%%ecx, %%esi, 4), %%xmm4\n" /* 255-2j */
-    "addl $8, %%eax\n"
+    "addl  $8, %%eax\n"
      "movss 1020(%%ecx, %%edi, 4), %%xmm5\n" /* 255-2(j+1) */
  
-    "shufps $0, %%xmm1, %%xmm4\n" /* 2j | 2j | 255-2j | 255-2j */
-    "shufps $0, %%xmm3, %%xmm5\n" /* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */
+    "shufps  $0, %%xmm1, %%xmm4\n" /* 2j | 2j | 255-2j | 255-2j */
+    "shufps  $0, %%xmm3, %%xmm5\n" /* 2(j+1) | 2(j+1) | 255-2(j+1) | 255-2(j+1) */
      "mulps   %%xmm4, %%xmm0\n"
      "mulps   %%xmm5, %%xmm2\n"
      "movhlps %%xmm0, %%xmm1\n"
@@ -138,9 +140,9 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
      "addps   %%xmm3, %%xmm2\n"
      "movlhps %%xmm2, %%xmm0\n"
      
-    "movups  %%xmm0, -16(%%ebx)\n"
-    "decl -4(%%ebp)\n"
-       "jnz .loop\n"
+    "movaps  %%xmm0, -16(%%ebx)\n"
+    "decl    -4(%%ebp)\n"
+    "jnz     .loop\n"
  
      "popl %%esi\n"
      "popl %%edi\n"
@@ -157,36 +159,38 @@ static void imdct512_pre_ifft_twiddle_sse (const int *pmt, complex_t *buf, float
  static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
  {
      __asm__ __volatile__ ( 
+    ".align 16\n"
      "pushl %%ebx\n"
-    "movl $32, %%ebx\n"                 /* loop counter */
+    "movl  $32, %%ebx\n"               /* loop counter */
  
+    ".align 16\n"
  ".loop1:\n"
-    "movups    (%%eax), %%xmm0\n"          /*  im1 | re1 | im0 | re0 */
+    "movaps (%%eax), %%xmm0\n"         /*  im1 | re1 | im0 | re0 */
  
-    "movups  (%%ecx), %%xmm2\n"         /* -c | -s | -s | c */
-    "movhlps  %%xmm0, %%xmm1\n"         /* im1 | re1 */
-    "movups  16(%%ecx), %%xmm3\n"       /* -c1 | -s1 | -s1 | c1 */
+    "movaps (%%ecx), %%xmm2\n"         /* -c | -s | -s | c */
+    "movhlps %%xmm0, %%xmm1\n"         /* im1 | re1 */
+    "movaps 16(%%ecx), %%xmm3\n"       /* -c1 | -s1 | -s1 | c1 */
  
-    "shufps $0x50, %%xmm0, %%xmm0\n"    /* im0 | im0 | re0 | re0 */
-    "shufps $0x50, %%xmm1, %%xmm1\n"    /* im1 | im1 | re1 | re1 */
+    "shufps $0x50, %%xmm0, %%xmm0\n"   /* im0 | im0 | re0 | re0 */
+    "shufps $0x50, %%xmm1, %%xmm1\n"   /* im1 | im1 | re1 | re1 */
  
-    "movups  16(%%eax), %%xmm4\n"       /* im3 | re3 | im2 | re2 */
+    "movaps  16(%%eax), %%xmm4\n"      /* im3 | re3 | im2 | re2 */
  
-    "shufps $0x27, %%xmm2, %%xmm2\n"    /* c | -s | -s | -c */
-    "movhlps  %%xmm4, %%xmm5\n"         /* im3 | re3 */
-    "shufps $0x27, %%xmm3, %%xmm3\n"    /* c1 | -s1 | -s1 | -c1 */
+    "shufps  $0x27, %%xmm2, %%xmm2\n"  /* c | -s | -s | -c */
+    "movhlps %%xmm4, %%xmm5\n"         /* im3 | re3 */
+    "shufps  $0x27, %%xmm3, %%xmm3\n"  /* c1 | -s1 | -s1 | -c1 */
  
-    "movups  32(%%ecx), %%xmm6\n"       /* -c2 | -s2 | -s2 | c2 */
-    "movups  48(%%ecx), %%xmm7\n"       /* -c3 | -s3 | -s3 | c3 */
+    "movaps 32(%%ecx), %%xmm6\n"       /* -c2 | -s2 | -s2 | c2 */
+    "movaps 48(%%ecx), %%xmm7\n"       /* -c3 | -s3 | -s3 | c3 */
  
-    "shufps $0x50, %%xmm4, %%xmm4\n"    /* im2 | im2 | re2 | re2 */
-    "shufps $0x50, %%xmm5, %%xmm5\n"    /* im3 | im3 | re3 | re3 */
+    "shufps $0x50, %%xmm4, %%xmm4\n"   /* im2 | im2 | re2 | re2 */
+    "shufps $0x50, %%xmm5, %%xmm5\n"   /* im3 | im3 | re3 | re3 */
  
      "mulps %%xmm2, %%xmm0\n"
      "mulps %%xmm3, %%xmm1\n"
  
-    "shufps $0x27, %%xmm6, %%xmm6\n"    /* c2 | -s2 | -s2 | -c2 */
-    "shufps $0x27, %%xmm7, %%xmm7\n"    /* c3 | -s3 | -s3 | -c3 */
+    "shufps $0x27, %%xmm6, %%xmm6\n"   /* c2 | -s2 | -s2 | -c2 */
+    "shufps $0x27, %%xmm7, %%xmm7\n"   /* c3 | -s3 | -s3 | -c3 */
  
      "movhlps %%xmm0, %%xmm2\n"
      "movhlps %%xmm1, %%xmm3\n"
@@ -206,8 +210,8 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
      "movlhps %%xmm1, %%xmm0\n"
      "movlhps %%xmm5, %%xmm4\n"
  
-    "movups %%xmm0, (%%eax)\n"
-    "movups %%xmm4, 16(%%eax)\n"
+    "movaps %%xmm0, (%%eax)\n"
+    "movaps %%xmm4, 16(%%eax)\n"
      "addl $64, %%ecx\n"
      "addl $32, %%eax\n"
      "decl %%ebx\n"
@@ -221,6 +225,7 @@ static void imdct512_post_ifft_twiddle_sse (complex_t *buf, float *xcos_sin_sse)
  static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebp\n"
      "movl  %%esp, %%ebp\n"
      
@@ -240,6 +245,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
      "leal 504(%%eax), %%edi\n"  /* buf[63].re */
      "movl  12(%%ebp), %%eax\n"  /* data */
  
+    ".align 16\n"
  ".first_128_samples:\n"
      "movss   (%%esi), %%xmm0\n"
      "movss  8(%%esi), %%xmm2\n"
@@ -250,7 +256,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
      "movlhps %%xmm3, %%xmm1\n"      /* 0.0 | re1 | 0.0 | re0 */
  
      "movups (%%edx), %%xmm4\n"      /* w3 | w2 | w1 | w0 */
-    "movups (%%ebx), %%xmm5\n"      /* d3 | d2 | d1 | d0 */
+    "movaps (%%ebx), %%xmm5\n"      /* d3 | d2 | d1 | d0 */
      "shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
  
      "movss  16(%%esi), %%xmm6\n"    /* im2 */
@@ -261,27 +267,28 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
      "mulps     %%xmm4, %%xmm0\n"
      "movlhps   %%xmm7, %%xmm6\n"    /* 0.0 | im3 | 0.0 | im2 */
      "movlhps   %%xmm3, %%xmm2\n"    /* 0.0 | re3 | 0.0 | re2 */
-    "addps %%xmm5, %%xmm0\n"
+    "addps  %%xmm5, %%xmm0\n"
      "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
      "movups 16(%%edx), %%xmm4\n"    /* w7 | w6 | w5 | w4 */
-    "movups 16(%%ebx), %%xmm5\n"    /* d7 | d6 | d5 | d4 */
-    "subps %%xmm2, %%xmm6\n"        /* -re3 | im3 | -re2 | im2 */
-    "addl $32, %%edx\n"
-    "movups %%xmm0, (%%eax)\n"
-    "addl $32, %%ebx\n"
-    "mulps %%xmm4, %%xmm6\n"
-    "addl $32, %%esi\n"
-    "addl $32, %%eax\n"
-    "addps %%xmm5, %%xmm6\n"
-    "addl $-32, %%edi\n"
-    "movups %%xmm6, -16(%%eax)\n"
-    "decl %%ecx\n"
+    "movaps 16(%%ebx), %%xmm5\n"    /* d7 | d6 | d5 | d4 */
+    "subps  %%xmm2, %%xmm6\n"       /* -re3 | im3 | -re2 | im2 */
+    "addl   $32, %%edx\n"
+    "movaps %%xmm0, (%%eax)\n"
+    "addl   $32, %%ebx\n"
+    "mulps  %%xmm4, %%xmm6\n"
+    "addl   $32, %%esi\n"
+    "addl   $32, %%eax\n"
+    "addps  %%xmm5, %%xmm6\n"
+    "addl   $-32, %%edi\n"
+    "movaps %%xmm6, -16(%%eax)\n"
+    "decl   %%ecx\n"
      "jnz .first_128_samples\n"
  
      "movl 8(%%ebp), %%esi\n"    /* buf[0].re */
      "leal 1020(%%esi), %%edi\n" /* buf[127].im */
      "movl $16, %%ecx\n"         /* loop count */
      
+    ".align 16\n"
  ".second_128_samples:\n"
      "movss   (%%esi), %%xmm0\n" /* buf[i].re */
      "movss  8(%%esi), %%xmm2\n" /* re1 */
@@ -292,7 +299,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
      "movlhps %%xmm3, %%xmm1\n"  /* 0.0 | im1 | 0.0 | im1 */
  
      "movups (%%edx), %%xmm4\n"  /* w3 | w2 | w1 | w0 */
-    "movups (%%ebx), %%xmm5\n"  /* d3 | d2 | d1 | d0 */
+    "movaps (%%ebx), %%xmm5\n"  /* d3 | d2 | d1 | d0 */
  
      "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
      "movss  16(%%esi), %%xmm6\n"    /* re2 */
@@ -305,19 +312,19 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
      "mulps   %%xmm4, %%xmm0\n"
      "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
      "movups 16(%%edx), %%xmm4\n"    /* w7 | w6 | w5 | w4 */
-    "addl $32, %%esi\n"
-    "subps %%xmm2, %%xmm6\n"        /* -im3 | re3 | -im2 | re2 */
-    "addps %%xmm5, %%xmm0\n"
-    "mulps %%xmm4, %%xmm6\n"
-    "addl $-32, %%edi\n"
-    "movups 16(%%ebx), %%xmm5\n"    /* d7 | d6 | d5 | d4 */
-    "movups %%xmm0, (%%eax)\n"
-    "addps %%xmm5, %%xmm6\n"
-    "addl $32, %%edx\n"
-    "addl $32, %%eax\n"
-    "addl $32, %%ebx\n"
-    "movups %%xmm6, -16(%%eax)\n"
-    "decl %%ecx\n"
+    "addl   $32, %%esi\n"
+    "subps  %%xmm2, %%xmm6\n"       /* -im3 | re3 | -im2 | re2 */
+    "addps  %%xmm5, %%xmm0\n"
+    "mulps  %%xmm4, %%xmm6\n"
+    "addl   $-32, %%edi\n"
+    "movaps 16(%%ebx), %%xmm5\n"    /* d7 | d6 | d5 | d4 */
+    "movaps %%xmm0, (%%eax)\n"
+    "addps  %%xmm5, %%xmm6\n"
+    "addl   $32, %%edx\n"
+    "addl   $32, %%eax\n"
+    "addl   $32, %%ebx\n"
+    "movaps %%xmm6, -16(%%eax)\n"
+    "decl   %%ecx\n"
      "jnz .second_128_samples\n"
  
      "movl   8(%%ebp), %%eax\n"
@@ -326,6 +333,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
      "movl $16, %%ecx\n"         /* loop count */
      "movl  20(%%ebp), %%eax\n"  /* delay */
  
+    ".align 16\n"
  ".first_128_delay:\n"
      "movss   (%%esi), %%xmm0\n"
      "movss  8(%%esi), %%xmm2\n"
@@ -341,21 +349,21 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
      "movss  24(%%esi), %%xmm7\n"    /* re3 */
      "movss -16(%%edi), %%xmm2\n"    /* im2 */
      "movss -24(%%edi), %%xmm3\n"    /* im3 */
-    "subps     %%xmm1, %%xmm0\n"    /* -im1 | re1 | -im0 | re0 */
-    "addl $-32, %%edx\n"
+    "subps   %%xmm1, %%xmm0\n"      /* -im1 | re1 | -im0 | re0 */
+    "addl    $-32, %%edx\n"
      "movlhps %%xmm7, %%xmm6\n"      /* 0.0 | re3 | 0.0 | re2 */
      "movlhps %%xmm3, %%xmm2\n"      /* 0.0 | im3 | 0.0 | im2 */
      "mulps   %%xmm4, %%xmm0\n"
      "movups (%%edx), %%xmm5\n"      /* w7 | w6 | w5 | w4 */
      "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
-    "movups %%xmm0, (%%eax)\n"
-    "addl $32, %%esi\n"
-    "subps %%xmm2, %%xmm6\n"        /* -im3 | re3 | -im2 | re2 */
-    "addl $-32, %%edi\n"
-    "mulps %%xmm5, %%xmm6\n"
-    "addl $32, %%eax\n"
-    "movups %%xmm6, -16(%%eax)\n"
-    "decl %%ecx\n"
+    "movaps %%xmm0, (%%eax)\n"
+    "addl   $32, %%esi\n"
+    "subps  %%xmm2, %%xmm6\n"       /* -im3 | re3 | -im2 | re2 */
+    "addl   $-32, %%edi\n"
+    "mulps  %%xmm5, %%xmm6\n"
+    "addl   $32, %%eax\n"
+    "movaps %%xmm6, -16(%%eax)\n"
+    "decl   %%ecx\n"
      "jnz .first_128_delay\n"
  
      "movl    8(%%ebp), %%ebx\n"
@@ -363,6 +371,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
      "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
      "movl $16, %%ecx\n"         /* loop count */
      
+    ".align 16\n"
  ".second_128_delay:\n"
      "movss   (%%esi), %%xmm0\n"
      "movss  8(%%esi), %%xmm2\n"
@@ -378,21 +387,21 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
      "movss  24(%%esi), %%xmm7\n"    /* im3 */
      "movss -16(%%edi), %%xmm2\n"    /* re2 */
      "movss -24(%%edi), %%xmm3\n"    /* re3 */
-    "subps %%xmm0, %%xmm1\n"        /* re1 | -im1 | re0 | -im0 */
-    "addl $-32, %%edx\n"
+    "subps   %%xmm0, %%xmm1\n"      /* re1 | -im1 | re0 | -im0 */
+    "addl    $-32, %%edx\n"
      "movlhps %%xmm7, %%xmm6\n"      /* 0.0 | im3 | 0.0 | im2 */
      "movlhps %%xmm3, %%xmm2\n"      /* 0.0 | re3 | 0.0 | re2 */
      "mulps   %%xmm4, %%xmm1\n"
      "movups (%%edx), %%xmm5\n"      /* w7 | w6 | w5 | w4 */
      "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
-    "movups %%xmm1, (%%eax)\n"
-    "addl $32, %%esi\n"
-    "subps %%xmm6, %%xmm2\n"        /* re | -im3 | re | -im2 */
-    "addl $-32, %%edi\n"
-    "mulps %%xmm5, %%xmm2\n"
-    "addl $32, %%eax\n"
-    "movups %%xmm2, -16(%%eax)\n"
-    "decl %%ecx\n"
+    "movaps %%xmm1, (%%eax)\n"
+    "addl   $32, %%esi\n"
+    "subps  %%xmm6, %%xmm2\n"       /* re | -im3 | re | -im2 */
+    "addl   $-32, %%edi\n"
+    "mulps  %%xmm5, %%xmm2\n"
+    "addl   $32, %%eax\n"
+    "movaps %%xmm2, -16(%%eax)\n"
+    "decl   %%ecx\n"
      "jnz .second_128_delay\n"
  
      "popl %%edi\n"
@@ -409,6 +418,7 @@ static void imdct512_window_delay_sse (complex_t *buf, float *data_ptr, float *w
  static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, float *window_prt, float *delay_prt)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebp\n"
      "movl  %%esp, %%ebp\n"
      
@@ -428,6 +438,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
      "leal 504(%%eax), %%edi\n"  /* buf[63].re */
      "movl  12(%%ebp), %%eax\n"  /* data */
      
+    ".align 16\n"
  ".first_128_sample:\n"
      "movss   (%%esi), %%xmm0\n"
      "movss  8(%%esi), %%xmm2\n"
@@ -438,7 +449,6 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
      "movlhps %%xmm3, %%xmm1\n"      /* 0.0 | re1 | 0.0 | re0 */
  
      "movups (%%edx), %%xmm4\n"      /* w3 | w2 | w1 | w0 */
-    /* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
      "shufps $0xb1, %%xmm1, %%xmm1\n"/* re1 | 0.0 | re0 | 0.0 */
  
      "movss  16(%%esi), %%xmm6\n"    /* im2 */
@@ -446,30 +456,27 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
      "subps     %%xmm1, %%xmm0\n"    /* -re1 | im1 | -re0 | im0 */
      "movss -16(%%edi), %%xmm2\n"    /* re2 */
      "movss -24(%%edi), %%xmm3\n"    /* re3 */
-    "mulps %%xmm4, %%xmm0\n"
+    "mulps   %%xmm4, %%xmm0\n"
      "movlhps %%xmm7, %%xmm6\n"      /* 0.0 | im3 | 0.0 | im2 */
      "movlhps %%xmm3, %%xmm2\n"      /* 0.0 | re3 | 0.0 | re2 */
-    /* addps %%xmm5, %%xmm0 */
      "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
      "movups 16(%%edx), %%xmm4\n"    /* w7 | w6 | w5 | w4 */
-    /* movups 16(%%ebx), %%xmm5  d7 | d6 | d5 | d4 */
-    "subps %%xmm2, %%xmm6\n"        /* -re3 | im3 | -re2 | im2 */
-    "addl $32, %%edx\n"
-    "movups %%xmm0, (%%eax)\n"
-    /* addl $32, %%ebx */
-    "mulps %%xmm4, %%xmm6\n"
-    "addl $32, %%esi\n"
-    "addl $32, %%eax\n"
-    /* addps %%xmm5, %%xmm6 */
-    "addl $-32, %%edi\n"
-    "movups %%xmm6, -16(%%eax)\n"
-    "decl %%ecx\n"
+    "subps  %%xmm2, %%xmm6\n"       /* -re3 | im3 | -re2 | im2 */
+    "addl   $32, %%edx\n"
+    "movaps %%xmm0, (%%eax)\n"
+    "mulps  %%xmm4, %%xmm6\n"
+    "addl   $32, %%esi\n"
+    "addl   $32, %%eax\n"
+    "addl   $-32, %%edi\n"
+    "movaps %%xmm6, -16(%%eax)\n"
+    "decl   %%ecx\n"
      "jnz .first_128_sample\n"
  
      "movl    8(%%ebp), %%esi\n"     /* buf[0].re */
      "leal 1020(%%esi), %%edi\n"     /* buf[127].im */
      "movl $16, %%ecx\n"             /* loop count */
      
+    ".align 16\n"
  ".second_128_sample:\n"
      "movss   (%%esi), %%xmm0\n"     /* buf[i].re */
      "movss  8(%%esi), %%xmm2\n"     /* re1 */
@@ -480,32 +487,27 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
      "movlhps %%xmm3, %%xmm1\n"      /* 0.0 | im1 | 0.0 | im1 */
      
      "movups (%%edx), %%xmm4\n"      /* w3 | w2 | w1 | w0 */
-    /* movups (%%ebx), %%xmm5 d3 | d2 | d1 | d0 */
  
      "shufps $0xb1, %%xmm1, %%xmm1\n"/* im1 | 0.0 | im0 | 0.0 */
      "movss  16(%%esi), %%xmm6\n"    /* re2 */
      "movss  24(%%esi), %%xmm7\n"    /* re3 */
      "movss -16(%%edi), %%xmm2\n"    /* im2 */
      "movss -24(%%edi), %%xmm3\n"    /* im3 */
-    "subps %%xmm1, %%xmm0\n"        /* -im1 | re1 | -im0 | re0 */
+    "subps   %%xmm1, %%xmm0\n"      /* -im1 | re1 | -im0 | re0 */
      "movlhps %%xmm7, %%xmm6\n"      /* 0.0 | re3 | 0.0 | re2 */
      "movlhps %%xmm3, %%xmm2\n"      /* 0.0 | im3 | 0.0 | im2 */
-    "mulps %%xmm4, %%xmm0\n"
+    "mulps   %%xmm4, %%xmm0\n"
      "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
      "movups 16(%%edx), %%xmm4\n"    /* w7 | w6 | w5 | w4 */
-    "addl $32, %%esi\n"
-    "subps %%xmm2, %%xmm6\n"        /* -im3 | re3 | -im2 | re2 */
-    /* addps %%xmm5, %%xmm0 */
-    "mulps %%xmm4, %%xmm6\n"
-    "addl $-32, %%edi\n"
-    /* movups 16(%%ebx), %%xmm5  d7 | d6 | d5 | d4 */
-    "movups %%xmm0, (%%eax)\n"
-    /* addps %%xmm5, %%xmm6 */
-    "addl $32, %%edx\n"
-    "addl $32, %%eax\n"
-    /* addl $32, %%ebx */
-    "movups %%xmm6, -16(%%eax)\n"
-    "decl %%ecx\n"
+    "addl   $32, %%esi\n"
+    "subps  %%xmm2, %%xmm6\n"       /* -im3 | re3 | -im2 | re2 */
+    "mulps  %%xmm4, %%xmm6\n"
+    "addl   $-32, %%edi\n"
+    "movaps %%xmm0, (%%eax)\n"
+    "addl   $32, %%edx\n"
+    "addl   $32, %%eax\n"
+    "movaps %%xmm6, -16(%%eax)\n"
+    "decl   %%ecx\n"
      "jnz .second_128_sample\n"
  
      "movl   8(%%ebp), %%eax\n"
@@ -514,6 +516,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
      "movl $16, %%ecx\n"         /* loop count */
      "movl  20(%%ebp), %%eax\n"  /* delay */
      
+    ".align 16\n"
  ".first_128_delays:\n"
      "movss   (%%esi), %%xmm0\n"
      "movss  8(%%esi), %%xmm2\n"
@@ -530,20 +533,20 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
      "movss -16(%%edi), %%xmm2\n"    /* im2 */
      "movss -24(%%edi), %%xmm3\n"    /* im3 */
      "subps %%xmm1, %%xmm0\n"        /* -im1 | re1 | -im0 | re0 */
-    "addl $-32, %%edx\n"
+    "addl  $-32, %%edx\n"
      "movlhps %%xmm7, %%xmm6\n"      /* 0.0 | re3 | 0.0 | re2 */
      "movlhps %%xmm3, %%xmm2\n"      /* 0.0 | im3 | 0.0 | im2 */
-    "mulps %%xmm4, %%xmm0\n"
+    "mulps   %%xmm4, %%xmm0\n"
      "movups (%%edx), %%xmm5\n"      /* w7 | w6 | w5 | w4 */
      "shufps $0xb1, %%xmm2, %%xmm2\n"/* im3 | 0.0 | im2 | 0.0 */
-    "movups %%xmm0, (%%eax)\n"
-    "addl $32, %%esi\n"
-    "subps %%xmm2, %%xmm6\n"        /* -im3 | re3 | -im2 | re2 */
-    "addl $-32, %%edi\n"
-    "mulps %%xmm5, %%xmm6\n"
-    "addl $32, %%eax\n"
-    "movups %%xmm6, -16(%%eax)\n"
-    "decl %%ecx\n"
+    "movaps %%xmm0, (%%eax)\n"
+    "addl   $32, %%esi\n"
+    "subps  %%xmm2, %%xmm6\n"       /* -im3 | re3 | -im2 | re2 */
+    "addl   $-32, %%edi\n"
+    "mulps  %%xmm5, %%xmm6\n"
+    "addl   $32, %%eax\n"
+    "movaps %%xmm6, -16(%%eax)\n"
+    "decl   %%ecx\n"
      "jnz .first_128_delays\n"
  
      "movl    8(%%ebp), %%ebx\n"
@@ -551,6 +554,7 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
      "leal 1016(%%ebx), %%edi\n" /* buf[127].re */
      "movl $16, %%ecx\n"         /* loop count */
      
+    ".align 16\n"
  ".second_128_delays:\n"
      "movss   (%%esi), %%xmm0\n"
      "movss  8(%%esi), %%xmm2\n"
@@ -566,21 +570,21 @@ static void imdct512_window_delay_nol_sse (complex_t *buf, float *data_ptr, floa
      "movss  24(%%esi), %%xmm7\n"    /* im3 */
      "movss -16(%%edi), %%xmm2\n"    /* re2 */
      "movss -24(%%edi), %%xmm3\n"    /* re3 */
-    "subps %%xmm0, %%xmm1\n"        /* re1 | -im1 | re0 | -im0 */
-    "addl $-32, %%edx\n"
+    "subps   %%xmm0, %%xmm1\n"      /* re1 | -im1 | re0 | -im0 */
+    "addl    $-32, %%edx\n"
      "movlhps %%xmm7, %%xmm6\n"      /* 0.0 | im3 | 0.0 | im2 */
      "movlhps %%xmm3, %%xmm2\n"      /* 0.0 | re3 | 0.0 | re2 */
-    "mulps %%xmm4, %%xmm1\n"
+    "mulps   %%xmm4, %%xmm1\n"
      "movups (%%edx), %%xmm5\n"      /* w7 | w6 | w5 | w4 */
      "shufps $0xb1, %%xmm2, %%xmm2\n"/* re3 | 0.0 | re2 | 0.0 */
-    "movups %%xmm1, (%%eax)\n"
-    "addl $32, %%esi\n"
-    "subps %%xmm6, %%xmm2\n"        /* re | -im3 | re | -im2 */
-    "addl $-32, %%edi\n"
-    "mulps %%xmm5, %%xmm2\n"
-    "addl $32, %%eax\n"
-    "movups %%xmm2, -16(%%eax)\n"
-    "decl %%ecx\n"
+    "movaps %%xmm1, (%%eax)\n"
+    "addl   $32, %%esi\n"
+    "subps  %%xmm6, %%xmm2\n"       /* re | -im3 | re | -im2 */
+    "addl   $-32, %%edi\n"
+    "mulps  %%xmm5, %%xmm2\n"
+    "addl   $32, %%eax\n"
+    "movaps %%xmm2, -16(%%eax)\n"
+    "decl   %%ecx\n"
      "jnz .second_128_delays\n"
  
      "popl %%edi\n"
diff --git a/plugins/imdct/ac3_srfft_3dn.c b/plugins/imdct/ac3_srfft_3dn.c

index 729f0981c8aa553658c7fec5e7bff30a2b90e9f7..65943f429944737f288259ebf148f68f29778c66 100644 (file)
--- a/plugins/imdct/ac3_srfft_3dn.c
+++ b/plugins/imdct/ac3_srfft_3dn.c
@@ -2,7 +2,7 @@
   * ac3_srfft_3dn.c: accelerated 3D Now! ac3 fft functions
   *****************************************************************************
   * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_srfft_3dn.c,v 1.1 2001/05/16 14:51:29 reno Exp $
+ * $Id: ac3_srfft_3dn.c,v 1.2 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Renaud Dartus <reno@videolan.org>
   *
@@ -126,6 +126,7 @@ void C_1_3dn (void)
  static void fft_4_3dn (complex_t *x)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
         "movq    (%%eax), %%mm0\n"      /* x[0] */
         "movq   8(%%eax), %%mm1\n"      /* x[1] */
         "movq  16(%%eax), %%mm2\n"      /* x[2] */
diff --git a/plugins/imdct/ac3_srfft_sse.c b/plugins/imdct/ac3_srfft_sse.c

index 7bc5d32448b72a5299133e0240038f703e652907..f7e4b640600d9131f7344fe8efa07cc308e45cc1 100644 (file)
--- a/plugins/imdct/ac3_srfft_sse.c
+++ b/plugins/imdct/ac3_srfft_sse.c
@@ -2,7 +2,7 @@
   * ac3_srfft_sse.c: accelerated SSE ac3 fft functions
   *****************************************************************************
   * Copyright (C) 1999, 2000, 2001 VideoLAN
- * $Id: ac3_srfft_sse.c,v 1.3 2001/07/01 08:49:09 gbazin Exp $
+ * $Id: ac3_srfft_sse.c,v 1.4 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Renaud Dartus <reno@videolan.org>
   *          Aaron Holtzman <aholtzma@engr.uvic.ca>
@@ -106,44 +106,45 @@ void _M( fft_128p ) ( complex_t *a )
  
  void hsqrt2_sse (void)
  {
-    __asm__ (
-     ".float 0f0.707106781188\n"
-     ".float 0f0.707106781188\n"
-     ".float 0f-0.707106781188\n"
-     ".float 0f-0.707106781188\n"
-     );
+    __asm__ __volatile__ (
+    ".float 0f0.707106781188\n"
+    ".float 0f0.707106781188\n"
+    ".float 0f-0.707106781188\n"
+    ".float 0f-0.707106781188\n"
+    );
  }
  
  void C_1_sse (void)
  {
-    __asm__ (
-     ".float 0f-1.0\n"
-     ".float 0f1.0\n"
-     ".float 0f-1.0\n"
-     ".float 0f1.0\n"
-     );
+    __asm__ __volatile__ (
+    ".float 0f-1.0\n"
+    ".float 0f1.0\n"
+    ".float 0f-1.0\n"
+    ".float 0f1.0\n"
+    );
  }
  
  static void fft_4_sse (complex_t *x)
  {
      __asm__ __volatile__ (
-    "movups   (%%eax), %%xmm0\n"    /* x[1] | x[0] */
-    "movups 16(%%eax), %%xmm2\n"    /* x[3] | x[2] */
-    "movups  %%xmm0, %%xmm1\n"        /* x[1] | x[0] */
-    "addps   %%xmm2, %%xmm0\n"        /* x[1] + x[3] | x[0] + x[2] */
-    "subps   %%xmm2, %%xmm1\n"        /* x[1] - x[3] | x[0] - x[2] */
+    ".align 16\n"
+    "movaps   (%%eax), %%xmm0\n"    /* x[1] | x[0] */
+    "movaps 16(%%eax), %%xmm2\n"    /* x[3] | x[2] */
+    "movaps  %%xmm0, %%xmm1\n"      /* x[1] | x[0] */
+    "addps   %%xmm2, %%xmm0\n"      /* x[1] + x[3] | x[0] + x[2] */
+    "subps   %%xmm2, %%xmm1\n"      /* x[1] - x[3] | x[0] - x[2] */
      "xorps   %%xmm6, %%xmm6\n"
-    "movhlps %%xmm1, %%xmm4\n"        /* ? | x[1] - x[3] */
-    "movhlps %%xmm0, %%xmm3\n"        /* ? | x[1] + x[3] */
-    "subss   %%xmm4, %%xmm6\n"        /* 0 | -(x[1] - x[3]).re */
-    "movlhps %%xmm1, %%xmm0\n"        /* x[0] - x[2] | x[0] + x[2] */
-    "movlhps %%xmm6, %%xmm4\n"        /* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
-    "movups  %%xmm0, %%xmm2\n"        /* x[0] - x[2] | x[0] + x[2] */
+    "movhlps %%xmm1, %%xmm4\n"      /* ? | x[1] - x[3] */
+    "movhlps %%xmm0, %%xmm3\n"      /* ? | x[1] + x[3] */
+    "subss   %%xmm4, %%xmm6\n"      /* 0 | -(x[1] - x[3]).re */
+    "movlhps %%xmm1, %%xmm0\n"      /* x[0] - x[2] | x[0] + x[2] */
+    "movlhps %%xmm6, %%xmm4\n"      /* 0 | -(x[1] - x[3]).re | (x[1] - x[3]).im | (x[3]-x[1]).re */
+    "movaps  %%xmm0, %%xmm2\n"      /* x[0] - x[2] | x[0] + x[2] */
      "shufps   $0x94, %%xmm4, %%xmm3\n" /* i*(x[1] - x[3]) | x[1] + x[3] */
      "addps   %%xmm3, %%xmm0\n"
      "subps   %%xmm3, %%xmm2\n"
-    "movups  %%xmm0,   (%%eax)\n"
-    "movups  %%xmm2, 16(%%eax)\n"
+    "movaps  %%xmm0,   (%%eax)\n"
+    "movaps  %%xmm2, 16(%%eax)\n"
      : "=a" (x)
      : "a" (x) );
  }
@@ -151,62 +152,63 @@ static void fft_4_sse (complex_t *x)
  static void fft_8_sse (complex_t *x)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl   %%ebx\n"
      
      "movlps   (%%eax), %%xmm0\n"    /* x[0] */
      "movlps 32(%%eax), %%xmm1\n"    /* x[4] */
      "movhps 16(%%eax), %%xmm0\n"    /* x[2] | x[0] */
      "movhps 48(%%eax), %%xmm1\n"    /* x[6] | x[4] */
-    "movups  %%xmm0, %%xmm2\n"        /* x[2] | x[0] */
+    "movaps  %%xmm0, %%xmm2\n"      /* x[2] | x[0] */
      "xorps   %%xmm3, %%xmm3\n"
-    "addps   %%xmm1, %%xmm0\n"        /* x[2] + x[6] | x[0] + x[4] */
-    "subps   %%xmm1, %%xmm2\n"        /* x[2] - x[6] | x[0] - x[4] */
-    "movhlps %%xmm0, %%xmm5\n"         /* x[2] + x[6] */
+    "addps   %%xmm1, %%xmm0\n"      /* x[2] + x[6] | x[0] + x[4] */
+    "subps   %%xmm1, %%xmm2\n"      /* x[2] - x[6] | x[0] - x[4] */
+    "movhlps %%xmm0, %%xmm5\n"      /* x[2] + x[6] */
      "movhlps %%xmm2, %%xmm4\n"      /* x[2] - x[6] */
-    "movlhps %%xmm2, %%xmm0\n"        /* x[0] - x[4] | x[0] + x[4] */
-    "subss   %%xmm4, %%xmm3\n"        /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
-    "movups  %%xmm0, %%xmm7\n"        /* x[0] - x[4] | x[0] + x[4] */
-    "movups  %%xmm3, %%xmm4\n"        /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
-    "movlps 8(%%eax), %%xmm1\n"        /* x[1] */
+    "movlhps %%xmm2, %%xmm0\n"      /* x[0] - x[4] | x[0] + x[4] */
+    "subss   %%xmm4, %%xmm3\n"      /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
+    "movaps  %%xmm0, %%xmm7\n"      /* x[0] - x[4] | x[0] + x[4] */
+    "movaps  %%xmm3, %%xmm4\n"      /* (x[2]-x[6]).im | -(x[2]-x[6]).re */
+    "movlps 8(%%eax), %%xmm1\n"     /* x[1] */
      "shufps   $0x14, %%xmm4, %%xmm5\n" /* i*(x[2] - x[6]) | x[2] + x[6] */
  
-    "addps   %%xmm5, %%xmm0\n"        /* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */
-    "subps   %%xmm5, %%xmm7\n"        /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
+    "addps   %%xmm5, %%xmm0\n"      /* yt = i*(x2-x6)+x0-x4 | x2+x6+x0+x4 */
+    "subps   %%xmm5, %%xmm7\n"      /* yb = i*(x6-x2)+x0-x4 | -x6-x2+x0+x4 */
  
      "movhps 24(%%eax), %%xmm1\n"    /* x[3] | x[1] */
      "movl   $hsqrt2_sse, %%ebx\n"
      "movlps 40(%%eax), %%xmm2\n"    /* x[5] */
      "movhps 56(%%eax), %%xmm2\n"    /* x[7] | x[5] */
-    "movups  %%xmm1, %%xmm3\n"        /* x[3] | x[1] */
-    "addps   %%xmm2, %%xmm1\n"        /* x[3] + x[7] | x[1] + x[5] */
-    "subps   %%xmm2, %%xmm3\n"        /* x[3] - x[7] | x[1] - x[5] */
-    "movups (%%ebx), %%xmm4\n"        /* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
-    "movups  %%xmm3, %%xmm6\n"        /* x[3] - x[7] | x[1] - x[5] */
+    "movaps  %%xmm1, %%xmm3\n"      /* x[3] | x[1] */
+    "addps   %%xmm2, %%xmm1\n"      /* x[3] + x[7] | x[1] + x[5] */
+    "subps   %%xmm2, %%xmm3\n"      /* x[3] - x[7] | x[1] - x[5] */
+    "movups (%%ebx), %%xmm4\n"      /* -1/sqrt2 | -1/sqrt2 | 1/sqrt2 | 1/sqrt2 */
+    "movaps  %%xmm3, %%xmm6\n"      /* x[3] - x[7] | x[1] - x[5] */
      "mulps   %%xmm4, %%xmm3\n"      /* -1/s2*(x[3] - x[7]) | 1/s2*(x[1] - x[5]) */
      "shufps   $0xc8, %%xmm4, %%xmm4\n" /* -1/sqrt2 | 1/sqrt2 | -1/sqrt2 | 1/sqrt2 */
      "shufps   $0xb1, %%xmm6, %%xmm6\n" /* (x3-x7).re|(x3-x7).im|(x1-x5).re|(x1-x5).im */
      "mulps   %%xmm4, %%xmm6\n"      /* (x7-x3).re/s2|(x3-x7).im/s2|(x5-x1).re/s2|(x1-x5).im/s2 */
-    "addps   %%xmm3, %%xmm6\n"        /* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */
-    "movhlps %%xmm1, %%xmm5\n"        /* x[3] + x[7] */
-    "movlhps %%xmm6, %%xmm1\n"        /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
-    "shufps   $0xe4, %%xmm6, %%xmm5\n"    /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
-    "movups  %%xmm1, %%xmm3\n"        /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+    "addps   %%xmm3, %%xmm6\n"      /* (-1-i)/sqrt2 * (x[3]-x[7]) | (1-i)/sqrt2 * (x[1] - x[5]) */
+    "movhlps %%xmm1, %%xmm5\n"      /* x[3] + x[7] */
+    "movlhps %%xmm6, %%xmm1\n"      /* (1+i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
+    "shufps   $0xe4, %%xmm6, %%xmm5\n" /* (-1-i)/sqrt2 * (x[3]-x[7]) | x[3]+x[7] */
+    "movaps  %%xmm1, %%xmm3\n"      /* (1-i)/sqrt2 * (x[1]-x[5]) | x[1]+x[5] */
      "movl  $C_1_sse, %%ebx\n"
-    "addps   %%xmm5, %%xmm1\n"        /* u */
-    "subps   %%xmm5, %%xmm3\n"        /* v */
-    "movups  %%xmm0, %%xmm2\n"        /* yb */
-    "movups  %%xmm7, %%xmm4\n"        /* yt */
+    "addps   %%xmm5, %%xmm1\n"      /* u */
+    "subps   %%xmm5, %%xmm3\n"      /* v */
+    "movaps  %%xmm0, %%xmm2\n"      /* yb */
+    "movaps  %%xmm7, %%xmm4\n"      /* yt */
      "movups (%%ebx), %%xmm5\n"
      "mulps   %%xmm5, %%xmm3\n"
-    "addps   %%xmm1, %%xmm0\n"        /* yt + u */
-    "subps   %%xmm1, %%xmm2\n"        /* yt - u */
+    "addps   %%xmm1, %%xmm0\n"      /* yt + u */
+    "subps   %%xmm1, %%xmm2\n"      /* yt - u */
      "shufps   $0xb1, %%xmm3, %%xmm3\n" /* -i * v */
-    "movups  %%xmm0, (%%eax)\n"
-    "movups  %%xmm2, 32(%%eax)\n"
-    "addps   %%xmm3, %%xmm4\n"        /* yb - i*v */
-    "subps   %%xmm3, %%xmm7\n"        /* yb + i*v */
-    "movups  %%xmm4, 16(%%eax)\n"
-    "movups  %%xmm7, 48(%%eax)\n"
+    "movaps  %%xmm0, (%%eax)\n"
+    "movaps  %%xmm2, 32(%%eax)\n"
+    "addps   %%xmm3, %%xmm4\n"      /* yb - i*v */
+    "subps   %%xmm3, %%xmm7\n"      /* yb + i*v */
+    "movaps  %%xmm4, 16(%%eax)\n"
+    "movaps  %%xmm7, 48(%%eax)\n"
  
      "popl    %%ebx\n"
      : "=a" (x)
@@ -218,6 +220,7 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
           const complex_t *d, const complex_t *d_3)
  {
      __asm__ __volatile__ (
+    ".align 16\n"
      "pushl %%ebp\n"
      "movl %%esp, %%ebp\n"
  
@@ -225,10 +228,11 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
      
      "pushl %%eax\n"
      "pushl %%ebx\n"
-    "pushl %%ecx\n"
+    "pushl %%ecx\n" //
      "pushl %%edx\n"
      "pushl %%esi\n"
-    "pushl %%edi\n"
+//    "movl %%edi, %%ecx\n" /* k */
+    "pushl %%edi\n" //
  
      "movl  8(%%ebp), %%ecx\n"   /* k */
      "movl 12(%%ebp), %%eax\n"   /* x */
@@ -236,19 +240,20 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
      "movl 16(%%ebp), %%ebx\n"   /* wT */
      "movl 20(%%ebp), %%edx\n"   /* d */
      "movl 24(%%ebp), %%esi\n"   /* d3 */
-    "shll $4, %%ecx\n"          /* 16k */
-    "addl $8, %%edx\n"
+    "shll $4, %%ecx\n"          /* 16k */ ///
+    "addl $8, %%edx\n" 
      "leal (%%eax, %%ecx, 2), %%edi\n"
      "addl $8, %%esi\n"
-    
+
      /* TRANSZERO and TRANS */
-    "movups (%%eax), %%xmm0\n"      /* x[1] | x[0] */
-    "movups (%%ebx), %%xmm1\n"      /* wT[1] | wT[0] */
-    "movups (%%ebx, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */
-    "movlps (%%edx), %%xmm3\n"      /* d */
-    "movlps (%%esi), %%xmm4\n"      /* d3 */
-    "movhlps %%xmm1, %%xmm5\n"      /* wT[1] */
-    "movhlps %%xmm2, %%xmm6\n"      /* wB[1] */
+    ".align 16\n"
+    "movaps (%%eax), %%xmm0\n"     /* x[1] | x[0] */
+    "movaps (%%ebx), %%xmm1\n"     /* wT[1] | wT[0] */
+    "movaps (%%ebx, %%ecx), %%xmm2\n" /* wB[1] | wB[0] */
+    "movlps (%%edx), %%xmm3\n"     /* d */
+    "movlps (%%esi), %%xmm4\n"     /* d3 */
+    "movhlps %%xmm1, %%xmm5\n"     /* wT[1] */
+    "movhlps %%xmm2, %%xmm6\n"     /* wB[1] */
      "shufps $0x50, %%xmm3, %%xmm3\n" /* d[1].im | d[1].im | d[1].re | d[1].re */
      "shufps $0x50, %%xmm4, %%xmm4\n" /* d3[1].im | d3[1].im | d3[i].re | d3[i].re */
      "movlhps %%xmm5, %%xmm5\n"      /* wT[1] | wT[1] */
@@ -259,40 +264,41 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
      "movlhps %%xmm6, %%xmm5\n"      /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wT[1].im * d[1].re | wT[1].re * d[1].re */
      "shufps $0xb1, %%xmm6, %%xmm7\n" /* wB[1].re * d3[1].im | wB[i].im * d3[1].im | wT[1].re * d[1].im | wT[1].im * d[1].im */
      "movl  $C_1_sse, %%edi\n"
-    "movups (%%edi), %%xmm4\n"
+    "movaps (%%edi), %%xmm4\n"
      "mulps   %%xmm4, %%xmm7\n"
      "addps   %%xmm7, %%xmm5\n"      /* wB[1] * d3[1] | wT[1] * d[1] */
      "movlhps %%xmm5, %%xmm1\n"      /* d[1] * wT[1] | wT[0] */
      "shufps  $0xe4, %%xmm5, %%xmm2\n" /* d3[1] * wB[1] | wB[0] */
-    "movups  %%xmm1, %%xmm3\n"      /* d[1] * wT[1] | wT[0] */
+    "movaps  %%xmm1, %%xmm3\n"      /* d[1] * wT[1] | wT[0] */
      "leal   (%%eax, %%ecx, 2), %%edi\n"
      "addps  %%xmm2, %%xmm1\n"       /* u */
      "subps  %%xmm2, %%xmm3\n"       /* v */
      "mulps  %%xmm4, %%xmm3\n"
-    "movups (%%eax, %%ecx), %%xmm5\n" /* xk[1] | xk[0] */
+    "movaps (%%eax, %%ecx), %%xmm5\n" /* xk[1] | xk[0] */
      "shufps $0xb1, %%xmm3, %%xmm3\n"  /* -i * v */
-    "movups %%xmm0, %%xmm2\n"         /* x[1] | x[0] */
-    "movups %%xmm5, %%xmm6\n"         /* xk[1] | xk[0] */
+    "movaps %%xmm0, %%xmm2\n"       /* x[1] | x[0] */
+    "movaps %%xmm5, %%xmm6\n"       /* xk[1] | xk[0] */
      "addps  %%xmm1, %%xmm0\n"
      "subps  %%xmm1, %%xmm2\n"
      "addps  %%xmm3, %%xmm5\n"
      "subps  %%xmm3, %%xmm6\n"
-    "movups %%xmm0, (%%eax)\n"
-    "movups %%xmm2, (%%edi)\n"
-    "movups %%xmm5, (%%eax, %%ecx)\n"
-    "movups %%xmm6, (%%edi, %%ecx)\n"
+    "movaps %%xmm0, (%%eax)\n"
+    "movaps %%xmm2, (%%edi)\n"
+    "movaps %%xmm5, (%%eax, %%ecx)\n"
+    "movaps %%xmm6, (%%edi, %%ecx)\n"
      "addl $16, %%eax\n"
      "addl $16, %%ebx\n"
      "addl  $8, %%edx\n"
      "addl  $8, %%esi\n"
      "decl -4(%%ebp)\n"
  
+    ".align 16\n"
  ".loop:\n"
-    "movups (%%ebx), %%xmm0\n"      /* wT[1] | wT[0] */
-    "movups (%%edx), %%xmm1\n"      /* d[1] | d[0] */
+    "movaps (%%ebx), %%xmm0\n"      /* wT[1] | wT[0] */
+    "movaps (%%edx), %%xmm1\n"      /* d[1] | d[0] */
  
-    "movups (%%ebx, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */
-    "movups (%%esi), %%xmm5\n"      /* d3[1] | d3[0] */
+    "movaps (%%ebx, %%ecx), %%xmm4\n" /* wB[1] | wB[0] */
+    "movaps (%%esi), %%xmm5\n"      /* d3[1] | d3[0] */
  
      "movhlps %%xmm0, %%xmm2\n"      /* wT[1] */
      "movhlps %%xmm1, %%xmm3\n"      /* d[1] */
@@ -317,50 +323,51 @@ static void fft_asmb_sse (int k, complex_t *x, complex_t *wTB,
      "movlhps %%xmm2, %%xmm0\n"  /* d[1].re * wT[1].im | d[1].re * wT[1].re | d[0].re * wT[0].im | d[0].re * wT[0].re */
      "mulps   %%xmm5, %%xmm4\n"  /* wB[0].im * d3[0].im | wB[0].re * d3[0].im | wB[0].im * d3[0].re | wB[0].re * d3[0].re */
      "mulps   %%xmm7, %%xmm6\n"  /* wB[1].im * d3[1].im | wB[1].re * d3[1].im | wB[1].im * d3[1].re | wB[1].re * d3[1].re */
-    "shufps $0xb1, %%xmm2, %%xmm1\n"    /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
+    "shufps $0xb1, %%xmm2, %%xmm1\n" /* d[1].im * wT[1].re | d[1].im * wT[1].im | d[0].im * wT[0].re | d[0].im * wT[0].im */
      "movl  $C_1_sse, %%edi\n"
-    "movups (%%edi), %%xmm3\n"  /* 1.0 | -1.0 | 1.0 | -1.0 */
+    "movaps (%%edi), %%xmm3\n"  /* 1.0 | -1.0 | 1.0 | -1.0 */
  
      "movhlps %%xmm4, %%xmm5\n"  /* wB[0].im * d3[0].im | wB[0].re * d3[0].im */
      "mulps   %%xmm3, %%xmm1\n"  /* d[1].im * wT[1].re | -d[1].im * wT[1].im | d[0].im * wT[0].re | -d[0].im * wT[0].im */
      "movlhps %%xmm6, %%xmm4\n"  /* wB[1].im * d3[1].re | wB[1].re * d3[1].re | wB[0].im * d3[0].re | wB[0].im * d3[0].re */
      "addps   %%xmm1, %%xmm0\n"  /* wT[1] * d[1] | wT[0] * d[0] */
  
-    "shufps $0xb1, %%xmm6, %%xmm5\n"    /* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
+    "shufps $0xb1, %%xmm6, %%xmm5\n" /* wB[1].re * d3[1].im | wB[1].im * d3[1].im | wB[0].re * d3[0].im | wB[0].im * d3[0].im */
      "mulps   %%xmm3, %%xmm5\n"  /* wB[1].re * d3[1].im | -wB[1].im * d3[1].im | wB[0].re * d3[0].im | -wB[0].im * d3[0].im */
      "addps   %%xmm5, %%xmm4\n"  /* wB[1] * d3[1] | wB[0] * d3[0] */
  
-    "movups %%xmm0, %%xmm1\n"   /* wT[1] * d[1] | wT[0] * d[0] */
+    "movaps %%xmm0, %%xmm1\n"   /* wT[1] * d[1] | wT[0] * d[0] */
      "addps  %%xmm4, %%xmm0\n"   /* u */
      "subps  %%xmm4, %%xmm1\n"   /* v */
-    "movups (%%eax), %%xmm6\n"  /* x[1] | x[0] */
+    "movaps (%%eax), %%xmm6\n"  /* x[1] | x[0] */
      "leal   (%%eax, %%ecx, 2), %%edi\n"
      "mulps  %%xmm3, %%xmm1\n"
      "addl $16, %%ebx\n"
      "addl $16, %%esi\n"
      "shufps $0xb1, %%xmm1, %%xmm1\n"    /* -i * v */
-    "movups (%%eax, %%ecx), %%xmm7\n"   /* xk[1] | xk[0] */
-    "movups %%xmm6, %%xmm2\n"
-    "movups %%xmm7, %%xmm4\n"
+    "movaps (%%eax, %%ecx), %%xmm7\n"   /* xk[1] | xk[0] */
+    "movaps %%xmm6, %%xmm2\n"
+    "movaps %%xmm7, %%xmm4\n"
      "addps  %%xmm0, %%xmm6\n"
      "subps  %%xmm0, %%xmm2\n"
-    "movups %%xmm6, (%%eax)\n"
-    "movups %%xmm2, (%%edi)\n"
+    "movaps %%xmm6, (%%eax)\n"
+    "movaps %%xmm2, (%%edi)\n"
      "addps  %%xmm1, %%xmm7\n"
      "subps  %%xmm1, %%xmm4\n"
      "addl $16, %%edx\n"
-    "movups %%xmm7, (%%eax, %%ecx)\n"
-    "movups %%xmm4, (%%edi, %%ecx)\n"
+    "movaps %%xmm7, (%%eax, %%ecx)\n"
+    "movaps %%xmm4, (%%edi, %%ecx)\n"
  
      "addl $16, %%eax\n"
      "decl -4(%%ebp)\n"
      "jnz .loop\n"
  
+    ".align 16\n"
  ".end:\n"
-    "popl %%edi\n"
+    "popl %%edi\n" //
      "popl %%esi\n"
      "popl %%edx\n"
-    "popl %%ecx\n"
+    "popl %%ecx\n" //
      "popl %%ebx\n"
      "popl %%eax\n"
      
diff --git a/src/ac3_decoder/ac3_decoder.h b/src/ac3_decoder/ac3_decoder.h

index 301de3664b7d3a451ac818d115aaf0249f76bbe0..be0cef2a315588c22af82eb23d9488ba14a9fd36 100644 (file)
--- a/src/ac3_decoder/ac3_decoder.h
+++ b/src/ac3_decoder/ac3_decoder.h
@@ -2,7 +2,7 @@
   * ac3_decoder.h : ac3 decoder interface
   *****************************************************************************
   * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder.h,v 1.10 2001/06/12 00:30:41 reno Exp $
+ * $Id: ac3_decoder.h,v 1.11 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Michel Kaempf <maxx@via.ecp.fr>
   *          Renaud Dartus <reno@videolan.org>
@@ -354,6 +354,9 @@ typedef struct mantissa_s
  
  struct ac3dec_s
  {
+    float               samples[6][256] __attribute__ ((aligned(16)));
+    imdct_t             imdct __attribute__ ((aligned(16)));
+
      /*
       * Input properties
       */
@@ -370,12 +373,10 @@ struct ac3dec_s
      bsi_t               bsi;
      audblk_t            audblk;
  
-    float               samples[6][256] __attribute__ ((aligned(16)));
      dm_par_t            dm_par;
  
      bit_allocate_t      bit_allocate;
      mantissa_t          mantissa;
-    imdct_t             imdct;
      downmix_t           downmix;
  
  };
diff --git a/src/ac3_decoder/ac3_decoder_thread.c b/src/ac3_decoder/ac3_decoder_thread.c

index 1bb8db30f64549f92851c6597a1d88bdd79b27ba..936869d3fa9b663ad7c049625db1054777399468 100644 (file)
--- a/src/ac3_decoder/ac3_decoder_thread.c
+++ b/src/ac3_decoder/ac3_decoder_thread.c
@@ -2,7 +2,7 @@
   * ac3_decoder_thread.c: ac3 decoder thread
   *****************************************************************************
   * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder_thread.c,v 1.34 2001/05/31 01:37:08 sam Exp $
+ * $Id: ac3_decoder_thread.c,v 1.35 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Michel Lespinasse <walken@zoy.org>
   *
@@ -82,7 +82,13 @@ vlc_thread_t ac3dec_CreateThread( adec_config_t * p_config )
      intf_DbgMsg( "ac3dec debug: creating ac3 decoder thread" );
  
      /* Allocate the memory needed to store the thread's structure */
-    if((p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t)))==NULL)
+    p_ac3thread = (ac3dec_thread_t *)malloc(sizeof(ac3dec_thread_t));
+
+    /* We need to be 16 bytes aligned */
+    p_ac3thread->ac3thread = (int)p_ac3thread & (-15);
+    p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
+    
+    if(p_ac3thread == NULL)
      {
          intf_ErrMsg ( "ac3dec error: not enough memory "
                        "for ac3dec_CreateThread() to create the new thread");
@@ -335,6 +341,7 @@ static void EndThread (ac3dec_thread_t * p_ac3thread)
  
      /* Destroy descriptor */
      free( p_ac3thread->p_config );
+    p_ac3thread = (ac3dec_thread_t *)p_ac3thread->ac3thread;
      free( p_ac3thread );
  
      intf_DbgMsg ("ac3dec debug: ac3 decoder thread %p destroyed", p_ac3thread);
diff --git a/src/ac3_decoder/ac3_decoder_thread.h b/src/ac3_decoder/ac3_decoder_thread.h

index a10a97c910975dd3571ae110a954f0d00e5da554..a06ad6f9380d8eccc3fb9891791aa9abad7780b7 100644 (file)
--- a/src/ac3_decoder/ac3_decoder_thread.h
+++ b/src/ac3_decoder/ac3_decoder_thread.h
@@ -2,7 +2,7 @@
   * ac3_decoder_thread.h : ac3 decoder thread interface
   *****************************************************************************
   * Copyright (C) 1999, 2000 VideoLAN
- * $Id: ac3_decoder_thread.h,v 1.7 2001/05/14 15:58:03 reno Exp $
+ * $Id: ac3_decoder_thread.h,v 1.8 2001/07/08 23:15:11 reno Exp $
   *
   * Authors: Michel Kaempf <maxx@via.ecp.fr>
   *
@@ -24,8 +24,16 @@
  /*****************************************************************************
   * ac3dec_thread_t : ac3 decoder thread descriptor
   *****************************************************************************/
+
  typedef struct ac3dec_thread_s
  {
+    /*
+     * Decoder properties
+     */
+    float used_for_alignement1;
+    float used_for_alignement2;
+    ac3dec_t            ac3_decoder __attribute__ ((aligned(16)));
+    
      /*
       * Thread properties
       */
@@ -38,16 +46,12 @@ typedef struct ac3dec_thread_s
      int                 sync_ptr;          /* sync ptr from ac3 magic header */
      adec_config_t *     p_config;
  
-    /*
-     * Decoder properties
-     */
-    ac3dec_t            ac3_decoder;
-
      /*
       * Output properties
       */
      aout_fifo_t *       p_aout_fifo; /* stores the decompressed audio frames */
-
+    int                 ac3thread;      /* save the old pointer */
+    
  } ac3dec_thread_t;
  
  /*****************************************************************************
author	Renaud Dartus <reno@videolan.org>
	Sun, 8 Jul 2001 23:15:11 +0000 (23:15 +0000)
committer	Renaud Dartus <reno@videolan.org>
	Sun, 8 Jul 2001 23:15:11 +0000 (23:15 +0000)
include/ac3_imdct.h		patch \| blob \| history
plugins/downmix/ac3_downmix_3dn.c		patch \| blob \| history
plugins/downmix/ac3_downmix_sse.c		patch \| blob \| history
plugins/imdct/ac3_imdct_3dn.c		patch \| blob \| history
plugins/imdct/ac3_imdct_sse.c		patch \| blob \| history
plugins/imdct/ac3_srfft_3dn.c		patch \| blob \| history
plugins/imdct/ac3_srfft_sse.c		patch \| blob \| history
src/ac3_decoder/ac3_decoder.h		patch \| blob \| history
src/ac3_decoder/ac3_decoder_thread.c		patch \| blob \| history
src/ac3_decoder/ac3_decoder_thread.h		patch \| blob \| history