git.sesse.net Git - ffmpeg/blob - libavcodec/x86/mpegaudiodsp.c

   1 /*
   2  * SIMD-optimized MP3 decoding functions
   3  * Copyright (c) 2010 Vitor Sessak
   4  *
   5  * This file is part of Libav.
   6  *
   7  * Libav is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * Libav is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with Libav; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "libavutil/attributes.h"
  23 #include "libavutil/cpu.h"
  24 #include "libavutil/internal.h"
  25 #include "libavutil/x86/asm.h"
  26 #include "libavutil/x86/cpu.h"
  27 #include "libavcodec/mpegaudiodsp.h"
  28
  29 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
  30 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
  31 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
  32 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
  33 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
  34 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
  35                                float *tmpbuf);
  36 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
  37                                float *tmpbuf);
  38
  39 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
  40
  41 #if HAVE_SSE2_INLINE
  42
  43 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
  44 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
  45
  46 #define SUM8(op, sum, w, p)               \
  47 {                                         \
  48     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
  49     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
  50     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
  51     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
  52     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
  53     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
  54     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
  55     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
  56 }
  57
  58 static void apply_window(const float *buf, const float *win1,
  59                          const float *win2, float *sum1, float *sum2, int len)
  60 {
  61     x86_reg count = - 4*len;
  62     const float *win1a = win1+len;
  63     const float *win2a = win2+len;
  64     const float *bufa  = buf+len;
  65     float *sum1a = sum1+len;
  66     float *sum2a = sum2+len;
  67
  68
  69 #define MULT(a, b)                                 \
  70     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
  71     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
  72     "mulps         %%xmm2, %%xmm1           \n\t"  \
  73     "subps         %%xmm1, %%xmm0           \n\t"  \
  74     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
  75     "subps         %%xmm2, %%xmm4           \n\t"  \
  76
  77     __asm__ volatile(
  78             "1:                                   \n\t"
  79             "xorps       %%xmm0, %%xmm0           \n\t"
  80             "xorps       %%xmm4, %%xmm4           \n\t"
  81
  82             MULT(   0,   0)
  83             MULT( 256,  64)
  84             MULT( 512, 128)
  85             MULT( 768, 192)
  86             MULT(1024, 256)
  87             MULT(1280, 320)
  88             MULT(1536, 384)
  89             MULT(1792, 448)
  90
  91             "movaps      %%xmm0, (%4,%0)          \n\t"
  92             "movaps      %%xmm4, (%5,%0)          \n\t"
  93             "add            $16,  %0              \n\t"
  94             "jl              1b                   \n\t"
  95             :"+&r"(count)
  96             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
  97             );
  98
  99 #undef MULT
 100 }
 101
 102 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
 103                              int incr)
 104 {
 105     LOCAL_ALIGNED_16(float, suma, [17]);
 106     LOCAL_ALIGNED_16(float, sumb, [17]);
 107     LOCAL_ALIGNED_16(float, sumc, [17]);
 108     LOCAL_ALIGNED_16(float, sumd, [17]);
 109
 110     float sum;
 111
 112     /* copy to avoid wrap */
 113     __asm__ volatile(
 114             "movaps    0(%0), %%xmm0   \n\t" \
 115             "movaps   16(%0), %%xmm1   \n\t" \
 116             "movaps   32(%0), %%xmm2   \n\t" \
 117             "movaps   48(%0), %%xmm3   \n\t" \
 118             "movaps   %%xmm0,   0(%1) \n\t" \
 119             "movaps   %%xmm1,  16(%1) \n\t" \
 120             "movaps   %%xmm2,  32(%1) \n\t" \
 121             "movaps   %%xmm3,  48(%1) \n\t" \
 122             "movaps   64(%0), %%xmm0   \n\t" \
 123             "movaps   80(%0), %%xmm1   \n\t" \
 124             "movaps   96(%0), %%xmm2   \n\t" \
 125             "movaps  112(%0), %%xmm3   \n\t" \
 126             "movaps   %%xmm0,  64(%1) \n\t" \
 127             "movaps   %%xmm1,  80(%1) \n\t" \
 128             "movaps   %%xmm2,  96(%1) \n\t" \
 129             "movaps   %%xmm3, 112(%1) \n\t"
 130             ::"r"(in), "r"(in+512)
 131             :"memory"
 132             );
 133
 134     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
 135     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
 136
 137     SUM8(MACS, suma[0], win + 32, in + 48);
 138
 139     sumc[ 0] = 0;
 140     sumb[16] = 0;
 141     sumd[16] = 0;
 142
 143 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
 144             "movups " #sumd "(%4),       %%xmm0          \n\t" \
 145             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
 146             "subps  " #suma "(%1),       %%xmm0          \n\t" \
 147             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
 148 \
 149             "movups " #sumc "(%3),       %%xmm0          \n\t" \
 150             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
 151             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
 152             "movaps        %%xmm0," #out2 "(%0)          \n\t"
 153
 154     if (incr == 1) {
 155         __asm__ volatile(
 156             SUMS( 0, 48,  4, 52,  0, 112)
 157             SUMS(16, 32, 20, 36, 16,  96)
 158             SUMS(32, 16, 36, 20, 32,  80)
 159             SUMS(48,  0, 52,  4, 48,  64)
 160
 161             :"+&r"(out)
 162             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
 163             :"memory"
 164             );
 165         out += 16*incr;
 166     } else {
 167         int j;
 168         float *out2 = out + 32 * incr;
 169         out[0  ]  = -suma[   0];
 170         out += incr;
 171         out2 -= incr;
 172         for(j=1;j<16;j++) {
 173             *out  = -suma[   j] + sumd[16-j];
 174             *out2 =  sumb[16-j] + sumc[   j];
 175             out  += incr;
 176             out2 -= incr;
 177         }
 178     }
 179
 180     sum = 0;
 181     SUM8(MLSS, sum, win + 16 + 32, in + 32);
 182     *out = sum;
 183 }
 184
 185 #endif /* HAVE_SSE2_INLINE */
 186
 187 #if HAVE_YASM
 188 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
 189 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
 190                                int count, int switch_point, int block_type) \
 191 {                                                                           \
 192     int align_end = count - (count & 3);                                \
 193     int j;                                                              \
 194     for (j = 0; j < align_end; j+= 4) {                                 \
 195         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
 196         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
 197         /* apply window & overlap with previous buffer */               \
 198                                                                         \
 199         /* select window */                                             \
 200         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
 201         in      += 4*18;                                                \
 202         buf     += 4*18;                                                \
 203         out     += 4;                                                   \
 204     }                                                                   \
 205     for (; j < count; j++) {                                            \
 206         /* apply window & overlap with previous buffer */               \
 207                                                                         \
 208         /* select window */                                             \
 209         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
 210         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
 211                                                                         \
 212         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
 213                                                                         \
 214         in  += 18;                                                      \
 215         buf++;                                                          \
 216         out++;                                                          \
 217     }                                                                   \
 218 }
 219
 220 DECL_IMDCT_BLOCKS(sse,sse)
 221 DECL_IMDCT_BLOCKS(sse2,sse)
 222 DECL_IMDCT_BLOCKS(sse3,sse)
 223 DECL_IMDCT_BLOCKS(ssse3,sse)
 224 DECL_IMDCT_BLOCKS(avx,avx)
 225 #endif /* HAVE_YASM */
 226
 227 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
 228 {
 229     int cpu_flags = av_get_cpu_flags();
 230
 231     int i, j;
 232     for (j = 0; j < 4; j++) {
 233         for (i = 0; i < 40; i ++) {
 234             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
 235             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
 236             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
 237             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
 238             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
 239             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
 240             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
 241             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
 242         }
 243     }
 244
 245 #if HAVE_SSE2_INLINE
 246     if (INLINE_SSE2(cpu_flags)) {
 247         s->apply_window_float = apply_window_mp3;
 248     }
 249 #endif /* HAVE_SSE2_INLINE */
 250
 251 #if HAVE_YASM
 252     if (EXTERNAL_SSE(cpu_flags)) {
 253         s->imdct36_blocks_float = imdct36_blocks_sse;
 254     }
 255     if (EXTERNAL_SSE2(cpu_flags)) {
 256         s->imdct36_blocks_float = imdct36_blocks_sse2;
 257     }
 258     if (EXTERNAL_SSE3(cpu_flags)) {
 259         s->imdct36_blocks_float = imdct36_blocks_sse3;
 260     }
 261     if (EXTERNAL_SSSE3(cpu_flags)) {
 262         s->imdct36_blocks_float = imdct36_blocks_ssse3;
 263     }
 264     if (EXTERNAL_AVX(cpu_flags)) {
 265         s->imdct36_blocks_float = imdct36_blocks_avx;
 266     }
 267 #endif /* HAVE_YASM */
 268 }