git.sesse.net Git - ffmpeg/blob - libavcodec/x86/mpegaudiodsp.c

   1 /*
   2  * SIMD-optimized MP3 decoding functions
   3  * Copyright (c) 2010 Vitor Sessak
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "libavutil/attributes.h"
  23 #include "libavutil/cpu.h"
  24 #include "libavutil/internal.h"
  25 #include "libavutil/mem_internal.h"
  26 #include "libavutil/x86/asm.h"
  27 #include "libavutil/x86/cpu.h"
  28 #include "libavcodec/mpegaudiodsp.h"
  29
  30 #define DECL(CPU)\
  31 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
  32 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
  33
  34 #if HAVE_X86ASM
  35 #if ARCH_X86_32
  36 DECL(sse)
  37 #endif
  38 DECL(sse2)
  39 DECL(sse3)
  40 DECL(ssse3)
  41 DECL(avx)
  42 #endif /* HAVE_X86ASM */
  43
  44 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
  45                                float *tmpbuf);
  46 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
  47                                float *tmpbuf);
  48
  49 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
  50
  51 #if HAVE_6REGS && HAVE_SSE_INLINE
  52
  53 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
  54 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
  55
  56 #define SUM8(op, sum, w, p)               \
  57 {                                         \
  58     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
  59     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
  60     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
  61     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
  62     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
  63     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
  64     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
  65     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
  66 }
  67
  68 static void apply_window(const float *buf, const float *win1,
  69                          const float *win2, float *sum1, float *sum2, int len)
  70 {
  71     x86_reg count = - 4*len;
  72     const float *win1a = win1+len;
  73     const float *win2a = win2+len;
  74     const float *bufa  = buf+len;
  75     float *sum1a = sum1+len;
  76     float *sum2a = sum2+len;
  77
  78
  79 #define MULT(a, b)                                 \
  80     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
  81     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
  82     "mulps         %%xmm2, %%xmm1           \n\t"  \
  83     "subps         %%xmm1, %%xmm0           \n\t"  \
  84     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
  85     "subps         %%xmm2, %%xmm4           \n\t"  \
  86
  87     __asm__ volatile(
  88             "1:                                   \n\t"
  89             "xorps       %%xmm0, %%xmm0           \n\t"
  90             "xorps       %%xmm4, %%xmm4           \n\t"
  91
  92             MULT(   0,   0)
  93             MULT( 256,  64)
  94             MULT( 512, 128)
  95             MULT( 768, 192)
  96             MULT(1024, 256)
  97             MULT(1280, 320)
  98             MULT(1536, 384)
  99             MULT(1792, 448)
 100
 101             "movaps      %%xmm0, (%4,%0)          \n\t"
 102             "movaps      %%xmm4, (%5,%0)          \n\t"
 103             "add            $16,  %0              \n\t"
 104             "jl              1b                   \n\t"
 105             :"+&r"(count)
 106             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
 107             );
 108
 109 #undef MULT
 110 }
 111
 112 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
 113                              ptrdiff_t incr)
 114 {
 115     LOCAL_ALIGNED_16(float, suma, [17]);
 116     LOCAL_ALIGNED_16(float, sumb, [17]);
 117     LOCAL_ALIGNED_16(float, sumc, [17]);
 118     LOCAL_ALIGNED_16(float, sumd, [17]);
 119
 120     float sum;
 121
 122     /* copy to avoid wrap */
 123     __asm__ volatile(
 124             "movaps    0(%0), %%xmm0   \n\t" \
 125             "movaps   16(%0), %%xmm1   \n\t" \
 126             "movaps   32(%0), %%xmm2   \n\t" \
 127             "movaps   48(%0), %%xmm3   \n\t" \
 128             "movaps   %%xmm0,   0(%1) \n\t" \
 129             "movaps   %%xmm1,  16(%1) \n\t" \
 130             "movaps   %%xmm2,  32(%1) \n\t" \
 131             "movaps   %%xmm3,  48(%1) \n\t" \
 132             "movaps   64(%0), %%xmm0   \n\t" \
 133             "movaps   80(%0), %%xmm1   \n\t" \
 134             "movaps   96(%0), %%xmm2   \n\t" \
 135             "movaps  112(%0), %%xmm3   \n\t" \
 136             "movaps   %%xmm0,  64(%1) \n\t" \
 137             "movaps   %%xmm1,  80(%1) \n\t" \
 138             "movaps   %%xmm2,  96(%1) \n\t" \
 139             "movaps   %%xmm3, 112(%1) \n\t"
 140             ::"r"(in), "r"(in+512)
 141             :"memory"
 142             );
 143
 144     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
 145     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
 146
 147     SUM8(MACS, suma[0], win + 32, in + 48);
 148
 149     sumc[ 0] = 0;
 150     sumb[16] = 0;
 151     sumd[16] = 0;
 152
 153 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
 154             "movups " #sumd "(%4),       %%xmm0          \n\t" \
 155             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
 156             "subps  " #suma "(%1),       %%xmm0          \n\t" \
 157             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
 158 \
 159             "movups " #sumc "(%3),       %%xmm0          \n\t" \
 160             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
 161             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
 162             "movaps        %%xmm0," #out2 "(%0)          \n\t"
 163
 164     if (incr == 1) {
 165         __asm__ volatile(
 166             SUMS( 0, 48,  4, 52,  0, 112)
 167             SUMS(16, 32, 20, 36, 16,  96)
 168             SUMS(32, 16, 36, 20, 32,  80)
 169             SUMS(48,  0, 52,  4, 48,  64)
 170
 171             :"+&r"(out)
 172             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
 173             :"memory"
 174             );
 175         out += 16*incr;
 176     } else {
 177         int j;
 178         float *out2 = out + 32 * incr;
 179         out[0  ]  = -suma[   0];
 180         out += incr;
 181         out2 -= incr;
 182         for(j=1;j<16;j++) {
 183             *out  = -suma[   j] + sumd[16-j];
 184             *out2 =  sumb[16-j] + sumc[   j];
 185             out  += incr;
 186             out2 -= incr;
 187         }
 188     }
 189
 190     sum = 0;
 191     SUM8(MLSS, sum, win + 16 + 32, in + 32);
 192     *out = sum;
 193 }
 194
 195 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */
 196
 197 #if HAVE_X86ASM
 198 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
 199 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
 200                                int count, int switch_point, int block_type) \
 201 {                                                                           \
 202     int align_end = count - (count & 3);                                \
 203     int j;                                                              \
 204     for (j = 0; j < align_end; j+= 4) {                                 \
 205         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
 206         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
 207         /* apply window & overlap with previous buffer */               \
 208                                                                         \
 209         /* select window */                                             \
 210         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
 211         in      += 4*18;                                                \
 212         buf     += 4*18;                                                \
 213         out     += 4;                                                   \
 214     }                                                                   \
 215     for (; j < count; j++) {                                            \
 216         /* apply window & overlap with previous buffer */               \
 217                                                                         \
 218         /* select window */                                             \
 219         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
 220         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
 221                                                                         \
 222         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
 223                                                                         \
 224         in  += 18;                                                      \
 225         buf++;                                                          \
 226         out++;                                                          \
 227     }                                                                   \
 228 }
 229
 230 #if HAVE_SSE
 231 #if ARCH_X86_32
 232 DECL_IMDCT_BLOCKS(sse,sse)
 233 #endif
 234 DECL_IMDCT_BLOCKS(sse2,sse)
 235 DECL_IMDCT_BLOCKS(sse3,sse)
 236 DECL_IMDCT_BLOCKS(ssse3,sse)
 237 #endif
 238 #if HAVE_AVX_EXTERNAL
 239 DECL_IMDCT_BLOCKS(avx,avx)
 240 #endif
 241 #endif /* HAVE_X86ASM */
 242
 243 av_cold void ff_mpadsp_init_x86_tabs(void)
 244 {
 245     int i, j;
 246     for (j = 0; j < 4; j++) {
 247         for (i = 0; i < 40; i ++) {
 248             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
 249             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
 250             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
 251             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
 252             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
 253             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
 254             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
 255             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
 256         }
 257     }
 258 }
 259
 260 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
 261 {
 262     av_unused int cpu_flags = av_get_cpu_flags();
 263
 264 #if HAVE_6REGS && HAVE_SSE_INLINE
 265     if (INLINE_SSE(cpu_flags)) {
 266         s->apply_window_float = apply_window_mp3;
 267     }
 268 #endif /* HAVE_SSE_INLINE */
 269
 270 #if HAVE_X86ASM
 271 #if HAVE_SSE
 272 #if ARCH_X86_32
 273     if (EXTERNAL_SSE(cpu_flags)) {
 274         s->imdct36_blocks_float = imdct36_blocks_sse;
 275     }
 276 #endif
 277     if (EXTERNAL_SSE2(cpu_flags)) {
 278         s->imdct36_blocks_float = imdct36_blocks_sse2;
 279     }
 280     if (EXTERNAL_SSE3(cpu_flags)) {
 281         s->imdct36_blocks_float = imdct36_blocks_sse3;
 282     }
 283     if (EXTERNAL_SSSE3(cpu_flags)) {
 284         s->imdct36_blocks_float = imdct36_blocks_ssse3;
 285     }
 286 #endif
 287 #if HAVE_AVX_EXTERNAL
 288     if (EXTERNAL_AVX(cpu_flags)) {
 289         s->imdct36_blocks_float = imdct36_blocks_avx;
 290     }
 291 #endif
 292 #endif /* HAVE_X86ASM */
 293 }