]> git.sesse.net Git - ffmpeg/blob - libavcodec/x86/mpegaudiodec.c
Merge commit '88bd7fdc821aaa0cbcf44cf075c62aaa42121e3f'
[ffmpeg] / libavcodec / x86 / mpegaudiodec.c
1 /*
2  * MMX optimized MP3 decoding functions
3  * Copyright (c) 2010 Vitor Sessak
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/cpu.h"
23 #include "libavutil/x86/asm.h"
24 #include "libavutil/x86/cpu.h"
25 #include "libavcodec/dsputil.h"
26 #include "libavcodec/mpegaudiodsp.h"
27
28 #define DECL(CPU)\
29 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
30 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
31
32 DECL(sse)
33 DECL(sse2)
34 DECL(sse3)
35 DECL(ssse3)
36 DECL(avx)
37
38 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
39                                float *tmpbuf);
40 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
41                                float *tmpbuf);
42
43 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
44
45 #if HAVE_SSE2_INLINE
46
47 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
48 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
49
50 #define SUM8(op, sum, w, p)               \
51 {                                         \
52     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
53     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
54     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
55     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
56     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
57     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
58     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
59     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
60 }
61
62 static void apply_window(const float *buf, const float *win1,
63                          const float *win2, float *sum1, float *sum2, int len)
64 {
65     x86_reg count = - 4*len;
66     const float *win1a = win1+len;
67     const float *win2a = win2+len;
68     const float *bufa  = buf+len;
69     float *sum1a = sum1+len;
70     float *sum2a = sum2+len;
71
72
73 #define MULT(a, b)                                 \
74     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
75     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
76     "mulps         %%xmm2, %%xmm1           \n\t"  \
77     "subps         %%xmm1, %%xmm0           \n\t"  \
78     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
79     "subps         %%xmm2, %%xmm4           \n\t"  \
80
81     __asm__ volatile(
82             "1:                                   \n\t"
83             "xorps       %%xmm0, %%xmm0           \n\t"
84             "xorps       %%xmm4, %%xmm4           \n\t"
85
86             MULT(   0,   0)
87             MULT( 256,  64)
88             MULT( 512, 128)
89             MULT( 768, 192)
90             MULT(1024, 256)
91             MULT(1280, 320)
92             MULT(1536, 384)
93             MULT(1792, 448)
94
95             "movaps      %%xmm0, (%4,%0)          \n\t"
96             "movaps      %%xmm4, (%5,%0)          \n\t"
97             "add            $16,  %0              \n\t"
98             "jl              1b                   \n\t"
99             :"+&r"(count)
100             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
101             );
102
103 #undef MULT
104 }
105
106 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
107                              int incr)
108 {
109     LOCAL_ALIGNED_16(float, suma, [17]);
110     LOCAL_ALIGNED_16(float, sumb, [17]);
111     LOCAL_ALIGNED_16(float, sumc, [17]);
112     LOCAL_ALIGNED_16(float, sumd, [17]);
113
114     float sum;
115
116     /* copy to avoid wrap */
117     __asm__ volatile(
118             "movaps    0(%0), %%xmm0   \n\t" \
119             "movaps   16(%0), %%xmm1   \n\t" \
120             "movaps   32(%0), %%xmm2   \n\t" \
121             "movaps   48(%0), %%xmm3   \n\t" \
122             "movaps   %%xmm0,   0(%1) \n\t" \
123             "movaps   %%xmm1,  16(%1) \n\t" \
124             "movaps   %%xmm2,  32(%1) \n\t" \
125             "movaps   %%xmm3,  48(%1) \n\t" \
126             "movaps   64(%0), %%xmm0   \n\t" \
127             "movaps   80(%0), %%xmm1   \n\t" \
128             "movaps   96(%0), %%xmm2   \n\t" \
129             "movaps  112(%0), %%xmm3   \n\t" \
130             "movaps   %%xmm0,  64(%1) \n\t" \
131             "movaps   %%xmm1,  80(%1) \n\t" \
132             "movaps   %%xmm2,  96(%1) \n\t" \
133             "movaps   %%xmm3, 112(%1) \n\t"
134             ::"r"(in), "r"(in+512)
135             :"memory"
136             );
137
138     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
139     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
140
141     SUM8(MACS, suma[0], win + 32, in + 48);
142
143     sumc[ 0] = 0;
144     sumb[16] = 0;
145     sumd[16] = 0;
146
147 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
148             "movups " #sumd "(%4),       %%xmm0          \n\t" \
149             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
150             "subps  " #suma "(%1),       %%xmm0          \n\t" \
151             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
152 \
153             "movups " #sumc "(%3),       %%xmm0          \n\t" \
154             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
155             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
156             "movaps        %%xmm0," #out2 "(%0)          \n\t"
157
158     if (incr == 1) {
159         __asm__ volatile(
160             SUMS( 0, 48,  4, 52,  0, 112)
161             SUMS(16, 32, 20, 36, 16,  96)
162             SUMS(32, 16, 36, 20, 32,  80)
163             SUMS(48,  0, 52,  4, 48,  64)
164
165             :"+&r"(out)
166             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
167             :"memory"
168             );
169         out += 16*incr;
170     } else {
171         int j;
172         float *out2 = out + 32 * incr;
173         out[0  ]  = -suma[   0];
174         out += incr;
175         out2 -= incr;
176         for(j=1;j<16;j++) {
177             *out  = -suma[   j] + sumd[16-j];
178             *out2 =  sumb[16-j] + sumc[   j];
179             out  += incr;
180             out2 -= incr;
181         }
182     }
183
184     sum = 0;
185     SUM8(MLSS, sum, win + 16 + 32, in + 32);
186     *out = sum;
187 }
188
189 #endif /* HAVE_SSE2_INLINE */
190
191 #if HAVE_YASM
192 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
193 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
194                                int count, int switch_point, int block_type) \
195 {                                                                           \
196     int align_end = count - (count & 3);                                \
197     int j;                                                              \
198     for (j = 0; j < align_end; j+= 4) {                                 \
199         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
200         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
201         /* apply window & overlap with previous buffer */               \
202                                                                         \
203         /* select window */                                             \
204         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
205         in      += 4*18;                                                \
206         buf     += 4*18;                                                \
207         out     += 4;                                                   \
208     }                                                                   \
209     for (; j < count; j++) {                                            \
210         /* apply window & overlap with previous buffer */               \
211                                                                         \
212         /* select window */                                             \
213         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
214         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
215                                                                         \
216         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
217                                                                         \
218         in  += 18;                                                      \
219         buf++;                                                          \
220         out++;                                                          \
221     }                                                                   \
222 }
223
224 #if HAVE_SSE
225 DECL_IMDCT_BLOCKS(sse,sse)
226 DECL_IMDCT_BLOCKS(sse2,sse)
227 DECL_IMDCT_BLOCKS(sse3,sse)
228 DECL_IMDCT_BLOCKS(ssse3,sse)
229 #endif
230 #if HAVE_AVX_EXTERNAL
231 DECL_IMDCT_BLOCKS(avx,avx)
232 #endif
233 #endif /* HAVE_YASM */
234
235 void ff_mpadsp_init_x86(MPADSPContext *s)
236 {
237     int mm_flags = av_get_cpu_flags();
238
239     int i, j;
240     for (j = 0; j < 4; j++) {
241         for (i = 0; i < 40; i ++) {
242             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
243             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
244             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
245             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
246             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
247             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
248             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
249             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
250         }
251     }
252
253 #if HAVE_SSE2_INLINE
254     if (mm_flags & AV_CPU_FLAG_SSE2) {
255         s->apply_window_float = apply_window_mp3;
256     }
257 #endif /* HAVE_SSE2_INLINE */
258
259 #if HAVE_YASM
260     if (EXTERNAL_AVX(mm_flags)) {
261         s->imdct36_blocks_float = imdct36_blocks_avx;
262     } else if (EXTERNAL_SSSE3(mm_flags)) {
263         s->imdct36_blocks_float = imdct36_blocks_ssse3;
264     } else if (EXTERNAL_SSE3(mm_flags)) {
265         s->imdct36_blocks_float = imdct36_blocks_sse3;
266     } else if (EXTERNAL_SSE2(mm_flags)) {
267         s->imdct36_blocks_float = imdct36_blocks_sse2;
268     } else if (EXTERNAL_SSE(mm_flags)) {
269         s->imdct36_blocks_float = imdct36_blocks_sse;
270     }
271 #endif /* HAVE_YASM */
272 }