]> git.sesse.net Git - ffmpeg/blob - libavcodec/x86/mpegaudiodsp.c
lavc: VP9 decoder
[ffmpeg] / libavcodec / x86 / mpegaudiodsp.c
1 /*
2  * MMX optimized MP3 decoding functions
3  * Copyright (c) 2010 Vitor Sessak
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/internal.h"
25 #include "libavutil/x86/asm.h"
26 #include "libavutil/x86/cpu.h"
27 #include "libavcodec/mpegaudiodsp.h"
28
29 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
30 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
31 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
32 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
33 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
34 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
35                                float *tmpbuf);
36 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
37                                float *tmpbuf);
38
39 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
40
41 #if HAVE_SSE2_INLINE
42
43 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
44 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
45
46 #define SUM8(op, sum, w, p)               \
47 {                                         \
48     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
49     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
50     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
51     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
52     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
53     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
54     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
55     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
56 }
57
58 static void apply_window(const float *buf, const float *win1,
59                          const float *win2, float *sum1, float *sum2, int len)
60 {
61     x86_reg count = - 4*len;
62     const float *win1a = win1+len;
63     const float *win2a = win2+len;
64     const float *bufa  = buf+len;
65     float *sum1a = sum1+len;
66     float *sum2a = sum2+len;
67
68
69 #define MULT(a, b)                                 \
70     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
71     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
72     "mulps         %%xmm2, %%xmm1           \n\t"  \
73     "subps         %%xmm1, %%xmm0           \n\t"  \
74     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
75     "subps         %%xmm2, %%xmm4           \n\t"  \
76
77     __asm__ volatile(
78             "1:                                   \n\t"
79             "xorps       %%xmm0, %%xmm0           \n\t"
80             "xorps       %%xmm4, %%xmm4           \n\t"
81
82             MULT(   0,   0)
83             MULT( 256,  64)
84             MULT( 512, 128)
85             MULT( 768, 192)
86             MULT(1024, 256)
87             MULT(1280, 320)
88             MULT(1536, 384)
89             MULT(1792, 448)
90
91             "movaps      %%xmm0, (%4,%0)          \n\t"
92             "movaps      %%xmm4, (%5,%0)          \n\t"
93             "add            $16,  %0              \n\t"
94             "jl              1b                   \n\t"
95             :"+&r"(count)
96             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
97             );
98
99 #undef MULT
100 }
101
102 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
103                              int incr)
104 {
105     LOCAL_ALIGNED_16(float, suma, [17]);
106     LOCAL_ALIGNED_16(float, sumb, [17]);
107     LOCAL_ALIGNED_16(float, sumc, [17]);
108     LOCAL_ALIGNED_16(float, sumd, [17]);
109
110     float sum;
111
112     /* copy to avoid wrap */
113     __asm__ volatile(
114             "movaps    0(%0), %%xmm0   \n\t" \
115             "movaps   16(%0), %%xmm1   \n\t" \
116             "movaps   32(%0), %%xmm2   \n\t" \
117             "movaps   48(%0), %%xmm3   \n\t" \
118             "movaps   %%xmm0,   0(%1) \n\t" \
119             "movaps   %%xmm1,  16(%1) \n\t" \
120             "movaps   %%xmm2,  32(%1) \n\t" \
121             "movaps   %%xmm3,  48(%1) \n\t" \
122             "movaps   64(%0), %%xmm0   \n\t" \
123             "movaps   80(%0), %%xmm1   \n\t" \
124             "movaps   96(%0), %%xmm2   \n\t" \
125             "movaps  112(%0), %%xmm3   \n\t" \
126             "movaps   %%xmm0,  64(%1) \n\t" \
127             "movaps   %%xmm1,  80(%1) \n\t" \
128             "movaps   %%xmm2,  96(%1) \n\t" \
129             "movaps   %%xmm3, 112(%1) \n\t"
130             ::"r"(in), "r"(in+512)
131             :"memory"
132             );
133
134     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
135     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
136
137     SUM8(MACS, suma[0], win + 32, in + 48);
138
139     sumc[ 0] = 0;
140     sumb[16] = 0;
141     sumd[16] = 0;
142
143 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
144             "movups " #sumd "(%4),       %%xmm0          \n\t" \
145             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
146             "subps  " #suma "(%1),       %%xmm0          \n\t" \
147             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
148 \
149             "movups " #sumc "(%3),       %%xmm0          \n\t" \
150             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
151             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
152             "movaps        %%xmm0," #out2 "(%0)          \n\t"
153
154     if (incr == 1) {
155         __asm__ volatile(
156             SUMS( 0, 48,  4, 52,  0, 112)
157             SUMS(16, 32, 20, 36, 16,  96)
158             SUMS(32, 16, 36, 20, 32,  80)
159             SUMS(48,  0, 52,  4, 48,  64)
160
161             :"+&r"(out)
162             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
163             :"memory"
164             );
165         out += 16*incr;
166     } else {
167         int j;
168         float *out2 = out + 32 * incr;
169         out[0  ]  = -suma[   0];
170         out += incr;
171         out2 -= incr;
172         for(j=1;j<16;j++) {
173             *out  = -suma[   j] + sumd[16-j];
174             *out2 =  sumb[16-j] + sumc[   j];
175             out  += incr;
176             out2 -= incr;
177         }
178     }
179
180     sum = 0;
181     SUM8(MLSS, sum, win + 16 + 32, in + 32);
182     *out = sum;
183 }
184
185 #endif /* HAVE_SSE2_INLINE */
186
187 #if HAVE_YASM
188 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
189 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
190                                int count, int switch_point, int block_type) \
191 {                                                                           \
192     int align_end = count - (count & 3);                                \
193     int j;                                                              \
194     for (j = 0; j < align_end; j+= 4) {                                 \
195         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
196         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
197         /* apply window & overlap with previous buffer */               \
198                                                                         \
199         /* select window */                                             \
200         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
201         in      += 4*18;                                                \
202         buf     += 4*18;                                                \
203         out     += 4;                                                   \
204     }                                                                   \
205     for (; j < count; j++) {                                            \
206         /* apply window & overlap with previous buffer */               \
207                                                                         \
208         /* select window */                                             \
209         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
210         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
211                                                                         \
212         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
213                                                                         \
214         in  += 18;                                                      \
215         buf++;                                                          \
216         out++;                                                          \
217     }                                                                   \
218 }
219
220 DECL_IMDCT_BLOCKS(sse,sse)
221 DECL_IMDCT_BLOCKS(sse2,sse)
222 DECL_IMDCT_BLOCKS(sse3,sse)
223 DECL_IMDCT_BLOCKS(ssse3,sse)
224 DECL_IMDCT_BLOCKS(avx,avx)
225 #endif /* HAVE_YASM */
226
227 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
228 {
229     int cpu_flags = av_get_cpu_flags();
230
231     int i, j;
232     for (j = 0; j < 4; j++) {
233         for (i = 0; i < 40; i ++) {
234             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
235             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
236             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
237             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
238             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
239             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
240             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
241             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
242         }
243     }
244
245 #if HAVE_SSE2_INLINE
246     if (cpu_flags & AV_CPU_FLAG_SSE2) {
247         s->apply_window_float = apply_window_mp3;
248     }
249 #endif /* HAVE_SSE2_INLINE */
250
251 #if HAVE_YASM
252     if (EXTERNAL_SSE(cpu_flags)) {
253         s->imdct36_blocks_float = imdct36_blocks_sse;
254     }
255     if (EXTERNAL_SSE2(cpu_flags)) {
256         s->imdct36_blocks_float = imdct36_blocks_sse2;
257     }
258     if (EXTERNAL_SSE3(cpu_flags)) {
259         s->imdct36_blocks_float = imdct36_blocks_sse3;
260     }
261     if (EXTERNAL_SSSE3(cpu_flags)) {
262         s->imdct36_blocks_float = imdct36_blocks_ssse3;
263     }
264     if (EXTERNAL_AVX(cpu_flags)) {
265         s->imdct36_blocks_float = imdct36_blocks_avx;
266     }
267 #endif /* HAVE_YASM */
268 }