]> git.sesse.net Git - ffmpeg/blob - libavcodec/x86/mpegaudiodec_mmx.c
mpegaudiodec: optimized iMDCT transform
[ffmpeg] / libavcodec / x86 / mpegaudiodec_mmx.c
1 /*
2  * MMX optimized MP3 decoding functions
3  * Copyright (c) 2010 Vitor Sessak
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/cpu.h"
23 #include "libavutil/x86_cpu.h"
24 #include "libavcodec/dsputil.h"
25 #include "libavcodec/mpegaudiodsp.h"
26
27 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
28 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
29 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
30 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
31 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
32 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
33                                float *tmpbuf);
34 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
35                                float *tmpbuf);
36
37 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
38
39 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
40 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
41
42 #define SUM8(op, sum, w, p)               \
43 {                                         \
44     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
45     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
46     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
47     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
48     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
49     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
50     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
51     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
52 }
53
54 static void apply_window(const float *buf, const float *win1,
55                          const float *win2, float *sum1, float *sum2, int len)
56 {
57     x86_reg count = - 4*len;
58     const float *win1a = win1+len;
59     const float *win2a = win2+len;
60     const float *bufa  = buf+len;
61     float *sum1a = sum1+len;
62     float *sum2a = sum2+len;
63
64
65 #define MULT(a, b)                                 \
66     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
67     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
68     "mulps         %%xmm2, %%xmm1           \n\t"  \
69     "subps         %%xmm1, %%xmm0           \n\t"  \
70     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
71     "subps         %%xmm2, %%xmm4           \n\t"  \
72
73     __asm__ volatile(
74             "1:                                   \n\t"
75             "xorps       %%xmm0, %%xmm0           \n\t"
76             "xorps       %%xmm4, %%xmm4           \n\t"
77
78             MULT(   0,   0)
79             MULT( 256,  64)
80             MULT( 512, 128)
81             MULT( 768, 192)
82             MULT(1024, 256)
83             MULT(1280, 320)
84             MULT(1536, 384)
85             MULT(1792, 448)
86
87             "movaps      %%xmm0, (%4,%0)          \n\t"
88             "movaps      %%xmm4, (%5,%0)          \n\t"
89             "add            $16,  %0              \n\t"
90             "jl              1b                   \n\t"
91             :"+&r"(count)
92             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
93             );
94
95 #undef MULT
96 }
97
98 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
99                              int incr)
100 {
101     LOCAL_ALIGNED_16(float, suma, [17]);
102     LOCAL_ALIGNED_16(float, sumb, [17]);
103     LOCAL_ALIGNED_16(float, sumc, [17]);
104     LOCAL_ALIGNED_16(float, sumd, [17]);
105
106     float sum;
107
108     /* copy to avoid wrap */
109     memcpy(in + 512, in, 32 * sizeof(*in));
110
111     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
112     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
113
114     SUM8(MACS, suma[0], win + 32, in + 48);
115
116     sumc[ 0] = 0;
117     sumb[16] = 0;
118     sumd[16] = 0;
119
120 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
121             "movups " #sumd "(%4),       %%xmm0          \n\t" \
122             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
123             "subps  " #suma "(%1),       %%xmm0          \n\t" \
124             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
125 \
126             "movups " #sumc "(%3),       %%xmm0          \n\t" \
127             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
128             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
129             "movaps        %%xmm0," #out2 "(%0)          \n\t"
130
131     if (incr == 1) {
132         __asm__ volatile(
133             SUMS( 0, 48,  4, 52,  0, 112)
134             SUMS(16, 32, 20, 36, 16,  96)
135             SUMS(32, 16, 36, 20, 32,  80)
136             SUMS(48,  0, 52,  4, 48,  64)
137
138             :"+&r"(out)
139             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
140             :"memory"
141             );
142         out += 16*incr;
143     } else {
144         int j;
145         float *out2 = out + 32 * incr;
146         out[0  ]  = -suma[   0];
147         out += incr;
148         out2 -= incr;
149         for(j=1;j<16;j++) {
150             *out  = -suma[   j] + sumd[16-j];
151             *out2 =  sumb[16-j] + sumc[   j];
152             out  += incr;
153             out2 -= incr;
154         }
155     }
156
157     sum = 0;
158     SUM8(MLSS, sum, win + 16 + 32, in + 32);
159     *out = sum;
160 }
161
162
163 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
164 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
165                                int count, int switch_point, int block_type) \
166 {                                                                           \
167     int align_end = count - (count & 3);                                \
168     int j;                                                              \
169     for (j = 0; j < align_end; j+= 4) {                                 \
170         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
171         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
172         /* apply window & overlap with previous buffer */               \
173                                                                         \
174         /* select window */                                             \
175         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
176         in      += 4*18;                                                \
177         buf     += 4*18;                                                \
178         out     += 4;                                                   \
179     }                                                                   \
180     for (; j < count; j++) {                                            \
181         /* apply window & overlap with previous buffer */               \
182                                                                         \
183         /* select window */                                             \
184         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
185         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
186                                                                         \
187         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
188                                                                         \
189         in  += 18;                                                      \
190         buf++;                                                          \
191         out++;                                                          \
192     }                                                                   \
193 }
194
195 DECL_IMDCT_BLOCKS(sse,sse)
196 DECL_IMDCT_BLOCKS(sse2,sse)
197 DECL_IMDCT_BLOCKS(sse3,sse)
198 DECL_IMDCT_BLOCKS(ssse3,sse)
199 DECL_IMDCT_BLOCKS(avx,avx)
200
201 void ff_mpadsp_init_mmx(MPADSPContext *s)
202 {
203     int mm_flags = av_get_cpu_flags();
204
205     int i, j;
206     for (j = 0; j < 4; j++) {
207         for (i = 0; i < 40; i ++) {
208             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
209             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
210             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
211             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
212             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
213             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
214             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
215             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
216         }
217     }
218
219     if (mm_flags & AV_CPU_FLAG_SSE2) {
220         s->apply_window_float = apply_window_mp3;
221     }
222 #if HAVE_YASM
223     if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
224         s->imdct36_blocks_float = imdct36_blocks_avx;
225 #if HAVE_SSE
226     } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
227         s->imdct36_blocks_float = imdct36_blocks_ssse3;
228     } else if (mm_flags & AV_CPU_FLAG_SSE3) {
229         s->imdct36_blocks_float = imdct36_blocks_sse3;
230     } else if (mm_flags & AV_CPU_FLAG_SSE2) {
231         s->imdct36_blocks_float = imdct36_blocks_sse2;
232     } else if (mm_flags & AV_CPU_FLAG_SSE) {
233         s->imdct36_blocks_float = imdct36_blocks_sse;
234 #endif /* HAVE_SSE */
235     }
236 #endif /* HAVE_YASM */
237 }