1 ;*****************************************************************************
2 ;* x86-optimized functions for showcqt filter
4 ;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
38 %macro EMULATE_FMADDPS 5 ; dst, src1, src2, src3, tmp
39 %if cpuflag(fma3) || cpuflag(fma4)
40 fmaddps %1, %2, %3, %4
45 %endmacro ; EMULATE_FMADDPS
48 ; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
49 ; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
51 add id, [coeffsq + Coeffs.start + %9]
52 movaps m%5, [srcq + 8 * iq]
53 movaps m%7, [srcq + 8 * iq + mmsize]
54 shufps m%6, m%5, m%7, q3131
55 shufps m%5, m%5, m%7, q2020
57 EMULATE_FMADDPS m%2, m%6, m%8, m%2, m%6
59 EMULATE_FMADDPS m%1, m%5, m%8, m%1, m%5
60 movups m%5, [srcq + 8 * iq - mmsize + 8]
61 movups m%7, [srcq + 8 * iq - 2*mmsize + 8]
63 vperm2f128 m%5, m%5, m%5, 1
64 vperm2f128 m%7, m%7, m%7, 1
66 shufps m%6, m%5, m%7, q1313
67 shufps m%5, m%5, m%7, q0202
68 EMULATE_FMADDPS m%4, m%6, m%8, m%4, m%6
69 EMULATE_FMADDPS m%3, m%5, m%8, m%3, m%5
72 %macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
81 vextractf128 xmm%2, m%1, 1
84 %endmacro ; CQT_SEPARATE
86 %macro DECLARE_CQT_CALC 0
87 ; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
89 cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
92 mov xd, [coeffsq + Coeffs.len]
96 mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
103 cmova coeffs_lend, xd
105 test coeffs_lend, coeffs_lend
107 mov coeffs_valq, [coeffsq + Coeffs.val]
108 mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
111 movaps m7, [coeffs_valq + 4 * xq]
112 CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
113 movaps m7, [coeffs_val2q + 4 * xq]
114 CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
119 cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
123 movaps m7, [coeffs_val2q + 4 * xq]
124 CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
126 cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
129 CQT_SEPARATE 0, 1, 2, 3, 4, 5
130 CQT_SEPARATE 8, 9, 10, 11, 4, 5
133 HADDPS xmm0, xmm8, xmm1
136 lea dstq, [dstq + 16]
137 lea coeffsq, [coeffsq + 2*Coeffs.sizeof]
142 cmp xd, [coeffsq + Coeffs.len]
146 movaps m7, [coeffs_valq + 4 * xq]
147 CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
149 cmp xd, [coeffsq + Coeffs.len]
153 cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
157 mov xd, [coeffsq + Coeffs.len]
164 mov coeffs_valq, [coeffsq + Coeffs.val]
168 movaps m7, [coeffs_valq + 4 * xq]
169 CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
171 cmp xd, [coeffsq + Coeffs.len]
173 CQT_SEPARATE 0, 1, 2, 3, 4, 5
175 HADDPS xmm0, xmm0, xmm1
180 lea coeffsq, [coeffsq + Coeffs.sizeof]
184 %endmacro ; DECLARE_CQT_CALC
190 %if HAVE_AVX_EXTERNAL
194 %if HAVE_FMA3_EXTERNAL
198 %if HAVE_FMA4_EXTERNAL