1 ;*****************************************************************************
2 ;* x86-optimized functions for showcqt filter
4 ;* Copyright (C) 2016 Muhammad Faiz <mfcc64@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
39 ; %1 = a_re, %2 = a_im, %3 = b_re, %4 = b_im
40 ; %5 = m_re, %6 = m_im, %7 = tmp, %8 = coeffval, %9 = coeffsq_offset
42 add id, [coeffsq + Coeffs.start + %9]
43 movaps m%5, [srcq + 8 * iq]
44 movaps m%7, [srcq + 8 * iq + mmsize]
45 shufps m%6, m%5, m%7, q3131
46 shufps m%5, m%5, m%7, q2020
48 FMULADD_PS m%2, m%6, m%8, m%2, m%6
50 FMULADD_PS m%1, m%5, m%8, m%1, m%5
51 movups m%5, [srcq + 8 * iq - mmsize + 8]
52 movups m%7, [srcq + 8 * iq - 2*mmsize + 8]
54 vperm2f128 m%5, m%5, m%5, 1
55 vperm2f128 m%7, m%7, m%7, 1
57 shufps m%6, m%5, m%7, q1313
58 shufps m%5, m%5, m%7, q0202
59 FMULADD_PS m%4, m%6, m%8, m%4, m%6
60 FMULADD_PS m%3, m%5, m%8, m%3, m%5
63 %macro CQT_SEPARATE 6 ; a_re, a_im, b_re, b_im, tmp, tmp2
72 vextractf128 xmm%2, m%1, 1
75 %endmacro ; CQT_SEPARATE
77 %macro DECLARE_CQT_CALC 0
78 ; ff_showcqt_cqt_calc_*(dst, src, coeffs, len, fft_len)
80 cglobal showcqt_cqt_calc, 5, 10, 12, dst, src, coeffs, len, fft_len, x, coeffs_val, coeffs_val2, i, coeffs_len
83 mov xd, [coeffsq + Coeffs.len]
87 mov coeffs_lend, [coeffsq + Coeffs.len + Coeffs.sizeof]
96 test coeffs_lend, coeffs_lend
98 mov coeffs_valq, [coeffsq + Coeffs.val]
99 mov coeffs_val2q, [coeffsq + Coeffs.val + Coeffs.sizeof]
102 movaps m7, [coeffs_valq + 4 * xq]
103 CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
104 movaps m7, [coeffs_val2q + 4 * xq]
105 CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
110 cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
114 movaps m7, [coeffs_val2q + 4 * xq]
115 CQT_CALC 8, 9, 10, 11, 4, 5, 6, 7, Coeffs.sizeof
117 cmp xd, [coeffsq + Coeffs.len + Coeffs.sizeof]
120 CQT_SEPARATE 0, 1, 2, 3, 4, 5
121 CQT_SEPARATE 8, 9, 10, 11, 4, 5
124 HADDPS xmm0, xmm8, xmm1
127 lea dstq, [dstq + 16]
128 lea coeffsq, [coeffsq + 2*Coeffs.sizeof]
133 cmp xd, [coeffsq + Coeffs.len]
137 movaps m7, [coeffs_valq + 4 * xq]
138 CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
140 cmp xd, [coeffsq + Coeffs.len]
144 cglobal showcqt_cqt_calc, 4, 7, 8, dst, src, coeffs, len, x, coeffs_val, i
148 mov xd, [coeffsq + Coeffs.len]
155 mov coeffs_valq, [coeffsq + Coeffs.val]
159 movaps m7, [coeffs_valq + 4 * xq]
160 CQT_CALC 0, 1, 2, 3, 4, 5, 6, 7, 0
162 cmp xd, [coeffsq + Coeffs.len]
164 CQT_SEPARATE 0, 1, 2, 3, 4, 5
166 HADDPS xmm0, xmm0, xmm1
171 lea coeffsq, [coeffsq + Coeffs.sizeof]
175 %endmacro ; DECLARE_CQT_CALC
181 %if HAVE_AVX_EXTERNAL
185 %if HAVE_FMA3_EXTERNAL
189 %if HAVE_FMA4_EXTERNAL