1 ;******************************************************************************
2 ;* FLAC DSP SIMD optimizations
4 ;* Copyright (C) 2014 Loren Merritt
5 ;* Copyright (C) 2014 James Almer
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
30 pmacsdql %1, %2, %3, %1
39 cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
42 lea decodedq, [decodedq+pred_orderq*4-8]
43 lea coeffsq, [coeffsq+pred_orderq*4]
48 movd m0, [decodedq+pred_orderq*4+8]
50 movd m1, [coeffsq+pred_orderq*4]
53 lea jq, [pred_orderq+1]
57 PMACSDQL m2, m0, m1, m2, m0
58 movd m0, [decodedq+jq*4]
59 PMACSDQL m3, m1, m0, m3, m1
60 movd m1, [coeffsq+jq*4]
64 PMACSDQL m2, m0, m1, m2, m0
71 PMACSDQL m3, m1, m0, m3, m1
86 ;----------------------------------------------------------------------------------
87 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
88 ; int len, int shift);
89 ;----------------------------------------------------------------------------------
90 %macro FLAC_DECORRELATE_16 3-4
91 cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
97 mov in1q, [in0q + gprsize]
107 mova m0, [in0q + lenq]
108 mova m1, [in1q + lenq]
120 mova [outq + lenq], m%2
127 FLAC_DECORRELATE_16 ls, 0, 2, sub
128 FLAC_DECORRELATE_16 rs, 2, 1, add
129 FLAC_DECORRELATE_16 ms, 2, 0, add
131 ;----------------------------------------------------------------------------------
132 ;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
133 ; int len, int shift);
134 ;----------------------------------------------------------------------------------
135 %macro FLAC_DECORRELATE_32 5
136 cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
141 mov in1q, [in0q + gprsize]
149 mova m1, [in0q + in1q]
158 SBUTTERFLY dq, %2, %3, %4
161 mova [outq + mmsize], m%3
171 FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
172 FLAC_DECORRELATE_32 rs, 2, 1, 0, add
173 FLAC_DECORRELATE_32 ms, 2, 0, 1, add
175 ;-----------------------------------------------------------------------------------------
176 ;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
177 ; int len, int shift);
178 ;-----------------------------------------------------------------------------------------
181 ;%3 = last xmm reg used
182 ;%4 = word/dword (shift instruction)
183 %macro FLAC_DECORRELATE_INDEP 4
184 %define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
185 cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
188 DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
189 %define lend dword r3m
198 mov in %+ %%i %+ q, [in0q+%%i*gprsize]
207 sub in %+ %%i %+ q, in0q
217 mova m %+ %%i, [in0q + in %+ %%i %+ q]
224 TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
226 SBUTTERFLY dq, 0, 1, 6
227 SBUTTERFLY dq, 2, 3, 6
228 SBUTTERFLY dq, 4, 5, 6
230 punpcklqdq m6, m0, m2
233 punpcklqdq m0, m1, m3
238 TRANSPOSE4x4D 0, 1, 2, 3, 4
240 SBUTTERFLY dq, 0, 1, 2
246 packssdw m0, [in0q + in4q]
247 packssdw m1, [in0q + in5q]
248 packssdw m2, [in0q + in6q]
249 packssdw m3, [in0q + in7q]
250 TRANSPOSE2x4x4W 0, 1, 2, 3, 4
252 packssdw m0, [in0q + in3q]
253 packssdw m1, [in0q + in4q]
254 packssdw m2, [in0q + in5q]
260 shufps m3, m0, m2, q2020
263 shufps m1, m2, m3, q3120
268 packssdw m0, [in0q + in2q]
269 packssdw m1, [in0q + in3q]
270 SBUTTERFLY wd, 0, 1, 2
271 SBUTTERFLY dq, 0, 1, 2
284 mova [outq + %%i*mmsize], m %+ %%i
289 add outq, mmsize*REPCOUNT
296 FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
297 FLAC_DECORRELATE_INDEP 32, 2, 3, d
298 FLAC_DECORRELATE_INDEP 16, 4, 3, w
299 FLAC_DECORRELATE_INDEP 32, 4, 5, d
300 FLAC_DECORRELATE_INDEP 16, 6, 4, w
301 FLAC_DECORRELATE_INDEP 32, 6, 7, d
303 FLAC_DECORRELATE_INDEP 16, 8, 5, w
304 FLAC_DECORRELATE_INDEP 32, 8, 9, d
308 FLAC_DECORRELATE_INDEP 32, 4, 5, d
309 FLAC_DECORRELATE_INDEP 32, 6, 7, d
311 FLAC_DECORRELATE_INDEP 16, 8, 5, w
312 FLAC_DECORRELATE_INDEP 32, 8, 9, d