1 ;******************************************************************************
2 ;* FLAC DSP SIMD optimizations
4 ;* Copyright (C) 2014 Loren Merritt
5 ;* Copyright (C) 2014 James Almer
7 ;* This file is part of FFmpeg.
9 ;* FFmpeg is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* FFmpeg is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with FFmpeg; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
30 cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j
33 lea decodedq, [decodedq+pred_orderq*4-8]
34 lea coeffsq, [coeffsq+pred_orderq*4]
39 movd m0, [decodedq+pred_orderq*4+8]
41 movd m1, [coeffsq+pred_orderq*4]
44 lea jq, [pred_orderq+1]
48 PMACSDQL m2, m0, m1, m2, m0
49 movd m0, [decodedq+jq*4]
50 PMACSDQL m3, m1, m0, m3, m1
51 movd m1, [coeffsq+jq*4]
55 PMACSDQL m2, m0, m1, m2, m0
62 PMACSDQL m3, m1, m0, m3, m1
77 ;----------------------------------------------------------------------------------
78 ;void ff_flac_decorrelate_[lrm]s_16_sse2(uint8_t **out, int32_t **in, int channels,
79 ; int len, int shift);
80 ;----------------------------------------------------------------------------------
81 %macro FLAC_DECORRELATE_16 3-4
82 cglobal flac_decorrelate_%1_16, 2, 4, 4, out, in0, in1, len
88 mov in1q, [in0q + gprsize]
98 mova m0, [in0q + lenq]
99 mova m1, [in1q + lenq]
111 mova [outq + lenq], m%2
118 FLAC_DECORRELATE_16 ls, 0, 2, sub
119 FLAC_DECORRELATE_16 rs, 2, 1, add
120 FLAC_DECORRELATE_16 ms, 2, 0, add
122 ;----------------------------------------------------------------------------------
123 ;void ff_flac_decorrelate_[lrm]s_32_sse2(uint8_t **out, int32_t **in, int channels,
124 ; int len, int shift);
125 ;----------------------------------------------------------------------------------
126 %macro FLAC_DECORRELATE_32 5
127 cglobal flac_decorrelate_%1_32, 2, 4, 4, out, in0, in1, len
132 mov in1q, [in0q + gprsize]
140 mova m1, [in0q + in1q]
149 SBUTTERFLY dq, %2, %3, %4
152 mova [outq + mmsize], m%3
162 FLAC_DECORRELATE_32 ls, 0, 2, 1, sub
163 FLAC_DECORRELATE_32 rs, 2, 1, 0, add
164 FLAC_DECORRELATE_32 ms, 2, 0, 1, add
166 ;-----------------------------------------------------------------------------------------
167 ;void ff_flac_decorrelate_indep<ch>_<bps>_<opt>(uint8_t **out, int32_t **in, int channels,
168 ; int len, int shift);
169 ;-----------------------------------------------------------------------------------------
172 ;%3 = last xmm reg used
173 ;%4 = word/dword (shift instruction)
174 %macro FLAC_DECORRELATE_INDEP 4
175 %define REPCOUNT %2/(32/%1) ; 16bits = channels / 2; 32bits = channels
176 cglobal flac_decorrelate_indep%2_%1, 2, %2+2, %3+1, out, in0, in1, len, in2, in3, in4, in5, in6, in7
179 DEFINE_ARGS out, in0, in1, in2, in3, in4, in5
180 %define lend dword r3m
189 mov in %+ %%i %+ q, [in0q+%%i*gprsize]
198 sub in %+ %%i %+ q, in0q
208 mova m %+ %%i, [in0q + in %+ %%i %+ q]
215 TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
217 SBUTTERFLY dq, 0, 1, 6
218 SBUTTERFLY dq, 2, 3, 6
219 SBUTTERFLY dq, 4, 5, 6
221 punpcklqdq m6, m0, m2
224 punpcklqdq m0, m1, m3
229 TRANSPOSE4x4D 0, 1, 2, 3, 4
231 SBUTTERFLY dq, 0, 1, 2
237 packssdw m0, [in0q + in4q]
238 packssdw m1, [in0q + in5q]
239 packssdw m2, [in0q + in6q]
240 packssdw m3, [in0q + in7q]
241 TRANSPOSE2x4x4W 0, 1, 2, 3, 4
243 packssdw m0, [in0q + in3q]
244 packssdw m1, [in0q + in4q]
245 packssdw m2, [in0q + in5q]
251 shufps m3, m0, m2, q2020
254 shufps m1, m2, m3, q3120
259 packssdw m0, [in0q + in2q]
260 packssdw m1, [in0q + in3q]
261 SBUTTERFLY wd, 0, 1, 2
262 SBUTTERFLY dq, 0, 1, 2
275 mova [outq + %%i*mmsize], m %+ %%i
280 add outq, mmsize*REPCOUNT
287 FLAC_DECORRELATE_16 indep2, 0, 1 ; Reuse stereo 16bits macro
288 FLAC_DECORRELATE_INDEP 32, 2, 3, d
289 FLAC_DECORRELATE_INDEP 16, 4, 3, w
290 FLAC_DECORRELATE_INDEP 32, 4, 5, d
291 FLAC_DECORRELATE_INDEP 16, 6, 4, w
292 FLAC_DECORRELATE_INDEP 32, 6, 7, d
294 FLAC_DECORRELATE_INDEP 16, 8, 5, w
295 FLAC_DECORRELATE_INDEP 32, 8, 9, d
299 FLAC_DECORRELATE_INDEP 32, 4, 5, d
300 FLAC_DECORRELATE_INDEP 32, 6, 7, d
302 FLAC_DECORRELATE_INDEP 16, 8, 5, w
303 FLAC_DECORRELATE_INDEP 32, 8, 9, d