2 * This file is part of FFmpeg.
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 #include "libavutil/aarch64/asm.S"
21 const factors, align=4
22 .float 1.0, -1.0, 1.0, -1.0
25 const phi_noise_0, align=4
26 .float 1.0, 0.0, 1.0, 0.0
29 const phi_noise_1, align=4
30 .float 0.0, 1.0, 0.0, -1.0
31 .float 0.0, -1.0, 0.0, 1.0
34 const phi_noise_2, align=4
35 .float -1.0, 0.0, -1.0, 0.0
38 const phi_noise_3, align=4
39 .float 0.0, -1.0, 0.0, 1.0
40 .float 0.0, 1.0, 0.0, -1.0
43 function ff_sbr_sum64x5_neon, export=1
50 ld1 {v1.4S}, [x1], #16
51 fadd v0.4S, v0.4S, v1.4S
52 ld1 {v2.4S}, [x2], #16
53 fadd v0.4S, v0.4S, v2.4S
54 ld1 {v3.4S}, [x3], #16
55 fadd v0.4S, v0.4S, v3.4S
56 ld1 {v4.4S}, [x4], #16
57 fadd v0.4S, v0.4S, v4.4S
58 st1 {v0.4S}, [x0], #16
64 function ff_sbr_sum_square_neon, export=1
66 1: ld1 {v1.4S}, [x0], #16
67 fmla v0.4S, v1.4S, v1.4S
70 faddp v0.4S, v0.4S, v0.4S
71 faddp v0.4S, v0.4S, v0.4S
75 function ff_sbr_neg_odd_64_neon, export=1
77 movi v5.4S, #1<<7, lsl #24
78 ld2 {v0.4S, v1.4S}, [x0], #32
79 eor v1.16B, v1.16B, v5.16B
80 ld2 {v2.4S, v3.4S}, [x0], #32
82 st2 {v0.4S, v1.4S}, [x1], #32
83 eor v3.16B, v3.16B, v5.16B
84 ld2 {v0.4S, v1.4S}, [x0], #32
85 st2 {v2.4S, v3.4S}, [x1], #32
86 eor v1.16B, v1.16B, v5.16B
87 ld2 {v2.4S, v3.4S}, [x0], #32
89 eor v3.16B, v3.16B, v5.16B
90 st2 {v0.4S, v1.4S}, [x1], #32
91 st2 {v2.4S, v3.4S}, [x1], #32
95 function ff_sbr_qmf_pre_shuffle_neon, export=1
100 movi v6.4S, #1<<7, lsl #24
101 ld1 {v0.2S}, [x0], #8
102 st1 {v0.2S}, [x2], #8
104 ld1 {v1.4S}, [x1], x3
105 ld1 {v2.4S}, [x0], #16
106 eor v1.16B, v1.16B, v6.16B
108 ext v1.16B, v1.16B, v1.16B, #8
109 st2 {v1.4S, v2.4S}, [x2], #32
112 ld1 {v1.2S}, [x1], x4
113 ld1 {v2.2S}, [x0], #8
116 eor v1.16B, v1.16B, v6.16B
118 st2 {v1.2S, v2.2S}, [x2], #16
119 st2 {v1.S, v2.S}[2], [x2]
123 function ff_sbr_qmf_post_shuffle_neon, export=1
127 movi v6.4S, #1<<7, lsl #24
128 1: ld1 {v0.4S}, [x2], x3
129 ld1 {v1.4S}, [x1], #16
130 eor v0.16B, v0.16B, v6.16B
132 ext v0.16B, v0.16B, v0.16B, #8
133 st2 {v0.4S, v1.4S}, [x0], #32
139 function ff_sbr_qmf_deint_neg_neon, export=1
144 movi v2.4S, #1<<7, lsl #24
145 1: ld2 {v0.4S, v1.4S}, [x1], x3
146 eor v0.16B, v0.16B, v2.16B
148 ext v1.16B, v1.16B, v1.16B, #8
150 st1 {v1.4S}, [x0], #16
157 function ff_sbr_qmf_deint_bfly_neon, export=1
162 1: ld1 {v0.4S}, [x1], #16
163 ld1 {v1.4S}, [x2], x5
165 ext v2.16B, v2.16B, v2.16B, #8
167 ext v3.16B, v3.16B, v3.16B, #8
168 fadd v1.4S, v1.4S, v2.4S
169 fsub v0.4S, v0.4S, v3.4S
170 st1 {v0.4S}, [x0], #16
171 st1 {v1.4S}, [x3], x5
177 function ff_sbr_hf_gen_neon, export=1
186 fmul v1.4S, v1.4S, v2.4S
189 fmul v0.4S, v0.4S, v1.4S
190 fmul v1.4S, v0.4S, v7.4S
193 add x0, x0, x4, lsl #3
194 add x1, x1, x4, lsl #3
196 1: ld1 {v2.4S}, [x1], #16
198 fmul v4.4S, v2.4S, v1.4S
199 fmul v5.4S, v2.4S, v0.4S
200 faddp v4.4S, v4.4S, v4.4S
201 faddp v5.4S, v5.4S, v5.4S
202 faddp v4.4S, v4.4S, v4.4S
203 faddp v5.4S, v5.4S, v5.4S
205 fadd v4.2S, v4.2S, v3.2S
206 st1 {v4.2S}, [x0], #8
213 function ff_sbr_hf_g_filt_neon, export=1
217 add x1, x1, x4, lsl #3
218 1: ld1 {v0.2S}, [x1], x5
219 ld1 {v1.S}[0], [x2], #4
220 fmul v2.4S, v0.4S, v1.S[0]
221 st1 {v2.2S}, [x0], #8
227 function ff_sbr_autocorrelate_neon, export=1
234 ld1 {v4.2S}, [x0], #8
235 ld1 {v5.2S}, [x0], #8
236 fmul v16.2S, v4.2S, v4.2S
237 fmul v17.2S, v5.2S, v4.S[0]
238 fmul v18.2S, v5.2S, v4.S[1]
239 1: ld1 {v5.D}[1], [x0], #8
240 fmla v1.2S, v4.2S, v4.2S
241 fmla v2.4S, v5.4S, v4.S[0]
242 fmla v3.4S, v5.4S, v4.S[1]
247 fmul v19.2S, v4.2S, v4.2S
248 fmul v20.2S, v5.2S, v4.S[0]
249 fmul v21.2S, v5.2S, v4.S[1]
250 fadd v22.4S, v2.4S, v20.4S
251 fsub v22.4S, v22.4S, v17.4S
252 fadd v23.4S, v3.4S, v21.4S
253 fsub v23.4S, v23.4S, v18.4S
255 fmul v23.4S, v23.4S, v0.4S
256 fadd v22.4S, v22.4S, v23.4S
257 st1 {v22.4S}, [x1], #16
258 fadd v23.2S, v1.2S, v19.2S
259 fsub v23.2S, v23.2S, v16.2S
260 faddp v23.2S, v23.2S, v23.2S
264 fmul v3.2S, v3.2S, v0.2S
265 fadd v2.2S, v2.2S, v3.2S
268 faddp v1.2S, v1.2S, v1.2S
273 .macro apply_noise_common
276 movrel x7, X(ff_sbr_noise_table)
278 1: and x3, x3, #0x1ff
279 add x8, x7, x3, lsl #3
282 ld1 {v3.2S}, [x1], #8
283 ld1 {v4.2S}, [x2], #8
286 zip1 v3.4S, v3.4S, v3.4S
287 zip1 v4.4S, v4.4S, v4.4S
288 fmla v6.4S, v1.4S, v3.4S
289 fmla v2.4S, v5.4S, v4.4S
290 fcmeq v7.4S, v3.4S, #0.0
291 bif v2.16B, v6.16B, v7.16B
292 st1 {v2.4S}, [x0], #16
297 function ff_sbr_hf_apply_noise_0_neon, export=1
298 movrel x9, phi_noise_0
304 function ff_sbr_hf_apply_noise_1_neon, export=1
305 movrel x9, phi_noise_1
307 add x9, x9, x4, lsl #4
313 function ff_sbr_hf_apply_noise_2_neon, export=1
314 movrel x9, phi_noise_2
320 function ff_sbr_hf_apply_noise_3_neon, export=1
321 movrel x9, phi_noise_3
323 add x9, x9, x4, lsl #4