2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/aarch64/asm.S"
25 function ff_conv_flt_to_s16_neon, export=1
27 ld1 {v0.4s}, [x1], #16
28 fcvtzs v4.4s, v0.4s, #31
29 ld1 {v1.4s}, [x1], #16
30 fcvtzs v5.4s, v1.4s, #31
35 sqrshrn v4.4h, v4.4s, #16
36 ld1 {v2.4s}, [x1], #16
37 fcvtzs v6.4s, v2.4s, #31
38 sqrshrn2 v4.8h, v5.4s, #16
39 ld1 {v3.4s}, [x1], #16
40 fcvtzs v7.4s, v3.4s, #31
41 sqrshrn v6.4h, v6.4s, #16
42 st1 {v4.8h}, [x0], #16
43 sqrshrn2 v6.8h, v7.4s, #16
44 ld1 {v0.4s}, [x1], #16
45 fcvtzs v4.4s, v0.4s, #31
46 ld1 {v1.4s}, [x1], #16
47 fcvtzs v5.4s, v1.4s, #31
48 st1 {v6.8h}, [x0], #16
52 2: ld1 {v2.4s}, [x1], #16
53 sqrshrn v4.4h, v4.4s, #16
54 fcvtzs v6.4s, v2.4s, #31
55 ld1 {v3.4s}, [x1], #16
56 sqrshrn2 v4.8h, v5.4s, #16
57 fcvtzs v7.4s, v3.4s, #31
58 sqrshrn v6.4h, v6.4s, #16
59 st1 {v4.8h}, [x0], #16
60 sqrshrn2 v6.8h, v7.4s, #16
63 3: sqrshrn v4.4h, v4.4s, #16
64 sqrshrn2 v4.8h, v5.4s, #16
69 function ff_conv_fltp_to_s16_2ch_neon, export=1
72 ld1 {v0.4s}, [x4], #16
73 fcvtzs v4.4s, v0.4s, #31
74 ld1 {v1.4s}, [x4], #16
75 fcvtzs v5.4s, v1.4s, #31
76 ld1 {v2.4s}, [x5], #16
77 fcvtzs v6.4s, v2.4s, #31
78 ld1 {v3.4s}, [x5], #16
79 fcvtzs v7.4s, v3.4s, #31
84 ld1 {v16.4s}, [x4], #16
85 fcvtzs v20.4s, v16.4s, #31
87 ld1 {v17.4s}, [x4], #16
88 fcvtzs v21.4s, v17.4s, #31
89 ld1 {v18.4s}, [x5], #16
90 fcvtzs v22.4s, v18.4s, #31
91 ld1 {v19.4s}, [x5], #16
93 st1 {v6.4s}, [x0], #16
94 fcvtzs v23.4s, v19.4s, #31
95 st1 {v7.4s}, [x0], #16
96 sri v22.4s, v20.4s, #16
97 ld1 {v0.4s}, [x4], #16
98 sri v23.4s, v21.4s, #16
99 st1 {v22.4s}, [x0], #16
100 fcvtzs v4.4s, v0.4s, #31
101 ld1 {v1.4s}, [x4], #16
102 fcvtzs v5.4s, v1.4s, #31
103 ld1 {v2.4s}, [x5], #16
104 fcvtzs v6.4s, v2.4s, #31
105 ld1 {v3.4s}, [x5], #16
106 fcvtzs v7.4s, v3.4s, #31
107 st1 {v23.4s}, [x0], #16
111 2: sri v6.4s, v4.4s, #16
112 ld1 {v0.4s}, [x4], #16
113 fcvtzs v0.4s, v0.4s, #31
114 ld1 {v1.4s}, [x4], #16
115 fcvtzs v1.4s, v1.4s, #31
116 ld1 {v2.4s}, [x5], #16
117 fcvtzs v2.4s, v2.4s, #31
118 sri v7.4s, v5.4s, #16
119 ld1 {v3.4s}, [x5], #16
120 fcvtzs v3.4s, v3.4s, #31
121 sri v2.4s, v0.4s, #16
122 st1 {v6.4s,v7.4s}, [x0], #32
123 sri v3.4s, v1.4s, #16
124 st1 {v2.4s,v3.4s}, [x0], #32
126 3: sri v6.4s, v4.4s, #16
127 sri v7.4s, v5.4s, #16
128 st1 {v6.4s,v7.4s}, [x0]
132 function ff_conv_fltp_to_s16_neon, export=1
134 b.eq X(ff_conv_fltp_to_s16_2ch_neon)
137 b X(ff_conv_flt_to_s16_neon)
144 ldp x4, x5, [x1], #16
145 ldp x6, x7, [x1], #16
148 ld1 {v4.4s}, [x4], #16
149 fcvtzs v4.4s, v4.4s, #31
150 ld1 {v5.4s}, [x5], #16
151 fcvtzs v5.4s, v5.4s, #31
152 ld1 {v6.4s}, [x6], #16
153 fcvtzs v6.4s, v6.4s, #31
154 ld1 {v7.4s}, [x7], #16
155 fcvtzs v7.4s, v7.4s, #31
158 ld1 {v0.4s}, [x4], #16
159 fcvtzs v0.4s, v0.4s, #31
160 sri v5.4s, v4.4s, #16
161 ld1 {v1.4s}, [x5], #16
162 fcvtzs v1.4s, v1.4s, #31
163 sri v7.4s, v6.4s, #16
164 ld1 {v2.4s}, [x6], #16
165 fcvtzs v2.4s, v2.4s, #31
166 zip1 v16.4s, v5.4s, v7.4s
167 ld1 {v3.4s}, [x7], #16
168 fcvtzs v3.4s, v3.4s, #31
169 zip2 v17.4s, v5.4s, v7.4s
170 st1 {v16.d}[0], [x8], x12
171 sri v1.4s, v0.4s, #16
172 st1 {v16.d}[1], [x8], x12
173 sri v3.4s, v2.4s, #16
174 st1 {v17.d}[0], [x8], x12
175 zip1 v18.4s, v1.4s, v3.4s
176 st1 {v17.d}[1], [x8], x12
177 zip2 v19.4s, v1.4s, v3.4s
179 ld1 {v4.4s}, [x4], #16
180 fcvtzs v4.4s, v4.4s, #31
181 st1 {v18.d}[0], [x8], x12
182 ld1 {v5.4s}, [x5], #16
183 fcvtzs v5.4s, v5.4s, #31
184 st1 {v18.d}[1], [x8], x12
185 ld1 {v6.4s}, [x6], #16
186 fcvtzs v6.4s, v6.4s, #31
187 st1 {v19.d}[0], [x8], x12
188 ld1 {v7.4s}, [x7], #16
189 fcvtzs v7.4s, v7.4s, #31
190 st1 {v19.d}[1], [x8], x12
193 st1 {v18.d}[0], [x8], x12
194 st1 {v18.d}[1], [x8], x12
195 st1 {v19.d}[0], [x8], x12
196 st1 {v19.d}[1], [x8], x12
206 ldp x4, x5, [x1], #16
210 ld1 {v4.4s}, [x4], #16
211 fcvtzs v4.4s, v4.4s, #31
212 ld1 {v5.4s}, [x5], #16
213 fcvtzs v5.4s, v5.4s, #31
214 ld1 {v6.4s}, [x4], #16
215 fcvtzs v6.4s, v6.4s, #31
216 ld1 {v7.4s}, [x5], #16
217 fcvtzs v7.4s, v7.4s, #31
221 sri v5.4s, v4.4s, #16
222 ld1 {v4.4s}, [x4], #16
223 fcvtzs v4.4s, v4.4s, #31
224 st1 {v5.s}[0], [x8], x12
225 sri v7.4s, v6.4s, #16
226 st1 {v5.s}[1], [x8], x12
227 ld1 {v6.4s}, [x4], #16
228 fcvtzs v6.4s, v6.4s, #31
229 st1 {v5.s}[2], [x8], x12
230 st1 {v5.s}[3], [x8], x12
231 st1 {v7.s}[0], [x8], x12
232 st1 {v7.s}[1], [x8], x12
233 ld1 {v5.4s}, [x5], #16
234 fcvtzs v5.4s, v5.4s, #31
235 st1 {v7.s}[2], [x8], x12
236 st1 {v7.s}[3], [x8], x12
237 ld1 {v7.4s}, [x5], #16
238 fcvtzs v7.4s, v7.4s, #31
241 ld1 {v0.4s}, [x4], #16
242 sri v5.4s, v4.4s, #16
243 fcvtzs v0.4s, v0.4s, #31
244 ld1 {v1.4s}, [x5], #16
245 sri v7.4s, v6.4s, #16
246 st1 {v5.s}[0], [x8], x12
247 st1 {v5.s}[1], [x8], x12
248 fcvtzs v1.4s, v1.4s, #31
249 st1 {v5.s}[2], [x8], x12
250 st1 {v5.s}[3], [x8], x12
251 ld1 {v2.4s}, [x4], #16
252 st1 {v7.s}[0], [x8], x12
253 fcvtzs v2.4s, v2.4s, #31
254 st1 {v7.s}[1], [x8], x12
255 ld1 {v3.4s}, [x5], #16
256 st1 {v7.s}[2], [x8], x12
257 fcvtzs v3.4s, v3.4s, #31
258 st1 {v7.s}[3], [x8], x12
259 sri v1.4s, v0.4s, #16
260 sri v3.4s, v2.4s, #16
262 ld1 {v4.4s}, [x4], #16
263 st1 {v1.s}[0], [x8], x12
264 fcvtzs v4.4s, v4.4s, #31
265 st1 {v1.s}[1], [x8], x12
266 ld1 {v5.4s}, [x5], #16
267 st1 {v1.s}[2], [x8], x12
268 fcvtzs v5.4s, v5.4s, #31
269 st1 {v1.s}[3], [x8], x12
270 ld1 {v6.4s}, [x4], #16
271 st1 {v3.s}[0], [x8], x12
272 fcvtzs v6.4s, v6.4s, #31
273 st1 {v3.s}[1], [x8], x12
274 ld1 {v7.4s}, [x5], #16
275 st1 {v3.s}[2], [x8], x12
276 fcvtzs v7.4s, v7.4s, #31
277 st1 {v3.s}[3], [x8], x12
280 st1 {v1.s}[0], [x8], x12
281 st1 {v1.s}[1], [x8], x12
282 st1 {v1.s}[2], [x8], x12
283 st1 {v1.s}[3], [x8], x12
284 st1 {v3.s}[0], [x8], x12
285 st1 {v3.s}[1], [x8], x12
286 st1 {v3.s}[2], [x8], x12
287 st1 {v3.s}[3], [x8], x12
290 sri v5.4s, v4.4s, #16
291 sri v7.4s, v6.4s, #16
292 st1 {v5.s}[0], [x8], x12
293 st1 {v5.s}[1], [x8], x12
294 st1 {v5.s}[2], [x8], x12
295 st1 {v5.s}[3], [x8], x12
296 st1 {v7.s}[0], [x8], x12
297 st1 {v7.s}[1], [x8], x12
298 st1 {v7.s}[2], [x8], x12
299 st1 {v7.s}[3], [x8], x12
310 ld1 {v0.4s}, [x4], #16
311 fcvtzs v0.4s, v0.4s, #31
312 ld1 {v1.4s}, [x4], #16
313 fcvtzs v1.4s, v1.4s, #31
317 ld1 {v2.4s}, [x4], #16
318 fcvtzs v2.4s, v2.4s, #31
319 ld1 {v3.4s}, [x4], #16
320 fcvtzs v3.4s, v3.4s, #31
321 st1 {v0.h}[1], [x5], x12
322 st1 {v0.h}[3], [x5], x12
323 st1 {v0.h}[5], [x5], x12
324 st1 {v0.h}[7], [x5], x12
325 st1 {v1.h}[1], [x5], x12
326 st1 {v1.h}[3], [x5], x12
327 st1 {v1.h}[5], [x5], x12
328 st1 {v1.h}[7], [x5], x12
330 ld1 {v0.4s}, [x4], #16
331 fcvtzs v0.4s, v0.4s, #31
332 ld1 {v1.4s}, [x4], #16
333 fcvtzs v1.4s, v1.4s, #31
335 st1 {v2.h}[1], [x5], x12
336 st1 {v2.h}[3], [x5], x12
337 st1 {v2.h}[5], [x5], x12
338 st1 {v2.h}[7], [x5], x12
339 st1 {v3.h}[1], [x5], x12
340 st1 {v3.h}[3], [x5], x12
341 st1 {v3.h}[5], [x5], x12
342 st1 {v3.h}[7], [x5], x12
347 st1 {v0.h}[1], [x5], x12
348 st1 {v0.h}[3], [x5], x12
349 st1 {v0.h}[5], [x5], x12
350 st1 {v0.h}[7], [x5], x12
351 st1 {v1.h}[1], [x5], x12
352 st1 {v1.h}[3], [x5], x12
353 st1 {v1.h}[5], [x5], x12
354 st1 {v1.h}[7], [x5], x12
356 ld1 {v0.4s}, [x4], #16
357 fcvtzs v0.4s, v0.4s, #31
358 ld1 {v1.4s}, [x4], #16
359 fcvtzs v1.4s, v1.4s, #31