2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/aarch64/asm.S"
25 .macro pixels16 rnd=1, avg=0
29 1: ld1 {v0.16B}, [x1], x2
30 ld1 {v1.16B}, [x1], x2
31 ld1 {v2.16B}, [x1], x2
32 ld1 {v3.16B}, [x1], x2
34 ld1 {v4.16B}, [x12], x2
35 urhadd v0.16B, v0.16B, v4.16B
36 ld1 {v5.16B}, [x12], x2
37 urhadd v1.16B, v1.16B, v5.16B
38 ld1 {v6.16B}, [x12], x2
39 urhadd v2.16B, v2.16B, v6.16B
40 ld1 {v7.16B}, [x12], x2
41 urhadd v3.16B, v3.16B, v7.16B
44 st1 {v0.16B}, [x0], x2
45 st1 {v1.16B}, [x0], x2
46 st1 {v2.16B}, [x0], x2
47 st1 {v3.16B}, [x0], x2
52 .macro pixels16_x2 rnd=1, avg=0
53 1: ld1 {v0.16B, v1.16B}, [x1], x2
54 ld1 {v2.16B, v3.16B}, [x1], x2
56 ext v1.16B, v0.16B, v1.16B, #1
57 avg v0.16B, v0.16B, v1.16B
58 ext v3.16B, v2.16B, v3.16B, #1
59 avg v2.16B, v2.16B, v3.16B
61 ld1 {v1.16B}, [x0], x2
63 urhadd v0.16B, v0.16B, v1.16B
64 urhadd v2.16B, v2.16B, v3.16B
67 st1 {v0.16B}, [x0], x2
68 st1 {v2.16B}, [x0], x2
73 .macro pixels16_y2 rnd=1, avg=0
75 ld1 {v0.16B}, [x1], x2
76 ld1 {v1.16B}, [x1], x2
78 avg v2.16B, v0.16B, v1.16B
79 ld1 {v0.16B}, [x1], x2
80 avg v3.16B, v0.16B, v1.16B
81 ld1 {v1.16B}, [x1], x2
83 ld1 {v4.16B}, [x0], x2
85 urhadd v2.16B, v2.16B, v4.16B
86 urhadd v3.16B, v3.16B, v5.16B
89 st1 {v2.16B}, [x0], x2
90 st1 {v3.16B}, [x0], x2
93 avg v2.16B, v0.16B, v1.16B
94 ld1 {v0.16B}, [x1], x2
95 avg v3.16B, v0.16B, v1.16B
97 ld1 {v4.16B}, [x0], x2
99 urhadd v2.16B, v2.16B, v4.16B
100 urhadd v3.16B, v3.16B, v5.16B
103 st1 {v2.16B}, [x0], x2
104 st1 {v3.16B}, [x0], x2
109 .macro pixels16_xy2 rnd=1, avg=0
111 ld1 {v0.16B, v1.16B}, [x1], x2
112 ld1 {v4.16B, v5.16B}, [x1], x2
114 ext v1.16B, v0.16B, v1.16B, #1
115 ext v5.16B, v4.16B, v5.16B, #1
116 uaddl v16.8H, v0.8B, v1.8B
117 uaddl2 v20.8H, v0.16B, v1.16B
118 uaddl v18.8H, v4.8B, v5.8B
119 uaddl2 v22.8H, v4.16B, v5.16B
121 ld1 {v0.16B, v1.16B}, [x1], x2
122 add v24.8H, v16.8H, v18.8H
123 NRND add v24.8H, v24.8H, v26.8H
124 ext v30.16B, v0.16B, v1.16B, #1
125 add v1.8H, v20.8H, v22.8H
126 mshrn v28.8B, v24.8H, #2
127 NRND add v1.8H, v1.8H, v26.8H
128 mshrn2 v28.16B, v1.8H, #2
131 urhadd v28.16B, v28.16B, v16.16B
133 uaddl v16.8H, v0.8B, v30.8B
134 ld1 {v2.16B, v3.16B}, [x1], x2
135 uaddl2 v20.8H, v0.16B, v30.16B
136 st1 {v28.16B}, [x0], x2
137 add v24.8H, v16.8H, v18.8H
138 NRND add v24.8H, v24.8H, v26.8H
139 ext v3.16B, v2.16B, v3.16B, #1
140 add v0.8H, v20.8H, v22.8H
141 mshrn v30.8B, v24.8H, #2
142 NRND add v0.8H, v0.8H, v26.8H
143 mshrn2 v30.16B, v0.8H, #2
146 urhadd v30.16B, v30.16B, v18.16B
148 uaddl v18.8H, v2.8B, v3.8B
149 uaddl2 v22.8H, v2.16B, v3.16B
150 st1 {v30.16B}, [x0], x2
153 ld1 {v0.16B, v1.16B}, [x1], x2
154 add v24.8H, v16.8H, v18.8H
155 NRND add v24.8H, v24.8H, v26.8H
156 ext v30.16B, v0.16B, v1.16B, #1
157 add v1.8H, v20.8H, v22.8H
158 mshrn v28.8B, v24.8H, #2
159 NRND add v1.8H, v1.8H, v26.8H
160 mshrn2 v28.16B, v1.8H, #2
163 urhadd v28.16B, v28.16B, v16.16B
165 uaddl v16.8H, v0.8B, v30.8B
166 uaddl2 v20.8H, v0.16B, v30.16B
167 st1 {v28.16B}, [x0], x2
168 add v24.8H, v16.8H, v18.8H
169 NRND add v24.8H, v24.8H, v26.8H
170 add v0.8H, v20.8H, v22.8H
171 mshrn v30.8B, v24.8H, #2
172 NRND add v0.8H, v0.8H, v26.8H
173 mshrn2 v30.16B, v0.8H, #2
176 urhadd v30.16B, v30.16B, v18.16B
178 st1 {v30.16B}, [x0], x2
183 .macro pixels8 rnd=1, avg=0
184 1: ld1 {v0.8B}, [x1], x2
185 ld1 {v1.8B}, [x1], x2
186 ld1 {v2.8B}, [x1], x2
187 ld1 {v3.8B}, [x1], x2
189 ld1 {v4.8B}, [x0], x2
190 urhadd v0.8B, v0.8B, v4.8B
191 ld1 {v5.8B}, [x0], x2
192 urhadd v1.8B, v1.8B, v5.8B
193 ld1 {v6.8B}, [x0], x2
194 urhadd v2.8B, v2.8B, v6.8B
195 ld1 {v7.8B}, [x0], x2
196 urhadd v3.8B, v3.8B, v7.8B
197 sub x0, x0, x2, lsl #2
200 st1 {v0.8B}, [x0], x2
201 st1 {v1.8B}, [x0], x2
202 st1 {v2.8B}, [x0], x2
203 st1 {v3.8B}, [x0], x2
208 .macro pixels8_x2 rnd=1, avg=0
209 1: ld1 {v0.8B, v1.8B}, [x1], x2
210 ext v1.8B, v0.8B, v1.8B, #1
211 ld1 {v2.8B, v3.8B}, [x1], x2
212 ext v3.8B, v2.8B, v3.8B, #1
214 avg v0.8B, v0.8B, v1.8B
215 avg v2.8B, v2.8B, v3.8B
217 ld1 {v4.8B}, [x0], x2
219 urhadd v0.8B, v0.8B, v4.8B
220 urhadd v2.8B, v2.8B, v5.8B
223 st1 {v0.8B}, [x0], x2
224 st1 {v2.8B}, [x0], x2
229 .macro pixels8_y2 rnd=1, avg=0
231 ld1 {v0.8B}, [x1], x2
232 ld1 {v1.8B}, [x1], x2
234 avg v4.8B, v0.8B, v1.8B
235 ld1 {v0.8B}, [x1], x2
236 avg v5.8B, v0.8B, v1.8B
237 ld1 {v1.8B}, [x1], x2
239 ld1 {v2.8B}, [x0], x2
241 urhadd v4.8B, v4.8B, v2.8B
242 urhadd v5.8B, v5.8B, v3.8B
245 st1 {v4.8B}, [x0], x2
246 st1 {v5.8B}, [x0], x2
249 avg v4.8B, v0.8B, v1.8B
250 ld1 {v0.8B}, [x1], x2
251 avg v5.8B, v0.8B, v1.8B
253 ld1 {v2.8B}, [x0], x2
255 urhadd v4.8B, v4.8B, v2.8B
256 urhadd v5.8B, v5.8B, v3.8B
259 st1 {v4.8B}, [x0], x2
260 st1 {v5.8B}, [x0], x2
265 .macro pixels8_xy2 rnd=1, avg=0
267 ld1 {v0.16B}, [x1], x2
268 ld1 {v1.16B}, [x1], x2
270 ext v4.16B, v0.16B, v4.16B, #1
271 ext v6.16B, v1.16B, v6.16B, #1
272 uaddl v16.8H, v0.8B, v4.8B
273 uaddl v17.8H, v1.8B, v6.8B
275 ld1 {v0.16B}, [x1], x2
276 add v18.8H, v16.8H, v17.8H
277 ext v4.16B, v0.16B, v4.16B, #1
278 NRND add v18.8H, v18.8H, v19.8H
279 uaddl v16.8H, v0.8B, v4.8B
280 mshrn v5.8B, v18.8H, #2
281 ld1 {v1.16B}, [x1], x2
282 add v18.8H, v16.8H, v17.8H
285 urhadd v5.8B, v5.8B, v7.8B
287 NRND add v18.8H, v18.8H, v19.8H
288 st1 {v5.8B}, [x0], x2
289 mshrn v7.8B, v18.8H, #2
292 urhadd v7.8B, v7.8B, v5.8B
294 ext v6.16B, v1.16B, v6.16B, #1
295 uaddl v17.8H, v1.8B, v6.8B
296 st1 {v7.8B}, [x0], x2
299 ld1 {v0.16B}, [x1], x2
300 add v18.8H, v16.8H, v17.8H
301 ext v4.16B, v0.16B, v4.16B, #1
302 NRND add v18.8H, v18.8H, v19.8H
303 uaddl v16.8H, v0.8B, v4.8B
304 mshrn v5.8B, v18.8H, #2
305 add v18.8H, v16.8H, v17.8H
308 urhadd v5.8B, v5.8B, v7.8B
310 NRND add v18.8H, v18.8H, v19.8H
311 st1 {v5.8B}, [x0], x2
312 mshrn v7.8B, v18.8H, #2
315 urhadd v7.8B, v7.8B, v5.8B
317 st1 {v7.8B}, [x0], x2
322 .macro pixfunc pfx, name, suf, rnd=1, avg=0
324 .macro avg rd, rn, rm
327 .macro mshrn rd, rn, rm
330 .macro mshrn2 rd, rn, rm
333 .macro NRND insn:vararg
336 .macro avg rd, rn, rm
339 .macro mshrn rd, rn, rm
342 .macro mshrn2 rd, rn, rm
345 .macro NRND insn:vararg
349 function ff_\pfx\name\suf\()_neon, export=1
358 .macro pixfunc2 pfx, name, avg=0
359 pixfunc \pfx, \name, rnd=1, avg=\avg
360 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
363 function ff_put_h264_qpel16_mc00_neon, export=1
367 pixfunc put_, pixels16, avg=0
368 pixfunc2 put_, pixels16_x2, avg=0
369 pixfunc2 put_, pixels16_y2, avg=0
370 pixfunc2 put_, pixels16_xy2, avg=0
372 function ff_avg_h264_qpel16_mc00_neon, export=1
376 pixfunc avg_, pixels16, avg=1
377 pixfunc2 avg_, pixels16_x2, avg=1
378 pixfunc2 avg_, pixels16_y2, avg=1
379 pixfunc2 avg_, pixels16_xy2, avg=1
381 function ff_put_h264_qpel8_mc00_neon, export=1
385 pixfunc put_, pixels8, avg=0
386 pixfunc2 put_, pixels8_x2, avg=0
387 pixfunc2 put_, pixels8_y2, avg=0
388 pixfunc2 put_, pixels8_xy2, avg=0
390 function ff_avg_h264_qpel8_mc00_neon, export=1
394 pixfunc avg_, pixels8, avg=1
395 pixfunc avg_, pixels8_x2, avg=1
396 pixfunc avg_, pixels8_y2, avg=1
397 pixfunc avg_, pixels8_xy2, avg=1