]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/hpeldsp_neon.S
Merge commit 'dc08bbf63a217c839aa4c143f2a1d0b7e2e6d997'
[ffmpeg] / libavcodec / aarch64 / hpeldsp_neon.S
1 /*
2  * ARM NEON optimised DSP functions
3  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "libavutil/aarch64/asm.S"
24
25 .macro  pixels16        rnd=1, avg=0
26   .if \avg
27         mov             x12, x0
28   .endif
29 1:      ld1             {v0.16B},  [x1], x2
30         ld1             {v1.16B},  [x1], x2
31         ld1             {v2.16B},  [x1], x2
32         ld1             {v3.16B},  [x1], x2
33   .if \avg
34         ld1             {v4.16B},  [x12], x2
35         urhadd          v0.16B,  v0.16B,  v4.16B
36         ld1             {v5.16B},  [x12], x2
37         urhadd          v1.16B,  v1.16B,  v5.16B
38         ld1             {v6.16B},  [x12], x2
39         urhadd          v2.16B,  v2.16B,  v6.16B
40         ld1             {v7.16B},  [x12], x2
41         urhadd          v3.16B,  v3.16B,  v7.16B
42   .endif
43         subs            w3,  w3,  #4
44         st1             {v0.16B},  [x0], x2
45         st1             {v1.16B},  [x0], x2
46         st1             {v2.16B},  [x0], x2
47         st1             {v3.16B},  [x0], x2
48         b.ne            1b
49         ret
50 .endm
51
52 .macro  pixels16_x2     rnd=1, avg=0
53 1:      ld1             {v0.16B, v1.16B}, [x1], x2
54         ld1             {v2.16B, v3.16B}, [x1], x2
55         subs            w3,  w3,  #2
56         ext             v1.16B,  v0.16B,  v1.16B,  #1
57         avg             v0.16B,  v0.16B,  v1.16B
58         ext             v3.16B,  v2.16B,  v3.16B,  #1
59         avg             v2.16B,  v2.16B,  v3.16B
60   .if \avg
61         ld1             {v1.16B}, [x0], x2
62         ld1             {v3.16B}, [x0]
63         urhadd          v0.16B,  v0.16B,  v1.16B
64         urhadd          v2.16B,  v2.16B,  v3.16B
65         sub             x0,  x0,  x2
66   .endif
67         st1             {v0.16B}, [x0], x2
68         st1             {v2.16B}, [x0], x2
69         b.ne            1b
70         ret
71 .endm
72
73 .macro  pixels16_y2     rnd=1, avg=0
74         sub             w3,  w3,  #2
75         ld1             {v0.16B}, [x1], x2
76         ld1             {v1.16B}, [x1], x2
77 1:      subs            w3,  w3,  #2
78         avg             v2.16B,  v0.16B,  v1.16B
79         ld1             {v0.16B}, [x1], x2
80         avg             v3.16B,  v0.16B,  v1.16B
81         ld1             {v1.16B}, [x1], x2
82   .if \avg
83         ld1             {v4.16B}, [x0], x2
84         ld1             {v5.16B}, [x0]
85         urhadd          v2.16B,  v2.16B,  v4.16B
86         urhadd          v3.16B,  v3.16B,  v5.16B
87         sub             x0,  x0,  x2
88   .endif
89         st1             {v2.16B}, [x0], x2
90         st1             {v3.16B}, [x0], x2
91         b.ne            1b
92
93         avg             v2.16B,  v0.16B,  v1.16B
94         ld1             {v0.16B}, [x1], x2
95         avg             v3.16B,  v0.16B,  v1.16B
96   .if \avg
97         ld1             {v4.16B}, [x0], x2
98         ld1             {v5.16B}, [x0]
99         urhadd          v2.16B,  v2.16B,  v4.16B
100         urhadd          v3.16B,  v3.16B,  v5.16B
101         sub             x0,  x0,  x2
102   .endif
103         st1             {v2.16B},     [x0], x2
104         st1             {v3.16B},     [x0], x2
105
106         ret
107 .endm
108
109 .macro  pixels16_xy2    rnd=1, avg=0
110         sub             w3,  w3,  #2
111         ld1             {v0.16B, v1.16B}, [x1], x2
112         ld1             {v4.16B, v5.16B}, [x1], x2
113 NRND    movi            v26.8H, #1
114         ext             v1.16B,  v0.16B,  v1.16B,  #1
115         ext             v5.16B,  v4.16B,  v5.16B,  #1
116         uaddl           v16.8H,  v0.8B,   v1.8B
117         uaddl2          v20.8H,  v0.16B,  v1.16B
118         uaddl           v18.8H,  v4.8B,   v5.8B
119         uaddl2          v22.8H,  v4.16B,  v5.16B
120 1:      subs            w3,  w3,  #2
121         ld1             {v0.16B, v1.16B}, [x1], x2
122         add             v24.8H,  v16.8H,  v18.8H
123 NRND    add             v24.8H,  v24.8H,  v26.8H
124         ext             v30.16B, v0.16B,  v1.16B,  #1
125         add             v1.8H,   v20.8H,  v22.8H
126         mshrn           v28.8B,  v24.8H,  #2
127 NRND    add             v1.8H,   v1.8H,   v26.8H
128         mshrn2          v28.16B, v1.8H,   #2
129   .if \avg
130         ld1             {v16.16B},        [x0]
131         urhadd          v28.16B, v28.16B, v16.16B
132   .endif
133         uaddl           v16.8H,  v0.8B,   v30.8B
134         ld1             {v2.16B, v3.16B}, [x1], x2
135         uaddl2          v20.8H,  v0.16B,  v30.16B
136         st1             {v28.16B},        [x0], x2
137         add             v24.8H,  v16.8H,  v18.8H
138 NRND    add             v24.8H,  v24.8H,  v26.8H
139         ext             v3.16B,  v2.16B,  v3.16B,  #1
140         add             v0.8H,   v20.8H,  v22.8H
141         mshrn           v30.8B,  v24.8H,  #2
142 NRND    add             v0.8H,   v0.8H,   v26.8H
143         mshrn2          v30.16B, v0.8H,   #2
144   .if \avg
145         ld1             {v18.16B},        [x0]
146         urhadd          v30.16B, v30.16B, v18.16B
147   .endif
148         uaddl           v18.8H,   v2.8B,  v3.8B
149         uaddl2          v22.8H,   v2.16B, v3.16B
150         st1             {v30.16B},        [x0], x2
151         b.gt            1b
152
153         ld1             {v0.16B, v1.16B}, [x1], x2
154         add             v24.8H,  v16.8H,  v18.8H
155 NRND    add             v24.8H,  v24.8H,  v26.8H
156         ext             v30.16B, v0.16B,  v1.16B,  #1
157         add             v1.8H,   v20.8H,  v22.8H
158         mshrn           v28.8B,  v24.8H,  #2
159 NRND    add             v1.8H,   v1.8H,   v26.8H
160         mshrn2          v28.16B, v1.8H,   #2
161   .if \avg
162         ld1             {v16.16B},        [x0]
163         urhadd          v28.16B, v28.16B, v16.16B
164   .endif
165         uaddl           v16.8H,  v0.8B,   v30.8B
166         uaddl2          v20.8H,  v0.16B,  v30.16B
167         st1             {v28.16B},        [x0], x2
168         add             v24.8H,  v16.8H,  v18.8H
169 NRND    add             v24.8H,  v24.8H,  v26.8H
170         add             v0.8H,   v20.8H,  v22.8H
171         mshrn           v30.8B,  v24.8H,  #2
172 NRND    add             v0.8H,   v0.8H,   v26.8H
173         mshrn2          v30.16B, v0.8H,   #2
174   .if \avg
175         ld1             {v18.16B},        [x0]
176         urhadd          v30.16B, v30.16B, v18.16B
177   .endif
178         st1             {v30.16B},        [x0], x2
179
180         ret
181 .endm
182
183 .macro  pixels8         rnd=1, avg=0
184 1:      ld1             {v0.8B}, [x1], x2
185         ld1             {v1.8B}, [x1], x2
186         ld1             {v2.8B}, [x1], x2
187         ld1             {v3.8B}, [x1], x2
188   .if \avg
189         ld1             {v4.8B}, [x0], x2
190         urhadd          v0.8B,  v0.8B,  v4.8B
191         ld1             {v5.8B}, [x0], x2
192         urhadd          v1.8B,  v1.8B,  v5.8B
193         ld1             {v6.8B}, [x0], x2
194         urhadd          v2.8B,  v2.8B,  v6.8B
195         ld1             {v7.8B}, [x0], x2
196         urhadd          v3.8B,  v3.8B,  v7.8B
197         sub             x0,  x0,  x2,  lsl #2
198   .endif
199         subs            w3,  w3,  #4
200         st1             {v0.8B}, [x0], x2
201         st1             {v1.8B}, [x0], x2
202         st1             {v2.8B}, [x0], x2
203         st1             {v3.8B}, [x0], x2
204         b.ne            1b
205         ret
206 .endm
207
208 .macro  pixels8_x2      rnd=1, avg=0
209 1:      ld1             {v0.8B, v1.8B}, [x1], x2
210         ext             v1.8B,  v0.8B,  v1.8B,  #1
211         ld1             {v2.8B, v3.8B}, [x1], x2
212         ext             v3.8B,  v2.8B,  v3.8B,  #1
213         subs            w3,  w3,  #2
214         avg             v0.8B,   v0.8B,   v1.8B
215         avg             v2.8B,   v2.8B,   v3.8B
216   .if \avg
217         ld1             {v4.8B},     [x0], x2
218         ld1             {v5.8B},     [x0]
219         urhadd          v0.8B,   v0.8B,   v4.8B
220         urhadd          v2.8B,   v2.8B,   v5.8B
221         sub             x0,  x0,  x2
222   .endif
223         st1             {v0.8B}, [x0], x2
224         st1             {v2.8B}, [x0], x2
225         b.ne            1b
226         ret
227 .endm
228
229 .macro  pixels8_y2      rnd=1, avg=0
230         sub             w3,  w3,  #2
231         ld1             {v0.8B},  [x1], x2
232         ld1             {v1.8B},  [x1], x2
233 1:      subs            w3,  w3,  #2
234         avg             v4.8B,  v0.8B,  v1.8B
235         ld1             {v0.8B},  [x1], x2
236         avg             v5.8B,  v0.8B,  v1.8B
237         ld1             {v1.8B},  [x1], x2
238   .if \avg
239         ld1             {v2.8B},     [x0], x2
240         ld1             {v3.8B},     [x0]
241         urhadd          v4.8B,  v4.8B,  v2.8B
242         urhadd          v5.8B,  v5.8B,  v3.8B
243         sub             x0,  x0,  x2
244   .endif
245         st1             {v4.8B},     [x0], x2
246         st1             {v5.8B},     [x0], x2
247         b.ne            1b
248
249         avg             v4.8B,  v0.8B,  v1.8B
250         ld1             {v0.8B},  [x1], x2
251         avg             v5.8B,  v0.8B,  v1.8B
252   .if \avg
253         ld1             {v2.8B},     [x0], x2
254         ld1             {v3.8B},     [x0]
255         urhadd          v4.8B,  v4.8B,  v2.8B
256         urhadd          v5.8B,  v5.8B,  v3.8B
257         sub             x0,  x0,  x2
258   .endif
259         st1             {v4.8B},     [x0], x2
260         st1             {v5.8B},     [x0], x2
261
262         ret
263 .endm
264
265 .macro  pixels8_xy2     rnd=1, avg=0
266         sub             w3,  w3,  #2
267         ld1             {v0.16B},     [x1], x2
268         ld1             {v1.16B},     [x1], x2
269 NRND    movi            v19.8H, #1
270         ext             v4.16B,  v0.16B,  v4.16B,  #1
271         ext             v6.16B,  v1.16B,  v6.16B,  #1
272         uaddl           v16.8H,  v0.8B,  v4.8B
273         uaddl           v17.8H,  v1.8B,  v6.8B
274 1:      subs            w3,  w3,  #2
275         ld1             {v0.16B},     [x1], x2
276         add             v18.8H, v16.8H,  v17.8H
277         ext             v4.16B,  v0.16B,  v4.16B,  #1
278 NRND    add             v18.8H, v18.8H, v19.8H
279         uaddl           v16.8H,  v0.8B,  v4.8B
280         mshrn           v5.8B,  v18.8H, #2
281         ld1             {v1.16B},     [x1], x2
282         add             v18.8H, v16.8H,  v17.8H
283   .if \avg
284         ld1             {v7.8B},     [x0]
285         urhadd          v5.8B,  v5.8B,  v7.8B
286   .endif
287 NRND    add             v18.8H, v18.8H, v19.8H
288         st1             {v5.8B},     [x0], x2
289         mshrn           v7.8B,  v18.8H, #2
290   .if \avg
291         ld1             {v5.8B},     [x0]
292         urhadd          v7.8B,  v7.8B,  v5.8B
293   .endif
294         ext             v6.16B,  v1.16B,  v6.16B,  #1
295         uaddl           v17.8H,  v1.8B,   v6.8B
296         st1             {v7.8B},     [x0], x2
297         b.gt            1b
298
299         ld1             {v0.16B},     [x1], x2
300         add             v18.8H, v16.8H, v17.8H
301         ext             v4.16B, v0.16B, v4.16B,  #1
302 NRND    add             v18.8H, v18.8H, v19.8H
303         uaddl           v16.8H,  v0.8B, v4.8B
304         mshrn           v5.8B,  v18.8H, #2
305         add             v18.8H, v16.8H, v17.8H
306   .if \avg
307         ld1             {v7.8B},     [x0]
308         urhadd          v5.8B,  v5.8B,  v7.8B
309   .endif
310 NRND    add             v18.8H, v18.8H, v19.8H
311         st1             {v5.8B},     [x0], x2
312         mshrn           v7.8B,  v18.8H, #2
313   .if \avg
314         ld1             {v5.8B},     [x0]
315         urhadd          v7.8B,  v7.8B,  v5.8B
316   .endif
317         st1             {v7.8B},     [x0], x2
318
319         ret
320 .endm
321
322 .macro  pixfunc         pfx, name, suf, rnd=1, avg=0
323   .if \rnd
324     .macro avg  rd, rn, rm
325         urhadd          \rd, \rn, \rm
326     .endm
327     .macro mshrn rd, rn, rm
328         rshrn           \rd, \rn, \rm
329     .endm
330     .macro mshrn2 rd, rn, rm
331         rshrn2          \rd, \rn, \rm
332     .endm
333     .macro NRND insn:vararg
334     .endm
335   .else
336     .macro avg  rd, rn, rm
337         uhadd           \rd, \rn, \rm
338     .endm
339     .macro mshrn rd, rn, rm
340         shrn            \rd, \rn, \rm
341     .endm
342     .macro mshrn2 rd, rn, rm
343         shrn2           \rd, \rn, \rm
344     .endm
345     .macro NRND insn:vararg
346         \insn
347     .endm
348   .endif
349 function ff_\pfx\name\suf\()_neon, export=1
350         \name           \rnd, \avg
351 endfunc
352         .purgem         avg
353         .purgem         mshrn
354         .purgem         mshrn2
355         .purgem         NRND
356 .endm
357
358 .macro  pixfunc2        pfx, name, avg=0
359         pixfunc         \pfx, \name,          rnd=1, avg=\avg
360         pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
361 .endm
362
363 function ff_put_h264_qpel16_mc00_neon, export=1
364         mov             w3,  #16
365 endfunc
366
367         pixfunc         put_, pixels16,     avg=0
368         pixfunc2        put_, pixels16_x2,  avg=0
369         pixfunc2        put_, pixels16_y2,  avg=0
370         pixfunc2        put_, pixels16_xy2, avg=0
371
372 function ff_avg_h264_qpel16_mc00_neon, export=1
373         mov             w3,  #16
374 endfunc
375
376         pixfunc         avg_, pixels16,     avg=1
377         pixfunc2        avg_, pixels16_x2,  avg=1
378         pixfunc2        avg_, pixels16_y2,  avg=1
379         pixfunc2        avg_, pixels16_xy2, avg=1
380
381 function ff_put_h264_qpel8_mc00_neon, export=1
382         mov             w3,  #8
383 endfunc
384
385         pixfunc         put_, pixels8,     avg=0
386         pixfunc2        put_, pixels8_x2,  avg=0
387         pixfunc2        put_, pixels8_y2,  avg=0
388         pixfunc2        put_, pixels8_xy2, avg=0
389
390 function ff_avg_h264_qpel8_mc00_neon, export=1
391         mov             w3,  #8
392 endfunc
393
394         pixfunc         avg_, pixels8,     avg=1
395         pixfunc         avg_, pixels8_x2,  avg=1
396         pixfunc         avg_, pixels8_y2,  avg=1
397         pixfunc         avg_, pixels8_xy2, avg=1