]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264qpel_neon.S
avcodec/adpcm_ima_apm: cosmetics
[ffmpeg] / libavcodec / aarch64 / h264qpel_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23 #include "neon.S"
24
25         /* H.264 qpel MC */
26
27 .macro  lowpass_const   r
28         movz            \r, #20, lsl #16
29         movk            \r, #5
30         mov             v6.S[0], \r
31 .endm
32
33 //trashes v0-v5
34 .macro  lowpass_8       r0,  r1,  r2,  r3,  d0,  d1,  narrow=1
35         ext             v2.8B,      \r0\().8B, \r1\().8B, #2
36         ext             v3.8B,      \r0\().8B, \r1\().8B, #3
37         uaddl           v2.8H,      v2.8B,     v3.8B
38         ext             v4.8B,      \r0\().8B, \r1\().8B, #1
39         ext             v5.8B,      \r0\().8B, \r1\().8B, #4
40         uaddl           v4.8H,      v4.8B,     v5.8B
41         ext             v1.8B,      \r0\().8B, \r1\().8B, #5
42         uaddl           \d0\().8H,  \r0\().8B, v1.8B
43         ext             v0.8B,      \r2\().8B, \r3\().8B, #2
44         mla             \d0\().8H,  v2.8H,     v6.H[1]
45         ext             v1.8B,      \r2\().8B, \r3\().8B, #3
46         uaddl           v0.8H,      v0.8B,     v1.8B
47         ext             v1.8B,      \r2\().8B, \r3\().8B, #1
48         mls             \d0\().8H,  v4.8H,     v6.H[0]
49         ext             v3.8B,      \r2\().8B, \r3\().8B, #4
50         uaddl           v1.8H,      v1.8B,     v3.8B
51         ext             v2.8B,      \r2\().8B, \r3\().8B, #5
52         uaddl           \d1\().8H,  \r2\().8B, v2.8B
53         mla             \d1\().8H,  v0.8H,     v6.H[1]
54         mls             \d1\().8H,  v1.8H,     v6.H[0]
55   .if \narrow
56         sqrshrun        \d0\().8B,  \d0\().8H, #5
57         sqrshrun        \d1\().8B,  \d1\().8H, #5
58   .endif
59 .endm
60
61 //trashes v0-v5, v7, v30-v31
62 .macro  lowpass_8H      r0,  r1
63         ext             v0.16B,     \r0\().16B, \r0\().16B, #2
64         ext             v1.16B,     \r0\().16B, \r0\().16B, #3
65         uaddl           v0.8H,      v0.8B,      v1.8B
66         ext             v2.16B,     \r0\().16B, \r0\().16B, #1
67         ext             v3.16B,     \r0\().16B, \r0\().16B, #4
68         uaddl           v2.8H,      v2.8B,      v3.8B
69         ext             v30.16B,    \r0\().16B, \r0\().16B, #5
70         uaddl           \r0\().8H,  \r0\().8B,  v30.8B
71         ext             v4.16B,     \r1\().16B, \r1\().16B, #2
72         mla             \r0\().8H,  v0.8H,      v6.H[1]
73         ext             v5.16B,     \r1\().16B, \r1\().16B, #3
74         uaddl           v4.8H,      v4.8B,      v5.8B
75         ext             v7.16B,     \r1\().16B, \r1\().16B, #1
76         mls             \r0\().8H,  v2.8H,      v6.H[0]
77         ext             v0.16B,     \r1\().16B, \r1\().16B, #4
78         uaddl           v7.8H,      v7.8B,      v0.8B
79         ext             v31.16B,    \r1\().16B, \r1\().16B, #5
80         uaddl           \r1\().8H,  \r1\().8B,  v31.8B
81         mla             \r1\().8H,  v4.8H,      v6.H[1]
82         mls             \r1\().8H,  v7.8H,      v6.H[0]
83 .endm
84
85 // trashes v2-v5, v30
86 .macro  lowpass_8_1     r0,  r1,  d0,  narrow=1
87         ext             v2.8B,     \r0\().8B, \r1\().8B, #2
88         ext             v3.8B,     \r0\().8B, \r1\().8B, #3
89         uaddl           v2.8H,     v2.8B,     v3.8B
90         ext             v4.8B,     \r0\().8B, \r1\().8B, #1
91         ext             v5.8B,     \r0\().8B, \r1\().8B, #4
92         uaddl           v4.8H,     v4.8B,     v5.8B
93         ext             v30.8B,    \r0\().8B, \r1\().8B, #5
94         uaddl           \d0\().8H, \r0\().8B, v30.8B
95         mla             \d0\().8H, v2.8H,     v6.H[1]
96         mls             \d0\().8H, v4.8H,     v6.H[0]
97   .if \narrow
98         sqrshrun        \d0\().8B, \d0\().8H, #5
99   .endif
100 .endm
101
102 // trashed v0-v7
103 .macro  lowpass_8.16    r0,  r1,  r2
104         ext             v1.16B,     \r0\().16B, \r1\().16B, #4
105         ext             v0.16B,     \r0\().16B, \r1\().16B, #6
106         saddl           v5.4S,      v1.4H,      v0.4H
107         ext             v2.16B,     \r0\().16B, \r1\().16B, #2
108         saddl2          v1.4S,      v1.8H,      v0.8H
109         ext             v3.16B,     \r0\().16B, \r1\().16B, #8
110         saddl           v6.4S,      v2.4H,      v3.4H
111         ext             \r1\().16B, \r0\().16B, \r1\().16B, #10
112         saddl2          v2.4S,      v2.8H,      v3.8H
113         saddl           v0.4S,      \r0\().4H,  \r1\().4H
114         saddl2          v4.4S,      \r0\().8H,  \r1\().8H
115
116         shl             v3.4S,  v5.4S,  #4
117         shl             v5.4S,  v5.4S,  #2
118         shl             v7.4S,  v6.4S,  #2
119         add             v5.4S,  v5.4S,  v3.4S
120         add             v6.4S,  v6.4S,  v7.4S
121
122         shl             v3.4S,  v1.4S,  #4
123         shl             v1.4S,  v1.4S,  #2
124         shl             v7.4S,  v2.4S,  #2
125         add             v1.4S,  v1.4S,  v3.4S
126         add             v2.4S,  v2.4S,  v7.4S
127
128         add             v5.4S,  v5.4S,  v0.4S
129         sub             v5.4S,  v5.4S,  v6.4S
130
131         add             v1.4S,  v1.4S,  v4.4S
132         sub             v1.4S,  v1.4S,  v2.4S
133
134         rshrn           v5.4H,  v5.4S,  #10
135         rshrn2          v5.8H,  v1.4S,  #10
136
137         sqxtun          \r2\().8B,  v5.8H
138 .endm
139
140 function put_h264_qpel16_h_lowpass_neon_packed
141         mov             x4,  x30
142         mov             x12, #16
143         mov             x3,  #8
144         bl              put_h264_qpel8_h_lowpass_neon
145         sub             x1,  x1,  x2, lsl #4
146         add             x1,  x1,  #8
147         mov             x12, #16
148         mov             x30, x4
149         b               put_h264_qpel8_h_lowpass_neon
150 endfunc
151
152 .macro  h264_qpel_h_lowpass type
153 function \type\()_h264_qpel16_h_lowpass_neon
154         mov             x13, x30
155         mov             x12, #16
156         bl              \type\()_h264_qpel8_h_lowpass_neon
157         sub             x0,  x0,  x3, lsl #4
158         sub             x1,  x1,  x2, lsl #4
159         add             x0,  x0,  #8
160         add             x1,  x1,  #8
161         mov             x12, #16
162         mov             x30, x13
163 endfunc
164
165 function \type\()_h264_qpel8_h_lowpass_neon
166 1:      ld1             {v28.8B, v29.8B}, [x1], x2
167         ld1             {v16.8B, v17.8B}, [x1], x2
168         subs            x12, x12, #2
169         lowpass_8       v28, v29, v16, v17, v28, v16
170   .ifc \type,avg
171         ld1             {v2.8B},    [x0], x3
172         urhadd          v28.8B, v28.8B,  v2.8B
173         ld1             {v3.8B},    [x0]
174         urhadd          v16.8B, v16.8B, v3.8B
175         sub             x0,  x0,  x3
176   .endif
177         st1             {v28.8B},    [x0], x3
178         st1             {v16.8B},    [x0], x3
179         b.ne            1b
180         ret
181 endfunc
182 .endm
183
184         h264_qpel_h_lowpass put
185         h264_qpel_h_lowpass avg
186
187 .macro  h264_qpel_h_lowpass_l2 type
188 function \type\()_h264_qpel16_h_lowpass_l2_neon
189         mov             x13, x30
190         mov             x12, #16
191         bl              \type\()_h264_qpel8_h_lowpass_l2_neon
192         sub             x0,  x0,  x2, lsl #4
193         sub             x1,  x1,  x2, lsl #4
194         sub             x3,  x3,  x2, lsl #4
195         add             x0,  x0,  #8
196         add             x1,  x1,  #8
197         add             x3,  x3,  #8
198         mov             x12, #16
199         mov             x30, x13
200 endfunc
201
202 function \type\()_h264_qpel8_h_lowpass_l2_neon
203 1:      ld1             {v26.8B, v27.8B}, [x1], x2
204         ld1             {v16.8B, v17.8B}, [x1], x2
205         ld1             {v28.8B},     [x3], x2
206         ld1             {v29.8B},     [x3], x2
207         subs            x12, x12, #2
208         lowpass_8       v26, v27, v16, v17, v26, v27
209         urhadd          v26.8B, v26.8B, v28.8B
210         urhadd          v27.8B, v27.8B, v29.8B
211   .ifc \type,avg
212         ld1             {v2.8B},      [x0], x2
213         urhadd          v26.8B, v26.8B, v2.8B
214         ld1             {v3.8B},      [x0]
215         urhadd          v27.8B, v27.8B, v3.8B
216         sub             x0,  x0,  x2
217   .endif
218         st1             {v26.8B},     [x0], x2
219         st1             {v27.8B},     [x0], x2
220         b.ne            1b
221         ret
222 endfunc
223 .endm
224
225         h264_qpel_h_lowpass_l2 put
226         h264_qpel_h_lowpass_l2 avg
227
228 function put_h264_qpel16_v_lowpass_neon_packed
229         mov             x4,  x30
230         mov             x2,  #8
231         bl              put_h264_qpel8_v_lowpass_neon
232         sub             x1,  x1,  x3, lsl #2
233         bl              put_h264_qpel8_v_lowpass_neon
234         sub             x1,  x1,  x3, lsl #4
235         sub             x1,  x1,  x3, lsl #2
236         add             x1,  x1,  #8
237         bl              put_h264_qpel8_v_lowpass_neon
238         sub             x1,  x1,  x3, lsl #2
239         mov             x30, x4
240         b               put_h264_qpel8_v_lowpass_neon
241 endfunc
242
243 .macro  h264_qpel_v_lowpass type
244 function \type\()_h264_qpel16_v_lowpass_neon
245         mov             x4,  x30
246         bl              \type\()_h264_qpel8_v_lowpass_neon
247         sub             x1,  x1,  x3, lsl #2
248         bl              \type\()_h264_qpel8_v_lowpass_neon
249         sub             x0,  x0,  x2, lsl #4
250         add             x0,  x0,  #8
251         sub             x1,  x1,  x3, lsl #4
252         sub             x1,  x1,  x3, lsl #2
253         add             x1,  x1,  #8
254         bl              \type\()_h264_qpel8_v_lowpass_neon
255         sub             x1,  x1,  x3, lsl #2
256         mov             x30, x4
257 endfunc
258
259 function \type\()_h264_qpel8_v_lowpass_neon
260         ld1             {v16.8B}, [x1], x3
261         ld1             {v18.8B}, [x1], x3
262         ld1             {v20.8B}, [x1], x3
263         ld1             {v22.8B}, [x1], x3
264         ld1             {v24.8B}, [x1], x3
265         ld1             {v26.8B}, [x1], x3
266         ld1             {v28.8B}, [x1], x3
267         ld1             {v30.8B}, [x1], x3
268         ld1             {v17.8B}, [x1], x3
269         ld1             {v19.8B}, [x1], x3
270         ld1             {v21.8B}, [x1], x3
271         ld1             {v23.8B}, [x1], x3
272         ld1             {v25.8B}, [x1]
273
274         transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
275         transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
276         lowpass_8       v16, v17, v18, v19, v16, v17
277         lowpass_8       v20, v21, v22, v23, v18, v19
278         lowpass_8       v24, v25, v26, v27, v20, v21
279         lowpass_8       v28, v29, v30, v31, v22, v23
280         transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
281
282   .ifc \type,avg
283         ld1             {v24.8B},  [x0], x2
284         urhadd          v16.8B, v16.8B, v24.8B
285         ld1             {v25.8B}, [x0], x2
286         urhadd          v17.8B, v17.8B, v25.8B
287         ld1             {v26.8B}, [x0], x2
288         urhadd          v18.8B, v18.8B, v26.8B
289         ld1             {v27.8B}, [x0], x2
290         urhadd          v19.8B, v19.8B, v27.8B
291         ld1             {v28.8B}, [x0], x2
292         urhadd          v20.8B, v20.8B, v28.8B
293         ld1             {v29.8B}, [x0], x2
294         urhadd          v21.8B, v21.8B, v29.8B
295         ld1             {v30.8B}, [x0], x2
296         urhadd          v22.8B, v22.8B, v30.8B
297         ld1             {v31.8B}, [x0], x2
298         urhadd          v23.8B, v23.8B, v31.8B
299         sub             x0,  x0,  x2,  lsl #3
300   .endif
301
302         st1             {v16.8B}, [x0], x2
303         st1             {v17.8B}, [x0], x2
304         st1             {v18.8B}, [x0], x2
305         st1             {v19.8B}, [x0], x2
306         st1             {v20.8B}, [x0], x2
307         st1             {v21.8B}, [x0], x2
308         st1             {v22.8B}, [x0], x2
309         st1             {v23.8B}, [x0], x2
310
311         ret
312 endfunc
313 .endm
314
315         h264_qpel_v_lowpass put
316         h264_qpel_v_lowpass avg
317
318 .macro  h264_qpel_v_lowpass_l2 type
319 function \type\()_h264_qpel16_v_lowpass_l2_neon
320         mov             x4,  x30
321         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
322         sub             x1,  x1,  x3, lsl #2
323         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
324         sub             x0,  x0,  x3, lsl #4
325         sub             x12, x12, x2, lsl #4
326         add             x0,  x0,  #8
327         add             x12, x12, #8
328         sub             x1,  x1,  x3, lsl #4
329         sub             x1,  x1,  x3, lsl #2
330         add             x1,  x1,  #8
331         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
332         sub             x1,  x1,  x3, lsl #2
333         mov             x30, x4
334 endfunc
335
336 function \type\()_h264_qpel8_v_lowpass_l2_neon
337         ld1             {v16.8B}, [x1], x3
338         ld1             {v18.8B}, [x1], x3
339         ld1             {v20.8B}, [x1], x3
340         ld1             {v22.8B}, [x1], x3
341         ld1             {v24.8B}, [x1], x3
342         ld1             {v26.8B}, [x1], x3
343         ld1             {v28.8B}, [x1], x3
344         ld1             {v30.8B}, [x1], x3
345         ld1             {v17.8B}, [x1], x3
346         ld1             {v19.8B}, [x1], x3
347         ld1             {v21.8B}, [x1], x3
348         ld1             {v23.8B}, [x1], x3
349         ld1             {v25.8B}, [x1]
350
351         transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1
352         transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1
353         lowpass_8       v16, v17, v18, v19, v16, v17
354         lowpass_8       v20, v21, v22, v23, v18, v19
355         lowpass_8       v24, v25, v26, v27, v20, v21
356         lowpass_8       v28, v29, v30, v31, v22, v23
357         transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
358
359         ld1             {v24.8B},  [x12], x2
360         ld1             {v25.8B},  [x12], x2
361         ld1             {v26.8B},  [x12], x2
362         ld1             {v27.8B},  [x12], x2
363         ld1             {v28.8B},  [x12], x2
364         urhadd          v16.8B, v24.8B, v16.8B
365         urhadd          v17.8B, v25.8B, v17.8B
366         ld1             {v29.8B},  [x12], x2
367         urhadd          v18.8B, v26.8B, v18.8B
368         urhadd          v19.8B, v27.8B, v19.8B
369         ld1             {v30.8B}, [x12], x2
370         urhadd          v20.8B, v28.8B, v20.8B
371         urhadd          v21.8B, v29.8B, v21.8B
372         ld1             {v31.8B}, [x12], x2
373         urhadd          v22.8B, v30.8B, v22.8B
374         urhadd          v23.8B, v31.8B, v23.8B
375
376   .ifc \type,avg
377         ld1             {v24.8B}, [x0], x3
378         urhadd          v16.8B, v16.8B, v24.8B
379         ld1             {v25.8B}, [x0], x3
380         urhadd          v17.8B, v17.8B, v25.8B
381         ld1             {v26.8B}, [x0], x3
382         urhadd          v18.8B, v18.8B, v26.8B
383         ld1             {v27.8B}, [x0], x3
384         urhadd          v19.8B, v19.8B, v27.8B
385         ld1             {v28.8B}, [x0], x3
386         urhadd          v20.8B, v20.8B, v28.8B
387         ld1             {v29.8B}, [x0], x3
388         urhadd          v21.8B, v21.8B, v29.8B
389         ld1             {v30.8B}, [x0], x3
390         urhadd          v22.8B, v22.8B, v30.8B
391         ld1             {v31.8B}, [x0], x3
392         urhadd          v23.8B, v23.8B, v31.8B
393         sub             x0,  x0,  x3,  lsl #3
394   .endif
395
396         st1             {v16.8B}, [x0], x3
397         st1             {v17.8B}, [x0], x3
398         st1             {v18.8B}, [x0], x3
399         st1             {v19.8B}, [x0], x3
400         st1             {v20.8B}, [x0], x3
401         st1             {v21.8B}, [x0], x3
402         st1             {v22.8B}, [x0], x3
403         st1             {v23.8B}, [x0], x3
404
405         ret
406 endfunc
407 .endm
408
409         h264_qpel_v_lowpass_l2 put
410         h264_qpel_v_lowpass_l2 avg
411
412 function put_h264_qpel8_hv_lowpass_neon_top
413         lowpass_const   w12
414         ld1             {v16.8H}, [x1], x3
415         ld1             {v17.8H}, [x1], x3
416         ld1             {v18.8H}, [x1], x3
417         ld1             {v19.8H}, [x1], x3
418         ld1             {v20.8H}, [x1], x3
419         ld1             {v21.8H}, [x1], x3
420         ld1             {v22.8H}, [x1], x3
421         ld1             {v23.8H}, [x1], x3
422         ld1             {v24.8H}, [x1], x3
423         ld1             {v25.8H}, [x1], x3
424         ld1             {v26.8H}, [x1], x3
425         ld1             {v27.8H}, [x1], x3
426         ld1             {v28.8H}, [x1]
427         lowpass_8H      v16, v17
428         lowpass_8H      v18, v19
429         lowpass_8H      v20, v21
430         lowpass_8H      v22, v23
431         lowpass_8H      v24, v25
432         lowpass_8H      v26, v27
433         lowpass_8H      v28, v29
434
435         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
436         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
437
438         lowpass_8.16    v16, v24, v16
439         lowpass_8.16    v17, v25, v17
440
441         lowpass_8.16    v18, v26, v18
442         lowpass_8.16    v19, v27, v19
443
444         lowpass_8.16    v20, v28, v20
445         lowpass_8.16    v21, v29, v21
446
447         lowpass_8.16    v22, v30, v22
448         lowpass_8.16    v23, v31, v23
449
450         transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
451
452         ret
453 endfunc
454
455 .macro  h264_qpel8_hv_lowpass type
456 function \type\()_h264_qpel8_hv_lowpass_neon
457         mov             x10, x30
458         bl              put_h264_qpel8_hv_lowpass_neon_top
459   .ifc \type,avg
460         ld1             {v0.8B},      [x0], x2
461         urhadd          v16.8B, v16.8B, v0.8B
462         ld1             {v1.8B},      [x0], x2
463         urhadd          v17.8B, v17.8B, v1.8B
464         ld1             {v2.8B},      [x0], x2
465         urhadd          v18.8B, v18.8B, v2.8B
466         ld1             {v3.8B},      [x0], x2
467         urhadd          v19.8B, v19.8B, v3.8B
468         ld1             {v4.8B},      [x0], x2
469         urhadd          v20.8B, v20.8B, v4.8B
470         ld1             {v5.8B},      [x0], x2
471         urhadd          v21.8B, v21.8B, v5.8B
472         ld1             {v6.8B},      [x0], x2
473         urhadd          v22.8B, v22.8B, v6.8B
474         ld1             {v7.8B},      [x0], x2
475         urhadd          v23.8B, v23.8B, v7.8B
476         sub             x0,  x0,  x2,  lsl #3
477   .endif
478
479         st1             {v16.8B},     [x0], x2
480         st1             {v17.8B},     [x0], x2
481         st1             {v18.8B},     [x0], x2
482         st1             {v19.8B},     [x0], x2
483         st1             {v20.8B},     [x0], x2
484         st1             {v21.8B},     [x0], x2
485         st1             {v22.8B},     [x0], x2
486         st1             {v23.8B},     [x0], x2
487
488         ret             x10
489 endfunc
490 .endm
491
492         h264_qpel8_hv_lowpass put
493         h264_qpel8_hv_lowpass avg
494
495 .macro  h264_qpel8_hv_lowpass_l2 type
496 function \type\()_h264_qpel8_hv_lowpass_l2_neon
497         mov             x10, x30
498         bl              put_h264_qpel8_hv_lowpass_neon_top
499
500         ld1             {v0.8B, v1.8B},  [x2], #16
501         ld1             {v2.8B, v3.8B},  [x2], #16
502         urhadd          v0.8B,  v0.8B,  v16.8B
503         urhadd          v1.8B,  v1.8B,  v17.8B
504         ld1             {v4.8B, v5.8B},  [x2], #16
505         urhadd          v2.8B,  v2.8B,  v18.8B
506         urhadd          v3.8B,  v3.8B,  v19.8B
507         ld1             {v6.8B, v7.8B},  [x2], #16
508         urhadd          v4.8B,  v4.8B,  v20.8B
509         urhadd          v5.8B,  v5.8B,  v21.8B
510         urhadd          v6.8B,  v6.8B,  v22.8B
511         urhadd          v7.8B,  v7.8B,  v23.8B
512   .ifc \type,avg
513         ld1             {v16.8B},     [x0], x3
514         urhadd          v0.8B,  v0.8B,  v16.8B
515         ld1             {v17.8B},     [x0], x3
516         urhadd          v1.8B,  v1.8B,  v17.8B
517         ld1             {v18.8B},     [x0], x3
518         urhadd          v2.8B,  v2.8B,  v18.8B
519         ld1             {v19.8B},     [x0], x3
520         urhadd          v3.8B,  v3.8B,  v19.8B
521         ld1             {v20.8B},     [x0], x3
522         urhadd          v4.8B,  v4.8B,  v20.8B
523         ld1             {v21.8B},     [x0], x3
524         urhadd          v5.8B,  v5.8B,  v21.8B
525         ld1             {v22.8B},     [x0], x3
526         urhadd          v6.8B,  v6.8B,  v22.8B
527         ld1             {v23.8B},     [x0], x3
528         urhadd          v7.8B,  v7.8B,  v23.8B
529         sub             x0,  x0,  x3,  lsl #3
530   .endif
531         st1             {v0.8B},      [x0], x3
532         st1             {v1.8B},      [x0], x3
533         st1             {v2.8B},      [x0], x3
534         st1             {v3.8B},      [x0], x3
535         st1             {v4.8B},      [x0], x3
536         st1             {v5.8B},      [x0], x3
537         st1             {v6.8B},      [x0], x3
538         st1             {v7.8B},      [x0], x3
539
540         ret             x10
541 endfunc
542 .endm
543
544         h264_qpel8_hv_lowpass_l2 put
545         h264_qpel8_hv_lowpass_l2 avg
546
547 .macro  h264_qpel16_hv  type
548 function \type\()_h264_qpel16_hv_lowpass_neon
549         mov             x13, x30
550         bl              \type\()_h264_qpel8_hv_lowpass_neon
551         sub             x1,  x1,  x3, lsl #2
552         bl              \type\()_h264_qpel8_hv_lowpass_neon
553         sub             x1,  x1,  x3, lsl #4
554         sub             x1,  x1,  x3, lsl #2
555         add             x1,  x1,  #8
556         sub             x0,  x0,  x2, lsl #4
557         add             x0,  x0,  #8
558         bl              \type\()_h264_qpel8_hv_lowpass_neon
559         sub             x1,  x1,  x3, lsl #2
560         mov             x30, x13
561         b               \type\()_h264_qpel8_hv_lowpass_neon
562 endfunc
563
564 function \type\()_h264_qpel16_hv_lowpass_l2_neon
565         mov             x13, x30
566         sub             x2,  x4,  #256
567         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
568         sub             x1,  x1,  x3, lsl #2
569         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
570         sub             x1,  x1,  x3, lsl #4
571         sub             x1,  x1,  x3, lsl #2
572         add             x1,  x1,  #8
573         sub             x0,  x0,  x3, lsl #4
574         add             x0,  x0,  #8
575         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
576         sub             x1,  x1,  x3, lsl #2
577         mov             x30, x13
578         b               \type\()_h264_qpel8_hv_lowpass_l2_neon
579 endfunc
580 .endm
581
582         h264_qpel16_hv put
583         h264_qpel16_hv avg
584
585 .macro  h264_qpel8      type
586 function ff_\type\()_h264_qpel8_mc10_neon, export=1
587         lowpass_const   w3
588         mov             x3,  x1
589         sub             x1,  x1,  #2
590         mov             x12, #8
591         b               \type\()_h264_qpel8_h_lowpass_l2_neon
592 endfunc
593
594 function ff_\type\()_h264_qpel8_mc20_neon, export=1
595         lowpass_const   w3
596         sub             x1,  x1,  #2
597         mov             x3,  x2
598         mov             x12, #8
599         b               \type\()_h264_qpel8_h_lowpass_neon
600 endfunc
601
602 function ff_\type\()_h264_qpel8_mc30_neon, export=1
603         lowpass_const   w3
604         add             x3,  x1,  #1
605         sub             x1,  x1,  #2
606         mov             x12, #8
607         b               \type\()_h264_qpel8_h_lowpass_l2_neon
608 endfunc
609
610 function ff_\type\()_h264_qpel8_mc01_neon, export=1
611         mov             x14, x30
612         mov             x12, x1
613 \type\()_h264_qpel8_mc01:
614         lowpass_const   w3
615         mov             x3,  x2
616         sub             x1,  x1,  x2, lsl #1
617         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
618         ret             x14
619 endfunc
620
621 function ff_\type\()_h264_qpel8_mc11_neon, export=1
622         mov             x14, x30
623         mov             x8,  x0
624         mov             x9,  x1
625 \type\()_h264_qpel8_mc11:
626         lowpass_const   w3
627         mov             x11, sp
628         sub             sp,  sp,  #64
629         mov             x0,  sp
630         sub             x1,  x1,  #2
631         mov             x3,  #8
632         mov             x12, #8
633         bl              put_h264_qpel8_h_lowpass_neon
634         mov             x0,  x8
635         mov             x3,  x2
636         mov             x12, sp
637         sub             x1,  x9,  x2, lsl #1
638         mov             x2,  #8
639         bl              \type\()_h264_qpel8_v_lowpass_l2_neon
640         mov             sp,  x11
641         ret             x14
642 endfunc
643
644 function ff_\type\()_h264_qpel8_mc21_neon, export=1
645         mov             x14, x30
646         mov             x8,  x0
647         mov             x9,  x1
648 \type\()_h264_qpel8_mc21:
649         lowpass_const   w3
650         mov             x11, sp
651         sub             sp,  sp,  #(8*8+16*12)
652         sub             x1,  x1,  #2
653         mov             x3,  #8
654         mov             x0,  sp
655         mov             x12, #8
656         bl              put_h264_qpel8_h_lowpass_neon
657         mov             x4,  x0
658         mov             x0,  x8
659         sub             x1,  x9,  x2, lsl #1
660         sub             x1,  x1,  #2
661         mov             x3,  x2
662         sub             x2,  x4,  #64
663         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
664         mov             sp,  x11
665         ret             x14
666 endfunc
667
668 function ff_\type\()_h264_qpel8_mc31_neon, export=1
669         add             x1,  x1,  #1
670         mov             x14, x30
671         mov             x8,  x0
672         mov             x9,  x1
673         sub             x1,  x1,  #1
674         b               \type\()_h264_qpel8_mc11
675 endfunc
676
677 function ff_\type\()_h264_qpel8_mc02_neon, export=1
678         mov             x14, x30
679         lowpass_const   w3
680         sub             x1,  x1,  x2, lsl #1
681         mov             x3,  x2
682         bl              \type\()_h264_qpel8_v_lowpass_neon
683         ret             x14
684 endfunc
685
686 function ff_\type\()_h264_qpel8_mc12_neon, export=1
687         mov             x14, x30
688         mov             x8,  x0
689         mov             x9,  x1
690 \type\()_h264_qpel8_mc12:
691         lowpass_const   w3
692         mov             x11, sp
693         sub             sp,  sp,  #(8*8+16*12)
694         sub             x1,  x1,  x2, lsl #1
695         mov             x3,  x2
696         mov             x2,  #8
697         mov             x0,  sp
698         bl              put_h264_qpel8_v_lowpass_neon
699         mov             x4,  x0
700         mov             x0,  x8
701         sub             x1,  x9,  x3, lsl #1
702         sub             x1,  x1,  #2
703         sub             x2,  x4,  #64
704         bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
705         mov             sp,  x11
706         ret             x14
707 endfunc
708
709 function ff_\type\()_h264_qpel8_mc22_neon, export=1
710         mov             x14, x30
711         mov             x11, sp
712         sub             x1,  x1,  x2, lsl #1
713         sub             x1,  x1,  #2
714         mov             x3,  x2
715         bl              \type\()_h264_qpel8_hv_lowpass_neon
716         mov             sp,  x11
717         ret             x14
718 endfunc
719
720 function ff_\type\()_h264_qpel8_mc32_neon, export=1
721         mov             x14, x30
722         mov             x8,  x0
723         mov             x9,  x1
724         add             x1,  x1,  #1
725         b               \type\()_h264_qpel8_mc12
726 endfunc
727
728 function ff_\type\()_h264_qpel8_mc03_neon, export=1
729         mov             x14, x30
730         add             x12, x1,  x2
731         b               \type\()_h264_qpel8_mc01
732 endfunc
733
734 function ff_\type\()_h264_qpel8_mc13_neon, export=1
735         mov             x14, x30
736         mov             x8,  x0
737         mov             x9,  x1
738         add             x1,  x1,  x2
739         b               \type\()_h264_qpel8_mc11
740 endfunc
741
742 function ff_\type\()_h264_qpel8_mc23_neon, export=1
743         mov             x14, x30
744         mov             x8,  x0
745         mov             x9,  x1
746         add             x1,  x1,  x2
747         b               \type\()_h264_qpel8_mc21
748 endfunc
749
750 function ff_\type\()_h264_qpel8_mc33_neon, export=1
751         add             x1,  x1,  #1
752         mov             x14, x30
753         mov             x8,  x0
754         mov             x9,  x1
755         add             x1,  x1,  x2
756         sub             x1,  x1,  #1
757         b               \type\()_h264_qpel8_mc11
758 endfunc
759 .endm
760
761         h264_qpel8 put
762         h264_qpel8 avg
763
764 .macro  h264_qpel16     type
765 function ff_\type\()_h264_qpel16_mc10_neon, export=1
766         lowpass_const   w3
767         mov             x3,  x1
768         sub             x1,  x1,  #2
769         b               \type\()_h264_qpel16_h_lowpass_l2_neon
770 endfunc
771
772 function ff_\type\()_h264_qpel16_mc20_neon, export=1
773         lowpass_const   w3
774         sub             x1,  x1,  #2
775         mov             x3,  x2
776         b               \type\()_h264_qpel16_h_lowpass_neon
777 endfunc
778
779 function ff_\type\()_h264_qpel16_mc30_neon, export=1
780         lowpass_const   w3
781         add             x3,  x1,  #1
782         sub             x1,  x1,  #2
783         b               \type\()_h264_qpel16_h_lowpass_l2_neon
784 endfunc
785
786 function ff_\type\()_h264_qpel16_mc01_neon, export=1
787         mov             x14, x30
788         mov             x12, x1
789 \type\()_h264_qpel16_mc01:
790         lowpass_const   w3
791         mov             x3,  x2
792         sub             x1,  x1,  x2, lsl #1
793         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
794         ret             x14
795 endfunc
796
797 function ff_\type\()_h264_qpel16_mc11_neon, export=1
798         mov             x14, x30
799         mov             x8,  x0
800         mov             x9,  x1
801 \type\()_h264_qpel16_mc11:
802         lowpass_const   w3
803         mov             x11, sp
804         sub             sp,  sp,  #256
805         mov             x0,  sp
806         sub             x1,  x1,  #2
807         mov             x3,  #16
808         bl              put_h264_qpel16_h_lowpass_neon
809         mov             x0,  x8
810         mov             x3,  x2
811         mov             x12, sp
812         sub             x1,  x9,  x2, lsl #1
813         mov             x2,  #16
814         bl              \type\()_h264_qpel16_v_lowpass_l2_neon
815         mov             sp,  x11
816         ret             x14
817 endfunc
818
819 function ff_\type\()_h264_qpel16_mc21_neon, export=1
820         mov             x14, x30
821         mov             x8,  x0
822         mov             x9,  x1
823 \type\()_h264_qpel16_mc21:
824         lowpass_const   w3
825         mov             x11, sp
826         sub             sp,  sp,  #(16*16+16*12)
827         sub             x1,  x1,  #2
828         mov             x0,  sp
829         bl              put_h264_qpel16_h_lowpass_neon_packed
830         mov             x4,  x0
831         mov             x0,  x8
832         sub             x1,  x9,  x2, lsl #1
833         sub             x1,  x1,  #2
834         mov             x3,  x2
835         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
836         mov             sp,  x11
837         ret             x14
838 endfunc
839
840 function ff_\type\()_h264_qpel16_mc31_neon, export=1
841         add             x1,  x1,  #1
842         mov             x14, x30
843         mov             x8,  x0
844         mov             x9,  x1
845         sub             x1,  x1,  #1
846         b               \type\()_h264_qpel16_mc11
847 endfunc
848
849 function ff_\type\()_h264_qpel16_mc02_neon, export=1
850         mov             x14, x30
851         lowpass_const   w3
852         sub             x1,  x1,  x2, lsl #1
853         mov             x3,  x2
854         bl              \type\()_h264_qpel16_v_lowpass_neon
855         ret             x14
856 endfunc
857
858 function ff_\type\()_h264_qpel16_mc12_neon, export=1
859         mov             x14, x30
860         mov             x8,  x0
861         mov             x9,  x1
862 \type\()_h264_qpel16_mc12:
863         lowpass_const   w3
864         mov             x11, sp
865         sub             sp,  sp,  #(16*16+16*12)
866         sub             x1,  x1,  x2, lsl #1
867         mov             x0,  sp
868         mov             x3,  x2
869         bl              put_h264_qpel16_v_lowpass_neon_packed
870         mov             x4,  x0
871         mov             x0,  x8
872         sub             x1,  x9,  x3, lsl #1
873         sub             x1,  x1,  #2
874         mov             x2,  x3
875         bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
876         mov             sp,  x11
877         ret             x14
878 endfunc
879
880 function ff_\type\()_h264_qpel16_mc22_neon, export=1
881         mov             x14, x30
882         lowpass_const   w3
883         mov             x11, sp
884         sub             x1,  x1,  x2, lsl #1
885         sub             x1,  x1,  #2
886         mov             x3,  x2
887         bl              \type\()_h264_qpel16_hv_lowpass_neon
888         mov             sp,  x11 // restore stack
889         ret             x14
890 endfunc
891
892 function ff_\type\()_h264_qpel16_mc32_neon, export=1
893         mov             x14, x30
894         mov             x8,  x0
895         mov             x9,  x1
896         add             x1,  x1,  #1
897         b               \type\()_h264_qpel16_mc12
898 endfunc
899
900 function ff_\type\()_h264_qpel16_mc03_neon, export=1
901         mov             x14, x30
902         add             x12, x1,  x2
903         b               \type\()_h264_qpel16_mc01
904 endfunc
905
906 function ff_\type\()_h264_qpel16_mc13_neon, export=1
907         mov             x14, x30
908         mov             x8,  x0
909         mov             x9,  x1
910         add             x1,  x1,  x2
911         b               \type\()_h264_qpel16_mc11
912 endfunc
913
914 function ff_\type\()_h264_qpel16_mc23_neon, export=1
915         mov             x14, x30
916         mov             x8,  x0
917         mov             x9,  x1
918         add             x1,  x1,  x2
919         b               \type\()_h264_qpel16_mc21
920 endfunc
921
922 function ff_\type\()_h264_qpel16_mc33_neon, export=1
923         add             x1,  x1,  #1
924         mov             x14, x30
925         mov             x8,  x0
926         mov             x9,  x1
927         add             x1,  x1,  x2
928         sub             x1,  x1,  #1
929         b               \type\()_h264_qpel16_mc11
930 endfunc
931 .endm
932
933         h264_qpel16 put
934         h264_qpel16 avg