]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264pred_neon.S
avfilter/formats: Remove avfilter_make_format64_list()
[ffmpeg] / libavcodec / aarch64 / h264pred_neon.S
1 /*
2  * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/aarch64/asm.S"
22
23 .macro ldcol.8  rd,  rs,  rt,  n=8,  hi=0
24 .if \n >= 8 || \hi == 0
25         ld1             {\rd\().b}[0],  [\rs], \rt
26         ld1             {\rd\().b}[1],  [\rs], \rt
27         ld1             {\rd\().b}[2],  [\rs], \rt
28         ld1             {\rd\().b}[3],  [\rs], \rt
29 .endif
30 .if \n >= 8 || \hi == 1
31         ld1             {\rd\().b}[4],  [\rs], \rt
32         ld1             {\rd\().b}[5],  [\rs], \rt
33         ld1             {\rd\().b}[6],  [\rs], \rt
34         ld1             {\rd\().b}[7],  [\rs], \rt
35 .endif
36 .if \n == 16
37         ld1             {\rd\().b}[8],  [\rs], \rt
38         ld1             {\rd\().b}[9],  [\rs], \rt
39         ld1             {\rd\().b}[10], [\rs], \rt
40         ld1             {\rd\().b}[11], [\rs], \rt
41         ld1             {\rd\().b}[12], [\rs], \rt
42         ld1             {\rd\().b}[13], [\rs], \rt
43         ld1             {\rd\().b}[14], [\rs], \rt
44         ld1             {\rd\().b}[15], [\rs], \rt
45 .endif
46 .endm
47
48 function ff_pred16x16_128_dc_neon, export=1
49         movi            v0.16b,  #128
50         b               .L_pred16x16_dc_end
51 endfunc
52
53 function ff_pred16x16_top_dc_neon, export=1
54         sub             x2,  x0,  x1
55         ld1             {v0.16b},  [x2]
56         uaddlv          h0,  v0.16b
57         rshrn           v0.8b,  v0.8h,  #4
58         dup             v0.16b, v0.b[0]
59         b               .L_pred16x16_dc_end
60 endfunc
61
62 function ff_pred16x16_left_dc_neon, export=1
63         sub             x2,  x0,  #1
64         ldcol.8         v0,  x2,  x1, 16
65         uaddlv          h0,  v0.16b
66         rshrn           v0.8b,  v0.8h,  #4
67         dup             v0.16b, v0.b[0]
68         b               .L_pred16x16_dc_end
69 endfunc
70
71 function ff_pred16x16_dc_neon, export=1
72         sub             x2,  x0,  x1
73         sub             x3,  x0,  #1
74         ld1             {v0.16b}, [x2]
75         ldcol.8         v1,  x3,  x1, 16
76         uaddlv          h0,  v0.16b
77         uaddlv          h1,  v1.16b
78         add             v0.4h,  v0.4h,  v1.4h
79         rshrn           v0.8b,  v0.8h,  #5
80         dup             v0.16b, v0.b[0]
81 .L_pred16x16_dc_end:
82         mov             w3,  #8
83 6:      st1             {v0.16b}, [x0], x1
84         subs            w3,  w3,  #1
85         st1             {v0.16b}, [x0], x1
86         b.ne            6b
87         ret
88 endfunc
89
90 function ff_pred16x16_hor_neon, export=1
91         sub             x2,  x0,  #1
92         mov             w3,  #16
93 1:      ld1r            {v0.16b}, [x2], x1
94         subs            w3,  w3,  #1
95         st1             {v0.16b}, [x0], x1
96         b.ne            1b
97         ret
98 endfunc
99
100 function ff_pred16x16_vert_neon, export=1
101         sub             x2,  x0,  x1
102         add             x1,  x1,  x1
103         ld1             {v0.16b}, [x2], x1
104         mov             w3,  #8
105 1:      subs            w3,  w3,  #1
106         st1             {v0.16b}, [x0], x1
107         st1             {v0.16b}, [x2], x1
108         b.ne            1b
109         ret
110 endfunc
111
112 function ff_pred16x16_plane_neon, export=1
113         sub             x3,  x0,  x1
114         movrel          x4,  p16weight
115         add             x2,  x3,  #8
116         sub             x3,  x3,  #1
117         ld1             {v0.8b},  [x3]
118         ld1             {v2.8b},  [x2], x1
119         ldcol.8         v1,  x3,  x1
120         add             x3,  x3,  x1
121         ldcol.8         v3,  x3,  x1
122         rev64           v0.8b,  v0.8b
123         rev64           v1.8b,  v1.8b
124         uaddl           v7.8h,  v2.8b,  v3.8b
125         usubl           v2.8h,  v2.8b,  v0.8b
126         usubl           v3.8h,  v3.8b,  v1.8b
127         ld1             {v0.8h},     [x4]
128         mul             v2.8h,  v2.8h,  v0.8h
129         mul             v3.8h,  v3.8h,  v0.8h
130         addp            v2.8h,  v2.8h,  v3.8h
131         addp            v2.8h,  v2.8h,  v2.8h
132         addp            v2.4h,  v2.4h,  v2.4h
133         sshll           v3.4s,  v2.4h,  #2
134         saddw           v2.4s,  v3.4s,  v2.4h
135         rshrn           v4.4h,  v2.4s,  #6
136         trn2            v5.4h,  v4.4h,  v4.4h
137         add             v2.4h,  v4.4h,  v5.4h
138         shl             v3.4h,  v2.4h,  #3
139         ext             v7.16b, v7.16b, v7.16b, #14
140         sub             v3.4h,  v3.4h,  v2.4h   // 7 * (b + c)
141         add             v7.4h,  v7.4h,  v0.4h
142         shl             v2.4h,  v7.4h,  #4
143         sub             v2.4h,  v2.4h,  v3.4h
144         shl             v3.4h,  v4.4h,  #4
145         ext             v0.16b, v0.16b, v0.16b, #14
146         sub             v6.4h,  v5.4h,  v3.4h
147         mov             v0.h[0],  wzr
148         mul             v0.8h,  v0.8h,  v4.h[0]
149         dup             v1.8h,  v2.h[0]
150         dup             v2.8h,  v4.h[0]
151         dup             v3.8h,  v6.h[0]
152         shl             v2.8h,  v2.8h,  #3
153         add             v1.8h,  v1.8h,  v0.8h
154         add             v3.8h,  v3.8h,  v2.8h
155         mov             w3,  #16
156 1:
157         sqshrun         v0.8b,  v1.8h,  #5
158         add             v1.8h,  v1.8h,  v2.8h
159         sqshrun2        v0.16b, v1.8h,  #5
160         add             v1.8h,  v1.8h,  v3.8h
161         subs            w3,  w3,  #1
162         st1             {v0.16b}, [x0], x1
163         b.ne            1b
164         ret
165 endfunc
166
167 const   p16weight, align=4
168         .short          1,2,3,4,5,6,7,8
169 endconst
170 const   p8weight, align=4
171         .short          1,2,3,4,1,2,3,4
172 endconst
173
174 function ff_pred8x8_hor_neon, export=1
175         sub             x2,  x0,  #1
176         mov             w3,  #8
177 1:      ld1r            {v0.8b},  [x2], x1
178         subs            w3,  w3,  #1
179         st1             {v0.8b},  [x0], x1
180         b.ne            1b
181         ret
182 endfunc
183
184 function ff_pred8x8_vert_neon, export=1
185         sub             x2,  x0,  x1
186         lsl             x1,  x1,  #1
187         ld1             {v0.8b},  [x2], x1
188         mov             w3,  #4
189 1:      subs            w3,  w3,  #1
190         st1             {v0.8b},  [x0], x1
191         st1             {v0.8b},  [x2], x1
192         b.ne            1b
193         ret
194 endfunc
195
196 function ff_pred8x8_plane_neon, export=1
197         sub             x3,  x0,  x1
198         movrel          x4,  p8weight
199         movrel          x5,  p16weight
200         add             x2,  x3,  #4
201         sub             x3,  x3,  #1
202         ld1             {v0.s}[0],  [x3]
203         ld1             {v2.s}[0],  [x2], x1
204         ldcol.8         v0,  x3,  x1,  4,  hi=1
205         add             x3,  x3,  x1
206         ldcol.8         v3,  x3,  x1,  4
207         uaddl           v7.8h,  v2.8b,  v3.8b
208         rev32           v0.8b,  v0.8b
209         trn1            v2.2s,  v2.2s,  v3.2s
210         usubl           v2.8h,  v2.8b,  v0.8b
211         ld1             {v6.8h},  [x4]
212         mul             v2.8h,  v2.8h,  v6.8h
213         ld1             {v0.8h},  [x5]
214         saddlp          v2.4s,  v2.8h
215         addp            v2.4s,  v2.4s,  v2.4s
216         shl             v3.4s,  v2.4s,  #4
217         add             v2.4s,  v3.4s,  v2.4s
218         rshrn           v5.4h,  v2.4s,  #5
219         addp            v2.4h,  v5.4h,  v5.4h
220         shl             v3.4h,  v2.4h,  #1
221         add             v3.4h,  v3.4h,  v2.4h
222         rev64           v7.4h,  v7.4h
223         add             v7.4h,  v7.4h,  v0.4h
224         shl             v2.4h,  v7.4h,  #4
225         sub             v2.4h,  v2.4h,  v3.4h
226         ext             v0.16b, v0.16b, v0.16b, #14
227         mov             v0.h[0],  wzr
228         mul             v0.8h,  v0.8h,  v5.h[0]
229         dup             v1.8h,  v2.h[0]
230         dup             v2.8h,  v5.h[1]
231         add             v1.8h,  v1.8h,  v0.8h
232         mov             w3,  #8
233 1:
234         sqshrun         v0.8b,  v1.8h,  #5
235         subs            w3,  w3,  #1
236         add             v1.8h,  v1.8h,  v2.8h
237         st1             {v0.8b},  [x0], x1
238         b.ne            1b
239         ret
240 endfunc
241
242 function ff_pred8x8_128_dc_neon, export=1
243         movi            v0.8b,  #128
244         movi            v1.8b,  #128
245         b               .L_pred8x8_dc_end
246 endfunc
247
248 function ff_pred8x8_top_dc_neon, export=1
249         sub             x2,  x0,  x1
250         ld1             {v0.8b},  [x2]
251         uaddlp          v0.4h,  v0.8b
252         addp            v0.4h,  v0.4h,  v0.4h
253         zip1            v0.8h,  v0.8h,  v0.8h
254         rshrn           v2.8b,  v0.8h,  #2
255         zip1            v0.8b,  v2.8b,  v2.8b
256         zip1            v1.8b,  v2.8b,  v2.8b
257         b               .L_pred8x8_dc_end
258 endfunc
259
260 function ff_pred8x8_left_dc_neon, export=1
261         sub             x2,  x0,  #1
262         ldcol.8         v0,  x2,  x1
263         uaddlp          v0.4h,  v0.8b
264         addp            v0.4h,  v0.4h,  v0.4h
265         rshrn           v2.8b,  v0.8h,  #2
266         dup             v1.8b,  v2.b[1]
267         dup             v0.8b,  v2.b[0]
268         b               .L_pred8x8_dc_end
269 endfunc
270
271 function ff_pred8x8_dc_neon, export=1
272         sub             x2,  x0,  x1
273         sub             x3,  x0,  #1
274         ld1             {v0.8b}, [x2]
275         ldcol.8         v1,  x3,  x1
276         uaddlp          v0.4h,  v0.8b
277         uaddlp          v1.4h,  v1.8b
278         trn1            v2.2s,  v0.2s,  v1.2s
279         trn2            v3.2s,  v0.2s,  v1.2s
280         addp            v4.4h,  v2.4h,  v3.4h
281         addp            v5.4h,  v4.4h,  v4.4h
282         rshrn           v6.8b,  v5.8h,  #3
283         rshrn           v7.8b,  v4.8h,  #2
284         dup             v0.8b,  v6.b[0]
285         dup             v2.8b,  v7.b[2]
286         dup             v1.8b,  v7.b[3]
287         dup             v3.8b,  v6.b[1]
288         zip1            v0.2s,  v0.2s,  v2.2s
289         zip1            v1.2s,  v1.2s,  v3.2s
290 .L_pred8x8_dc_end:
291         mov             w3,  #4
292         add             x2,  x0,  x1,  lsl #2
293 6:      subs            w3,  w3,  #1
294         st1             {v0.8b},  [x0], x1
295         st1             {v1.8b},  [x2], x1
296         b.ne            6b
297         ret
298 endfunc
299
300 function ff_pred8x8_l0t_dc_neon, export=1
301         sub             x2,  x0,  x1
302         sub             x3,  x0,  #1
303         ld1             {v0.8b},  [x2]
304         ldcol.8         v1,  x3,  x1,  4
305         zip1            v0.4s,  v0.4s,  v1.4s
306         uaddlp          v0.8h,  v0.16b
307         addp            v0.8h,  v0.8h,  v0.8h
308         addp            v1.4h,  v0.4h,  v0.4h
309         rshrn           v2.8b,  v0.8h,  #2
310         rshrn           v3.8b,  v1.8h,  #3
311         dup             v4.8b,  v3.b[0]
312         dup             v6.8b,  v2.b[2]
313         dup             v5.8b,  v2.b[0]
314         zip1            v0.2s,  v4.2s,  v6.2s
315         zip1            v1.2s,  v5.2s,  v6.2s
316         b               .L_pred8x8_dc_end
317 endfunc
318
319 function ff_pred8x8_l00_dc_neon, export=1
320         sub             x2,  x0,  #1
321         ldcol.8         v0,  x2,  x1,  4
322         uaddlp          v0.4h,  v0.8b
323         addp            v0.4h,  v0.4h,  v0.4h
324         rshrn           v0.8b,  v0.8h,  #2
325         movi            v1.8b,  #128
326         dup             v0.8b,  v0.b[0]
327         b               .L_pred8x8_dc_end
328 endfunc
329
330 function ff_pred8x8_0lt_dc_neon, export=1
331         add             x3,  x0,  x1,  lsl #2
332         sub             x2,  x0,  x1
333         sub             x3,  x3,  #1
334         ld1             {v0.8b},  [x2]
335         ldcol.8         v1,  x3,  x1,  4,  hi=1
336         zip1            v0.4s,  v0.4s,  v1.4s
337         uaddlp          v0.8h,  v0.16b
338         addp            v0.8h,  v0.8h,  v0.8h
339         addp            v1.4h,  v0.4h,  v0.4h
340         rshrn           v2.8b,  v0.8h,  #2
341         rshrn           v3.8b,  v1.8h,  #3
342         dup             v4.8b,  v2.b[0]
343         dup             v5.8b,  v2.b[3]
344         dup             v6.8b,  v2.b[2]
345         dup             v7.8b,  v3.b[1]
346         zip1            v0.2s,  v4.2s,  v6.2s
347         zip1            v1.2s,  v5.2s,  v7.2s
348         b               .L_pred8x8_dc_end
349 endfunc
350
351 function ff_pred8x8_0l0_dc_neon, export=1
352         add             x2,  x0,  x1,  lsl #2
353         sub             x2,  x2,  #1
354         ldcol.8         v1,  x2,  x1,  4
355         uaddlp          v2.4h,  v1.8b
356         addp            v2.4h,  v2.4h,  v2.4h
357         rshrn           v1.8b,  v2.8h,  #2
358         movi            v0.8b,  #128
359         dup             v1.8b,  v1.b[0]
360         b               .L_pred8x8_dc_end
361 endfunc
362
363 .macro ldcol.16  rd,  rs,  rt,  n=4,  hi=0
364 .if \n >= 4 || \hi == 0
365         ld1             {\rd\().h}[0],  [\rs], \rt
366         ld1             {\rd\().h}[1],  [\rs], \rt
367 .endif
368 .if \n >= 4 || \hi == 1
369         ld1             {\rd\().h}[2],  [\rs], \rt
370         ld1             {\rd\().h}[3],  [\rs], \rt
371 .endif
372 .if \n == 8
373         ld1             {\rd\().h}[4],  [\rs], \rt
374         ld1             {\rd\().h}[5],  [\rs], \rt
375         ld1             {\rd\().h}[6],  [\rs], \rt
376         ld1             {\rd\().h}[7],  [\rs], \rt
377 .endif
378 .endm
379
380 // slower than C
381 /*
382 function ff_pred16x16_128_dc_neon_10, export=1
383         movi            v0.8h, #2, lsl #8 // 512, 1 << (bit_depth - 1)
384
385         b               .L_pred16x16_dc_10_end
386 endfunc
387 */
388
389 function ff_pred16x16_top_dc_neon_10, export=1
390         sub             x2,  x0,  x1
391
392         ld1             {v0.8h, v1.8h}, [x2]
393
394         add             v0.8h, v0.8h, v1.8h
395         addv            h0, v0.8h
396
397         urshr           v0.4h,  v0.4h,  #4
398         dup             v0.8h, v0.h[0]
399         b               .L_pred16x16_dc_10_end
400 endfunc
401
402 // slower than C
403 /*
404 function ff_pred16x16_left_dc_neon_10, export=1
405         sub             x2,  x0,  #2 // access to the "left" column
406         ldcol.16        v0,  x2,  x1,  8
407         ldcol.16        v1,  x2,  x1,  8 // load "left" column
408
409         add             v0.8h, v0.8h, v1.8h
410         addv            h0,  v0.8h
411
412         urshr           v0.4h,  v0.4h,  #4
413         dup             v0.8h, v0.h[0]
414         b               .L_pred16x16_dc_10_end
415 endfunc
416 */
417
418 function ff_pred16x16_dc_neon_10, export=1
419         sub             x2,  x0,  x1 // access to the "top" row
420         sub             x3,  x0,  #2 // access to the "left" column
421
422         ld1             {v0.8h, v1.8h}, [x2]
423         ldcol.16        v2,  x3,  x1,  8
424         ldcol.16        v3,  x3,  x1,  8 // load pixels in "top" row and "left" col
425
426         add             v0.8h, v0.8h, v1.8h
427         add             v2.8h, v2.8h, v3.8h
428         add             v0.8h, v0.8h, v2.8h
429         addv            h0, v0.8h
430
431         urshr           v0.4h,  v0.4h,  #5
432         dup             v0.8h,  v0.h[0]
433 .L_pred16x16_dc_10_end:
434         mov             v1.16b,  v0.16b
435         mov             w3,  #8
436 6:      st1             {v0.8h, v1.8h}, [x0], x1
437         subs            w3,  w3,  #1
438         st1             {v0.8h, v1.8h}, [x0], x1
439         b.ne            6b
440         ret
441 endfunc
442
443 function ff_pred16x16_hor_neon_10, export=1
444         sub             x2,  x0,  #2
445         add             x3,  x0,  #16
446
447         mov             w4,  #16
448 1:      ld1r            {v0.8h},  [x2],  x1
449         subs            w4,  w4,  #1
450         st1             {v0.8h},  [x0],  x1
451         st1             {v0.8h},  [x3],  x1
452         b.ne            1b
453         ret
454 endfunc
455
456 function ff_pred16x16_vert_neon_10, export=1
457         sub             x2,  x0,  x1
458         add             x1,  x1,  x1
459
460         ld1             {v0.8h, v1.8h},  [x2],  x1
461
462         mov             w3,  #8
463 1:      subs            w3,  w3,  #1
464         st1             {v0.8h, v1.8h},  [x0],  x1
465         st1             {v0.8h, v1.8h},  [x2],  x1
466
467         b.ne            1b
468         ret
469 endfunc