]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264cmc_neon.S
dv: Mark internal frame reference as const
[ffmpeg] / libavcodec / aarch64 / h264cmc_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23
24 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
25 .macro  h264_chroma_mc8 type, codec=h264
26 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
27         sxtw            x2,  w2
28   .ifc \type,avg
29         mov             x8,  x0
30   .endif
31         prfm            pldl1strm, [x1]
32         prfm            pldl1strm, [x1, x2]
33   .ifc \codec,rv40
34         movrel          x6,  rv40bias
35         lsr             w9,  w5,  #1
36         lsr             w10, w4,  #1
37         lsl             w9,  w9,  #3
38         lsl             w10, w10, #1
39         add             w9,  w9,  w10
40         add             x6,  x6,  w9, UXTW
41         ld1r            {v22.8H}, [x6]
42   .endif
43   .ifc \codec,vc1
44         movi            v22.8H,   #28
45   .endif
46         mul             w7,  w4,  w5
47         lsl             w14, w5,  #3
48         lsl             w13, w4,  #3
49         cmp             w7,  #0
50         sub             w6,  w14, w7
51         sub             w12, w13, w7
52         sub             w4,  w7,  w13
53         sub             w4,  w4,  w14
54         add             w4,  w4,  #64
55         b.eq            2f
56
57         dup             v0.8B,  w4
58         dup             v1.8B,  w12
59         ld1             {v4.8B, v5.8B}, [x1], x2
60         dup             v2.8B,  w6
61         dup             v3.8B,  w7
62         ext             v5.8B,  v4.8B,  v5.8B,  #1
63 1:      ld1             {v6.8B, v7.8B}, [x1], x2
64         umull           v16.8H, v4.8B,  v0.8B
65         umlal           v16.8H, v5.8B,  v1.8B
66         ext             v7.8B,  v6.8B,  v7.8B,  #1
67         ld1             {v4.8B, v5.8B}, [x1], x2
68         umlal           v16.8H, v6.8B,  v2.8B
69         prfm            pldl1strm, [x1]
70         ext             v5.8B,  v4.8B,  v5.8B,  #1
71         umlal           v16.8H, v7.8B,  v3.8B
72         umull           v17.8H, v6.8B,  v0.8B
73         subs            w3,  w3,  #2
74         umlal           v17.8H, v7.8B, v1.8B
75         umlal           v17.8H, v4.8B, v2.8B
76         umlal           v17.8H, v5.8B, v3.8B
77         prfm            pldl1strm, [x1, x2]
78   .ifc \codec,h264
79         rshrn           v16.8B, v16.8H, #6
80         rshrn           v17.8B, v17.8H, #6
81   .else
82         add             v16.8H, v16.8H, v22.8H
83         add             v17.8H, v17.8H, v22.8H
84         shrn            v16.8B, v16.8H, #6
85         shrn            v17.8B, v17.8H, #6
86   .endif
87   .ifc \type,avg
88         ld1             {v20.8B}, [x8], x2
89         ld1             {v21.8B}, [x8], x2
90         urhadd          v16.8B, v16.8B, v20.8B
91         urhadd          v17.8B, v17.8B, v21.8B
92   .endif
93         st1             {v16.8B}, [x0], x2
94         st1             {v17.8B}, [x0], x2
95         b.gt            1b
96         ret
97
98 2:      adds            w12, w12, w6
99         dup             v0.8B, w4
100         b.eq            5f
101         tst             w6,  w6
102         dup             v1.8B, w12
103         b.eq            4f
104
105         ld1             {v4.8B}, [x1], x2
106 3:      ld1             {v6.8B}, [x1], x2
107         umull           v16.8H, v4.8B,  v0.8B
108         umlal           v16.8H, v6.8B,  v1.8B
109         ld1             {v4.8B}, [x1], x2
110         umull           v17.8H, v6.8B,  v0.8B
111         umlal           v17.8H, v4.8B,  v1.8B
112         prfm            pldl1strm, [x1]
113   .ifc \codec,h264
114         rshrn           v16.8B, v16.8H, #6
115         rshrn           v17.8B, v17.8H, #6
116   .else
117         add             v16.8H, v16.8H, v22.8H
118         add             v17.8H, v17.8H, v22.8H
119         shrn            v16.8B, v16.8H, #6
120         shrn            v17.8B, v17.8H, #6
121   .endif
122         prfm            pldl1strm, [x1, x2]
123   .ifc \type,avg
124         ld1             {v20.8B}, [x8], x2
125         ld1             {v21.8B}, [x8], x2
126         urhadd          v16.8B, v16.8B, v20.8B
127         urhadd          v17.8B, v17.8B, v21.8B
128   .endif
129         subs            w3,  w3,  #2
130         st1             {v16.8B}, [x0], x2
131         st1             {v17.8B}, [x0], x2
132         b.gt            3b
133         ret
134
135 4:      ld1             {v4.8B, v5.8B}, [x1], x2
136         ld1             {v6.8B, v7.8B}, [x1], x2
137         ext             v5.8B,  v4.8B,  v5.8B,  #1
138         ext             v7.8B,  v6.8B,  v7.8B,  #1
139         prfm            pldl1strm, [x1]
140         subs            w3,  w3,  #2
141         umull           v16.8H, v4.8B, v0.8B
142         umlal           v16.8H, v5.8B, v1.8B
143         umull           v17.8H, v6.8B, v0.8B
144         umlal           v17.8H, v7.8B, v1.8B
145         prfm            pldl1strm, [x1, x2]
146   .ifc \codec,h264
147         rshrn           v16.8B, v16.8H, #6
148         rshrn           v17.8B, v17.8H, #6
149   .else
150         add             v16.8H, v16.8H, v22.8H
151         add             v17.8H, v17.8H, v22.8H
152         shrn            v16.8B, v16.8H, #6
153         shrn            v17.8B, v17.8H, #6
154   .endif
155   .ifc \type,avg
156         ld1             {v20.8B}, [x8], x2
157         ld1             {v21.8B}, [x8], x2
158         urhadd          v16.8B, v16.8B, v20.8B
159         urhadd          v17.8B, v17.8B, v21.8B
160   .endif
161         st1             {v16.8B}, [x0], x2
162         st1             {v17.8B}, [x0], x2
163         b.gt            4b
164         ret
165
166 5:      ld1             {v4.8B}, [x1], x2
167         ld1             {v5.8B}, [x1], x2
168         prfm            pldl1strm, [x1]
169         subs            w3,  w3,  #2
170         umull           v16.8H, v4.8B, v0.8B
171         umull           v17.8H, v5.8B, v0.8B
172         prfm            pldl1strm, [x1, x2]
173   .ifc \codec,h264
174         rshrn           v16.8B, v16.8H, #6
175         rshrn           v17.8B, v17.8H, #6
176   .else
177         add             v16.8H, v16.8H, v22.8H
178         add             v17.8H, v17.8H, v22.8H
179         shrn            v16.8B, v16.8H, #6
180         shrn            v17.8B, v17.8H, #6
181   .endif
182   .ifc \type,avg
183         ld1             {v20.8B}, [x8], x2
184         ld1             {v21.8B}, [x8], x2
185         urhadd          v16.8B, v16.8B, v20.8B
186         urhadd          v17.8B, v17.8B, v21.8B
187   .endif
188         st1             {v16.8B}, [x0], x2
189         st1             {v17.8B}, [x0], x2
190         b.gt            5b
191         ret
192 endfunc
193 .endm
194
195 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
196 .macro  h264_chroma_mc4 type, codec=h264
197 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
198         sxtw            x2,  w2
199   .ifc \type,avg
200         mov             x8,  x0
201   .endif
202         prfm            pldl1strm, [x1]
203         prfm            pldl1strm, [x1, x2]
204   .ifc \codec,rv40
205         movrel          x6,  rv40bias
206         lsr             w9,  w5,  #1
207         lsr             w10, w4,  #1
208         lsl             w9,  w9,  #3
209         lsl             w10, w10, #1
210         add             w9,  w9,  w10
211         add             x6,  x6,  w9, UXTW
212         ld1r            {v22.8H}, [x6]
213   .endif
214   .ifc \codec,vc1
215         movi            v22.8H,   #28
216   .endif
217         mul             w7,  w4,  w5
218         lsl             w14, w5,  #3
219         lsl             w13, w4,  #3
220         cmp             w7,  #0
221         sub             w6,  w14, w7
222         sub             w12, w13, w7
223         sub             w4,  w7,  w13
224         sub             w4,  w4,  w14
225         add             w4,  w4,  #64
226         b.eq            2f
227
228         dup             v24.8B,  w4
229         dup             v25.8B,  w12
230         ld1             {v4.8B}, [x1], x2
231         dup             v26.8B,  w6
232         dup             v27.8B,  w7
233         ext             v5.8B,  v4.8B,  v5.8B, #1
234         trn1            v0.2S,  v24.2S, v25.2S
235         trn1            v2.2S,  v26.2S, v27.2S
236         trn1            v4.2S,  v4.2S,  v5.2S
237 1:      ld1             {v6.8B}, [x1], x2
238         ext             v7.8B,  v6.8B,  v7.8B, #1
239         trn1            v6.2S,  v6.2S,  v7.2S
240         umull           v18.8H, v4.8B,  v0.8B
241         umlal           v18.8H, v6.8B,  v2.8B
242         ld1             {v4.8B}, [x1], x2
243         ext             v5.8B,  v4.8B,  v5.8B, #1
244         trn1            v4.2S,  v4.2S,  v5.2S
245         prfm            pldl1strm, [x1]
246         umull           v19.8H, v6.8B,  v0.8B
247         umlal           v19.8H, v4.8B,  v2.8B
248         trn1            v30.2D, v18.2D, v19.2D
249         trn2            v31.2D, v18.2D, v19.2D
250         add             v18.8H, v30.8H, v31.8H
251   .ifc \codec,h264
252         rshrn           v16.8B, v18.8H, #6
253   .else
254         add             v18.8H, v18.8H, v22.8H
255         shrn            v16.8B, v18.8H, #6
256   .endif
257         subs            w3,  w3,  #2
258         prfm            pldl1strm, [x1, x2]
259   .ifc \type,avg
260         ld1             {v20.S}[0], [x8], x2
261         ld1             {v20.S}[1], [x8], x2
262         urhadd          v16.8B, v16.8B, v20.8B
263   .endif
264         st1             {v16.S}[0], [x0], x2
265         st1             {v16.S}[1], [x0], x2
266         b.gt            1b
267         ret
268
269 2:      adds            w12, w12, w6
270         dup             v30.8B, w4
271         b.eq            5f
272         tst             w6,  w6
273         dup             v31.8B, w12
274         trn1            v0.2S,  v30.2S, v31.2S
275         trn2            v1.2S,  v30.2S, v31.2S
276         b.eq            4f
277
278         ext             v1.8B,  v0.8B,  v1.8B, #4
279         ld1             {v4.S}[0], [x1], x2
280 3:      ld1             {v4.S}[1], [x1], x2
281         umull           v18.8H, v4.8B,  v0.8B
282         ld1             {v4.S}[0], [x1], x2
283         umull           v19.8H, v4.8B,  v1.8B
284         trn1            v30.2D, v18.2D, v19.2D
285         trn2            v31.2D, v18.2D, v19.2D
286         add             v18.8H, v30.8H, v31.8H
287         prfm            pldl1strm, [x1]
288   .ifc \codec,h264
289         rshrn           v16.8B, v18.8H, #6
290   .else
291         add             v18.8H, v18.8H, v22.8H
292         shrn            v16.8B, v18.8H, #6
293   .endif
294   .ifc \type,avg
295         ld1             {v20.S}[0], [x8], x2
296         ld1             {v20.S}[1], [x8], x2
297         urhadd          v16.8B, v16.8B, v20.8B
298   .endif
299         subs            w3,  w3,  #2
300         prfm            pldl1strm, [x1, x2]
301         st1             {v16.S}[0], [x0], x2
302         st1             {v16.S}[1], [x0], x2
303         b.gt            3b
304         ret
305
306 4:      ld1             {v4.8B}, [x1], x2
307         ld1             {v6.8B}, [x1], x2
308         ext             v5.8B,  v4.8B,  v5.8B, #1
309         ext             v7.8B,  v6.8B,  v7.8B, #1
310         trn1            v4.2S,  v4.2S,  v5.2S
311         trn1            v6.2S,  v6.2S,  v7.2S
312         umull           v18.8H, v4.8B,  v0.8B
313         umull           v19.8H, v6.8B,  v0.8B
314         subs            w3,  w3,  #2
315         trn1            v30.2D, v18.2D, v19.2D
316         trn2            v31.2D, v18.2D, v19.2D
317         add             v18.8H, v30.8H, v31.8H
318         prfm            pldl1strm, [x1]
319   .ifc \codec,h264
320         rshrn           v16.8B, v18.8H, #6
321   .else
322         add             v18.8H, v18.8H, v22.8H
323         shrn            v16.8B, v18.8H, #6
324   .endif
325   .ifc \type,avg
326         ld1             {v20.S}[0], [x8], x2
327         ld1             {v20.S}[1], [x8], x2
328         urhadd          v16.8B, v16.8B, v20.8B
329   .endif
330         prfm            pldl1strm, [x1]
331         st1             {v16.S}[0], [x0], x2
332         st1             {v16.S}[1], [x0], x2
333         b.gt            4b
334         ret
335
336 5:      ld1             {v4.S}[0], [x1], x2
337         ld1             {v4.S}[1], [x1], x2
338         umull           v18.8H, v4.8B,  v30.8B
339         subs            w3,  w3,  #2
340         prfm            pldl1strm, [x1]
341   .ifc \codec,h264
342         rshrn           v16.8B, v18.8H, #6
343   .else
344         add             v18.8H, v18.8H, v22.8H
345         shrn            v16.8B, v18.8H, #6
346   .endif
347   .ifc \type,avg
348         ld1             {v20.S}[0], [x8], x2
349         ld1             {v20.S}[1], [x8], x2
350         urhadd          v16.8B, v16.8B, v20.8B
351   .endif
352         prfm            pldl1strm, [x1]
353         st1             {v16.S}[0], [x0], x2
354         st1             {v16.S}[1], [x0], x2
355         b.gt            5b
356         ret
357 endfunc
358 .endm
359
360 .macro  h264_chroma_mc2 type
361 function ff_\type\()_h264_chroma_mc2_neon, export=1
362         sxtw            x2,  w2
363         prfm            pldl1strm, [x1]
364         prfm            pldl1strm, [x1, x2]
365         orr             w7,  w4,  w5
366         cbz             w7,  2f
367
368         mul             w7,  w4,  w5
369         lsl             w14, w5,  #3
370         lsl             w13, w4,  #3
371         sub             w6,  w14, w7
372         sub             w12, w13, w7
373         sub             w4,  w7,  w13
374         sub             w4,  w4,  w14
375         add             w4,  w4,  #64
376         dup             v0.8B,  w4
377         dup             v2.8B,  w12
378         dup             v1.8B,  w6
379         dup             v3.8B,  w7
380         trn1            v0.4H,  v0.4H,  v2.4H
381         trn1            v1.4H,  v1.4H,  v3.4H
382 1:
383         ld1             {v4.S}[0],  [x1], x2
384         ld1             {v4.S}[1],  [x1], x2
385         rev64           v5.2S,  v4.2S
386         ld1             {v5.S}[1],  [x1]
387         ext             v6.8B,  v4.8B,  v5.8B,  #1
388         ext             v7.8B,  v5.8B,  v4.8B,  #1
389         trn1            v4.4H,  v4.4H,  v6.4H
390         trn1            v5.4H,  v5.4H,  v7.4H
391         umull           v16.8H, v4.8B,  v0.8B
392         umlal           v16.8H, v5.8B,  v1.8B
393   .ifc \type,avg
394         ld1             {v18.H}[0], [x0], x2
395         ld1             {v18.H}[2], [x0]
396         sub             x0,  x0,  x2
397   .endif
398         rev64           v17.4S, v16.4S
399         add             v16.8H, v16.8H, v17.8H
400         rshrn           v16.8B, v16.8H, #6
401   .ifc \type,avg
402         urhadd          v16.8B, v16.8B, v18.8B
403   .endif
404         st1             {v16.H}[0], [x0], x2
405         st1             {v16.H}[2], [x0], x2
406         subs            w3,  w3,  #2
407         b.gt            1b
408         ret
409
410 2:
411         ld1             {v16.H}[0], [x1], x2
412         ld1             {v16.H}[1], [x1], x2
413   .ifc \type,avg
414         ld1             {v18.H}[0], [x0], x2
415         ld1             {v18.H}[1], [x0]
416         sub             x0,  x0,  x2
417         urhadd          v16.8B, v16.8B, v18.8B
418   .endif
419         st1             {v16.H}[0], [x0], x2
420         st1             {v16.H}[1], [x0], x2
421         subs            w3,  w3,  #2
422         b.gt            2b
423         ret
424 endfunc
425 .endm
426
427         h264_chroma_mc8 put
428         h264_chroma_mc8 avg
429         h264_chroma_mc4 put
430         h264_chroma_mc4 avg
431         h264_chroma_mc2 put
432         h264_chroma_mc2 avg
433
434 #if CONFIG_RV40_DECODER
435 const   rv40bias
436         .short           0, 16, 32, 16
437         .short          32, 28, 32, 28
438         .short           0, 32, 16, 32
439         .short          32, 28, 32, 28
440 endconst
441
442         h264_chroma_mc8 put, rv40
443         h264_chroma_mc8 avg, rv40
444         h264_chroma_mc4 put, rv40
445         h264_chroma_mc4 avg, rv40
446 #endif
447
448 #if CONFIG_VC1_DECODER
449         h264_chroma_mc8 put, vc1
450         h264_chroma_mc8 avg, vc1
451         h264_chroma_mc4 put, vc1
452         h264_chroma_mc4 avg, vc1
453 #endif