]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264cmc_neon.S
aarch64: NEON fixed/floating point MPADSP apply_window
[ffmpeg] / libavcodec / aarch64 / h264cmc_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23
24 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
25 .macro  h264_chroma_mc8 type, codec=h264
26 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
27         sxtw            x2,  w2
28   .ifc \type,avg
29         mov             x8,  x0
30   .endif
31         prfm            pldl1strm, [x1]
32         prfm            pldl1strm, [x1, x2]
33   .ifc \codec,rv40
34         movrel          x6,  rv40bias
35         lsr             w9,  w5,  #1
36         lsr             w10, w4,  #1
37         lsl             w9,  w9,  #3
38         lsl             w10, w10, #1
39         add             w9,  w9,  w10
40         add             x6,  x6,  w9, UXTW
41         ld1r            {v22.8H}, [x6]
42   .endif
43   .ifc \codec,vc1
44         movi            v22.8H,   #28
45   .endif
46         mul             w7,  w4,  w5
47         lsl             w14, w5,  #3
48         lsl             w13, w4,  #3
49         cmp             w7,  #0
50         sub             w6,  w14, w7
51         sub             w12, w13, w7
52         sub             w4,  w7,  w13
53         sub             w4,  w4,  w14
54         add             w4,  w4,  #64
55         b.eq            2f
56
57         dup             v0.8B,  w4
58         dup             v1.8B,  w12
59         ld1             {v4.8B, v5.8B}, [x1], x2
60         dup             v2.8B,  w6
61         dup             v3.8B,  w7
62         ext             v5.8B,  v4.8B,  v5.8B,  #1
63 1:      ld1             {v6.8B, v7.8B}, [x1], x2
64         umull           v16.8H, v4.8B,  v0.8B
65         umlal           v16.8H, v5.8B,  v1.8B
66         ext             v7.8B,  v6.8B,  v7.8B,  #1
67         ld1             {v4.8B, v5.8B}, [x1], x2
68         umlal           v16.8H, v6.8B,  v2.8B
69         prfm            pldl1strm, [x1]
70         ext             v5.8B,  v4.8B,  v5.8B,  #1
71         umlal           v16.8H, v7.8B,  v3.8B
72         umull           v17.8H, v6.8B,  v0.8B
73         subs            w3,  w3,  #2
74         umlal           v17.8H, v7.8B, v1.8B
75         umlal           v17.8H, v4.8B, v2.8B
76         umlal           v17.8H, v5.8B, v3.8B
77         prfm            pldl1strm, [x1, x2]
78   .ifc \codec,h264
79         rshrn           v16.8B, v16.8H, #6
80         rshrn           v17.8B, v17.8H, #6
81   .else
82         add             v16.8H, v16.8H, v22.8H
83         add             v17.8H, v17.8H, v22.8H
84         shrn            v16.8B, v16.8H, #6
85         shrn            v17.8B, v17.8H, #6
86   .endif
87   .ifc \type,avg
88         ld1             {v20.8B}, [x8], x2
89         ld1             {v21.8B}, [x8], x2
90         urhadd          v16.8B, v16.8B, v20.8B
91         urhadd          v17.8B, v17.8B, v21.8B
92   .endif
93         st1             {v16.8B}, [x0], x2
94         st1             {v17.8B}, [x0], x2
95         b.gt            1b
96         ret
97
98 2:      tst             w6,  w6
99         add             w12, w12, w6
100         dup             v0.8B, w4
101         dup             v1.8B, w12
102         b.eq            4f
103
104         ld1             {v4.8B}, [x1], x2
105 3:      ld1             {v6.8B}, [x1], x2
106         umull           v16.8H, v4.8B,  v0.8B
107         umlal           v16.8H, v6.8B,  v1.8B
108         ld1             {v4.8B}, [x1], x2
109         umull           v17.8H, v6.8B,  v0.8B
110         umlal           v17.8H, v4.8B,  v1.8B
111         prfm            pldl1strm, [x1]
112   .ifc \codec,h264
113         rshrn           v16.8B, v16.8H, #6
114         rshrn           v17.8B, v17.8H, #6
115   .else
116         add             v16.8H, v16.8H, v22.8H
117         add             v17.8H, v17.8H, v22.8H
118         shrn            v16.8B, v16.8H, #6
119         shrn            v17.8B, v17.8H, #6
120   .endif
121         prfm            pldl1strm, [x1, x2]
122   .ifc \type,avg
123         ld1             {v20.8B}, [x8], x2
124         ld1             {v21.8B}, [x8], x2
125         urhadd          v16.8B, v16.8B, v20.8B
126         urhadd          v17.8B, v17.8B, v21.8B
127   .endif
128         subs            w3,  w3,  #2
129         st1             {v16.8B}, [x0], x2
130         st1             {v17.8B}, [x0], x2
131         b.gt            3b
132         ret
133
134 4:      ld1             {v4.8B, v5.8B}, [x1], x2
135         ld1             {v6.8B, v7.8B}, [x1], x2
136         ext             v5.8B,  v4.8B,  v5.8B,  #1
137         ext             v7.8B,  v6.8B,  v7.8B,  #1
138         prfm            pldl1strm, [x1]
139         subs            w3,  w3,  #2
140         umull           v16.8H, v4.8B, v0.8B
141         umlal           v16.8H, v5.8B, v1.8B
142         umull           v17.8H, v6.8B, v0.8B
143         umlal           v17.8H, v7.8B, v1.8B
144         prfm            pldl1strm, [x1, x2]
145   .ifc \codec,h264
146         rshrn           v16.8B, v16.8H, #6
147         rshrn           v17.8B, v17.8H, #6
148   .else
149         add             v16.8H, v16.8H, v22.8H
150         add             v17.8H, v17.8H, v22.8H
151         shrn            v16.8B, v16.8H, #6
152         shrn            v17.8B, v17.8H, #6
153   .endif
154   .ifc \type,avg
155         ld1             {v20.8B}, [x8], x2
156         ld1             {v21.8B}, [x8], x2
157         urhadd          v16.8B, v16.8B, v20.8B
158         urhadd          v17.8B, v17.8B, v21.8B
159   .endif
160         st1             {v16.8B}, [x0], x2
161         st1             {v17.8B}, [x0], x2
162         b.gt            4b
163         ret
164 endfunc
165 .endm
166
167 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
168 .macro  h264_chroma_mc4 type, codec=h264
169 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
170         sxtw            x2,  w2
171   .ifc \type,avg
172         mov             x8,  x0
173   .endif
174         prfm            pldl1strm, [x1]
175         prfm            pldl1strm, [x1, x2]
176   .ifc \codec,rv40
177         movrel          x6,  rv40bias
178         lsr             w9,  w5,  #1
179         lsr             w10, w4,  #1
180         lsl             w9,  w9,  #3
181         lsl             w10, w10, #1
182         add             w9,  w9,  w10
183         add             x6,  x6,  w9, UXTW
184         ld1r            {v22.8H}, [x6]
185   .endif
186   .ifc \codec,vc1
187         movi            v22.8H,   #28
188   .endif
189         mul             w7,  w4,  w5
190         lsl             w14, w5,  #3
191         lsl             w13, w4,  #3
192         cmp             w7,  #0
193         sub             w6,  w14, w7
194         sub             w12, w13, w7
195         sub             w4,  w7,  w13
196         sub             w4,  w4,  w14
197         add             w4,  w4,  #64
198         b.eq            2f
199
200         dup             v24.8B,  w4
201         dup             v25.8B,  w12
202         ld1             {v4.8B}, [x1], x2
203         dup             v26.8B,  w6
204         dup             v27.8B,  w7
205         ext             v5.8B,  v4.8B,  v5.8B, #1
206         trn1            v0.2S,  v24.2S, v25.2S
207         trn1            v2.2S,  v26.2S, v27.2S
208         trn1            v4.2S,  v4.2S,  v5.2S
209 1:      ld1             {v6.8B}, [x1], x2
210         ext             v7.8B,  v6.8B,  v7.8B, #1
211         trn1            v6.2S,  v6.2S,  v7.2S
212         umull           v18.8H, v4.8B,  v0.8B
213         umlal           v18.8H, v6.8B,  v2.8B
214         ld1             {v4.8B}, [x1], x2
215         ext             v5.8B,  v4.8B,  v5.8B, #1
216         trn1            v4.2S,  v4.2S,  v5.2S
217         prfm            pldl1strm, [x1]
218         umull           v19.8H, v6.8B,  v0.8B
219         umlal           v19.8H, v4.8B,  v2.8B
220         trn1            v30.2D, v18.2D, v19.2D
221         trn2            v31.2D, v18.2D, v19.2D
222         add             v18.8H, v30.8H, v31.8H
223   .ifc \codec,h264
224         rshrn           v16.8B, v18.8H, #6
225   .else
226         add             v18.8H, v18.8H, v22.8H
227         shrn            v16.8B, v18.8H, #6
228   .endif
229         subs            w3,  w3,  #2
230         prfm            pldl1strm, [x1, x2]
231   .ifc \type,avg
232         ld1             {v20.S}[0], [x8], x2
233         ld1             {v20.S}[1], [x8], x2
234         urhadd          v16.8B, v16.8B, v20.8B
235   .endif
236         st1             {v16.S}[0], [x0], x2
237         st1             {v16.S}[1], [x0], x2
238         b.gt            1b
239         ret
240
241 2:      tst             w6,  w6
242         add             w12, w12, w6
243         dup             v30.8B, w4
244         dup             v31.8B, w12
245         trn1            v0.2S,  v30.2S, v31.2S
246         trn2            v1.2S,  v30.2S, v31.2S
247         b.eq            4f
248
249         ext             v1.8B,  v0.8B,  v1.8B, #4
250         ld1             {v4.S}[0], [x1], x2
251 3:      ld1             {v4.S}[1], [x1], x2
252         umull           v18.8H, v4.8B,  v0.8B
253         ld1             {v4.S}[0], [x1], x2
254         umull           v19.8H, v4.8B,  v1.8B
255         trn1            v30.2D, v18.2D, v19.2D
256         trn2            v31.2D, v18.2D, v19.2D
257         add             v18.8H, v30.8H, v31.8H
258         prfm            pldl1strm, [x1]
259   .ifc \codec,h264
260         rshrn           v16.8B, v18.8H, #6
261   .else
262         add             v18.8H, v18.8H, v22.8H
263         shrn            v16.8B, v18.8H, #6
264   .endif
265   .ifc \type,avg
266         ld1             {v20.S}[0], [x8], x2
267         ld1             {v20.S}[1], [x8], x2
268         urhadd          v16.8B, v16.8B, v20.8B
269   .endif
270         subs            w3,  w3,  #2
271         prfm            pldl1strm, [x1, x2]
272         st1             {v16.S}[0], [x0], x2
273         st1             {v16.S}[1], [x0], x2
274         b.gt            3b
275         ret
276
277 4:      ld1             {v4.8B}, [x1], x2
278         ld1             {v6.8B}, [x1], x2
279         ext             v5.8B,  v4.8B,  v5.8B, #1
280         ext             v7.8B,  v6.8B,  v7.8B, #1
281         trn1            v4.2S,  v4.2S,  v5.2S
282         trn1            v6.2S,  v6.2S,  v7.2S
283         umull           v18.8H, v4.8B,  v0.8B
284         umull           v19.8H, v6.8B,  v0.8B
285         subs            w3,  w3,  #2
286         trn1            v30.2D, v18.2D, v19.2D
287         trn2            v31.2D, v18.2D, v19.2D
288         add             v18.8H, v30.8H, v31.8H
289         prfm            pldl1strm, [x1]
290   .ifc \codec,h264
291         rshrn           v16.8B, v18.8H, #6
292   .else
293         add             v18.8H, v18.8H, v22.8H
294         shrn            v16.8B, v18.8H, #6
295   .endif
296   .ifc \type,avg
297         ld1             {v20.S}[0], [x8], x2
298         ld1             {v20.S}[1], [x8], x2
299         urhadd          v16.8B, v16.8B, v20.8B
300   .endif
301         prfm            pldl1strm, [x1]
302         st1             {v16.S}[0], [x0], x2
303         st1             {v16.S}[1], [x0], x2
304         b.gt            4b
305         ret
306 endfunc
307 .endm
308
309 .macro  h264_chroma_mc2 type
310 function ff_\type\()_h264_chroma_mc2_neon, export=1
311         sxtw            x2,  w2
312         prfm            pldl1strm, [x1]
313         prfm            pldl1strm, [x1, x2]
314         orr             w7,  w4,  w5
315         cbz             w7,  2f
316
317         mul             w7,  w4,  w5
318         lsl             w14, w5,  #3
319         lsl             w13, w4,  #3
320         sub             w6,  w14, w7
321         sub             w12, w13, w7
322         sub             w4,  w7,  w13
323         sub             w4,  w4,  w14
324         add             w4,  w4,  #64
325         dup             v0.8B,  w4
326         dup             v2.8B,  w12
327         dup             v1.8B,  w6
328         dup             v3.8B,  w7
329         trn1            v0.4H,  v0.4H,  v2.4H
330         trn1            v1.4H,  v1.4H,  v3.4H
331 1:
332         ld1             {v4.S}[0],  [x1], x2
333         ld1             {v4.S}[1],  [x1], x2
334         rev64           v5.2S,  v4.2S
335         ld1             {v5.S}[1],  [x1]
336         ext             v6.8B,  v4.8B,  v5.8B,  #1
337         ext             v7.8B,  v5.8B,  v4.8B,  #1
338         trn1            v4.4H,  v4.4H,  v6.4H
339         trn1            v5.4H,  v5.4H,  v7.4H
340         umull           v16.8H, v4.8B,  v0.8B
341         umlal           v16.8H, v5.8B,  v1.8B
342   .ifc \type,avg
343         ld1             {v18.H}[0], [x0], x2
344         ld1             {v18.H}[2], [x0]
345         sub             x0,  x0,  x2
346   .endif
347         rev64           v17.4S, v16.4S
348         add             v16.8H, v16.8H, v17.8H
349         rshrn           v16.8B, v16.8H, #6
350   .ifc \type,avg
351         urhadd          v16.8B, v16.8B, v18.8B
352   .endif
353         st1             {v16.H}[0], [x0], x2
354         st1             {v16.H}[2], [x0], x2
355         subs            w3,  w3,  #2
356         b.gt            1b
357         ret
358
359 2:
360         ld1             {v16.H}[0], [x1], x2
361         ld1             {v16.H}[1], [x1], x2
362   .ifc \type,avg
363         ld1             {v18.H}[0], [x0], x2
364         ld1             {v18.H}[1], [x0]
365         sub             x0,  x0,  x2
366         urhadd          v16.8B, v16.8B, v18.8B
367   .endif
368         st1             {v16.H}[0], [x0], x2
369         st1             {v16.H}[1], [x0], x2
370         subs            w3,  w3,  #2
371         b.gt            2b
372         ret
373 endfunc
374 .endm
375
376         h264_chroma_mc8 put
377         h264_chroma_mc8 avg
378         h264_chroma_mc4 put
379         h264_chroma_mc4 avg
380         h264_chroma_mc2 put
381         h264_chroma_mc2 avg
382
383 #if CONFIG_RV40_DECODER
384 const   rv40bias
385         .short           0, 16, 32, 16
386         .short          32, 28, 32, 28
387         .short           0, 32, 16, 32
388         .short          32, 28, 32, 28
389 endconst
390
391         h264_chroma_mc8 put, rv40
392         h264_chroma_mc8 avg, rv40
393         h264_chroma_mc4 put, rv40
394         h264_chroma_mc4 avg, rv40
395 #endif
396
397 #if CONFIG_VC1_DECODER
398         h264_chroma_mc8 put, vc1
399         h264_chroma_mc8 avg, vc1
400         h264_chroma_mc4 put, vc1
401         h264_chroma_mc4 avg, vc1
402 #endif