]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/h264dsp_neon.S
Merge commit 'f89ec87afaf0d1abb6d450253b0b348fd554533b'
[ffmpeg] / libavcodec / aarch64 / h264dsp_neon.S
1 /*
2  * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3  * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "libavutil/aarch64/asm.S"
23 #include "neon.S"
24
25 .macro  h264_loop_filter_start
26         cmp             w2,  #0
27         ldr             w6,  [x4]
28         ccmp            w3,  #0, #0, ne
29         mov             v24.S[0], w6
30         and             w6,  w6,  w6,  lsl #16
31         b.eq            1f
32         ands            w6,  w6,  w6,  lsl #8
33         b.ge            2f
34 1:
35         ret
36 2:
37 .endm
38
39 .macro  h264_loop_filter_luma
40         dup             v22.16B, w2                     // alpha
41         uxtl            v24.8H,  v24.8B
42         uabd            v21.16B, v16.16B, v0.16B        // abs(p0 - q0)
43         uxtl            v24.4S,  v24.4H
44         uabd            v28.16B, v18.16B, v16.16B       // abs(p1 - p0)
45         sli             v24.8H,  v24.8H,  #8
46         uabd            v30.16B, v2.16B,  v0.16B        // abs(q1 - q0)
47         sli             v24.4S,  v24.4S,  #16
48         cmhi            v21.16B, v22.16B, v21.16B       // < alpha
49         dup             v22.16B, w3                     // beta
50         cmlt            v23.16B, v24.16B, #0
51         cmhi            v28.16B, v22.16B, v28.16B       // < beta
52         cmhi            v30.16B, v22.16B, v30.16B       // < beta
53         bic             v21.16B, v21.16B, v23.16B
54         uabd            v17.16B, v20.16B, v16.16B       // abs(p2 - p0)
55         and             v21.16B, v21.16B, v28.16B
56         uabd            v19.16B,  v4.16B,  v0.16B       // abs(q2 - q0)
57         cmhi            v17.16B, v22.16B, v17.16B       // < beta
58         and             v21.16B, v21.16B, v30.16B
59         cmhi            v19.16B, v22.16B, v19.16B       // < beta
60         and             v17.16B, v17.16B, v21.16B
61         and             v19.16B, v19.16B, v21.16B
62         and             v24.16B, v24.16B, v21.16B
63         urhadd          v28.16B, v16.16B,  v0.16B
64         sub             v21.16B, v24.16B, v17.16B
65         uqadd           v23.16B, v18.16B, v24.16B
66         uhadd           v20.16B, v20.16B, v28.16B
67         sub             v21.16B, v21.16B, v19.16B
68         uhadd           v28.16B,  v4.16B, v28.16B
69         umin            v23.16B, v23.16B, v20.16B
70         uqsub           v22.16B, v18.16B, v24.16B
71         uqadd           v4.16B,   v2.16B, v24.16B
72         umax            v23.16B, v23.16B, v22.16B
73         uqsub           v22.16B,  v2.16B, v24.16B
74         umin            v28.16B,  v4.16B, v28.16B
75         uxtl            v4.8H,    v0.8B
76         umax            v28.16B, v28.16B, v22.16B
77         uxtl2           v20.8H,   v0.16B
78         usubw           v4.8H,    v4.8H,  v16.8B
79         usubw2          v20.8H,  v20.8H,  v16.16B
80         shl             v4.8H,    v4.8H,  #2
81         shl             v20.8H,  v20.8H,  #2
82         uaddw           v4.8H,    v4.8H,  v18.8B
83         uaddw2          v20.8H,  v20.8H,  v18.16B
84         usubw           v4.8H,    v4.8H,   v2.8B
85         usubw2          v20.8H,  v20.8H,   v2.16B
86         rshrn           v4.8B,    v4.8H,  #3
87         rshrn2          v4.16B,  v20.8H,  #3
88         bsl             v17.16B, v23.16B, v18.16B
89         bsl             v19.16B, v28.16B,  v2.16B
90         neg             v23.16B, v21.16B
91         uxtl            v28.8H,  v16.8B
92         smin            v4.16B,   v4.16B, v21.16B
93         uxtl2           v21.8H,  v16.16B
94         smax            v4.16B,   v4.16B, v23.16B
95         uxtl            v22.8H,   v0.8B
96         uxtl2           v24.8H,   v0.16B
97         saddw           v28.8H,  v28.8H,  v4.8B
98         saddw2          v21.8H,  v21.8H,  v4.16B
99         ssubw           v22.8H,  v22.8H,  v4.8B
100         ssubw2          v24.8H,  v24.8H,  v4.16B
101         sqxtun          v16.8B,  v28.8H
102         sqxtun2         v16.16B, v21.8H
103         sqxtun          v0.8B,   v22.8H
104         sqxtun2         v0.16B,  v24.8H
105 .endm
106
107 function ff_h264_v_loop_filter_luma_neon, export=1
108         h264_loop_filter_start
109         sxtw            x1,  w1
110
111         ld1             {v0.16B},  [x0], x1
112         ld1             {v2.16B},  [x0], x1
113         ld1             {v4.16B},  [x0], x1
114         sub             x0,  x0,  x1, lsl #2
115         sub             x0,  x0,  x1, lsl #1
116         ld1             {v20.16B},  [x0], x1
117         ld1             {v18.16B},  [x0], x1
118         ld1             {v16.16B},  [x0], x1
119
120         h264_loop_filter_luma
121
122         sub             x0,  x0,  x1, lsl #1
123         st1             {v17.16B},  [x0], x1
124         st1             {v16.16B}, [x0], x1
125         st1             {v0.16B},  [x0], x1
126         st1             {v19.16B}, [x0]
127
128         ret
129 endfunc
130
131 function ff_h264_h_loop_filter_luma_neon, export=1
132         h264_loop_filter_start
133
134         sub             x0,  x0,  #4
135         ld1             {v6.8B},  [x0], x1
136         ld1             {v20.8B}, [x0], x1
137         ld1             {v18.8B}, [x0], x1
138         ld1             {v16.8B}, [x0], x1
139         ld1             {v0.8B},  [x0], x1
140         ld1             {v2.8B},  [x0], x1
141         ld1             {v4.8B},  [x0], x1
142         ld1             {v26.8B}, [x0], x1
143         ld1             {v6.D}[1],  [x0], x1
144         ld1             {v20.D}[1], [x0], x1
145         ld1             {v18.D}[1], [x0], x1
146         ld1             {v16.D}[1], [x0], x1
147         ld1             {v0.D}[1],  [x0], x1
148         ld1             {v2.D}[1],  [x0], x1
149         ld1             {v4.D}[1],  [x0], x1
150         ld1             {v26.D}[1], [x0], x1
151
152         transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
153
154         h264_loop_filter_luma
155
156         transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
157
158         sub             x0,  x0,  x1, lsl #4
159         add             x0,  x0,  #2
160         st1             {v17.S}[0],  [x0], x1
161         st1             {v16.S}[0], [x0], x1
162         st1             {v0.S}[0],  [x0], x1
163         st1             {v19.S}[0], [x0], x1
164         st1             {v17.S}[1],  [x0], x1
165         st1             {v16.S}[1], [x0], x1
166         st1             {v0.S}[1],  [x0], x1
167         st1             {v19.S}[1], [x0], x1
168         st1             {v17.S}[2],  [x0], x1
169         st1             {v16.S}[2], [x0], x1
170         st1             {v0.S}[2],  [x0], x1
171         st1             {v19.S}[2], [x0], x1
172         st1             {v17.S}[3],  [x0], x1
173         st1             {v16.S}[3], [x0], x1
174         st1             {v0.S}[3],  [x0], x1
175         st1             {v19.S}[3], [x0], x1
176
177         ret
178 endfunc
179
180 .macro  h264_loop_filter_chroma
181         dup             v22.8B, w2              // alpha
182         uxtl            v24.8H, v24.8B
183         uabd            v26.8B, v16.8B, v0.8B   // abs(p0 - q0)
184         uxtl            v4.8H,  v0.8B
185         uabd            v28.8B, v18.8B, v16.8B  // abs(p1 - p0)
186         usubw           v4.8H,  v4.8H,  v16.8B
187         sli             v24.8H, v24.8H, #8
188         shl             v4.8H,  v4.8H,  #2
189         uabd            v30.8B, v2.8B,  v0.8B   // abs(q1 - q0)
190         uaddw           v4.8H,  v4.8H,  v18.8B
191         cmhi            v26.8B, v22.8B, v26.8B  // < alpha
192         usubw           v4.8H,  v4.8H,  v2.8B
193         dup             v22.8B, w3              // beta
194         rshrn           v4.8B,  v4.8H,  #3
195         cmhi            v28.8B, v22.8B, v28.8B  // < beta
196         cmhi            v30.8B, v22.8B, v30.8B  // < beta
197         smin            v4.8B,  v4.8B,  v24.8B
198         neg             v25.8B, v24.8B
199         and             v26.8B, v26.8B, v28.8B
200         smax            v4.8B,  v4.8B,  v25.8B
201         and             v26.8B, v26.8B, v30.8B
202         uxtl            v22.8H, v0.8B
203         and             v4.8B,  v4.8B,  v26.8B
204         uxtl            v28.8H, v16.8B
205         saddw           v28.8H, v28.8H, v4.8B
206         ssubw           v22.8H, v22.8H, v4.8B
207         sqxtun          v16.8B, v28.8H
208         sqxtun          v0.8B,  v22.8H
209 .endm
210
211 function ff_h264_v_loop_filter_chroma_neon, export=1
212         h264_loop_filter_start
213
214         sub             x0,  x0,  x1, lsl #1
215         ld1             {v18.8B}, [x0], x1
216         ld1             {v16.8B}, [x0], x1
217         ld1             {v0.8B},  [x0], x1
218         ld1             {v2.8B},  [x0]
219
220         h264_loop_filter_chroma
221
222         sub             x0,  x0,  x1, lsl #1
223         st1             {v16.8B}, [x0], x1
224         st1             {v0.8B},  [x0], x1
225
226         ret
227 endfunc
228
229 function ff_h264_h_loop_filter_chroma_neon, export=1
230         h264_loop_filter_start
231
232         sub             x0,  x0,  #2
233         ld1             {v18.S}[0], [x0], x1
234         ld1             {v16.S}[0], [x0], x1
235         ld1             {v0.S}[0],  [x0], x1
236         ld1             {v2.S}[0],  [x0], x1
237         ld1             {v18.S}[1], [x0], x1
238         ld1             {v16.S}[1], [x0], x1
239         ld1             {v0.S}[1],  [x0], x1
240         ld1             {v2.S}[1],  [x0], x1
241
242         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
243
244         h264_loop_filter_chroma
245
246         transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
247
248         sub             x0,  x0,  x1, lsl #3
249         st1             {v18.S}[0], [x0], x1
250         st1             {v16.S}[0], [x0], x1
251         st1             {v0.S}[0],  [x0], x1
252         st1             {v2.S}[0],  [x0], x1
253         st1             {v18.S}[1], [x0], x1
254         st1             {v16.S}[1], [x0], x1
255         st1             {v0.S}[1],  [x0], x1
256         st1             {v2.S}[1],  [x0], x1
257
258         ret
259 endfunc
260
261 .macro  biweight_16     macs, macd
262         dup             v0.16B,  w5
263         dup             v1.16B,  w6
264         mov             v4.16B,  v16.16B
265         mov             v6.16B,  v16.16B
266 1:      subs            w3,  w3,  #2
267         ld1             {v20.16B}, [x0], x2
268         \macd           v4.8H,   v0.8B,  v20.8B
269         \macd\()2       v6.8H,   v0.16B, v20.16B
270         ld1             {v22.16B}, [x1], x2
271         \macs           v4.8H,   v1.8B,  v22.8B
272         \macs\()2       v6.8H,   v1.16B, v22.16B
273         mov             v24.16B, v16.16B
274         ld1             {v28.16B}, [x0], x2
275         mov             v26.16B, v16.16B
276         \macd           v24.8H,  v0.8B,  v28.8B
277         \macd\()2       v26.8H,  v0.16B, v28.16B
278         ld1             {v30.16B}, [x1], x2
279         \macs           v24.8H,  v1.8B,  v30.8B
280         \macs\()2       v26.8H,  v1.16B, v30.16B
281         sshl            v4.8H,   v4.8H,  v18.8H
282         sshl            v6.8H,   v6.8H,  v18.8H
283         sqxtun          v4.8B,   v4.8H
284         sqxtun2         v4.16B,  v6.8H
285         sshl            v24.8H,  v24.8H, v18.8H
286         sshl            v26.8H,  v26.8H, v18.8H
287         sqxtun          v24.8B,  v24.8H
288         sqxtun2         v24.16B, v26.8H
289         mov             v6.16B,  v16.16B
290         st1             {v4.16B},  [x7], x2
291         mov             v4.16B,  v16.16B
292         st1             {v24.16B}, [x7], x2
293         b.ne            1b
294         ret
295 .endm
296
297 .macro  biweight_8      macs, macd
298         dup             v0.8B,  w5
299         dup             v1.8B,  w6
300         mov             v2.16B,  v16.16B
301         mov             v20.16B, v16.16B
302 1:      subs            w3,  w3,  #2
303         ld1             {v4.8B}, [x0], x2
304         \macd           v2.8H,  v0.8B,  v4.8B
305         ld1             {v5.8B}, [x1], x2
306         \macs           v2.8H,  v1.8B,  v5.8B
307         ld1             {v6.8B}, [x0], x2
308         \macd           v20.8H, v0.8B,  v6.8B
309         ld1             {v7.8B}, [x1], x2
310         \macs           v20.8H, v1.8B,  v7.8B
311         sshl            v2.8H,  v2.8H,  v18.8H
312         sqxtun          v2.8B,  v2.8H
313         sshl            v20.8H, v20.8H, v18.8H
314         sqxtun          v4.8B,  v20.8H
315         mov             v20.16B, v16.16B
316         st1             {v2.8B}, [x7], x2
317         mov             v2.16B,  v16.16B
318         st1             {v4.8B}, [x7], x2
319         b.ne            1b
320         ret
321 .endm
322
323 .macro  biweight_4      macs, macd
324         dup             v0.8B,  w5
325         dup             v1.8B,  w6
326         mov             v2.16B, v16.16B
327         mov             v20.16B,v16.16B
328 1:      subs            w3,  w3,  #4
329         ld1             {v4.S}[0], [x0], x2
330         ld1             {v4.S}[1], [x0], x2
331         \macd           v2.8H,  v0.8B,  v4.8B
332         ld1             {v5.S}[0], [x1], x2
333         ld1             {v5.S}[1], [x1], x2
334         \macs           v2.8H,  v1.8B,  v5.8B
335         b.lt            2f
336         ld1             {v6.S}[0], [x0], x2
337         ld1             {v6.S}[1], [x0], x2
338         \macd           v20.8H, v0.8B,  v6.8B
339         ld1             {v7.S}[0], [x1], x2
340         ld1             {v7.S}[1], [x1], x2
341         \macs           v20.8H, v1.8B,  v7.8B
342         sshl            v2.8H,  v2.8H,  v18.8H
343         sqxtun          v2.8B,  v2.8H
344         sshl            v20.8H, v20.8H, v18.8H
345         sqxtun          v4.8B,  v20.8H
346         mov             v20.16B, v16.16B
347         st1             {v2.S}[0], [x7], x2
348         st1             {v2.S}[1], [x7], x2
349         mov             v2.16B,  v16.16B
350         st1             {v4.S}[0], [x7], x2
351         st1             {v4.S}[1], [x7], x2
352         b.ne            1b
353         ret
354 2:      sshl            v2.8H,  v2.8H,  v18.8H
355         sqxtun          v2.8B,  v2.8H
356         st1             {v2.S}[0], [x7], x2
357         st1             {v2.S}[1], [x7], x2
358         ret
359 .endm
360
361 .macro  biweight_func   w
362 function ff_biweight_h264_pixels_\w\()_neon, export=1
363         sxtw            x2,  w2
364         lsr             w8,  w5,  #31
365         add             w7,  w7,  #1
366         eor             w8,  w8,  w6,  lsr #30
367         orr             w7,  w7,  #1
368         dup             v18.8H,   w4
369         lsl             w7,  w7,  w4
370         not             v18.16B,  v18.16B
371         dup             v16.8H,   w7
372         mov             x7,  x0
373         cbz             w8,  10f
374         subs            w8,  w8,  #1
375         b.eq            20f
376         subs            w8,  w8,  #1
377         b.eq            30f
378         b               40f
379 10:     biweight_\w     umlal, umlal
380 20:     neg             w5, w5
381         biweight_\w     umlal, umlsl
382 30:     neg             w5, w5
383         neg             w6, w6
384         biweight_\w     umlsl, umlsl
385 40:     neg             w6, w6
386         biweight_\w     umlsl, umlal
387 endfunc
388 .endm
389
390         biweight_func   16
391         biweight_func   8
392         biweight_func   4
393
394 .macro  weight_16       add
395         dup             v0.16B,  w4
396 1:      subs            w2,  w2,  #2
397         ld1             {v20.16B}, [x0], x1
398         umull           v4.8H,   v0.8B,  v20.8B
399         umull2          v6.8H,   v0.16B, v20.16B
400         ld1             {v28.16B}, [x0], x1
401         umull           v24.8H,  v0.8B,  v28.8B
402         umull2          v26.8H,  v0.16B, v28.16B
403         \add            v4.8H,   v16.8H, v4.8H
404         srshl           v4.8H,   v4.8H,  v18.8H
405         \add            v6.8H,   v16.8H, v6.8H
406         srshl           v6.8H,   v6.8H,  v18.8H
407         sqxtun          v4.8B,   v4.8H
408         sqxtun2         v4.16B,  v6.8H
409         \add            v24.8H,  v16.8H, v24.8H
410         srshl           v24.8H,  v24.8H, v18.8H
411         \add            v26.8H,  v16.8H, v26.8H
412         srshl           v26.8H,  v26.8H, v18.8H
413         sqxtun          v24.8B,  v24.8H
414         sqxtun2         v24.16B, v26.8H
415         st1             {v4.16B},  [x5], x1
416         st1             {v24.16B}, [x5], x1
417         b.ne            1b
418         ret
419 .endm
420
421 .macro  weight_8        add
422         dup             v0.8B,  w4
423 1:      subs            w2,  w2,  #2
424         ld1             {v4.8B}, [x0], x1
425         umull           v2.8H,  v0.8B,  v4.8B
426         ld1             {v6.8B}, [x0], x1
427         umull           v20.8H, v0.8B,  v6.8B
428         \add            v2.8H,  v16.8H,  v2.8H
429         srshl           v2.8H,  v2.8H,  v18.8H
430         sqxtun          v2.8B,  v2.8H
431         \add            v20.8H, v16.8H,  v20.8H
432         srshl           v20.8H, v20.8H, v18.8H
433         sqxtun          v4.8B,  v20.8H
434         st1             {v2.8B}, [x5], x1
435         st1             {v4.8B}, [x5], x1
436         b.ne            1b
437         ret
438 .endm
439
440 .macro  weight_4        add
441         dup             v0.8B,  w4
442 1:      subs            w2,  w2,  #4
443         ld1             {v4.S}[0], [x0], x1
444         ld1             {v4.S}[1], [x0], x1
445         umull           v2.8H,  v0.8B,  v4.8B
446         b.lt            2f
447         ld1             {v6.S}[0], [x0], x1
448         ld1             {v6.S}[1], [x0], x1
449         umull           v20.8H, v0.8B,  v6.8B
450         \add            v2.8H,  v16.8H,  v2.8H
451         srshl           v2.8H,  v2.8H,  v18.8H
452         sqxtun          v2.8B,  v2.8H
453         \add            v20.8H, v16.8H,  v20.8H
454         srshl           v20.8H, v20.8h, v18.8H
455         sqxtun          v4.8B,  v20.8H
456         st1             {v2.S}[0], [x5], x1
457         st1             {v2.S}[1], [x5], x1
458         st1             {v4.S}[0], [x5], x1
459         st1             {v4.S}[1], [x5], x1
460         b.ne            1b
461         ret
462 2:      \add            v2.8H,  v16.8H,  v2.8H
463         srshl           v2.8H,  v2.8H,  v18.8H
464         sqxtun          v2.8B,  v2.8H
465         st1             {v2.S}[0], [x5], x1
466         st1             {v2.S}[1], [x5], x1
467         ret
468 .endm
469
470 .macro  weight_func     w
471 function ff_weight_h264_pixels_\w\()_neon, export=1
472         sxtw            x1,  w1
473         cmp             w3,  #1
474         mov             w6,  #1
475         lsl             w5,  w5,  w3
476         dup             v16.8H,  w5
477         mov             x5,  x0
478         b.le            20f
479         sub             w6,  w6,  w3
480         dup             v18.8H,  w6
481         cmp             w4, #0
482         b.lt            10f
483         weight_\w       shadd
484 10:     neg             w4,  w4
485         weight_\w       shsub
486 20:     neg             w6,  w3
487         dup             v18.8H,  w6
488         cmp             w4,  #0
489         b.lt            10f
490         weight_\w       add
491 10:     neg             w4,  w4
492         weight_\w       sub
493 endfunc
494 .endm
495
496         weight_func     16
497         weight_func     8
498         weight_func     4