]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/hevcdsp_idct_neon.S
avfilter/avfilter: Remove compatibility code for old filter options
[ffmpeg] / libavcodec / aarch64 / hevcdsp_idct_neon.S
1 /*
2  * ARM NEON optimised IDCT functions for HEVC decoding
3  * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
4  * Copyright (c) 2017 Alexandra Hájková
5  *
6  * Ported from arm/hevcdsp_idct_neon.S by
7  * Copyright (c) 2020 Reimar Döffinger
8  * Copyright (c) 2020 Josh Dekker
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26
27 #include "libavutil/aarch64/asm.S"
28
29 const trans, align=4
30         .short 64, 83, 64, 36
31         .short 89, 75, 50, 18
32         .short 90, 87, 80, 70
33         .short 57, 43, 25, 9
34         .short 90, 90, 88, 85
35         .short 82, 78, 73, 67
36         .short 61, 54, 46, 38
37         .short 31, 22, 13, 4
38 endconst
39
40 .macro clip10 in1, in2, c1, c2
41         smax        \in1, \in1, \c1
42         smax        \in2, \in2, \c1
43         smin        \in1, \in1, \c2
44         smin        \in2, \in2, \c2
45 .endm
46
47 function ff_hevc_add_residual_4x4_8_neon, export=1
48         ld1             {v0.8h-v1.8h}, [x1]
49         ld1             {v2.s}[0], [x0], x2
50         ld1             {v2.s}[1], [x0], x2
51         ld1             {v2.s}[2], [x0], x2
52         ld1             {v2.s}[3], [x0], x2
53         sub              x0,  x0,  x2, lsl #2
54         uxtl             v6.8h,  v2.8b
55         uxtl2            v7.8h,  v2.16b
56         sqadd            v0.8h,  v0.8h, v6.8h
57         sqadd            v1.8h,  v1.8h, v7.8h
58         sqxtun           v0.8b,  v0.8h
59         sqxtun2          v0.16b, v1.8h
60         st1             {v0.s}[0], [x0], x2
61         st1             {v0.s}[1], [x0], x2
62         st1             {v0.s}[2], [x0], x2
63         st1             {v0.s}[3], [x0], x2
64         ret
65 endfunc
66
67 function ff_hevc_add_residual_4x4_10_neon, export=1
68         mov             x12,  x0
69         ld1             {v0.8h-v1.8h}, [x1]
70         ld1             {v2.d}[0], [x12], x2
71         ld1             {v2.d}[1], [x12], x2
72         ld1             {v3.d}[0], [x12], x2
73         sqadd            v0.8h, v0.8h, v2.8h
74         ld1             {v3.d}[1], [x12], x2
75         movi             v4.8h, #0
76         sqadd            v1.8h, v1.8h, v3.8h
77         mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
78         clip10           v0.8h, v1.8h, v4.8h, v5.8h
79         st1             {v0.d}[0],  [x0], x2
80         st1             {v0.d}[1],  [x0], x2
81         st1             {v1.d}[0],  [x0], x2
82         st1             {v1.d}[1],  [x0], x2
83         ret
84 endfunc
85
86 function ff_hevc_add_residual_8x8_8_neon, export=1
87         add             x12,  x0, x2
88         add              x2,  x2, x2
89         mov              x3,  #8
90 1:      subs             x3,  x3, #2
91         ld1             {v2.d}[0],     [x0]
92         ld1             {v2.d}[1],    [x12]
93         uxtl             v3.8h,  v2.8b
94         ld1             {v0.8h-v1.8h}, [x1], #32
95         uxtl2            v2.8h,  v2.16b
96         sqadd            v0.8h,  v0.8h,   v3.8h
97         sqadd            v1.8h,  v1.8h,   v2.8h
98         sqxtun           v0.8b,  v0.8h
99         sqxtun2          v0.16b, v1.8h
100         st1             {v0.d}[0],     [x0], x2
101         st1             {v0.d}[1],    [x12], x2
102         bne              1b
103         ret
104 endfunc
105
106 function ff_hevc_add_residual_8x8_10_neon, export=1
107         add             x12,  x0, x2
108         add              x2,  x2, x2
109         mov              x3,  #8
110         movi             v4.8h, #0
111         mvni             v5.8h, #0xFC, lsl #8 // movi #0x3FF
112 1:      subs             x3,  x3, #2
113         ld1             {v0.8h-v1.8h}, [x1], #32
114         ld1             {v2.8h},       [x0]
115         sqadd            v0.8h, v0.8h, v2.8h
116         ld1             {v3.8h},      [x12]
117         sqadd            v1.8h, v1.8h, v3.8h
118         clip10           v0.8h, v1.8h, v4.8h, v5.8h
119         st1             {v0.8h},       [x0], x2
120         st1             {v1.8h},      [x12], x2
121         bne              1b
122         ret
123 endfunc
124
125 function ff_hevc_add_residual_16x16_8_neon, export=1
126         mov              x3,  #16
127         add             x12, x0, x2
128         add              x2,  x2, x2
129 1:      subs             x3,  x3, #2
130         ld1             {v16.16b},     [x0]
131         ld1             {v0.8h-v3.8h}, [x1], #64
132         ld1             {v19.16b},    [x12]
133         uxtl            v17.8h, v16.8b
134         uxtl2           v18.8h, v16.16b
135         uxtl            v20.8h, v19.8b
136         uxtl2           v21.8h, v19.16b
137         sqadd            v0.8h,  v0.8h, v17.8h
138         sqadd            v1.8h,  v1.8h, v18.8h
139         sqadd            v2.8h,  v2.8h, v20.8h
140         sqadd            v3.8h,  v3.8h, v21.8h
141         sqxtun           v0.8b,  v0.8h
142         sqxtun2         v0.16b,  v1.8h
143         sqxtun           v1.8b,  v2.8h
144         sqxtun2         v1.16b,  v3.8h
145         st1             {v0.16b},     [x0], x2
146         st1             {v1.16b},    [x12], x2
147         bne              1b
148         ret
149 endfunc
150
151 function ff_hevc_add_residual_16x16_10_neon, export=1
152         mov              x3,  #16
153         movi            v20.8h, #0
154         mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
155         add             x12,  x0, x2
156         add              x2,  x2, x2
157 1:      subs             x3,  x3, #2
158         ld1             {v16.8h-v17.8h}, [x0]
159         ld1             {v0.8h-v3.8h},  [x1], #64
160         sqadd            v0.8h, v0.8h, v16.8h
161         ld1             {v18.8h-v19.8h}, [x12]
162         sqadd            v1.8h, v1.8h, v17.8h
163         sqadd            v2.8h, v2.8h, v18.8h
164         sqadd            v3.8h, v3.8h, v19.8h
165         clip10           v0.8h, v1.8h, v20.8h, v21.8h
166         clip10           v2.8h, v3.8h, v20.8h, v21.8h
167         st1             {v0.8h-v1.8h},   [x0], x2
168         st1             {v2.8h-v3.8h},  [x12], x2
169         bne              1b
170         ret
171 endfunc
172
173 function ff_hevc_add_residual_32x32_8_neon, export=1
174         add             x12,  x0, x2
175         add              x2,  x2, x2
176         mov              x3,  #32
177 1:      subs             x3,  x3, #2
178         ld1             {v20.16b, v21.16b}, [x0]
179         uxtl            v16.8h,  v20.8b
180         uxtl2           v17.8h,  v20.16b
181         ld1             {v22.16b, v23.16b}, [x12]
182         uxtl            v18.8h,  v21.8b
183         uxtl2           v19.8h,  v21.16b
184         uxtl            v20.8h,  v22.8b
185         ld1             {v0.8h-v3.8h}, [x1], #64
186         ld1             {v4.8h-v7.8h}, [x1], #64
187         uxtl2           v21.8h,  v22.16b
188         uxtl            v22.8h,  v23.8b
189         uxtl2           v23.8h,  v23.16b
190         sqadd            v0.8h,  v0.8h,  v16.8h
191         sqadd            v1.8h,  v1.8h,  v17.8h
192         sqadd            v2.8h,  v2.8h,  v18.8h
193         sqadd            v3.8h,  v3.8h,  v19.8h
194         sqadd            v4.8h,  v4.8h,  v20.8h
195         sqadd            v5.8h,  v5.8h,  v21.8h
196         sqadd            v6.8h,  v6.8h,  v22.8h
197         sqadd            v7.8h,  v7.8h,  v23.8h
198         sqxtun           v0.8b,  v0.8h
199         sqxtun2         v0.16b,  v1.8h
200         sqxtun           v1.8b,  v2.8h
201         sqxtun2         v1.16b,  v3.8h
202         sqxtun           v2.8b,  v4.8h
203         sqxtun2         v2.16b,  v5.8h
204         st1             {v0.16b, v1.16b},  [x0], x2
205         sqxtun           v3.8b,  v6.8h
206         sqxtun2         v3.16b,  v7.8h
207         st1             {v2.16b, v3.16b}, [x12], x2
208         bne              1b
209         ret
210 endfunc
211
212 function ff_hevc_add_residual_32x32_10_neon, export=1
213         mov              x3,  #32
214         movi            v20.8h, #0
215         mvni            v21.8h, #0xFC, lsl #8 // movi #0x3FF
216 1:      subs             x3,  x3, #1
217         ld1             {v0.8h-v3.8h},   [x1], #64
218         ld1             {v16.8h-v19.8h}, [x0]
219         sqadd            v0.8h, v0.8h, v16.8h
220         sqadd            v1.8h, v1.8h, v17.8h
221         sqadd            v2.8h, v2.8h, v18.8h
222         sqadd            v3.8h, v3.8h, v19.8h
223         clip10           v0.8h, v1.8h, v20.8h, v21.8h
224         clip10           v2.8h, v3.8h, v20.8h, v21.8h
225         st1             {v0.8h-v3.8h},   [x0], x2
226         bne              1b
227         ret
228 endfunc
229
230 .macro sum_sub out, in, c, op, p
231   .ifc \op, +
232         smlal\p         \out, \in, \c
233   .else
234         smlsl\p         \out, \in, \c
235   .endif
236 .endm
237
238 .macro fixsqrshrn d, dt, n, m
239   .ifc \dt, .8h
240         sqrshrn2        \d\dt, \n\().4s, \m
241   .else
242         sqrshrn         \n\().4h, \n\().4s, \m
243         mov             \d\().d[0], \n\().d[0]
244   .endif
245 .endm
246
247 // uses and clobbers v28-v31 as temp registers
248 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
249          sshll\p1       v28.4s, \in0, #6
250          mov            v29.16b, v28.16b
251          smull\p1       v30.4s, \in1, v0.h[1]
252          smull\p1       v31.4s, \in1, v0.h[3]
253          smlal\p2       v28.4s, \in2, v0.h[0] //e0
254          smlsl\p2       v29.4s, \in2, v0.h[0] //e1
255          smlal\p2       v30.4s, \in3, v0.h[3] //o0
256          smlsl\p2       v31.4s, \in3, v0.h[1] //o1
257
258          add            \out0, v28.4s, v30.4s
259          add            \out1, v29.4s, v31.4s
260          sub            \out2, v29.4s, v31.4s
261          sub            \out3, v28.4s, v30.4s
262 .endm
263
264 .macro transpose8_4x4 r0, r1, r2, r3
265         trn1            v2.8h, \r0\().8h, \r1\().8h
266         trn2            v3.8h, \r0\().8h, \r1\().8h
267         trn1            v4.8h, \r2\().8h, \r3\().8h
268         trn2            v5.8h, \r2\().8h, \r3\().8h
269         trn1            \r0\().4s, v2.4s, v4.4s
270         trn2            \r2\().4s, v2.4s, v4.4s
271         trn1            \r1\().4s, v3.4s, v5.4s
272         trn2            \r3\().4s, v3.4s, v5.4s
273 .endm
274
275 .macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
276         transpose8_4x4  \r0, \r1, \r2, \r3
277         transpose8_4x4  \r4, \r5, \r6, \r7
278 .endm
279
280 .macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, in5,in5t, in6,in6t, in7,in7t, p1, p2
281         tr_4x4_8        \in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4s, v25.4s, v26.4s, v27.4s, \p1, \p2
282
283         smull\p1        v30.4s, \in1\in1t, v0.h[6]
284         smull\p1        v28.4s, \in1\in1t, v0.h[4]
285         smull\p1        v29.4s, \in1\in1t, v0.h[5]
286         sum_sub         v30.4s, \in3\in3t, v0.h[4], -, \p1
287         sum_sub         v28.4s, \in3\in3t, v0.h[5], +, \p1
288         sum_sub         v29.4s, \in3\in3t, v0.h[7], -, \p1
289
290         sum_sub         v30.4s, \in5\in5t, v0.h[7], +, \p2
291         sum_sub         v28.4s, \in5\in5t, v0.h[6], +, \p2
292         sum_sub         v29.4s, \in5\in5t, v0.h[4], -, \p2
293
294         sum_sub         v30.4s, \in7\in7t, v0.h[5], +, \p2
295         sum_sub         v28.4s, \in7\in7t, v0.h[7], +, \p2
296         sum_sub         v29.4s, \in7\in7t, v0.h[6], -, \p2
297
298         add             v31.4s, v26.4s, v30.4s
299         sub             v26.4s, v26.4s, v30.4s
300         fixsqrshrn      \in2,\in2t, v31, \shift
301
302
303         smull\p1        v31.4s, \in1\in1t, v0.h[7]
304         sum_sub         v31.4s, \in3\in3t, v0.h[6], -, \p1
305         sum_sub         v31.4s, \in5\in5t, v0.h[5], +, \p2
306         sum_sub         v31.4s, \in7\in7t, v0.h[4], -, \p2
307         fixsqrshrn      \in5,\in5t, v26, \shift
308
309
310         add             v26.4s, v24.4s, v28.4s
311         sub             v24.4s, v24.4s, v28.4s
312         add             v28.4s, v25.4s, v29.4s
313         sub             v25.4s, v25.4s, v29.4s
314         add             v30.4s, v27.4s, v31.4s
315         sub             v27.4s, v27.4s, v31.4s
316
317         fixsqrshrn      \in0,\in0t, v26, \shift
318         fixsqrshrn      \in7,\in7t, v24, \shift
319         fixsqrshrn      \in1,\in1t, v28, \shift
320         fixsqrshrn      \in6,\in6t, v25, \shift
321         fixsqrshrn      \in3,\in3t, v30, \shift
322         fixsqrshrn      \in4,\in4t, v27, \shift
323 .endm
324
325 .macro idct_8x8 bitdepth
326 function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
327 //x0 - coeffs
328         mov              x1,  x0
329         ld1             {v16.8h-v19.8h}, [x1], #64
330         ld1             {v20.8h-v23.8h}, [x1]
331
332         movrel           x1, trans
333         ld1             {v0.8h}, [x1]
334
335         tr_8x4          7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
336         tr_8x4          7, v16,.8h, v17,.8h, v18,.8h, v19,.8h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, 2, 2
337
338         transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23
339
340         tr_8x4          20 - \bitdepth, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v16,.8h, v17,.8h, v18,.8h, v19,.8h, , 2
341         tr_8x4          20 - \bitdepth, v20,.4h, v21,.4h, v22,.4h, v23,.4h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, , 2
342
343         transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23
344
345         mov              x1,  x0
346         st1             {v16.8h-v19.8h}, [x1], #64
347         st1             {v20.8h-v23.8h}, [x1]
348
349         ret
350 endfunc
351 .endm
352
353 .macro butterfly e, o, tmp_p, tmp_m
354         add        \tmp_p, \e, \o
355         sub        \tmp_m, \e, \o
356 .endm
357
358 .macro tr16_8x4 in0, in1, in2, in3, offset
359         tr_4x4_8        \in0\().4h, \in1\().4h, \in2\().4h, \in3\().4h, v24.4s, v25.4s, v26.4s, v27.4s
360
361         smull2          v28.4s, \in0\().8h, v0.h[4]
362         smull2          v29.4s, \in0\().8h, v0.h[5]
363         smull2          v30.4s, \in0\().8h, v0.h[6]
364         smull2          v31.4s, \in0\().8h, v0.h[7]
365         sum_sub         v28.4s, \in1\().8h, v0.h[5], +, 2
366         sum_sub         v29.4s, \in1\().8h, v0.h[7], -, 2
367         sum_sub         v30.4s, \in1\().8h, v0.h[4], -, 2
368         sum_sub         v31.4s, \in1\().8h, v0.h[6], -, 2
369
370         sum_sub         v28.4s, \in2\().8h, v0.h[6], +, 2
371         sum_sub         v29.4s, \in2\().8h, v0.h[4], -, 2
372         sum_sub         v30.4s, \in2\().8h, v0.h[7], +, 2
373         sum_sub         v31.4s, \in2\().8h, v0.h[5], +, 2
374
375         sum_sub         v28.4s, \in3\().8h, v0.h[7], +, 2
376         sum_sub         v29.4s, \in3\().8h, v0.h[6], -, 2
377         sum_sub         v30.4s, \in3\().8h, v0.h[5], +, 2
378         sum_sub         v31.4s, \in3\().8h, v0.h[4], -, 2
379
380         butterfly       v24.4s, v28.4s, v16.4s, v23.4s
381         butterfly       v25.4s, v29.4s, v17.4s, v22.4s
382         butterfly       v26.4s, v30.4s, v18.4s, v21.4s
383         butterfly       v27.4s, v31.4s, v19.4s, v20.4s
384         add              x4,  sp,  #\offset
385         st1             {v16.4s-v19.4s}, [x4], #64
386         st1             {v20.4s-v23.4s}, [x4]
387 .endm
388
389 .macro load16 in0, in1, in2, in3
390         ld1             {\in0}[0], [x1], x2
391         ld1             {\in0}[1], [x3], x2
392         ld1             {\in1}[0], [x1], x2
393         ld1             {\in1}[1], [x3], x2
394         ld1             {\in2}[0], [x1], x2
395         ld1             {\in2}[1], [x3], x2
396         ld1             {\in3}[0], [x1], x2
397         ld1             {\in3}[1], [x3], x2
398 .endm
399
400 .macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
401         sum_sub v21.4s, \in, \t0, \op0, \p
402         sum_sub v22.4s, \in, \t1, \op1, \p
403         sum_sub v23.4s, \in, \t2, \op2, \p
404         sum_sub v24.4s, \in, \t3, \op3, \p
405         sum_sub v25.4s, \in, \t4, \op4, \p
406         sum_sub v26.4s, \in, \t5, \op5, \p
407         sum_sub v27.4s, \in, \t6, \op6, \p
408         sum_sub v28.4s, \in, \t7, \op7, \p
409 .endm
410
411 .macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
412         add             v20.4s, \in0, \in1
413         sub             \in0, \in0, \in1
414         add             \in1, \in2, \in3
415         sub             \in2, \in2, \in3
416         add             \in3, \in4, \in5
417         sub             \in4, \in4, \in5
418         add             \in5, \in6, \in7
419         sub             \in6, \in6, \in7
420 .endm
421
422 .macro store16 in0, in1, in2, in3, rx
423         st1             {\in0}[0], [x1], x2
424         st1             {\in0}[1], [x3], \rx
425         st1             {\in1}[0], [x1], x2
426         st1             {\in1}[1], [x3], \rx
427         st1             {\in2}[0], [x1], x2
428         st1             {\in2}[1], [x3], \rx
429         st1             {\in3}[0], [x1], x2
430         st1             {\in3}[1], [x3], \rx
431 .endm
432
433 .macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
434         sqrshrn         \out0\().4h, \in0, \shift
435         sqrshrn2        \out0\().8h, \in1, \shift
436         sqrshrn         \out1\().4h, \in2, \shift
437         sqrshrn2        \out1\().8h, \in3, \shift
438         sqrshrn         \out2\().4h, \in4, \shift
439         sqrshrn2        \out2\().8h, \in5, \shift
440         sqrshrn         \out3\().4h, \in6, \shift
441         sqrshrn2        \out3\().8h, \in7, \shift
442 .endm
443
444 .macro transpose16_4x4_2 r0, r1, r2, r3
445         // lower halves
446         trn1            v2.4h, \r0\().4h, \r1\().4h
447         trn2            v3.4h, \r0\().4h, \r1\().4h
448         trn1            v4.4h, \r2\().4h, \r3\().4h
449         trn2            v5.4h, \r2\().4h, \r3\().4h
450         trn1            v6.2s, v2.2s, v4.2s
451         trn2            v7.2s, v2.2s, v4.2s
452         trn1            v2.2s, v3.2s, v5.2s
453         trn2            v4.2s, v3.2s, v5.2s
454         mov             \r0\().d[0], v6.d[0]
455         mov             \r2\().d[0], v7.d[0]
456         mov             \r1\().d[0], v2.d[0]
457         mov             \r3\().d[0], v4.d[0]
458
459         // upper halves in reverse order
460         trn1            v2.8h, \r3\().8h, \r2\().8h
461         trn2            v3.8h, \r3\().8h, \r2\().8h
462         trn1            v4.8h, \r1\().8h, \r0\().8h
463         trn2            v5.8h, \r1\().8h, \r0\().8h
464         trn1            v6.4s, v2.4s, v4.4s
465         trn2            v7.4s, v2.4s, v4.4s
466         trn1            v2.4s, v3.4s, v5.4s
467         trn2            v4.4s, v3.4s, v5.4s
468         mov             \r3\().d[1], v6.d[1]
469         mov             \r1\().d[1], v7.d[1]
470         mov             \r2\().d[1], v2.d[1]
471         mov             \r0\().d[1], v4.d[1]
472 .endm
473
474 .macro tr_16x4 name, shift, offset, step
475 function func_tr_16x4_\name
476         mov              x1,  x5
477         add              x3,  x5, #(\step * 64)
478         mov              x2,  #(\step * 128)
479         load16          v16.d, v17.d, v18.d, v19.d
480         movrel           x1,  trans
481         ld1             {v0.8h}, [x1]
482
483         tr16_8x4        v16, v17, v18, v19, \offset
484
485         add              x1,  x5, #(\step * 32)
486         add              x3,  x5, #(\step * 3 *32)
487         mov              x2,  #(\step * 128)
488         load16          v20.d, v17.d, v18.d, v19.d
489         movrel           x1, trans, 16
490         ld1             {v1.8h}, [x1]
491         smull           v21.4s, v20.4h, v1.h[0]
492         smull           v22.4s, v20.4h, v1.h[1]
493         smull           v23.4s, v20.4h, v1.h[2]
494         smull           v24.4s, v20.4h, v1.h[3]
495         smull           v25.4s, v20.4h, v1.h[4]
496         smull           v26.4s, v20.4h, v1.h[5]
497         smull           v27.4s, v20.4h, v1.h[6]
498         smull           v28.4s, v20.4h, v1.h[7]
499
500         add_member      v20.8h, v1.h[1], v1.h[4], v1.h[7], v1.h[5], v1.h[2], v1.h[0], v1.h[3], v1.h[6], +, +, +, -, -, -, -, -, 2
501         add_member      v17.4h, v1.h[2], v1.h[7], v1.h[3], v1.h[1], v1.h[6], v1.h[4], v1.h[0], v1.h[5], +, +, -, -, -, +, +, +
502         add_member      v17.8h, v1.h[3], v1.h[5], v1.h[1], v1.h[7], v1.h[0], v1.h[6], v1.h[2], v1.h[4], +, -, -, +, +, +, -, -, 2
503         add_member      v18.4h, v1.h[4], v1.h[2], v1.h[6], v1.h[0], v1.h[7], v1.h[1], v1.h[5], v1.h[3], +, -, -, +, -, -, +, +
504         add_member      v18.8h, v1.h[5], v1.h[0], v1.h[4], v1.h[6], v1.h[1], v1.h[3], v1.h[7], v1.h[2], +, -, +, +, -, +, +, -, 2
505         add_member      v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
506         add_member      v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2
507
508         add              x4, sp, #\offset
509         ld1             {v16.4s-v19.4s}, [x4], #64
510
511         butterfly16     v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
512         scale           v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
513         transpose16_4x4_2 v29, v30, v31, v24
514         mov              x1,  x6
515         add              x3,  x6, #(24 +3*32)
516         mov              x2, #32
517         mov              x4, #-32
518         store16         v29.d, v30.d, v31.d, v24.d, x4
519
520         add             x4, sp, #(\offset + 64)
521         ld1             {v16.4s-v19.4s}, [x4]
522         butterfly16     v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, v28.4s
523         scale           v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
524         transpose16_4x4_2 v29, v30, v31, v20
525
526         add              x1,  x6, #8
527         add              x3,  x6, #(16 + 3 * 32)
528         mov              x2, #32
529         mov              x4, #-32
530         store16         v29.d, v30.d, v31.d, v20.d, x4
531
532         ret
533 endfunc
534 .endm
535
536 .macro idct_16x16 bitdepth
537 function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
538 //r0 - coeffs
539         mov             x15, x30
540
541         // allocate a temp buffer
542         sub              sp,  sp,  #640
543
544 .irp i, 0, 1, 2, 3
545         add              x5,  x0, #(8 * \i)
546         add              x6,  sp, #(8 * \i * 16)
547         bl              func_tr_16x4_firstpass
548 .endr
549
550 .irp i, 0, 1, 2, 3
551         add              x5,  sp, #(8 * \i)
552         add              x6,  x0, #(8 * \i * 16)
553         bl              func_tr_16x4_secondpass_\bitdepth
554 .endr
555
556         add              sp,  sp,  #640
557
558         mov             x30, x15
559         ret
560 endfunc
561 .endm
562
563 idct_8x8 8
564 idct_8x8 10
565
566 tr_16x4 firstpass, 7, 512, 1
567 tr_16x4 secondpass_8, 20 - 8, 512, 1
568 tr_16x4 secondpass_10, 20 - 10, 512, 1
569
570 idct_16x16 8
571 idct_16x16 10
572
573 // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
574 .macro idct_dc size, bitdepth
575 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
576         movi          v1.8h,  #((1 << (14 - \bitdepth))+1)
577         ld1r         {v4.8h}, [x0]
578         add           v4.8h,  v4.8h,  v1.8h
579         sshr          v0.8h,  v4.8h,  #(15 - \bitdepth)
580         sshr          v1.8h,  v4.8h,  #(15 - \bitdepth)
581 .if \size > 4
582         sshr          v2.8h,  v4.8h,  #(15 - \bitdepth)
583         sshr          v3.8h,  v4.8h,  #(15 - \bitdepth)
584 .if \size > 16 /* dc 32x32 */
585         mov              x2,  #4
586 1:
587         subs             x2,  x2, #1
588 .endif
589         add             x12,  x0, #64
590         mov             x13,  #128
591 .if \size > 8 /* dc 16x16 */
592         st1            {v0.8h-v3.8h},  [x0], x13
593         st1            {v0.8h-v3.8h}, [x12], x13
594         st1            {v0.8h-v3.8h},  [x0], x13
595         st1            {v0.8h-v3.8h}, [x12], x13
596         st1            {v0.8h-v3.8h},  [x0], x13
597         st1            {v0.8h-v3.8h}, [x12], x13
598 .endif /* dc 8x8 */
599         st1            {v0.8h-v3.8h},  [x0], x13
600         st1            {v0.8h-v3.8h}, [x12], x13
601 .if \size > 16 /* dc 32x32 */
602         bne             1b
603 .endif
604 .else /* dc 4x4 */
605         st1            {v0.8h-v1.8h},  [x0]
606 .endif
607         ret
608 endfunc
609 .endm
610
611 idct_dc 4, 8
612 idct_dc 4, 10
613
614 idct_dc 8, 8
615 idct_dc 8, 10
616
617 idct_dc 16, 8
618 idct_dc 16, 10
619
620 idct_dc 32, 8
621 idct_dc 32, 10