]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9itxfm_16bpp_neon.S
aarch64: vp8: Remove superfluous includes
[ffmpeg] / libavcodec / aarch64 / vp9itxfm_16bpp_neon.S
1 /*
2  * Copyright (c) 2017 Google Inc.
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/aarch64/asm.S"
22 #include "neon.S"
23
24 const itxfm4_coeffs, align=4
25         .short  11585, 0, 6270, 15137
26 iadst4_coeffs:
27         .short  5283, 15212, 9929, 13377
28 endconst
29
30 const iadst8_coeffs, align=4
31         .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32 idct_coeffs:
33         .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34         .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35         .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36         .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37 endconst
38
39 const iadst16_coeffs, align=4
40         .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41         .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
42 endconst
43
44 .macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
45         trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
46         trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
47         trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
48         trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
49         trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
50         trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
51         trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
52         trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
53 .endm
54
55 // Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
56 // over two registers.
57 .macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
58         transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
59         transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
60
61         // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
62         // while swapping the two 4x4 matrices between each other
63
64         // First step of the 4x4 transpose of r1-r7, into t0-t3
65         trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
66         trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
67         trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
68         trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
69
70         // First step of the 4x4 transpose of r8-r12, into r1-r7
71         trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
72         trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
73         trn1            \r5\().4s,  \r12\().4s, \r14\().4s
74         trn2            \r7\().4s,  \r12\().4s, \r14\().4s
75
76         // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
77         trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
78         trn2            \r12\().2d, \t0\().2d,  \t2\().2d
79         trn1            \r10\().2d, \t1\().2d,  \t3\().2d
80         trn2            \r14\().2d, \t1\().2d,  \t3\().2d
81
82         // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
83         trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
84         trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
85         trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
86         trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
87
88         // Move the outputs of trn1 back in place
89         mov             \r1\().16b,  \t0\().16b
90         mov             \r3\().16b,  \t1\().16b
91 .endm
92
93 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
94 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
95 // in/out are .4s registers; this can do with 4 temp registers, but is
96 // more efficient if 6 temp registers are available.
97 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
98 .if \neg > 0
99         neg             \tmp4\().4s, v0.4s
100 .endif
101         add             \tmp1\().4s, \in1\().4s,  \in2\().4s
102         sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
103 .if \neg > 0
104         smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
105         smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
106 .else
107         smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
108         smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
109 .endif
110 .ifb \tmp5
111         rshrn           \out1\().2s, \tmp3\().2d, #14
112         rshrn2          \out1\().4s, \tmp4\().2d, #14
113         smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
114         smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
115         rshrn           \out2\().2s, \tmp3\().2d, #14
116         rshrn2          \out2\().4s, \tmp4\().2d, #14
117 .else
118         smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
119         smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
120         rshrn           \out1\().2s, \tmp3\().2d, #14
121         rshrn2          \out1\().4s, \tmp4\().2d, #14
122         rshrn           \out2\().2s, \tmp5\().2d, #14
123         rshrn2          \out2\().4s, \tmp6\().2d, #14
124 .endif
125 .endm
126
127 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
128 // writing the same output into both out1 and out2.
129 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
130         smull           \tmp1\().2d, \in1\().2s,  v0.s[0]
131         smull2          \tmp2\().2d, \in1\().4s,  v0.s[0]
132         rshrn           \out1\().2s, \tmp1\().2d, #14
133         rshrn2          \out1\().4s, \tmp2\().2d, #14
134         rshrn           \out2\().2s, \tmp1\().2d, #14
135         rshrn2          \out2\().4s, \tmp2\().2d, #14
136 .endm
137
138 // out1,out2 = in1 * coef1 - in2 * coef2
139 // out3,out4 = in1 * coef2 + in2 * coef1
140 // out are 4 x .2d registers, in are 2 x .4s registers
141 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
142         smull           \out1\().2d, \in1\().2s, \coef1
143         smull2          \out2\().2d, \in1\().4s, \coef1
144         smull           \out3\().2d, \in1\().2s, \coef2
145         smull2          \out4\().2d, \in1\().4s, \coef2
146         smlsl           \out1\().2d, \in2\().2s, \coef2
147         smlsl2          \out2\().2d, \in2\().4s, \coef2
148         smlal           \out3\().2d, \in2\().2s, \coef1
149         smlal2          \out4\().2d, \in2\().4s, \coef1
150 .endm
151
152 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
153 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
154 // inout are 2 x .4s registers
155 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
156         dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
157 .if \neg > 0
158         neg             \tmp3\().2d, \tmp3\().2d
159         neg             \tmp4\().2d, \tmp4\().2d
160 .endif
161         rshrn           \inout1\().2s, \tmp1\().2d,  #14
162         rshrn2          \inout1\().4s, \tmp2\().2d,  #14
163         rshrn           \inout2\().2s, \tmp3\().2d,  #14
164         rshrn2          \inout2\().4s, \tmp4\().2d,  #14
165 .endm
166
167 // Same as dmbutterfly above, but treating the input in inout2 as zero
168 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
169         smull           \tmp1\().2d, \inout1\().2s, \coef1
170         smull2          \tmp2\().2d, \inout1\().4s, \coef1
171         smull           \tmp3\().2d, \inout1\().2s, \coef2
172         smull2          \tmp4\().2d, \inout1\().4s, \coef2
173         rshrn           \inout1\().2s, \tmp1\().2d, #14
174         rshrn2          \inout1\().4s, \tmp2\().2d, #14
175         rshrn           \inout2\().2s, \tmp3\().2d, #14
176         rshrn2          \inout2\().4s, \tmp4\().2d, #14
177 .endm
178
179 // Same as dmbutterfly above, but treating the input in inout1 as zero
180 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
181         smull           \tmp1\().2d, \inout2\().2s, \coef2
182         smull2          \tmp2\().2d, \inout2\().4s, \coef2
183         smull           \tmp3\().2d, \inout2\().2s, \coef1
184         smull2          \tmp4\().2d, \inout2\().4s, \coef1
185         neg             \tmp1\().2d, \tmp1\().2d
186         neg             \tmp2\().2d, \tmp2\().2d
187         rshrn           \inout2\().2s, \tmp3\().2d, #14
188         rshrn2          \inout2\().4s, \tmp4\().2d, #14
189         rshrn           \inout1\().2s, \tmp1\().2d, #14
190         rshrn2          \inout1\().4s, \tmp2\().2d, #14
191 .endm
192
193 .macro dsmull_h out1, out2, in, coef
194         smull           \out1\().2d, \in\().2s, \coef
195         smull2          \out2\().2d, \in\().4s, \coef
196 .endm
197
198 .macro drshrn_h out, in1, in2, shift
199         rshrn           \out\().2s, \in1\().2d, \shift
200         rshrn2          \out\().4s, \in2\().2d, \shift
201 .endm
202
203
204 // out1 = in1 + in2
205 // out2 = in1 - in2
206 .macro butterfly_4s out1, out2, in1, in2
207         add             \out1\().4s, \in1\().4s, \in2\().4s
208         sub             \out2\().4s, \in1\().4s, \in2\().4s
209 .endm
210
211 // out1 = in1 - in2
212 // out2 = in1 + in2
213 .macro butterfly_4s_r out1, out2, in1, in2
214         sub             \out1\().4s, \in1\().4s, \in2\().4s
215         add             \out2\().4s, \in1\().4s, \in2\().4s
216 .endm
217
218 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
219 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
220 // out are 2 x .4s registers, in are 4 x .2d registers
221 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
222         add             \tmp1\().2d, \in1\().2d, \in3\().2d
223         add             \tmp2\().2d, \in2\().2d, \in4\().2d
224         sub             \tmp3\().2d, \in1\().2d, \in3\().2d
225         sub             \tmp4\().2d, \in2\().2d, \in4\().2d
226         rshrn           \out1\().2s, \tmp1\().2d,  #14
227         rshrn2          \out1\().4s, \tmp2\().2d,  #14
228         rshrn           \out2\().2s, \tmp3\().2d,  #14
229         rshrn2          \out2\().4s, \tmp4\().2d,  #14
230 .endm
231
232 .macro iwht4_10 c0, c1, c2, c3
233         add             \c0\().4s, \c0\().4s, \c1\().4s
234         sub             v17.4s,    \c2\().4s, \c3\().4s
235         sub             v16.4s,    \c0\().4s, v17.4s
236         sshr            v16.4s,    v16.4s,    #1
237         sub             \c2\().4s, v16.4s,    \c1\().4s
238         sub             \c1\().4s, v16.4s,    \c3\().4s
239         add             \c3\().4s, v17.4s,    \c2\().4s
240         sub             \c0\().4s, \c0\().4s, \c1\().4s
241 .endm
242
243 .macro iwht4_12 c0, c1, c2, c3
244         iwht4_10        \c0, \c1, \c2, \c3
245 .endm
246
247 .macro idct4_10 c0, c1, c2, c3
248         mul             v22.4s,    \c1\().4s, v0.s[3]
249         mul             v20.4s,    \c1\().4s, v0.s[2]
250         add             v16.4s,    \c0\().4s, \c2\().4s
251         sub             v17.4s,    \c0\().4s, \c2\().4s
252         mla             v22.4s,    \c3\().4s, v0.s[2]
253         mul             v18.4s,    v16.4s,    v0.s[0]
254         mul             v24.4s,    v17.4s,    v0.s[0]
255         mls             v20.4s,    \c3\().4s, v0.s[3]
256         srshr           v22.4s,    v22.4s,    #14
257         srshr           v18.4s,    v18.4s,    #14
258         srshr           v24.4s,    v24.4s,    #14
259         srshr           v20.4s,    v20.4s,    #14
260         add             \c0\().4s, v18.4s,    v22.4s
261         sub             \c3\().4s, v18.4s,    v22.4s
262         add             \c1\().4s, v24.4s,    v20.4s
263         sub             \c2\().4s, v24.4s,    v20.4s
264 .endm
265
266 .macro idct4_12 c0, c1, c2, c3
267         smull           v22.2d,    \c1\().2s, v0.s[3]
268         smull2          v23.2d,    \c1\().4s, v0.s[3]
269         smull           v20.2d,    \c1\().2s, v0.s[2]
270         smull2          v21.2d,    \c1\().4s, v0.s[2]
271         add             v16.4s,    \c0\().4s, \c2\().4s
272         sub             v17.4s,    \c0\().4s, \c2\().4s
273         smlal           v22.2d,    \c3\().2s, v0.s[2]
274         smlal2          v23.2d,    \c3\().4s, v0.s[2]
275         smull           v18.2d,    v16.2s,    v0.s[0]
276         smull2          v19.2d,    v16.4s,    v0.s[0]
277         smull           v24.2d,    v17.2s,    v0.s[0]
278         smull2          v25.2d,    v17.4s,    v0.s[0]
279         smlsl           v20.2d,    \c3\().2s, v0.s[3]
280         smlsl2          v21.2d,    \c3\().4s, v0.s[3]
281         rshrn           v22.2s,    v22.2d,    #14
282         rshrn2          v22.4s,    v23.2d,    #14
283         rshrn           v18.2s,    v18.2d,    #14
284         rshrn2          v18.4s,    v19.2d,    #14
285         rshrn           v24.2s,    v24.2d,    #14
286         rshrn2          v24.4s,    v25.2d,    #14
287         rshrn           v20.2s,    v20.2d,    #14
288         rshrn2          v20.4s,    v21.2d,    #14
289         add             \c0\().4s, v18.4s,    v22.4s
290         sub             \c3\().4s, v18.4s,    v22.4s
291         add             \c1\().4s, v24.4s,    v20.4s
292         sub             \c2\().4s, v24.4s,    v20.4s
293 .endm
294
295 .macro iadst4_10 c0, c1, c2, c3
296         mul             v16.4s,    \c0\().4s, v1.s[0]
297         mla             v16.4s,    \c2\().4s, v1.s[1]
298         mla             v16.4s,    \c3\().4s, v1.s[2]
299         mul             v18.4s,    \c0\().4s, v1.s[2]
300         mls             v18.4s,    \c2\().4s, v1.s[0]
301         sub             \c0\().4s, \c0\().4s, \c2\().4s
302         mls             v18.4s,    \c3\().4s, v1.s[1]
303         add             \c0\().4s, \c0\().4s, \c3\().4s
304         mul             v22.4s,    \c1\().4s, v1.s[3]
305         mul             v20.4s,    \c0\().4s, v1.s[3]
306         add             v24.4s,    v16.4s,    v22.4s
307         add             v26.4s,    v18.4s,    v22.4s
308         srshr           \c0\().4s, v24.4s,    #14
309         add             v16.4s,    v16.4s,    v18.4s
310         srshr           \c1\().4s, v26.4s,    #14
311         sub             v16.4s,    v16.4s,    v22.4s
312         srshr           \c2\().4s, v20.4s,    #14
313         srshr           \c3\().4s, v16.4s,    #14
314 .endm
315
316 .macro iadst4_12 c0, c1, c2, c3
317         smull           v16.2d,    \c0\().2s, v1.s[0]
318         smull2          v17.2d,    \c0\().4s, v1.s[0]
319         smlal           v16.2d,    \c2\().2s, v1.s[1]
320         smlal2          v17.2d,    \c2\().4s, v1.s[1]
321         smlal           v16.2d,    \c3\().2s, v1.s[2]
322         smlal2          v17.2d,    \c3\().4s, v1.s[2]
323         smull           v18.2d,    \c0\().2s, v1.s[2]
324         smull2          v19.2d,    \c0\().4s, v1.s[2]
325         smlsl           v18.2d,    \c2\().2s, v1.s[0]
326         smlsl2          v19.2d,    \c2\().4s, v1.s[0]
327         sub             \c0\().4s, \c0\().4s, \c2\().4s
328         smlsl           v18.2d,    \c3\().2s, v1.s[1]
329         smlsl2          v19.2d,    \c3\().4s, v1.s[1]
330         add             \c0\().4s, \c0\().4s, \c3\().4s
331         smull           v22.2d,    \c1\().2s, v1.s[3]
332         smull2          v23.2d,    \c1\().4s, v1.s[3]
333         smull           v20.2d,    \c0\().2s, v1.s[3]
334         smull2          v21.2d,    \c0\().4s, v1.s[3]
335         add             v24.2d,    v16.2d,    v22.2d
336         add             v25.2d,    v17.2d,    v23.2d
337         add             v26.2d,    v18.2d,    v22.2d
338         add             v27.2d,    v19.2d,    v23.2d
339         rshrn           \c0\().2s, v24.2d,    #14
340         rshrn2          \c0\().4s, v25.2d,    #14
341         add             v16.2d,    v16.2d,    v18.2d
342         add             v17.2d,    v17.2d,    v19.2d
343         rshrn           \c1\().2s, v26.2d,    #14
344         rshrn2          \c1\().4s, v27.2d,    #14
345         sub             v16.2d,    v16.2d,    v22.2d
346         sub             v17.2d,    v17.2d,    v23.2d
347         rshrn           \c2\().2s, v20.2d,    #14
348         rshrn2          \c2\().4s, v21.2d,    #14
349         rshrn           \c3\().2s, v16.2d,    #14
350         rshrn2          \c3\().4s, v17.2d,    #14
351 .endm
352
353 // The public functions in this file have got the following signature:
354 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
355
356 .macro itxfm_func4x4 txfm1, txfm2, bpp
357 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
358 .ifc \txfm1,\txfm2
359 .ifc \txfm1,idct
360         movrel          x4,  itxfm4_coeffs
361         ld1             {v0.4h}, [x4]
362         sxtl            v0.4s,  v0.4h
363 .endif
364 .ifc \txfm1,iadst
365         movrel          x4,  iadst4_coeffs
366         ld1             {v0.d}[1], [x4]
367         sxtl2           v1.4s,  v0.8h
368 .endif
369 .else
370         movrel          x4,  itxfm4_coeffs
371         ld1             {v0.8h}, [x4]
372         sxtl2           v1.4s,  v0.8h
373         sxtl            v0.4s,  v0.4h
374 .endif
375
376         movi            v30.4s, #0
377         movi            v31.4s, #0
378 .ifc \txfm1\()_\txfm2,idct_idct
379         cmp             w3,  #1
380         b.ne            1f
381         // DC-only for idct/idct
382         ld1             {v2.s}[0],  [x2]
383         smull           v2.2d,  v2.2s, v0.s[0]
384         rshrn           v2.2s,  v2.2d, #14
385         smull           v2.2d,  v2.2s, v0.s[0]
386         rshrn           v2.2s,  v2.2d, #14
387         st1             {v31.s}[0], [x2]
388         dup             v4.4s,  v2.s[0]
389         mov             v5.16b, v4.16b
390         mov             v6.16b, v4.16b
391         mov             v7.16b, v4.16b
392         b               2f
393 .endif
394
395 1:
396         ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
397         st1             {v30.4s,v31.4s}, [x2], #32
398
399 .ifc \txfm1,iwht
400         sshr            v4.4s,  v4.4s,  #2
401         sshr            v5.4s,  v5.4s,  #2
402         sshr            v6.4s,  v6.4s,  #2
403         sshr            v7.4s,  v7.4s,  #2
404 .endif
405
406         \txfm1\()4_\bpp v4,  v5,  v6,  v7
407
408         st1             {v30.4s,v31.4s}, [x2], #32
409         // Transpose 4x4 with 32 bit elements
410         transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
411
412         \txfm2\()4_\bpp v4,  v5,  v6,  v7
413 2:
414         mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
415         ld1             {v0.4h},   [x0], x1
416         ld1             {v1.4h},   [x0], x1
417 .ifnc \txfm1,iwht
418         srshr           v4.4s,  v4.4s,  #4
419         srshr           v5.4s,  v5.4s,  #4
420         srshr           v6.4s,  v6.4s,  #4
421         srshr           v7.4s,  v7.4s,  #4
422 .endif
423         uaddw           v4.4s,  v4.4s,  v0.4h
424         uaddw           v5.4s,  v5.4s,  v1.4h
425         ld1             {v2.4h},   [x0], x1
426         ld1             {v3.4h},   [x0], x1
427         sqxtun          v0.4h,  v4.4s
428         sqxtun2         v0.8h,  v5.4s
429         sub             x0,  x0,  x1, lsl #2
430
431         uaddw           v6.4s,  v6.4s,  v2.4h
432         umin            v0.8h,  v0.8h,  v31.8h
433         uaddw           v7.4s,  v7.4s,  v3.4h
434         st1             {v0.4h},   [x0], x1
435         sqxtun          v2.4h,  v6.4s
436         sqxtun2         v2.8h,  v7.4s
437         umin            v2.8h,  v2.8h,  v31.8h
438
439         st1             {v0.d}[1], [x0], x1
440         st1             {v2.4h},   [x0], x1
441         st1             {v2.d}[1], [x0], x1
442
443         ret
444 endfunc
445 .endm
446
447 .macro itxfm_funcs4x4 bpp
448 itxfm_func4x4 idct,  idct,  \bpp
449 itxfm_func4x4 iadst, idct,  \bpp
450 itxfm_func4x4 idct,  iadst, \bpp
451 itxfm_func4x4 iadst, iadst, \bpp
452 itxfm_func4x4 iwht,  iwht,  \bpp
453 .endm
454
455 itxfm_funcs4x4 10
456 itxfm_funcs4x4 12
457
458 function idct8x8_dc_add_neon
459         movrel          x4,  idct_coeffs
460         ld1             {v0.4h}, [x4]
461
462         movi            v1.4h,  #0
463         sxtl            v0.4s,  v0.4h
464
465         ld1             {v2.s}[0],  [x2]
466         smull           v2.2d,  v2.2s,  v0.s[0]
467         rshrn           v2.2s,  v2.2d,  #14
468         smull           v2.2d,  v2.2s,  v0.s[0]
469         rshrn           v2.2s,  v2.2d,  #14
470         st1             {v1.s}[0],  [x2]
471         dup             v2.4s,  v2.s[0]
472
473         srshr           v2.4s,  v2.4s,  #5
474
475         mov             x4,  #8
476         mov             x3,  x0
477         dup             v31.8h, w5
478 1:
479         // Loop to add the constant from v2 into all 8x8 outputs
480         subs            x4,  x4,  #2
481         ld1             {v3.8h},  [x0], x1
482         ld1             {v4.8h},  [x0], x1
483         uaddw           v16.4s, v2.4s,  v3.4h
484         uaddw2          v17.4s, v2.4s,  v3.8h
485         uaddw           v18.4s, v2.4s,  v4.4h
486         uaddw2          v19.4s, v2.4s,  v4.8h
487         sqxtun          v3.4h,  v16.4s
488         sqxtun2         v3.8h,  v17.4s
489         sqxtun          v4.4h,  v18.4s
490         sqxtun2         v4.8h,  v19.4s
491         umin            v3.8h,  v3.8h,  v31.8h
492         umin            v4.8h,  v4.8h,  v31.8h
493         st1             {v3.8h},  [x3], x1
494         st1             {v4.8h},  [x3], x1
495         b.ne            1b
496
497         ret
498 endfunc
499
500 .macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
501         dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
502         dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
503         dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
504         dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
505
506         butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
507         butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
508         butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
509         butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
510
511         dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
512
513         butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
514         butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
515         butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
516         butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
517 .endm
518
519 .macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
520         dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
521         dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
522
523         dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
524         dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
525
526         dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
527         dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
528
529         dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
530         dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
531
532         butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
533         neg             \r7\().4s, \r7\().4s // r7 = out[7]
534         butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
535
536         dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
537         dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
538
539         dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
540
541         dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
542         neg             \r3\().4s, \r3\().4s  // r3 = out[3]
543
544         dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
545         neg             \r1\().4s, \r1\().4s  // r1 = out[1]
546
547         dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
548         neg             \r5\().4s, \r5\().4s  // r5 = out[5]
549 .endm
550
551
552 .macro itxfm_func8x8 txfm1, txfm2
553 function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
554 .ifc \txfm1\()_\txfm2,idct_idct
555         cmp             w3,  #1
556         b.eq            idct8x8_dc_add_neon
557 .endif
558         // The iadst also uses a few coefficients from
559         // idct, so those always need to be loaded.
560 .ifc \txfm1\()_\txfm2,idct_idct
561         movrel          x4,  idct_coeffs
562 .else
563         movrel          x4,  iadst8_coeffs
564         ld1             {v1.8h}, [x4], #16
565         stp             d8,  d9,  [sp, #-0x10]!
566         sxtl2           v3.4s,  v1.8h
567         sxtl            v2.4s,  v1.4h
568 .endif
569         ld1             {v0.8h}, [x4]
570         sxtl2           v1.4s,  v0.8h
571         sxtl            v0.4s,  v0.4h
572
573         movi            v4.4s, #0
574         movi            v5.4s, #0
575         movi            v6.4s, #0
576         movi            v7.4s, #0
577
578 1:
579         ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
580         ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
581         ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
582         ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
583         sub             x2,  x2,  #256
584         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
585         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
586         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
587         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
588
589 .ifc \txfm1\()_\txfm2,idct_idct
590         idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
591         idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
592 .else
593         \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
594         \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
595 .endif
596
597         // Transpose 8x8 with 16 bit elements
598         transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
599
600 .ifc \txfm1\()_\txfm2,idct_idct
601         idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
602         idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
603 .else
604         \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
605         \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
606 .endif
607 2:
608         mov             x3,  x0
609         // Add into the destination
610         ld1             {v0.8h},  [x0], x1
611         srshr           v16.4s, v16.4s, #5
612         srshr           v17.4s, v17.4s, #5
613         ld1             {v1.8h},  [x0], x1
614         srshr           v18.4s, v18.4s, #5
615         srshr           v19.4s, v19.4s, #5
616         ld1             {v2.8h},  [x0], x1
617         srshr           v20.4s, v20.4s, #5
618         srshr           v21.4s, v21.4s, #5
619         uaddw           v16.4s, v16.4s, v0.4h
620         uaddw2          v17.4s, v17.4s, v0.8h
621         ld1             {v3.8h},  [x0], x1
622         srshr           v22.4s, v22.4s, #5
623         srshr           v23.4s, v23.4s, #5
624         uaddw           v18.4s, v18.4s, v1.4h
625         uaddw2          v19.4s, v19.4s, v1.8h
626         ld1             {v4.8h},  [x0], x1
627         srshr           v24.4s, v24.4s, #5
628         srshr           v25.4s, v25.4s, #5
629         uaddw           v20.4s, v20.4s, v2.4h
630         uaddw2          v21.4s, v21.4s, v2.8h
631         sqxtun          v0.4h,  v16.4s
632         sqxtun2         v0.8h,  v17.4s
633         dup             v16.8h, w5
634         ld1             {v5.8h},  [x0], x1
635         srshr           v26.4s, v26.4s, #5
636         srshr           v27.4s, v27.4s, #5
637         uaddw           v22.4s, v22.4s, v3.4h
638         uaddw2          v23.4s, v23.4s, v3.8h
639         sqxtun          v1.4h,  v18.4s
640         sqxtun2         v1.8h,  v19.4s
641         umin            v0.8h,  v0.8h,  v16.8h
642         ld1             {v6.8h},  [x0], x1
643         srshr           v28.4s, v28.4s, #5
644         srshr           v29.4s, v29.4s, #5
645         uaddw           v24.4s, v24.4s, v4.4h
646         uaddw2          v25.4s, v25.4s, v4.8h
647         sqxtun          v2.4h,  v20.4s
648         sqxtun2         v2.8h,  v21.4s
649         umin            v1.8h,  v1.8h,  v16.8h
650         ld1             {v7.8h},  [x0], x1
651         srshr           v30.4s, v30.4s, #5
652         srshr           v31.4s, v31.4s, #5
653         uaddw           v26.4s, v26.4s, v5.4h
654         uaddw2          v27.4s, v27.4s, v5.8h
655         sqxtun          v3.4h,  v22.4s
656         sqxtun2         v3.8h,  v23.4s
657         umin            v2.8h,  v2.8h,  v16.8h
658
659         st1             {v0.8h},  [x3], x1
660         uaddw           v28.4s, v28.4s, v6.4h
661         uaddw2          v29.4s, v29.4s, v6.8h
662         st1             {v1.8h},  [x3], x1
663         sqxtun          v4.4h,  v24.4s
664         sqxtun2         v4.8h,  v25.4s
665         umin            v3.8h,  v3.8h,  v16.8h
666         st1             {v2.8h},  [x3], x1
667         uaddw           v30.4s, v30.4s, v7.4h
668         uaddw2          v31.4s, v31.4s, v7.8h
669         st1             {v3.8h},  [x3], x1
670         sqxtun          v5.4h,  v26.4s
671         sqxtun2         v5.8h,  v27.4s
672         umin            v4.8h,  v4.8h,  v16.8h
673         st1             {v4.8h},  [x3], x1
674         sqxtun          v6.4h,  v28.4s
675         sqxtun2         v6.8h,  v29.4s
676         umin            v5.8h,  v5.8h,  v16.8h
677         st1             {v5.8h},  [x3], x1
678         sqxtun          v7.4h,  v30.4s
679         sqxtun2         v7.8h,  v31.4s
680         umin            v6.8h,  v6.8h,  v16.8h
681
682         st1             {v6.8h},  [x3], x1
683         umin            v7.8h,  v7.8h,  v16.8h
684         st1             {v7.8h},  [x3], x1
685
686 .ifnc \txfm1\()_\txfm2,idct_idct
687         ldp             d8,  d9,  [sp], 0x10
688 .endif
689         ret
690 endfunc
691
692 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
693         mov             x5,  #0x03ff
694         b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
695 endfunc
696
697 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
698         mov             x5,  #0x0fff
699         b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
700 endfunc
701 .endm
702
703 itxfm_func8x8 idct,  idct
704 itxfm_func8x8 iadst, idct
705 itxfm_func8x8 idct,  iadst
706 itxfm_func8x8 iadst, iadst
707
708
709 function idct16x16_dc_add_neon
710         movrel          x4,  idct_coeffs
711         ld1             {v0.4h}, [x4]
712         sxtl            v0.4s,  v0.4h
713
714         movi            v1.4h,  #0
715
716         ld1             {v2.s}[0],  [x2]
717         smull           v2.2d,  v2.2s,  v0.s[0]
718         rshrn           v2.2s,  v2.2d,  #14
719         smull           v2.2d,  v2.2s,  v0.s[0]
720         rshrn           v2.2s,  v2.2d,  #14
721         st1             {v1.s}[0],  [x2]
722         dup             v2.4s,  v2.s[0]
723
724         srshr           v0.4s,  v2.4s,  #6
725
726         mov             x3, x0
727         mov             x4, #16
728         dup             v31.8h, w13
729 1:
730         // Loop to add the constant from v2 into all 16x16 outputs
731         subs            x4,  x4,  #2
732         ld1             {v1.8h,v2.8h},  [x0], x1
733         uaddw           v16.4s, v0.4s,  v1.4h
734         uaddw2          v17.4s, v0.4s,  v1.8h
735         ld1             {v3.8h,v4.8h},  [x0], x1
736         uaddw           v18.4s, v0.4s,  v2.4h
737         uaddw2          v19.4s, v0.4s,  v2.8h
738         uaddw           v20.4s, v0.4s,  v3.4h
739         uaddw2          v21.4s, v0.4s,  v3.8h
740         uaddw           v22.4s, v0.4s,  v4.4h
741         uaddw2          v23.4s, v0.4s,  v4.8h
742         sqxtun          v1.4h,  v16.4s
743         sqxtun2         v1.8h,  v17.4s
744         sqxtun          v2.4h,  v18.4s
745         sqxtun2         v2.8h,  v19.4s
746         sqxtun          v3.4h,  v20.4s
747         sqxtun2         v3.8h,  v21.4s
748         sqxtun          v4.4h,  v22.4s
749         sqxtun2         v4.8h,  v23.4s
750         umin            v1.8h,  v1.8h,  v31.8h
751         umin            v2.8h,  v2.8h,  v31.8h
752         st1             {v1.8h,v2.8h},  [x3], x1
753         umin            v3.8h,  v3.8h,  v31.8h
754         umin            v4.8h,  v4.8h,  v31.8h
755         st1             {v3.8h,v4.8h},  [x3], x1
756         b.ne            1b
757
758         ret
759 endfunc
760
761 .macro idct16_end
762         butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
763         butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
764         butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
765         butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
766         butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
767         butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
768         butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
769         butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
770
771         dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
772         dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
773
774         butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
775         butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
776         butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
777         butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
778         butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
779         butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
780         butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
781         butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
782         ret
783 .endm
784
785 function idct16
786         dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
787         dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
788         dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
789         dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
790         dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
791         dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
792         dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
793         dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
794
795         butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
796         butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
797         butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
798         butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
799         butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
800         butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
801         butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
802         butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
803
804         dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
805         dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
806         dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
807         idct16_end
808 endfunc
809
810 function idct16_half
811         dmbutterfly0_h  v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
812         dmbutterfly_h1  v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
813         dmbutterfly_h1  v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
814         dmbutterfly_h2  v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
815         dmbutterfly_h1  v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
816         dmbutterfly_h2  v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
817         dmbutterfly_h1  v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
818         dmbutterfly_h2  v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
819
820         butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
821         butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
822         butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
823         butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
824         butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
825         butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
826         butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
827         butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
828
829         dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
830         dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
831         dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
832         idct16_end
833 endfunc
834
835 function idct16_quarter
836         dsmull_h        v24, v25, v19, v3.s[3]
837         dsmull_h        v4,  v5,  v17, v2.s[0]
838         dsmull_h        v7,  v6,  v18, v1.s[1]
839         dsmull_h        v30, v31, v18, v1.s[0]
840         neg             v24.2d,  v24.2d
841         neg             v25.2d,  v25.2d
842         dsmull_h        v29, v28, v17, v2.s[1]
843         dsmull_h        v26, v27, v19, v3.s[2]
844         dsmull_h        v22, v23, v16, v0.s[0]
845         drshrn_h        v24, v24, v25, #14
846         drshrn_h        v16, v4,  v5,  #14
847         drshrn_h        v7,  v7,  v6,  #14
848         drshrn_h        v6,  v30, v31, #14
849         drshrn_h        v29, v29, v28, #14
850         drshrn_h        v17, v26, v27, #14
851         drshrn_h        v28, v22, v23, #14
852
853         dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
854         dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
855         neg             v22.2d,  v22.2d
856         neg             v23.2d,  v23.2d
857         drshrn_h        v27, v20, v21, #14
858         drshrn_h        v21, v22, v23, #14
859         drshrn_h        v23, v18, v19, #14
860         drshrn_h        v25, v30, v31, #14
861         mov             v4.16b,  v28.16b
862         mov             v5.16b,  v28.16b
863         dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
864         mov             v20.16b, v28.16b
865         idct16_end
866 endfunc
867
868 function iadst16
869         ld1             {v0.8h,v1.8h}, [x11]
870         sxtl            v2.4s,  v1.4h
871         sxtl2           v3.4s,  v1.8h
872         sxtl2           v1.4s,  v0.8h
873         sxtl            v0.4s,  v0.4h
874
875         dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
876         dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
877         dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
878         dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
879         dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
880
881         dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
882         dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
883         dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
884         dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
885
886         dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
887         dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
888         dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
889         dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
890
891         dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
892         ld1             {v0.8h}, [x10]
893         dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
894         sxtl2           v1.4s,  v0.8h
895         sxtl            v0.4s,  v0.4h
896         dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
897         dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
898
899         dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
900         dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
901         dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
902         butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
903         dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
904
905         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
906         butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
907         dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
908         dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
909
910         butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
911         butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
912
913         dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
914         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
915
916         dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
917         dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
918         neg             v29.4s, v29.4s                   // v29 = out[13]
919
920         dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
921         dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
922
923         butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
924         butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
925
926         dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
927         neg             v19.4s, v19.4s                   // v19 = out[3]
928         dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
929
930         butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
931         butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
932
933         dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
934         dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
935         dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
936         dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
937
938         neg             v31.4s,  v5.4s                    // v31 = out[15]
939         neg             v17.4s,  v3.4s                    // v17 = out[1]
940
941         mov             v16.16b, v2.16b
942         mov             v30.16b, v4.16b
943         ret
944 endfunc
945
946 // Helper macros; we can't use these expressions directly within
947 // e.g. .irp due to the extra concatenation \(). Therefore wrap
948 // them in macros to allow using .irp below.
949 .macro load i, src, inc
950         ld1             {v\i\().4s},  [\src], \inc
951 .endm
952 .macro store i, dst, inc
953         st1             {v\i\().4s},  [\dst], \inc
954 .endm
955 .macro movi_v i, size, imm
956         movi            v\i\()\size,  \imm
957 .endm
958 .macro load_clear i, src, inc
959         ld1             {v\i\().4s}, [\src]
960         st1             {v4.4s},  [\src], \inc
961 .endm
962
963 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
964         srshr           \coef0, \coef0, #6
965         ld1             {v4.4h},   [x0], x1
966         srshr           \coef1, \coef1, #6
967         ld1             {v4.d}[1], [x3], x1
968         srshr           \coef2, \coef2, #6
969         ld1             {v5.4h},   [x0], x1
970         srshr           \coef3, \coef3, #6
971         uaddw           \coef0, \coef0, v4.4h
972         ld1             {v5.d}[1], [x3], x1
973         srshr           \coef4, \coef4, #6
974         uaddw2          \coef1, \coef1, v4.8h
975         ld1             {v6.4h},   [x0], x1
976         srshr           \coef5, \coef5, #6
977         uaddw           \coef2, \coef2, v5.4h
978         ld1             {v6.d}[1], [x3], x1
979         sqxtun          v4.4h,  \coef0
980         srshr           \coef6, \coef6, #6
981         uaddw2          \coef3, \coef3, v5.8h
982         ld1             {v7.4h},   [x0], x1
983         sqxtun2         v4.8h,  \coef1
984         srshr           \coef7, \coef7, #6
985         uaddw           \coef4, \coef4, v6.4h
986         ld1             {v7.d}[1], [x3], x1
987         umin            v4.8h,  v4.8h,  v8.8h
988         sub             x0,  x0,  x1, lsl #2
989         sub             x3,  x3,  x1, lsl #2
990         sqxtun          v5.4h,  \coef2
991         uaddw2          \coef5, \coef5, v6.8h
992         st1             {v4.4h},   [x0], x1
993         sqxtun2         v5.8h,  \coef3
994         uaddw           \coef6, \coef6, v7.4h
995         st1             {v4.d}[1], [x3], x1
996         umin            v5.8h,  v5.8h,  v8.8h
997         sqxtun          v6.4h,  \coef4
998         uaddw2          \coef7, \coef7, v7.8h
999         st1             {v5.4h},   [x0], x1
1000         sqxtun2         v6.8h,  \coef5
1001         st1             {v5.d}[1], [x3], x1
1002         umin            v6.8h,  v6.8h,  v8.8h
1003         sqxtun          v7.4h,  \coef6
1004         st1             {v6.4h},   [x0], x1
1005         sqxtun2         v7.8h,  \coef7
1006         st1             {v6.d}[1], [x3], x1
1007         umin            v7.8h,  v7.8h,  v8.8h
1008         st1             {v7.4h},   [x0], x1
1009         st1             {v7.d}[1], [x3], x1
1010 .endm
1011
1012 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1013 // transpose into a horizontal 16x4 slice and store.
1014 // x0 = dst (temp buffer)
1015 // x1 = slice offset
1016 // x2 = src
1017 // x9 = input stride
1018 .macro itxfm16_1d_funcs txfm
1019 function \txfm\()16_1d_4x16_pass1_neon
1020         mov             x14, x30
1021
1022         movi            v4.4s, #0
1023 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1024         load_clear      \i,  x2,  x9
1025 .endr
1026
1027         bl              \txfm\()16
1028
1029         // Do four 4x4 transposes. Originally, v16-v31 contain the
1030         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1031         // contain the four transposed 4x4 blocks.
1032         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1033         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1034         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1035         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1036
1037         // Store the transposed 4x4 blocks horizontally.
1038         cmp             x1,  #12
1039         b.eq            1f
1040 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1041         store           \i,  x0,  #16
1042 .endr
1043         br              x14
1044 1:
1045         // Special case: For the last input column (x1 == 12),
1046         // which would be stored as the last row in the temp buffer,
1047         // don't store the first 4x4 block, but keep it in registers
1048         // for the first slice of the second pass (where it is the
1049         // last 4x4 block).
1050         add             x0,  x0,  #16
1051         st1             {v20.4s},  [x0], #16
1052         st1             {v24.4s},  [x0], #16
1053         st1             {v28.4s},  [x0], #16
1054         add             x0,  x0,  #16
1055         st1             {v21.4s},  [x0], #16
1056         st1             {v25.4s},  [x0], #16
1057         st1             {v29.4s},  [x0], #16
1058         add             x0,  x0,  #16
1059         st1             {v22.4s},  [x0], #16
1060         st1             {v26.4s},  [x0], #16
1061         st1             {v30.4s},  [x0], #16
1062         add             x0,  x0,  #16
1063         st1             {v23.4s},  [x0], #16
1064         st1             {v27.4s},  [x0], #16
1065         st1             {v31.4s},  [x0], #16
1066
1067         mov             v28.16b, v16.16b
1068         mov             v29.16b, v17.16b
1069         mov             v30.16b, v18.16b
1070         mov             v31.16b, v19.16b
1071         br              x14
1072 endfunc
1073
1074 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1075 // load the destination pixels (from a similar 4x16 slice), add and store back.
1076 // x0 = dst
1077 // x1 = dst stride
1078 // x2 = src (temp buffer)
1079 // x3 = slice offset
1080 // x9 = temp buffer stride
1081 function \txfm\()16_1d_4x16_pass2_neon
1082         mov             x14, x30
1083
1084 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
1085         load            \i,  x2,  x9
1086 .endr
1087         cbz             x3,  1f
1088 .irp i, 28, 29, 30, 31
1089         load            \i,  x2,  x9
1090 .endr
1091 1:
1092
1093         add             x3,  x0,  x1
1094         lsl             x1,  x1,  #1
1095         bl              \txfm\()16
1096
1097         dup             v8.8h, w13
1098         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1099         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1100
1101         br              x14
1102 endfunc
1103 .endm
1104
1105 itxfm16_1d_funcs idct
1106 itxfm16_1d_funcs iadst
1107
1108 // This is the minimum eob value for each subpartition, in increments of 4
1109 const min_eob_idct_idct_16, align=4
1110         .short  0, 10, 38, 89
1111 endconst
1112
1113 .macro itxfm_func16x16 txfm1, txfm2
1114 function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1115 .ifc \txfm1\()_\txfm2,idct_idct
1116         cmp             w3,  #1
1117         b.eq            idct16x16_dc_add_neon
1118 .endif
1119         mov             x15, x30
1120         // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
1121 .ifnc \txfm1\()_\txfm2,idct_idct
1122         stp             d14, d15, [sp, #-0x10]!
1123         stp             d12, d13, [sp, #-0x10]!
1124         stp             d10, d11, [sp, #-0x10]!
1125 .endif
1126         stp             d8,  d9,  [sp, #-0x10]!
1127
1128         sub             sp,  sp,  #1024
1129
1130         mov             x4,  x0
1131         mov             x5,  x1
1132         mov             x6,  x2
1133
1134         movrel          x10, idct_coeffs
1135 .ifnc \txfm1\()_\txfm2,idct_idct
1136         movrel          x11, iadst16_coeffs
1137 .endif
1138 .ifc \txfm1,idct
1139         ld1             {v0.8h,v1.8h}, [x10]
1140         sxtl            v2.4s,  v1.4h
1141         sxtl2           v3.4s,  v1.8h
1142         sxtl2           v1.4s,  v0.8h
1143         sxtl            v0.4s,  v0.4h
1144 .endif
1145         mov             x9,  #64
1146
1147 .ifc \txfm1\()_\txfm2,idct_idct
1148         cmp             w3,  #10
1149         b.le            idct16x16_quarter_add_16_neon
1150         cmp             w3,  #38
1151         b.le            idct16x16_half_add_16_neon
1152
1153         movrel          x12, min_eob_idct_idct_16, 2
1154 .endif
1155
1156 .irp i, 0, 4, 8, 12
1157         add             x0,  sp,  #(\i*64)
1158 .ifc \txfm1\()_\txfm2,idct_idct
1159 .if \i > 0
1160         ldrh            w1,  [x12], #2
1161         cmp             w3,  w1
1162         mov             x1,  #(16 - \i)/4
1163         b.le            1f
1164 .endif
1165 .endif
1166         mov             x1,  #\i
1167         add             x2,  x6,  #(\i*4)
1168         bl              \txfm1\()16_1d_4x16_pass1_neon
1169 .endr
1170 .ifc \txfm1\()_\txfm2,iadst_idct
1171         ld1             {v0.8h,v1.8h}, [x10]
1172         sxtl            v2.4s,  v1.4h
1173         sxtl2           v3.4s,  v1.8h
1174         sxtl2           v1.4s,  v0.8h
1175         sxtl            v0.4s,  v0.4h
1176 .endif
1177
1178 .ifc \txfm1\()_\txfm2,idct_idct
1179         b               3f
1180 1:
1181         // Set v28-v31 to zero, for the in-register passthrough of
1182         // coefficients to pass 2.
1183         movi            v28.4s,  #0
1184         movi            v29.4s,  #0
1185         movi            v30.4s,  #0
1186         movi            v31.4s,  #0
1187 2:
1188         subs            x1,  x1,  #1
1189 .rept 4
1190         st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
1191 .endr
1192         b.ne            2b
1193 3:
1194 .endif
1195
1196 .irp i, 0, 4, 8, 12
1197         add             x0,  x4,  #(\i*2)
1198         mov             x1,  x5
1199         add             x2,  sp,  #(\i*4)
1200         mov             x3,  #\i
1201         bl              \txfm2\()16_1d_4x16_pass2_neon
1202 .endr
1203
1204         add             sp,  sp,  #1024
1205         ldp             d8,  d9,  [sp], 0x10
1206 .ifnc \txfm1\()_\txfm2,idct_idct
1207         ldp             d10, d11, [sp], 0x10
1208         ldp             d12, d13, [sp], 0x10
1209         ldp             d14, d15, [sp], 0x10
1210 .endif
1211         br              x15
1212 endfunc
1213
1214 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1215         mov             x13, #0x03ff
1216         b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1217 endfunc
1218
1219 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1220         mov             x13, #0x0fff
1221         b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1222 endfunc
1223 .endm
1224
1225 itxfm_func16x16 idct,  idct
1226 itxfm_func16x16 iadst, idct
1227 itxfm_func16x16 idct,  iadst
1228 itxfm_func16x16 iadst, iadst
1229
1230 function idct16_1d_4x16_pass1_quarter_neon
1231         mov             x14, x30
1232
1233         movi            v4.4s, #0
1234 .irp i, 16, 17, 18, 19
1235         load_clear      \i,  x2,  x9
1236 .endr
1237
1238         bl              idct16_quarter
1239
1240         // Do four 4x4 transposes. Originally, v16-v31 contain the
1241         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1242         // contain the four transposed 4x4 blocks.
1243         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1244         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1245         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1246         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1247
1248         // Store the transposed 4x4 blocks horizontally.
1249         // The first 4x4 block is kept in registers for the second pass,
1250         // store the rest in the temp buffer.
1251         add             x0,  x0,  #16
1252         st1             {v20.4s},  [x0], #16
1253         st1             {v24.4s},  [x0], #16
1254         st1             {v28.4s},  [x0], #16
1255         add             x0,  x0,  #16
1256         st1             {v21.4s},  [x0], #16
1257         st1             {v25.4s},  [x0], #16
1258         st1             {v29.4s},  [x0], #16
1259         add             x0,  x0,  #16
1260         st1             {v22.4s},  [x0], #16
1261         st1             {v26.4s},  [x0], #16
1262         st1             {v30.4s},  [x0], #16
1263         add             x0,  x0,  #16
1264         st1             {v23.4s},  [x0], #16
1265         st1             {v27.4s},  [x0], #16
1266         st1             {v31.4s},  [x0], #16
1267         br              x14
1268 endfunc
1269
1270 function idct16_1d_4x16_pass2_quarter_neon
1271         mov             x14, x30
1272
1273         // Only load the top 4 lines, and only do it for the later slices.
1274         // For the first slice, d16-d19 is kept in registers from the first pass.
1275         cbz             x3,  1f
1276 .irp i, 16, 17, 18, 19
1277         load            \i,  x2,  x9
1278 .endr
1279 1:
1280
1281         add             x3,  x0,  x1
1282         lsl             x1,  x1,  #1
1283         bl              idct16_quarter
1284
1285         dup             v8.8h, w13
1286         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1287         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1288
1289         br              x14
1290 endfunc
1291
1292 function idct16_1d_4x16_pass1_half_neon
1293         mov             x14, x30
1294
1295         movi            v4.4s, #0
1296 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1297         load_clear      \i,  x2,  x9
1298 .endr
1299
1300         bl              idct16_half
1301
1302         // Do four 4x4 transposes. Originally, v16-v31 contain the
1303         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1304         // contain the four transposed 4x4 blocks.
1305         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1306         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1307         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1308         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1309
1310         // Store the transposed 4x4 blocks horizontally.
1311         cmp             x1,  #4
1312         b.eq            1f
1313 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1314         store           \i,  x0,  #16
1315 .endr
1316         br              x14
1317 1:
1318         // Special case: For the second input column (r1 == 4),
1319         // which would be stored as the second row in the temp buffer,
1320         // don't store the first 4x4 block, but keep it in registers
1321         // for the first slice of the second pass (where it is the
1322         // second 4x4 block).
1323         add             x0,  x0,  #16
1324         st1             {v20.4s},  [x0], #16
1325         st1             {v24.4s},  [x0], #16
1326         st1             {v28.4s},  [x0], #16
1327         add             x0,  x0,  #16
1328         st1             {v21.4s},  [x0], #16
1329         st1             {v25.4s},  [x0], #16
1330         st1             {v29.4s},  [x0], #16
1331         add             x0,  x0,  #16
1332         st1             {v22.4s},  [x0], #16
1333         st1             {v26.4s},  [x0], #16
1334         st1             {v30.4s},  [x0], #16
1335         add             x0,  x0,  #16
1336         st1             {v23.4s},  [x0], #16
1337         st1             {v27.4s},  [x0], #16
1338         st1             {v31.4s},  [x0], #16
1339
1340         mov             v20.16b, v16.16b
1341         mov             v21.16b, v17.16b
1342         mov             v22.16b, v18.16b
1343         mov             v23.16b, v19.16b
1344         br              x14
1345 endfunc
1346
1347 function idct16_1d_4x16_pass2_half_neon
1348         mov             x14, x30
1349
1350 .irp i, 16, 17, 18, 19
1351         load            \i,  x2,  x9
1352 .endr
1353         cbz             x3,  1f
1354 .irp i, 20, 21, 22, 23
1355         load            \i,  x2,  x9
1356 .endr
1357 1:
1358
1359         add             x3,  x0,  x1
1360         lsl             x1,  x1,  #1
1361         bl              idct16_half
1362
1363         dup             v8.8h, w13
1364         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1365         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1366
1367         br              x14
1368 endfunc
1369
1370 .macro idct16_partial size
1371 function idct16x16_\size\()_add_16_neon
1372         add             x0,  sp,  #(0*64)
1373         mov             x1,  #0
1374         add             x2,  x6,  #(0*4)
1375         bl              idct16_1d_4x16_pass1_\size\()_neon
1376 .ifc \size,half
1377         add             x0,  sp,  #(4*64)
1378         mov             x1,  #4
1379         add             x2,  x6,  #(4*4)
1380         bl              idct16_1d_4x16_pass1_\size\()_neon
1381 .endif
1382
1383 .irp i, 0, 4, 8, 12
1384         add             x0,  x4,  #(\i*2)
1385         mov             x1,  x5
1386         add             x2,  sp,  #(\i*4)
1387         mov             x3,  #\i
1388         bl              idct16_1d_4x16_pass2_\size\()_neon
1389 .endr
1390
1391         add             sp,  sp,  #1024
1392         ldp             d8,  d9,  [sp], 0x10
1393         br              x15
1394 endfunc
1395 .endm
1396
1397 idct16_partial quarter
1398 idct16_partial half
1399
1400 function idct32x32_dc_add_neon
1401         movrel          x4,  idct_coeffs
1402         ld1             {v0.4h}, [x4]
1403         sxtl            v0.4s,  v0.4h
1404
1405         movi            v1.4h,  #0
1406
1407         ld1             {v2.s}[0],  [x2]
1408         smull           v2.2d,  v2.2s,  v0.s[0]
1409         rshrn           v2.2s,  v2.2d,  #14
1410         smull           v2.2d,  v2.2s,  v0.s[0]
1411         rshrn           v2.2s,  v2.2d,  #14
1412         st1             {v1.s}[0],  [x2]
1413         dup             v2.4s,  v2.s[0]
1414
1415         srshr           v0.4s,  v2.4s,  #6
1416
1417         mov             x3,  x0
1418         mov             x4,  #32
1419         sub             x1,  x1,  #32
1420         dup             v31.8h, w13
1421 1:
1422         // Loop to add the constant v0 into all 32x32 outputs
1423         subs            x4,  x4,  #1
1424         ld1             {v1.8h,v2.8h},  [x0], #32
1425         uaddw           v16.4s, v0.4s,  v1.4h
1426         uaddw2          v17.4s, v0.4s,  v1.8h
1427         ld1             {v3.8h,v4.8h},  [x0], x1
1428         uaddw           v18.4s, v0.4s,  v2.4h
1429         uaddw2          v19.4s, v0.4s,  v2.8h
1430         uaddw           v20.4s, v0.4s,  v3.4h
1431         uaddw2          v21.4s, v0.4s,  v3.8h
1432         uaddw           v22.4s, v0.4s,  v4.4h
1433         uaddw2          v23.4s, v0.4s,  v4.8h
1434         sqxtun          v1.4h,  v16.4s
1435         sqxtun2         v1.8h,  v17.4s
1436         sqxtun          v2.4h,  v18.4s
1437         sqxtun2         v2.8h,  v19.4s
1438         sqxtun          v3.4h,  v20.4s
1439         sqxtun2         v3.8h,  v21.4s
1440         sqxtun          v4.4h,  v22.4s
1441         sqxtun2         v4.8h,  v23.4s
1442         umin            v1.8h,  v1.8h,  v31.8h
1443         umin            v2.8h,  v2.8h,  v31.8h
1444         st1             {v1.8h,v2.8h},  [x3], #32
1445         umin            v3.8h,  v3.8h,  v31.8h
1446         umin            v4.8h,  v4.8h,  v31.8h
1447         st1             {v3.8h,v4.8h},  [x3], x1
1448         b.ne            1b
1449
1450         ret
1451 endfunc
1452
1453 .macro idct32_end
1454         butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1455         butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1456         butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1457         butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1458         butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1459         butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1460         butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
1461         butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1462
1463         dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1464         dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1465         dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1466         dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1467
1468         butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1469         butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1470         butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1471         butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1472         butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1473         butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1474         butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1475         butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
1476
1477         dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
1478         dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
1479         dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
1480         dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
1481         ret
1482 .endm
1483
1484 function idct32_odd
1485         dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1486         dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1487         dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1488         dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1489         dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1490         dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1491         dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1492         dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1493
1494         butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1495         butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1496         butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1497         butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1498         butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1499         butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1500         butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1501         butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1502
1503         dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1504         dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1505         dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1506         dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1507         idct32_end
1508 endfunc
1509
1510 function idct32_odd_half
1511         dmbutterfly_h1  v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1512         dmbutterfly_h2  v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1513         dmbutterfly_h1  v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1514         dmbutterfly_h2  v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1515         dmbutterfly_h1  v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1516         dmbutterfly_h2  v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1517         dmbutterfly_h1  v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1518         dmbutterfly_h2  v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1519
1520         butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1521         butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1522         butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1523         butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1524         butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1525         butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1526         butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1527         butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1528
1529         dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1530         dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1531         dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1532         dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1533         idct32_end
1534 endfunc
1535
1536 function idct32_odd_quarter
1537         dsmull_h        v4,  v5,  v16, v10.s[0]
1538         dsmull_h        v28, v29, v19, v11.s[3]
1539         dsmull_h        v30, v31, v16, v10.s[1]
1540         dsmull_h        v22, v23, v17, v13.s[2]
1541         dsmull_h        v7,  v6,  v17, v13.s[3]
1542         dsmull_h        v26, v27, v19, v11.s[2]
1543         dsmull_h        v20, v21, v18, v12.s[0]
1544         dsmull_h        v24, v25, v18, v12.s[1]
1545
1546         neg             v28.2d, v28.2d
1547         neg             v29.2d, v29.2d
1548         neg             v7.2d,  v7.2d
1549         neg             v6.2d,  v6.2d
1550
1551         drshrn_h        v4,  v4,  v5,  #14
1552         drshrn_h        v5,  v28, v29, #14
1553         drshrn_h        v29, v30, v31, #14
1554         drshrn_h        v28, v22, v23, #14
1555         drshrn_h        v7,  v7,  v6,  #14
1556         drshrn_h        v31, v26, v27, #14
1557         drshrn_h        v6,  v20, v21, #14
1558         drshrn_h        v30, v24, v25, #14
1559
1560         dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v1.s[0], v1.s[1]
1561         dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v1.s[0], v1.s[1]
1562         drshrn_h        v23, v16, v17, #14
1563         drshrn_h        v24, v18, v19, #14
1564         neg             v20.2d, v20.2d
1565         neg             v21.2d, v21.2d
1566         drshrn_h        v27, v27, v26, #14
1567         drshrn_h        v20, v20, v21, #14
1568         dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v1.s[2], v1.s[3]
1569         drshrn_h        v21, v16, v17, #14
1570         drshrn_h        v26, v18, v19, #14
1571         dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v1.s[2], v1.s[3]
1572         drshrn_h        v25, v16, v17, #14
1573         neg             v18.2d, v18.2d
1574         neg             v19.2d, v19.2d
1575         drshrn_h        v22, v18, v19, #14
1576
1577         idct32_end
1578 endfunc
1579
1580 .macro idct32_funcs suffix
1581 // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
1582 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1583 // a normal IDCT16 with every other input component (the even ones, with
1584 // each output written twice), followed by a separate 16-point IDCT
1585 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1586 // x0 = dst (temp buffer)
1587 // x1 = unused
1588 // x2 = src
1589 // x9 = double input stride
1590 function idct32_1d_4x32_pass1\suffix\()_neon
1591         mov             x14, x30
1592
1593         movi            v4.4s,  #0
1594
1595         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1596 .ifb \suffix
1597 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1598         load_clear      \i, x2, x9
1599 .endr
1600 .endif
1601 .ifc \suffix,_quarter
1602 .irp i, 16, 17, 18, 19
1603         load_clear      \i, x2, x9
1604 .endr
1605 .endif
1606 .ifc \suffix,_half
1607 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1608         load_clear      \i, x2, x9
1609 .endr
1610 .endif
1611
1612         bl              idct16\suffix
1613
1614         // Do four 4x4 transposes. Originally, v16-v31 contain the
1615         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1616         // contain the four transposed 4x4 blocks.
1617         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1618         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1619         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1620         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1621
1622         // Store the registers a, b, c, d horizontally, followed by the
1623         // same registers d, c, b, a mirrored.
1624 .macro store_rev a, b, c, d
1625         // There's no rev128 instruction, but we reverse each 64 bit
1626         // half, and then flip them using an ext with 8 bytes offset.
1627         rev64           v7.4s, \d
1628         st1             {\a},  [x0], #16
1629         ext             v7.16b, v7.16b, v7.16b, #8
1630         st1             {\b},  [x0], #16
1631         rev64           v6.4s, \c
1632         st1             {\c},  [x0], #16
1633         ext             v6.16b, v6.16b, v6.16b, #8
1634         st1             {\d},  [x0], #16
1635         rev64           v5.4s, \b
1636         st1             {v7.4s},  [x0], #16
1637         ext             v5.16b, v5.16b, v5.16b, #8
1638         st1             {v6.4s},  [x0], #16
1639         rev64           v4.4s, \a
1640         st1             {v5.4s},  [x0], #16
1641         ext             v4.16b, v4.16b, v4.16b, #8
1642         st1             {v4.4s},  [x0], #16
1643 .endm
1644         store_rev       v16.4s, v20.4s, v24.4s, v28.4s
1645         store_rev       v17.4s, v21.4s, v25.4s, v29.4s
1646         store_rev       v18.4s, v22.4s, v26.4s, v30.4s
1647         store_rev       v19.4s, v23.4s, v27.4s, v31.4s
1648         sub             x0,  x0,  #512
1649 .purgem store_rev
1650
1651         // Move x2 back to the start of the input, and move
1652         // to the first odd row
1653 .ifb \suffix
1654         sub             x2,  x2,  x9, lsl #4
1655 .endif
1656 .ifc \suffix,_quarter
1657         sub             x2,  x2,  x9, lsl #2
1658 .endif
1659 .ifc \suffix,_half
1660         sub             x2,  x2,  x9, lsl #3
1661 .endif
1662         add             x2,  x2,  #128
1663
1664         movi            v4.4s,  #0
1665         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1666 .ifb \suffix
1667 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1668         load_clear      \i, x2, x9
1669 .endr
1670 .endif
1671 .ifc \suffix,_quarter
1672 .irp i, 16, 17, 18, 19
1673         load_clear      \i, x2, x9
1674 .endr
1675 .endif
1676 .ifc \suffix,_half
1677 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1678         load_clear      \i, x2, x9
1679 .endr
1680 .endif
1681
1682         bl              idct32_odd\suffix
1683
1684         transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
1685         transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
1686         transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
1687         transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
1688
1689         // Store the registers a, b, c, d horizontally,
1690         // adding into the output first, and the mirrored,
1691         // subtracted from the output.
1692 .macro store_rev a, b, c, d, a16b, b16b
1693         ld1             {v4.4s},  [x0]
1694         rev64           v9.4s, \d
1695         add             v4.4s, v4.4s, \a
1696         st1             {v4.4s},  [x0], #16
1697         rev64           v8.4s, \c
1698         ld1             {v4.4s},  [x0]
1699         ext             v9.16b, v9.16b, v9.16b, #8
1700         add             v4.4s, v4.4s, \b
1701         st1             {v4.4s},  [x0], #16
1702         ext             v8.16b, v8.16b, v8.16b, #8
1703         ld1             {v4.4s},  [x0]
1704         rev64           \b, \b
1705         add             v4.4s, v4.4s, \c
1706         st1             {v4.4s},  [x0], #16
1707         rev64           \a, \a
1708         ld1             {v4.4s},  [x0]
1709         ext             \b16b, \b16b, \b16b, #8
1710         add             v4.4s, v4.4s, \d
1711         st1             {v4.4s},  [x0], #16
1712         ext             \a16b, \a16b, \a16b, #8
1713         ld1             {v4.4s},  [x0]
1714         sub             v4.4s, v4.4s, v9.4s
1715         st1             {v4.4s},  [x0], #16
1716         ld1             {v4.4s},  [x0]
1717         sub             v4.4s, v4.4s, v8.4s
1718         st1             {v4.4s},  [x0], #16
1719         ld1             {v4.4s},  [x0]
1720         sub             v4.4s, v4.4s, \b
1721         st1             {v4.4s},  [x0], #16
1722         ld1             {v4.4s},  [x0]
1723         sub             v4.4s, v4.4s, \a
1724         st1             {v4.4s},  [x0], #16
1725 .endm
1726
1727         store_rev       v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
1728         store_rev       v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
1729         store_rev       v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
1730         store_rev       v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
1731 .purgem store_rev
1732         br              x14
1733 endfunc
1734
1735 // This is mostly the same as 4x32_pass1, but without the transpose,
1736 // and use the source as temp buffer between the two idct passes, and
1737 // add into the destination.
1738 // x0 = dst
1739 // x1 = dst stride
1740 // x2 = src (temp buffer)
1741 // x7 = negative double temp buffer stride
1742 // x9 = double temp buffer stride
1743 function idct32_1d_4x32_pass2\suffix\()_neon
1744         mov             x14, x30
1745
1746         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1747 .ifb \suffix
1748 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1749         load            \i, x2, x9
1750 .endr
1751         sub             x2,  x2,  x9, lsl #4
1752 .endif
1753 .ifc \suffix,_quarter
1754 .irp i, 16, 17, 18, 19
1755         load            \i, x2, x9
1756 .endr
1757         sub             x2,  x2,  x9, lsl #2
1758 .endif
1759 .ifc \suffix,_half
1760 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1761         load            \i, x2, x9
1762 .endr
1763         sub             x2,  x2,  x9, lsl #3
1764 .endif
1765
1766         bl              idct16\suffix
1767
1768 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1769         store           \i, x2, x9
1770 .endr
1771
1772         sub             x2,  x2,  x9, lsl #4
1773         add             x2,  x2,  #128
1774
1775         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1776 .ifb \suffix
1777 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1778         load            \i, x2, x9
1779 .endr
1780         sub             x2,  x2,  x9, lsl #4
1781 .endif
1782 .ifc \suffix,_quarter
1783 .irp i, 16, 17, 18, 19
1784         load            \i, x2, x9
1785 .endr
1786         sub             x2,  x2,  x9, lsl #2
1787 .endif
1788 .ifc \suffix,_half
1789 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1790         load            \i, x2, x9
1791 .endr
1792         sub             x2,  x2,  x9, lsl #3
1793 .endif
1794         sub             x2,  x2,  #128
1795
1796         bl              idct32_odd\suffix
1797
1798 .macro load_acc_store a, b, c, d, neg=0
1799 .if \neg == 0
1800         ld1             {v4.4s},  [x2], x9
1801         ld1             {v5.4s},  [x2], x9
1802         add             v4.4s, v4.4s, \a
1803         ld1             {v6.4s},  [x2], x9
1804         add             v5.4s, v5.4s, \b
1805         ld1             {v7.4s},  [x2], x9
1806         add             v6.4s, v6.4s, \c
1807         add             v7.4s, v7.4s, \d
1808 .else
1809         ld1             {v4.4s},  [x2], x7
1810         ld1             {v5.4s},  [x2], x7
1811         sub             v4.4s, v4.4s, \a
1812         ld1             {v6.4s},  [x2], x7
1813         sub             v5.4s, v5.4s, \b
1814         ld1             {v7.4s},  [x2], x7
1815         sub             v6.4s, v6.4s, \c
1816         sub             v7.4s, v7.4s, \d
1817 .endif
1818         ld1             {v8.4h},   [x0], x1
1819         ld1             {v8.d}[1], [x0], x1
1820         srshr           v4.4s, v4.4s, #6
1821         ld1             {v9.4h},   [x0], x1
1822         srshr           v5.4s, v5.4s, #6
1823         uaddw           v4.4s, v4.4s, v8.4h
1824         ld1             {v9.d}[1], [x0], x1
1825         srshr           v6.4s, v6.4s, #6
1826         uaddw2          v5.4s, v5.4s, v8.8h
1827         srshr           v7.4s, v7.4s, #6
1828         sub             x0,  x0,  x1, lsl #2
1829         uaddw           v6.4s, v6.4s, v9.4h
1830         sqxtun          v4.4h, v4.4s
1831         uaddw2          v7.4s, v7.4s, v9.8h
1832         sqxtun2         v4.8h, v5.4s
1833         umin            v4.8h, v4.8h, v15.8h
1834         st1             {v4.4h},   [x0], x1
1835         sqxtun          v5.4h, v6.4s
1836         st1             {v4.d}[1], [x0], x1
1837         sqxtun2         v5.8h, v7.4s
1838         umin            v5.8h, v5.8h, v15.8h
1839         st1             {v5.4h},   [x0], x1
1840         st1             {v5.d}[1], [x0], x1
1841 .endm
1842         load_acc_store  v31.4s, v30.4s, v29.4s, v28.4s
1843         load_acc_store  v27.4s, v26.4s, v25.4s, v24.4s
1844         load_acc_store  v23.4s, v22.4s, v21.4s, v20.4s
1845         load_acc_store  v19.4s, v18.4s, v17.4s, v16.4s
1846         sub             x2,  x2,  x9
1847         load_acc_store  v16.4s, v17.4s, v18.4s, v19.4s, 1
1848         load_acc_store  v20.4s, v21.4s, v22.4s, v23.4s, 1
1849         load_acc_store  v24.4s, v25.4s, v26.4s, v27.4s, 1
1850         load_acc_store  v28.4s, v29.4s, v30.4s, v31.4s, 1
1851 .purgem load_acc_store
1852         br              x14
1853 endfunc
1854 .endm
1855
1856 idct32_funcs
1857 idct32_funcs _quarter
1858 idct32_funcs _half
1859
1860 const min_eob_idct_idct_32, align=4
1861         .short  0, 9, 34, 70, 135, 240, 336, 448
1862 endconst
1863
1864 function vp9_idct_idct_32x32_add_16_neon
1865         cmp             w3,  #1
1866         b.eq            idct32x32_dc_add_neon
1867
1868         movrel          x10, idct_coeffs
1869
1870         mov             x15, x30
1871         stp             d8,  d9,  [sp, #-0x10]!
1872         stp             d10, d11, [sp, #-0x10]!
1873         stp             d12, d13, [sp, #-0x10]!
1874         stp             d14, d15, [sp, #-0x10]!
1875
1876         sub             sp,  sp,  #4096
1877
1878         mov             x4,  x0
1879         mov             x5,  x1
1880         mov             x6,  x2
1881
1882         // Double stride of the input, since we only read every other line
1883         mov             x9,  #256
1884         neg             x7,  x9
1885
1886         ld1             {v0.8h,v1.8h},   [x10], #32
1887         sxtl            v2.4s,  v1.4h
1888         sxtl2           v3.4s,  v1.8h
1889         sxtl2           v1.4s,  v0.8h
1890         sxtl            v0.4s,  v0.4h
1891         ld1             {v10.8h,v11.8h}, [x10]
1892         sxtl            v12.4s, v11.4h
1893         sxtl2           v13.4s, v11.8h
1894         sxtl2           v11.4s, v10.8h
1895         sxtl            v10.4s, v10.4h
1896
1897         dup             v15.8h, w13
1898
1899         cmp             w3,  #34
1900         b.le            idct32x32_quarter_add_16_neon
1901         cmp             w3,  #135
1902         b.le            idct32x32_half_add_16_neon
1903
1904         movrel          x12, min_eob_idct_idct_32, 2
1905
1906 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1907         add             x0,  sp,  #(\i*128)
1908 .if \i > 0
1909         ldrh            w1,  [x12], #2
1910         cmp             w3,  w1
1911         mov             x1,  #(32 - \i)/4
1912         b.le            1f
1913 .endif
1914         add             x2,  x6,  #(\i*4)
1915         bl              idct32_1d_4x32_pass1_neon
1916 .endr
1917         b               3f
1918
1919 1:
1920         // Write zeros to the temp buffer for pass 2
1921         movi            v16.4s,  #0
1922         movi            v17.4s,  #0
1923         movi            v18.4s,  #0
1924         movi            v19.4s,  #0
1925 2:
1926         subs            x1,  x1,  #1
1927 .rept 4
1928         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1929         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1930 .endr
1931         b.ne            2b
1932 3:
1933 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1934         add             x0,  x4,  #(\i*2)
1935         mov             x1,  x5
1936         add             x2,  sp,  #(\i*4)
1937         bl              idct32_1d_4x32_pass2_neon
1938 .endr
1939
1940         add             sp,  sp,  #4096
1941         ldp             d14, d15, [sp], 0x10
1942         ldp             d12, d13, [sp], 0x10
1943         ldp             d10, d11, [sp], 0x10
1944         ldp             d8,  d9,  [sp], 0x10
1945
1946         br              x15
1947 endfunc
1948
1949 function ff_vp9_idct_idct_32x32_add_10_neon, export=1
1950         mov             x13, #0x03ff
1951         b               vp9_idct_idct_32x32_add_16_neon
1952 endfunc
1953
1954 function ff_vp9_idct_idct_32x32_add_12_neon, export=1
1955         mov             x13, #0x0fff
1956         b               vp9_idct_idct_32x32_add_16_neon
1957 endfunc
1958
1959 .macro idct32_partial size
1960 function idct32x32_\size\()_add_16_neon
1961 .irp i, 0, 4
1962         add             x0,  sp,  #(\i*128)
1963 .ifc \size,quarter
1964 .if \i == 4
1965         cmp             w3,  #9
1966         b.le            1f
1967 .endif
1968 .endif
1969         add             x2,  x6,  #(\i*4)
1970         bl              idct32_1d_4x32_pass1_\size\()_neon
1971 .endr
1972
1973 .ifc \size,half
1974 .irp i, 8, 12
1975         add             x0,  sp,  #(\i*128)
1976 .if \i == 12
1977         cmp             w3,  #70
1978         b.le            1f
1979 .endif
1980         add             x2,  x6,  #(\i*4)
1981         bl              idct32_1d_4x32_pass1_\size\()_neon
1982 .endr
1983 .endif
1984         b               3f
1985
1986 1:
1987         // Write zeros to the temp buffer for pass 2
1988         movi            v16.4s,  #0
1989         movi            v17.4s,  #0
1990         movi            v18.4s,  #0
1991         movi            v19.4s,  #0
1992
1993 .rept 4
1994         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1995         st1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x0], #64
1996 .endr
1997
1998 3:
1999 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
2000         add             x0,  x4,  #(\i*2)
2001         mov             x1,  x5
2002         add             x2,  sp,  #(\i*4)
2003         bl              idct32_1d_4x32_pass2_\size\()_neon
2004 .endr
2005
2006         add             sp,  sp,  #4096
2007         ldp             d14, d15, [sp], 0x10
2008         ldp             d12, d13, [sp], 0x10
2009         ldp             d10, d11, [sp], 0x10
2010         ldp             d8,  d9,  [sp], 0x10
2011
2012         br              x15
2013 endfunc
2014 .endm
2015
2016 idct32_partial quarter
2017 idct32_partial half