]> git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/vp9itxfm_16bpp_neon.S
Merge commit '1f7b4f9abc6bae94e576e710b8d10117ca3c8238'
[ffmpeg] / libavcodec / aarch64 / vp9itxfm_16bpp_neon.S
1 /*
2  * Copyright (c) 2017 Google Inc.
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavutil/aarch64/asm.S"
22 #include "neon.S"
23
24 const itxfm4_coeffs, align=4
25         .short  11585, 0, 6270, 15137
26 iadst4_coeffs:
27         .short  5283, 15212, 9929, 13377
28 endconst
29
30 const iadst8_coeffs, align=4
31         .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
32 idct_coeffs:
33         .short  11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34         .short  1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35         .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36         .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
37 endconst
38
39 const iadst16_coeffs, align=4
40         .short  16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41         .short  14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
42 endconst
43
44 .macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
45         trn1            \r4\().4s,  \r0\().4s,  \r1\().4s
46         trn2            \r5\().4s,  \r0\().4s,  \r1\().4s
47         trn1            \r6\().4s,  \r2\().4s,  \r3\().4s
48         trn2            \r7\().4s,  \r2\().4s,  \r3\().4s
49         trn1            \r0\().2d,  \r4\().2d,  \r6\().2d
50         trn2            \r2\().2d,  \r4\().2d,  \r6\().2d
51         trn1            \r1\().2d,  \r5\().2d,  \r7\().2d
52         trn2            \r3\().2d,  \r5\().2d,  \r7\().2d
53 .endm
54
55 // Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
56 // over two registers.
57 .macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
58         transpose_4x4s  \r0,  \r2,  \r4,  \r6,  \t0, \t1, \t2, \t3
59         transpose_4x4s  \r9,  \r11, \r13, \r15, \t0, \t1, \t2, \t3
60
61         // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
62         // while swapping the two 4x4 matrices between each other
63
64         // First step of the 4x4 transpose of r1-r7, into t0-t3
65         trn1            \t0\().4s,  \r1\().4s,  \r3\().4s
66         trn2            \t1\().4s,  \r1\().4s,  \r3\().4s
67         trn1            \t2\().4s,  \r5\().4s,  \r7\().4s
68         trn2            \t3\().4s,  \r5\().4s,  \r7\().4s
69
70         // First step of the 4x4 transpose of r8-r12, into r1-r7
71         trn1            \r1\().4s,  \r8\().4s,  \r10\().4s
72         trn2            \r3\().4s,  \r8\().4s,  \r10\().4s
73         trn1            \r5\().4s,  \r12\().4s, \r14\().4s
74         trn2            \r7\().4s,  \r12\().4s, \r14\().4s
75
76         // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
77         trn1            \r8\().2d,  \t0\().2d,  \t2\().2d
78         trn2            \r12\().2d, \t0\().2d,  \t2\().2d
79         trn1            \r10\().2d, \t1\().2d,  \t3\().2d
80         trn2            \r14\().2d, \t1\().2d,  \t3\().2d
81
82         // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
83         trn1            \t0\().2d,  \r1\().2d,  \r5\().2d
84         trn2            \r5\().2d,  \r1\().2d,  \r5\().2d
85         trn1            \t1\().2d,  \r3\().2d,  \r7\().2d
86         trn2            \r7\().2d,  \r3\().2d,  \r7\().2d
87
88         // Move the outputs of trn1 back in place
89         mov             \r1\().16b,  \t0\().16b
90         mov             \r3\().16b,  \t1\().16b
91 .endm
92
93 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
94 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
95 // in/out are .4s registers; this can do with 4 temp registers, but is
96 // more efficient if 6 temp registers are available.
97 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
98 .if \neg > 0
99         neg             \tmp4\().4s, v0.4s
100 .endif
101         add             \tmp1\().4s, \in1\().4s,  \in2\().4s
102         sub             \tmp2\().4s, \in1\().4s,  \in2\().4s
103 .if \neg > 0
104         smull           \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
105         smull2          \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
106 .else
107         smull           \tmp3\().2d, \tmp1\().2s, v0.s[0]
108         smull2          \tmp4\().2d, \tmp1\().4s, v0.s[0]
109 .endif
110 .ifb \tmp5
111         rshrn           \out1\().2s, \tmp3\().2d, #14
112         rshrn2          \out1\().4s, \tmp4\().2d, #14
113         smull           \tmp3\().2d, \tmp2\().2s, v0.s[0]
114         smull2          \tmp4\().2d, \tmp2\().4s, v0.s[0]
115         rshrn           \out2\().2s, \tmp3\().2d, #14
116         rshrn2          \out2\().4s, \tmp4\().2d, #14
117 .else
118         smull           \tmp5\().2d, \tmp2\().2s, v0.s[0]
119         smull2          \tmp6\().2d, \tmp2\().4s, v0.s[0]
120         rshrn           \out1\().2s, \tmp3\().2d, #14
121         rshrn2          \out1\().4s, \tmp4\().2d, #14
122         rshrn           \out2\().2s, \tmp5\().2d, #14
123         rshrn2          \out2\().4s, \tmp6\().2d, #14
124 .endif
125 .endm
126
127 // out1,out2 = in1 * coef1 - in2 * coef2
128 // out3,out4 = in1 * coef2 + in2 * coef1
129 // out are 4 x .2d registers, in are 2 x .4s registers
130 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
131         smull           \out1\().2d, \in1\().2s, \coef1
132         smull2          \out2\().2d, \in1\().4s, \coef1
133         smull           \out3\().2d, \in1\().2s, \coef2
134         smull2          \out4\().2d, \in1\().4s, \coef2
135         smlsl           \out1\().2d, \in2\().2s, \coef2
136         smlsl2          \out2\().2d, \in2\().4s, \coef2
137         smlal           \out3\().2d, \in2\().2s, \coef1
138         smlal2          \out4\().2d, \in2\().4s, \coef1
139 .endm
140
141 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
142 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
143 // inout are 2 x .4s registers
144 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
145         dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
146 .if \neg > 0
147         neg             \tmp3\().2d, \tmp3\().2d
148         neg             \tmp4\().2d, \tmp4\().2d
149 .endif
150         rshrn           \inout1\().2s, \tmp1\().2d,  #14
151         rshrn2          \inout1\().4s, \tmp2\().2d,  #14
152         rshrn           \inout2\().2s, \tmp3\().2d,  #14
153         rshrn2          \inout2\().4s, \tmp4\().2d,  #14
154 .endm
155
156 // out1 = in1 + in2
157 // out2 = in1 - in2
158 .macro butterfly_4s out1, out2, in1, in2
159         add             \out1\().4s, \in1\().4s, \in2\().4s
160         sub             \out2\().4s, \in1\().4s, \in2\().4s
161 .endm
162
163 // out1 = in1 - in2
164 // out2 = in1 + in2
165 .macro butterfly_4s_r out1, out2, in1, in2
166         sub             \out1\().4s, \in1\().4s, \in2\().4s
167         add             \out2\().4s, \in1\().4s, \in2\().4s
168 .endm
169
170 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
171 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
172 // out are 2 x .4s registers, in are 4 x .2d registers
173 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
174         add             \tmp1\().2d, \in1\().2d, \in3\().2d
175         add             \tmp2\().2d, \in2\().2d, \in4\().2d
176         sub             \tmp3\().2d, \in1\().2d, \in3\().2d
177         sub             \tmp4\().2d, \in2\().2d, \in4\().2d
178         rshrn           \out1\().2s, \tmp1\().2d,  #14
179         rshrn2          \out1\().4s, \tmp2\().2d,  #14
180         rshrn           \out2\().2s, \tmp3\().2d,  #14
181         rshrn2          \out2\().4s, \tmp4\().2d,  #14
182 .endm
183
184 .macro iwht4_10 c0, c1, c2, c3
185         add             \c0\().4s, \c0\().4s, \c1\().4s
186         sub             v17.4s,    \c2\().4s, \c3\().4s
187         sub             v16.4s,    \c0\().4s, v17.4s
188         sshr            v16.4s,    v16.4s,    #1
189         sub             \c2\().4s, v16.4s,    \c1\().4s
190         sub             \c1\().4s, v16.4s,    \c3\().4s
191         add             \c3\().4s, v17.4s,    \c2\().4s
192         sub             \c0\().4s, \c0\().4s, \c1\().4s
193 .endm
194
195 .macro iwht4_12 c0, c1, c2, c3
196         iwht4_10        \c0, \c1, \c2, \c3
197 .endm
198
199 .macro idct4_10 c0, c1, c2, c3
200         mul             v22.4s,    \c1\().4s, v0.s[3]
201         mul             v20.4s,    \c1\().4s, v0.s[2]
202         add             v16.4s,    \c0\().4s, \c2\().4s
203         sub             v17.4s,    \c0\().4s, \c2\().4s
204         mla             v22.4s,    \c3\().4s, v0.s[2]
205         mul             v18.4s,    v16.4s,    v0.s[0]
206         mul             v24.4s,    v17.4s,    v0.s[0]
207         mls             v20.4s,    \c3\().4s, v0.s[3]
208         srshr           v22.4s,    v22.4s,    #14
209         srshr           v18.4s,    v18.4s,    #14
210         srshr           v24.4s,    v24.4s,    #14
211         srshr           v20.4s,    v20.4s,    #14
212         add             \c0\().4s, v18.4s,    v22.4s
213         sub             \c3\().4s, v18.4s,    v22.4s
214         add             \c1\().4s, v24.4s,    v20.4s
215         sub             \c2\().4s, v24.4s,    v20.4s
216 .endm
217
218 .macro idct4_12 c0, c1, c2, c3
219         smull           v22.2d,    \c1\().2s, v0.s[3]
220         smull2          v23.2d,    \c1\().4s, v0.s[3]
221         smull           v20.2d,    \c1\().2s, v0.s[2]
222         smull2          v21.2d,    \c1\().4s, v0.s[2]
223         add             v16.4s,    \c0\().4s, \c2\().4s
224         sub             v17.4s,    \c0\().4s, \c2\().4s
225         smlal           v22.2d,    \c3\().2s, v0.s[2]
226         smlal2          v23.2d,    \c3\().4s, v0.s[2]
227         smull           v18.2d,    v16.2s,    v0.s[0]
228         smull2          v19.2d,    v16.4s,    v0.s[0]
229         smull           v24.2d,    v17.2s,    v0.s[0]
230         smull2          v25.2d,    v17.4s,    v0.s[0]
231         smlsl           v20.2d,    \c3\().2s, v0.s[3]
232         smlsl2          v21.2d,    \c3\().4s, v0.s[3]
233         rshrn           v22.2s,    v22.2d,    #14
234         rshrn2          v22.4s,    v23.2d,    #14
235         rshrn           v18.2s,    v18.2d,    #14
236         rshrn2          v18.4s,    v19.2d,    #14
237         rshrn           v24.2s,    v24.2d,    #14
238         rshrn2          v24.4s,    v25.2d,    #14
239         rshrn           v20.2s,    v20.2d,    #14
240         rshrn2          v20.4s,    v21.2d,    #14
241         add             \c0\().4s, v18.4s,    v22.4s
242         sub             \c3\().4s, v18.4s,    v22.4s
243         add             \c1\().4s, v24.4s,    v20.4s
244         sub             \c2\().4s, v24.4s,    v20.4s
245 .endm
246
247 .macro iadst4_10 c0, c1, c2, c3
248         mul             v16.4s,    \c0\().4s, v1.s[0]
249         mla             v16.4s,    \c2\().4s, v1.s[1]
250         mla             v16.4s,    \c3\().4s, v1.s[2]
251         mul             v18.4s,    \c0\().4s, v1.s[2]
252         mls             v18.4s,    \c2\().4s, v1.s[0]
253         sub             \c0\().4s, \c0\().4s, \c2\().4s
254         mls             v18.4s,    \c3\().4s, v1.s[1]
255         add             \c0\().4s, \c0\().4s, \c3\().4s
256         mul             v22.4s,    \c1\().4s, v1.s[3]
257         mul             v20.4s,    \c0\().4s, v1.s[3]
258         add             v24.4s,    v16.4s,    v22.4s
259         add             v26.4s,    v18.4s,    v22.4s
260         srshr           \c0\().4s, v24.4s,    #14
261         add             v16.4s,    v16.4s,    v18.4s
262         srshr           \c1\().4s, v26.4s,    #14
263         sub             v16.4s,    v16.4s,    v22.4s
264         srshr           \c2\().4s, v20.4s,    #14
265         srshr           \c3\().4s, v16.4s,    #14
266 .endm
267
268 .macro iadst4_12 c0, c1, c2, c3
269         smull           v16.2d,    \c0\().2s, v1.s[0]
270         smull2          v17.2d,    \c0\().4s, v1.s[0]
271         smlal           v16.2d,    \c2\().2s, v1.s[1]
272         smlal2          v17.2d,    \c2\().4s, v1.s[1]
273         smlal           v16.2d,    \c3\().2s, v1.s[2]
274         smlal2          v17.2d,    \c3\().4s, v1.s[2]
275         smull           v18.2d,    \c0\().2s, v1.s[2]
276         smull2          v19.2d,    \c0\().4s, v1.s[2]
277         smlsl           v18.2d,    \c2\().2s, v1.s[0]
278         smlsl2          v19.2d,    \c2\().4s, v1.s[0]
279         sub             \c0\().4s, \c0\().4s, \c2\().4s
280         smlsl           v18.2d,    \c3\().2s, v1.s[1]
281         smlsl2          v19.2d,    \c3\().4s, v1.s[1]
282         add             \c0\().4s, \c0\().4s, \c3\().4s
283         smull           v22.2d,    \c1\().2s, v1.s[3]
284         smull2          v23.2d,    \c1\().4s, v1.s[3]
285         smull           v20.2d,    \c0\().2s, v1.s[3]
286         smull2          v21.2d,    \c0\().4s, v1.s[3]
287         add             v24.2d,    v16.2d,    v22.2d
288         add             v25.2d,    v17.2d,    v23.2d
289         add             v26.2d,    v18.2d,    v22.2d
290         add             v27.2d,    v19.2d,    v23.2d
291         rshrn           \c0\().2s, v24.2d,    #14
292         rshrn2          \c0\().4s, v25.2d,    #14
293         add             v16.2d,    v16.2d,    v18.2d
294         add             v17.2d,    v17.2d,    v19.2d
295         rshrn           \c1\().2s, v26.2d,    #14
296         rshrn2          \c1\().4s, v27.2d,    #14
297         sub             v16.2d,    v16.2d,    v22.2d
298         sub             v17.2d,    v17.2d,    v23.2d
299         rshrn           \c2\().2s, v20.2d,    #14
300         rshrn2          \c2\().4s, v21.2d,    #14
301         rshrn           \c3\().2s, v16.2d,    #14
302         rshrn2          \c3\().4s, v17.2d,    #14
303 .endm
304
305 // The public functions in this file have got the following signature:
306 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
307
308 .macro itxfm_func4x4 txfm1, txfm2, bpp
309 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
310 .ifc \txfm1,\txfm2
311 .ifc \txfm1,idct
312         movrel          x4,  itxfm4_coeffs
313         ld1             {v0.4h}, [x4]
314         sxtl            v0.4s,  v0.4h
315 .endif
316 .ifc \txfm1,iadst
317         movrel          x4,  iadst4_coeffs
318         ld1             {v0.d}[1], [x4]
319         sxtl2           v1.4s,  v0.8h
320 .endif
321 .else
322         movrel          x4,  itxfm4_coeffs
323         ld1             {v0.8h}, [x4]
324         sxtl2           v1.4s,  v0.8h
325         sxtl            v0.4s,  v0.4h
326 .endif
327
328         movi            v30.4s, #0
329         movi            v31.4s, #0
330 .ifc \txfm1\()_\txfm2,idct_idct
331         cmp             w3,  #1
332         b.ne            1f
333         // DC-only for idct/idct
334         ld1             {v2.s}[0],  [x2]
335         smull           v2.2d,  v2.2s, v0.s[0]
336         rshrn           v2.2s,  v2.2d, #14
337         smull           v2.2d,  v2.2s, v0.s[0]
338         rshrn           v2.2s,  v2.2d, #14
339         st1             {v31.s}[0], [x2]
340         dup             v4.4s,  v2.s[0]
341         mov             v5.16b, v4.16b
342         mov             v6.16b, v4.16b
343         mov             v7.16b, v4.16b
344         b               2f
345 .endif
346
347 1:
348         ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x2]
349         st1             {v30.4s,v31.4s}, [x2], #32
350
351 .ifc \txfm1,iwht
352         sshr            v4.4s,  v4.4s,  #2
353         sshr            v5.4s,  v5.4s,  #2
354         sshr            v6.4s,  v6.4s,  #2
355         sshr            v7.4s,  v7.4s,  #2
356 .endif
357
358         \txfm1\()4_\bpp v4,  v5,  v6,  v7
359
360         st1             {v30.4s,v31.4s}, [x2], #32
361         // Transpose 4x4 with 32 bit elements
362         transpose_4x4s  v4,  v5,  v6,  v7,  v16, v17, v18, v19
363
364         \txfm2\()4_\bpp v4,  v5,  v6,  v7
365 2:
366         mvni            v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
367         ld1             {v0.4h},   [x0], x1
368         ld1             {v1.4h},   [x0], x1
369 .ifnc \txfm1,iwht
370         srshr           v4.4s,  v4.4s,  #4
371         srshr           v5.4s,  v5.4s,  #4
372         srshr           v6.4s,  v6.4s,  #4
373         srshr           v7.4s,  v7.4s,  #4
374 .endif
375         uaddw           v4.4s,  v4.4s,  v0.4h
376         uaddw           v5.4s,  v5.4s,  v1.4h
377         ld1             {v2.4h},   [x0], x1
378         ld1             {v3.4h},   [x0], x1
379         sqxtun          v0.4h,  v4.4s
380         sqxtun2         v0.8h,  v5.4s
381         sub             x0,  x0,  x1, lsl #2
382
383         uaddw           v6.4s,  v6.4s,  v2.4h
384         umin            v0.8h,  v0.8h,  v31.8h
385         uaddw           v7.4s,  v7.4s,  v3.4h
386         st1             {v0.4h},   [x0], x1
387         sqxtun          v2.4h,  v6.4s
388         sqxtun2         v2.8h,  v7.4s
389         umin            v2.8h,  v2.8h,  v31.8h
390
391         st1             {v0.d}[1], [x0], x1
392         st1             {v2.4h},   [x0], x1
393         st1             {v2.d}[1], [x0], x1
394
395         ret
396 endfunc
397 .endm
398
399 .macro itxfm_funcs4x4 bpp
400 itxfm_func4x4 idct,  idct,  \bpp
401 itxfm_func4x4 iadst, idct,  \bpp
402 itxfm_func4x4 idct,  iadst, \bpp
403 itxfm_func4x4 iadst, iadst, \bpp
404 itxfm_func4x4 iwht,  iwht,  \bpp
405 .endm
406
407 itxfm_funcs4x4 10
408 itxfm_funcs4x4 12
409
410 function idct8x8_dc_add_neon
411         movrel          x4,  idct_coeffs
412         ld1             {v0.4h}, [x4]
413
414         movi            v1.4h,  #0
415         sxtl            v0.4s,  v0.4h
416
417         ld1             {v2.s}[0],  [x2]
418         smull           v2.2d,  v2.2s,  v0.s[0]
419         rshrn           v2.2s,  v2.2d,  #14
420         smull           v2.2d,  v2.2s,  v0.s[0]
421         rshrn           v2.2s,  v2.2d,  #14
422         st1             {v1.s}[0],  [x2]
423         dup             v2.4s,  v2.s[0]
424
425         srshr           v2.4s,  v2.4s,  #5
426
427         mov             x4,  #8
428         mov             x3,  x0
429         dup             v31.8h, w5
430 1:
431         // Loop to add the constant from v2 into all 8x8 outputs
432         subs            x4,  x4,  #2
433         ld1             {v3.8h},  [x0], x1
434         ld1             {v4.8h},  [x0], x1
435         uaddw           v16.4s, v2.4s,  v3.4h
436         uaddw2          v17.4s, v2.4s,  v3.8h
437         uaddw           v18.4s, v2.4s,  v4.4h
438         uaddw2          v19.4s, v2.4s,  v4.8h
439         sqxtun          v3.4h,  v16.4s
440         sqxtun2         v3.8h,  v17.4s
441         sqxtun          v4.4h,  v18.4s
442         sqxtun2         v4.8h,  v19.4s
443         umin            v3.8h,  v3.8h,  v31.8h
444         umin            v4.8h,  v4.8h,  v31.8h
445         st1             {v3.8h},  [x3], x1
446         st1             {v4.8h},  [x3], x1
447         b.ne            1b
448
449         ret
450 endfunc
451
452 .macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
453         dmbutterfly0    \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
454         dmbutterfly     \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3   // r2 = t2a, r6 = t3a
455         dmbutterfly     \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3   // r1 = t4a, r7 = t7a
456         dmbutterfly     \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3   // r5 = t5a, r3 = t6a
457
458         butterfly_4s    \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
459         butterfly_4s    \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
460         butterfly_4s    \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
461         butterfly_4s    \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
462
463         dmbutterfly0    \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
464
465         butterfly_4s    \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
466         butterfly_4s    \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
467         butterfly_4s    \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
468         butterfly_4s    \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
469 .endm
470
471 .macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
472         dmbutterfly_l   \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0]   // t2,t3 = t1a, t0,t1 = t0a
473         dmbutterfly_l   \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0]   // r0,r7 = t5a, t4,t5 = t4a
474
475         dbutterfly_n    \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
476         dbutterfly_n    \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
477
478         dmbutterfly_l   \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2]   // t4,t5 = t3a, t2,t3 = t2a
479         dmbutterfly_l   \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2]   // r2,r5 = t7a, r0,r7 = t6a
480
481         dbutterfly_n    \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
482         dbutterfly_n    \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
483
484         butterfly_4s    \r7, \r4, \r4, \r0   // r7 = -out[7], r4 = t3
485         neg             \r7\().4s, \r7\().4s // r7 = out[7]
486         butterfly_4s    \r0, \r1, \r3, \r1   // r0 = out[0],  r1 = t2
487
488         dmbutterfly_l   \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3]   // r2,r3 = t5a, t3,t5 = t4a
489         dmbutterfly_l   \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2]   // t0,t1 = t6a, r5,r6 = t7a
490
491         dbutterfly_n    \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6],  t2 = t7
492
493         dmbutterfly0    \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2           // r3 = -out[3], r4 = out[4]
494         neg             \r3\().4s, \r3\().4s  // r3 = out[3]
495
496         dbutterfly_n    \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
497         neg             \r1\().4s, \r1\().4s  // r1 = out[1]
498
499         dmbutterfly0    \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5           // r2 = out[2],  r5 = -out[5]
500         neg             \r5\().4s, \r5\().4s  // r5 = out[5]
501 .endm
502
503
504 .macro itxfm_func8x8 txfm1, txfm2
505 function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
506 .ifc \txfm1\()_\txfm2,idct_idct
507         cmp             w3,  #1
508         b.eq            idct8x8_dc_add_neon
509 .endif
510         // The iadst also uses a few coefficients from
511         // idct, so those always need to be loaded.
512 .ifc \txfm1\()_\txfm2,idct_idct
513         movrel          x4,  idct_coeffs
514 .else
515         movrel          x4,  iadst8_coeffs
516         ld1             {v1.8h}, [x4], #16
517         stp             d8,  d9,  [sp, #-0x10]!
518         sxtl2           v3.4s,  v1.8h
519         sxtl            v2.4s,  v1.4h
520 .endif
521         ld1             {v0.8h}, [x4]
522         sxtl2           v1.4s,  v0.8h
523         sxtl            v0.4s,  v0.4h
524
525         movi            v4.4s, #0
526         movi            v5.4s, #0
527         movi            v6.4s, #0
528         movi            v7.4s, #0
529
530 1:
531         ld1             {v16.4s,v17.4s,v18.4s,v19.4s},  [x2], #64
532         ld1             {v20.4s,v21.4s,v22.4s,v23.4s},  [x2], #64
533         ld1             {v24.4s,v25.4s,v26.4s,v27.4s},  [x2], #64
534         ld1             {v28.4s,v29.4s,v30.4s,v31.4s},  [x2], #64
535         sub             x2,  x2,  #256
536         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
537         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
538         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
539         st1             {v4.4s,v5.4s,v6.4s,v7.4s},      [x2], #64
540
541 .ifc \txfm1\()_\txfm2,idct_idct
542         idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
543         idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
544 .else
545         \txfm1\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
546         \txfm1\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
547 .endif
548
549         // Transpose 8x8 with 16 bit elements
550         transpose_8x8s  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
551
552 .ifc \txfm1\()_\txfm2,idct_idct
553         idct8           v16, v18, v20, v22, v24, v26, v28, v30, v2,  v3,  v4,  v5,  v6,  v7
554         idct8           v17, v19, v21, v23, v25, v27, v29, v31, v2,  v3,  v4,  v5,  v6,  v7
555 .else
556         \txfm2\()8      v16, v18, v20, v22, v24, v26, v28, v30, v4,  v5,  v6,  v7,  v8,  v9
557         \txfm2\()8      v17, v19, v21, v23, v25, v27, v29, v31, v4,  v5,  v6,  v7,  v8,  v9
558 .endif
559 2:
560         mov             x3,  x0
561         // Add into the destination
562         ld1             {v0.8h},  [x0], x1
563         srshr           v16.4s, v16.4s, #5
564         srshr           v17.4s, v17.4s, #5
565         ld1             {v1.8h},  [x0], x1
566         srshr           v18.4s, v18.4s, #5
567         srshr           v19.4s, v19.4s, #5
568         ld1             {v2.8h},  [x0], x1
569         srshr           v20.4s, v20.4s, #5
570         srshr           v21.4s, v21.4s, #5
571         uaddw           v16.4s, v16.4s, v0.4h
572         uaddw2          v17.4s, v17.4s, v0.8h
573         ld1             {v3.8h},  [x0], x1
574         srshr           v22.4s, v22.4s, #5
575         srshr           v23.4s, v23.4s, #5
576         uaddw           v18.4s, v18.4s, v1.4h
577         uaddw2          v19.4s, v19.4s, v1.8h
578         ld1             {v4.8h},  [x0], x1
579         srshr           v24.4s, v24.4s, #5
580         srshr           v25.4s, v25.4s, #5
581         uaddw           v20.4s, v20.4s, v2.4h
582         uaddw2          v21.4s, v21.4s, v2.8h
583         sqxtun          v0.4h,  v16.4s
584         sqxtun2         v0.8h,  v17.4s
585         dup             v16.8h, w5
586         ld1             {v5.8h},  [x0], x1
587         srshr           v26.4s, v26.4s, #5
588         srshr           v27.4s, v27.4s, #5
589         uaddw           v22.4s, v22.4s, v3.4h
590         uaddw2          v23.4s, v23.4s, v3.8h
591         sqxtun          v1.4h,  v18.4s
592         sqxtun2         v1.8h,  v19.4s
593         umin            v0.8h,  v0.8h,  v16.8h
594         ld1             {v6.8h},  [x0], x1
595         srshr           v28.4s, v28.4s, #5
596         srshr           v29.4s, v29.4s, #5
597         uaddw           v24.4s, v24.4s, v4.4h
598         uaddw2          v25.4s, v25.4s, v4.8h
599         sqxtun          v2.4h,  v20.4s
600         sqxtun2         v2.8h,  v21.4s
601         umin            v1.8h,  v1.8h,  v16.8h
602         ld1             {v7.8h},  [x0], x1
603         srshr           v30.4s, v30.4s, #5
604         srshr           v31.4s, v31.4s, #5
605         uaddw           v26.4s, v26.4s, v5.4h
606         uaddw2          v27.4s, v27.4s, v5.8h
607         sqxtun          v3.4h,  v22.4s
608         sqxtun2         v3.8h,  v23.4s
609         umin            v2.8h,  v2.8h,  v16.8h
610
611         st1             {v0.8h},  [x3], x1
612         uaddw           v28.4s, v28.4s, v6.4h
613         uaddw2          v29.4s, v29.4s, v6.8h
614         st1             {v1.8h},  [x3], x1
615         sqxtun          v4.4h,  v24.4s
616         sqxtun2         v4.8h,  v25.4s
617         umin            v3.8h,  v3.8h,  v16.8h
618         st1             {v2.8h},  [x3], x1
619         uaddw           v30.4s, v30.4s, v7.4h
620         uaddw2          v31.4s, v31.4s, v7.8h
621         st1             {v3.8h},  [x3], x1
622         sqxtun          v5.4h,  v26.4s
623         sqxtun2         v5.8h,  v27.4s
624         umin            v4.8h,  v4.8h,  v16.8h
625         st1             {v4.8h},  [x3], x1
626         sqxtun          v6.4h,  v28.4s
627         sqxtun2         v6.8h,  v29.4s
628         umin            v5.8h,  v5.8h,  v16.8h
629         st1             {v5.8h},  [x3], x1
630         sqxtun          v7.4h,  v30.4s
631         sqxtun2         v7.8h,  v31.4s
632         umin            v6.8h,  v6.8h,  v16.8h
633
634         st1             {v6.8h},  [x3], x1
635         umin            v7.8h,  v7.8h,  v16.8h
636         st1             {v7.8h},  [x3], x1
637
638 .ifnc \txfm1\()_\txfm2,idct_idct
639         ldp             d8,  d9,  [sp], 0x10
640 .endif
641         ret
642 endfunc
643
644 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
645         mov             x5,  #0x03ff
646         b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
647 endfunc
648
649 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
650         mov             x5,  #0x0fff
651         b               vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
652 endfunc
653 .endm
654
655 itxfm_func8x8 idct,  idct
656 itxfm_func8x8 iadst, idct
657 itxfm_func8x8 idct,  iadst
658 itxfm_func8x8 iadst, iadst
659
660
661 function idct16x16_dc_add_neon
662         movrel          x4,  idct_coeffs
663         ld1             {v0.4h}, [x4]
664         sxtl            v0.4s,  v0.4h
665
666         movi            v1.4h,  #0
667
668         ld1             {v2.s}[0],  [x2]
669         smull           v2.2d,  v2.2s,  v0.s[0]
670         rshrn           v2.2s,  v2.2d,  #14
671         smull           v2.2d,  v2.2s,  v0.s[0]
672         rshrn           v2.2s,  v2.2d,  #14
673         st1             {v1.s}[0],  [x2]
674         dup             v2.4s,  v2.s[0]
675
676         srshr           v0.4s,  v2.4s,  #6
677
678         mov             x3, x0
679         mov             x4, #16
680         dup             v31.8h, w13
681 1:
682         // Loop to add the constant from v2 into all 16x16 outputs
683         subs            x4,  x4,  #2
684         ld1             {v1.8h,v2.8h},  [x0], x1
685         uaddw           v16.4s, v0.4s,  v1.4h
686         uaddw2          v17.4s, v0.4s,  v1.8h
687         ld1             {v3.8h,v4.8h},  [x0], x1
688         uaddw           v18.4s, v0.4s,  v2.4h
689         uaddw2          v19.4s, v0.4s,  v2.8h
690         uaddw           v20.4s, v0.4s,  v3.4h
691         uaddw2          v21.4s, v0.4s,  v3.8h
692         uaddw           v22.4s, v0.4s,  v4.4h
693         uaddw2          v23.4s, v0.4s,  v4.8h
694         sqxtun          v1.4h,  v16.4s
695         sqxtun2         v1.8h,  v17.4s
696         sqxtun          v2.4h,  v18.4s
697         sqxtun2         v2.8h,  v19.4s
698         sqxtun          v3.4h,  v20.4s
699         sqxtun2         v3.8h,  v21.4s
700         sqxtun          v4.4h,  v22.4s
701         sqxtun2         v4.8h,  v23.4s
702         umin            v1.8h,  v1.8h,  v31.8h
703         umin            v2.8h,  v2.8h,  v31.8h
704         st1             {v1.8h,v2.8h},  [x3], x1
705         umin            v3.8h,  v3.8h,  v31.8h
706         umin            v4.8h,  v4.8h,  v31.8h
707         st1             {v3.8h,v4.8h},  [x3], x1
708         b.ne            1b
709
710         ret
711 endfunc
712
713 .macro idct16
714         dmbutterfly0    v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a,  v24 = t1a
715         dmbutterfly     v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a,  v28 = t3a
716         dmbutterfly     v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a,  v30 = t7a
717         dmbutterfly     v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a,  v22 = t6a
718         dmbutterfly     v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a,  v31 = t15a
719         dmbutterfly     v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a,  v23 = t14a
720         dmbutterfly     v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
721         dmbutterfly     v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
722
723         butterfly_4s    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
724         butterfly_4s    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
725         butterfly_4s    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
726         butterfly_4s    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
727         butterfly_4s    v16, v25, v17, v25               // v16 = t8,   v25 = t9
728         butterfly_4s    v24, v21, v29, v21               // v24 = t11,  v21 = t10
729         butterfly_4s    v17, v27, v19, v27               // v17 = t12,  v27 = t13
730         butterfly_4s    v29, v23, v31, v23               // v29 = t15,  v23 = t14
731
732         dmbutterfly0    v22, v26, v22, v26, v8, v9, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
733         dmbutterfly     v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
734         dmbutterfly     v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
735
736         butterfly_4s    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
737         butterfly_4s    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
738         butterfly_4s    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
739         butterfly_4s    v5,  v6,  v28, v6                // v5  = t3a,  v6  = t4
740         butterfly_4s    v20, v28, v16, v24               // v20 = t8a,  v28 = t11a
741         butterfly_4s    v24, v21, v23, v21               // v24 = t9,   v21 = t10
742         butterfly_4s    v23, v27, v25, v27               // v23 = t14,  v27 = t13
743         butterfly_4s    v25, v29, v29, v17               // v25 = t15a, v29 = t12a
744
745         dmbutterfly0    v8,  v9,  v27, v21, v8,  v9,  v16, v17, v30, v31 // v8  = t13a, v9  = t10a
746         dmbutterfly0    v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12,  v27 = t11
747
748         butterfly_4s    v16, v31, v18, v25               // v16 = out[0], v31 = out[15]
749         butterfly_4s    v17, v30, v19, v23               // v17 = out[1], v30 = out[14]
750         butterfly_4s_r  v25, v22, v22, v24               // v25 = out[9], v22 = out[6]
751         butterfly_4s    v23, v24, v7,  v20               // v23 = out[7], v24 = out[8]
752         butterfly_4s    v18, v29, v4,  v8                // v18 = out[2], v29 = out[13]
753         butterfly_4s    v19, v28, v5,  v28               // v19 = out[3], v28 = out[12]
754         butterfly_4s    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
755         butterfly_4s    v21, v26, v26, v9                // v21 = out[5], v26 = out[10]
756 .endm
757
758 .macro iadst16
759         ld1             {v0.8h,v1.8h}, [x11]
760         sxtl            v2.4s,  v1.4h
761         sxtl2           v3.4s,  v1.8h
762         sxtl2           v1.4s,  v0.8h
763         sxtl            v0.4s,  v0.4h
764
765         dmbutterfly_l   v6,  v7,  v4,  v5,  v31, v16, v0.s[1], v0.s[0]   // v6,v7   = t1,   v4,v5   = t0
766         dmbutterfly_l   v10, v11, v8,  v9,  v23, v24, v1.s[1], v1.s[0]   // v10,v11 = t9,   v8,v9   = t8
767         dbutterfly_n    v31, v24, v6,  v7,  v10, v11, v12, v13, v10, v11 // v31     = t1a,  v24     = t9a
768         dmbutterfly_l   v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2]   // v14,v15 = t3,   v12,v13 = t2
769         dbutterfly_n    v16, v23, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v16     = t0a,  v23     = t8a
770
771         dmbutterfly_l   v6,  v7,  v4,  v5,  v21, v26, v1.s[3], v1.s[2]   // v6,v7   = t11,  v4,v5   = t10
772         dbutterfly_n    v29, v26, v14, v15, v6,  v7,  v8,  v9,  v6,  v7  // v29     = t3a,  v26     = t11a
773         dmbutterfly_l   v10, v11, v8,  v9,  v27, v20, v2.s[1], v2.s[0]   // v10,v11 = t5,   v8,v9   = t4
774         dbutterfly_n    v18, v21, v12, v13, v4,  v5,  v6,  v7,  v4,  v5  // v18     = t2a,  v21     = t10a
775
776         dmbutterfly_l   v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0]   // v14,v15 = t13,  v12,v13 = t12
777         dbutterfly_n    v20, v28, v10, v11, v14, v15, v4,  v5,  v14, v15 // v20     = t5a,  v28     = t13a
778         dmbutterfly_l   v6,  v7,  v4,  v5,  v25, v22, v2.s[3], v2.s[2]   // v6,v7   = t7,   v4,v5   = t6
779         dbutterfly_n    v27, v19, v8,  v9,  v12, v13, v10, v11, v12, v13 // v27     = t4a,  v19     = t12a
780
781         dmbutterfly_l   v10, v11, v8,  v9,  v17, v30, v3.s[3], v3.s[2]   // v10,v11 = t15,  v8,v9   = t14
782         ld1             {v0.8h}, [x10]
783         dbutterfly_n    v22, v30, v6,  v7,  v10, v11, v12, v13, v10, v11 // v22     = t7a,  v30     = t15a
784         sxtl2           v1.4s,  v0.8h
785         sxtl            v0.4s,  v0.4h
786         dmbutterfly_l   v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1]   // v14,v15 = t9,   v12,v13 = t8
787         dbutterfly_n    v25, v17, v4,  v5,  v8,  v9,  v6,  v7,  v8,  v9  // v25     = t6a,  v17     = t14a
788
789         dmbutterfly_l   v4,  v5,  v6,  v7,  v28, v19, v1.s[1], v1.s[0]   // v4,v5   = t12,  v6,v7   = t13
790         dbutterfly_n    v23, v19, v12, v13, v4,  v5,  v8,  v9,  v4,  v5  // v23     = t8a,  v19     = t12a
791         dmbutterfly_l   v10, v11, v8,  v9,  v21, v26, v1.s[2], v1.s[3]   // v10,v11 = t11,  v8,v9   = t10
792         butterfly_4s_r  v4,  v27, v16, v27               // v4  = t4,   v27 = t0
793         dbutterfly_n    v24, v28, v14, v15, v6,  v7,  v12, v13, v6,  v7  // v24     = t9a,  v28     = t13a
794
795         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2]   // v12,v13 = t14,  v14,v15 = t15
796         butterfly_4s_r  v5,  v20, v31, v20               // v5  = t5, v20 = t1
797         dbutterfly_n    v21, v17, v8,  v9,  v12, v13, v6,  v7,  v12, v13 // v21     = t10a, v17     = t14a
798         dbutterfly_n    v26, v30, v10, v11, v14, v15, v8,  v9,  v14, v15 // v26     = t11a, v30     = t15a
799
800         butterfly_4s_r  v6,  v25, v18, v25               // v6  = t6, v25 = t2
801         butterfly_4s_r  v7,  v22, v29, v22               // v7  = t7, v22 = t3
802
803         dmbutterfly_l   v10, v11, v8,  v9,  v19, v28, v0.s[2], v0.s[3]   // v10,v11 = t13,  v8,v9   = t12
804         dmbutterfly_l   v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2]   // v12,v13 = t14,  v14,v15 = t15
805
806         dbutterfly_n    v18, v30, v8,  v9,  v12, v13, v16, v17, v12, v13 // v18   = out[2], v30     = t14a
807         dbutterfly_n    v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17     = t15a
808         neg             v29.4s, v29.4s                   // v29 = out[13]
809
810         dmbutterfly_l   v10, v11, v8,  v9,  v4,  v5,  v0.s[2], v0.s[3]   // v10,v11 = t5a,  v8,v9   = t4a
811         dmbutterfly_l   v12, v13, v14, v15, v7,  v6,  v0.s[3], v0.s[2]   // v12,v13 = t6a,  v14,v15 = t7a
812
813         butterfly_4s    v2,  v6,  v27, v25               // v2 = out[0], v6 = t2a
814         butterfly_4s    v3,  v7,  v23, v21               // v3 =-out[1], v7 = t10
815
816         dbutterfly_n    v19, v31, v8,  v9,  v12, v13, v4,  v5,  v8,  v9  // v19 = -out[3],  v31 = t6
817         neg             v19.4s, v19.4s                   // v19 = out[3]
818         dbutterfly_n    v28, v16, v10, v11, v14, v15, v4,  v5,  v10, v11 // v28 = out[12],  v16 = t7
819
820         butterfly_4s    v5,  v8,  v20, v22               // v5 =-out[15],v8 = t3a
821         butterfly_4s    v4,  v9,  v24, v26               // v4 = out[14],v9 = t11
822
823         dmbutterfly0    v23, v24, v6,  v8,  v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
824         dmbutterfly0    v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
825         dmbutterfly0    v20, v27, v16, v31, v10, v11, v12, v13, v14, v15    // v20 = out[4], v27 = out[11]
826         dmbutterfly0    v22, v25, v9,  v7,  v10, v11, v12, v13, v14, v15    // v22 = out[6], v25 = out[9]
827
828         neg             v31.4s,  v5.4s                    // v31 = out[15]
829         neg             v17.4s,  v3.4s                    // v17 = out[1]
830
831         mov             v16.16b, v2.16b
832         mov             v30.16b, v4.16b
833 .endm
834
835 // Helper macros; we can't use these expressions directly within
836 // e.g. .irp due to the extra concatenation \(). Therefore wrap
837 // them in macros to allow using .irp below.
838 .macro load i, src, inc
839         ld1             {v\i\().4s},  [\src], \inc
840 .endm
841 .macro store i, dst, inc
842         st1             {v\i\().4s},  [\dst], \inc
843 .endm
844 .macro movi_v i, size, imm
845         movi            v\i\()\size,  \imm
846 .endm
847 .macro load_clear i, src, inc
848         ld1             {v\i\().4s}, [\src]
849         st1             {v4.4s},  [\src], \inc
850 .endm
851
852 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
853 // transpose into a horizontal 16x4 slice and store.
854 // x0 = dst (temp buffer)
855 // x1 = slice offset
856 // x2 = src
857 // x9 = input stride
858 .macro itxfm16_1d_funcs txfm
859 function \txfm\()16_1d_4x16_pass1_neon
860         movi            v4.4s, #0
861 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
862         load_clear      \i,  x2,  x9
863 .endr
864
865         \txfm\()16
866
867         // Do four 4x4 transposes. Originally, v16-v31 contain the
868         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
869         // contain the four transposed 4x4 blocks.
870         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
871         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
872         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
873         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
874
875         // Store the transposed 8x8 blocks horizontally.
876         cmp             x1,  #12
877         b.eq            1f
878 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
879         store           \i,  x0,  #16
880 .endr
881         ret
882 1:
883         // Special case: For the last input column (x1 == 12),
884         // which would be stored as the last row in the temp buffer,
885         // don't store the first 4x4 block, but keep it in registers
886         // for the first slice of the second pass (where it is the
887         // last 4x4 block).
888         add             x0,  x0,  #16
889 .irp i, 20, 24, 28
890         store           \i,  x0,  #16
891 .endr
892         add             x0,  x0,  #16
893 .irp i, 21, 25, 29
894         store           \i,  x0,  #16
895 .endr
896         add             x0,  x0,  #16
897 .irp i, 22, 26, 30
898         store           \i,  x0,  #16
899 .endr
900         add             x0,  x0,  #16
901 .irp i, 23, 27, 31
902         store           \i,  x0,  #16
903 .endr
904
905         mov             v28.16b, v16.16b
906         mov             v29.16b, v17.16b
907         mov             v30.16b, v18.16b
908         mov             v31.16b, v19.16b
909         ret
910 endfunc
911
912 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
913 // load the destination pixels (from a similar 4x16 slice), add and store back.
914 // x0 = dst
915 // x1 = dst stride
916 // x2 = src (temp buffer)
917 // x3 = slice offset
918 // x9 = temp buffer stride
919 function \txfm\()16_1d_4x16_pass2_neon
920 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
921         load            \i,  x2,  x9
922 .endr
923         cbz             x3,  1f
924 .irp i, 28, 29, 30, 31
925         load            \i,  x2,  x9
926 .endr
927 1:
928
929         add             x3,  x0,  x1
930         lsl             x1,  x1,  #1
931         \txfm\()16
932
933         dup             v8.8h, w13
934 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
935         srshr           \coef0, \coef0, #6
936         ld1             {v4.4h},   [x0], x1
937         srshr           \coef1, \coef1, #6
938         ld1             {v4.d}[1], [x3], x1
939         srshr           \coef2, \coef2, #6
940         ld1             {v5.4h},   [x0], x1
941         srshr           \coef3, \coef3, #6
942         uaddw           \coef0, \coef0, v4.4h
943         ld1             {v5.d}[1], [x3], x1
944         srshr           \coef4, \coef4, #6
945         uaddw2          \coef1, \coef1, v4.8h
946         ld1             {v6.4h},   [x0], x1
947         srshr           \coef5, \coef5, #6
948         uaddw           \coef2, \coef2, v5.4h
949         ld1             {v6.d}[1], [x3], x1
950         sqxtun          v4.4h,  \coef0
951         srshr           \coef6, \coef6, #6
952         uaddw2          \coef3, \coef3, v5.8h
953         ld1             {v7.4h},   [x0], x1
954         sqxtun2         v4.8h,  \coef1
955         srshr           \coef7, \coef7, #6
956         uaddw           \coef4, \coef4, v6.4h
957         ld1             {v7.d}[1], [x3], x1
958         umin            v4.8h,  v4.8h,  v8.8h
959         sub             x0,  x0,  x1, lsl #2
960         sub             x3,  x3,  x1, lsl #2
961         sqxtun          v5.4h,  \coef2
962         uaddw2          \coef5, \coef5, v6.8h
963         st1             {v4.4h},   [x0], x1
964         sqxtun2         v5.8h,  \coef3
965         uaddw           \coef6, \coef6, v7.4h
966         st1             {v4.d}[1], [x3], x1
967         umin            v5.8h,  v5.8h,  v8.8h
968         sqxtun          v6.4h,  \coef4
969         uaddw2          \coef7, \coef7, v7.8h
970         st1             {v5.4h},   [x0], x1
971         sqxtun2         v6.8h,  \coef5
972         st1             {v5.d}[1], [x3], x1
973         umin            v6.8h,  v6.8h,  v8.8h
974         sqxtun          v7.4h,  \coef6
975         st1             {v6.4h},   [x0], x1
976         sqxtun2         v7.8h,  \coef7
977         st1             {v6.d}[1], [x3], x1
978         umin            v7.8h,  v7.8h,  v8.8h
979         st1             {v7.4h},   [x0], x1
980         st1             {v7.d}[1], [x3], x1
981 .endm
982         load_add_store  v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
983         load_add_store  v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
984 .purgem load_add_store
985
986         ret
987 endfunc
988 .endm
989
990 itxfm16_1d_funcs idct
991 itxfm16_1d_funcs iadst
992
993 // This is the minimum eob value for each subpartition, in increments of 4
994 const min_eob_idct_idct_16, align=4
995         .short  0, 10, 38, 89
996 endconst
997
998 .macro itxfm_func16x16 txfm1, txfm2
999 function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1000 .ifc \txfm1\()_\txfm2,idct_idct
1001         cmp             w3,  #1
1002         b.eq            idct16x16_dc_add_neon
1003 .endif
1004         mov             x15, x30
1005         // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
1006 .ifnc \txfm1\()_\txfm2,idct_idct
1007         stp             d14, d15, [sp, #-0x10]!
1008         stp             d12, d13, [sp, #-0x10]!
1009         stp             d10, d11, [sp, #-0x10]!
1010 .endif
1011         stp             d8,  d9,  [sp, #-0x10]!
1012
1013         sub             sp,  sp,  #1024
1014
1015         mov             x4,  x0
1016         mov             x5,  x1
1017         mov             x6,  x2
1018
1019         movrel          x10, idct_coeffs
1020 .ifnc \txfm1\()_\txfm2,idct_idct
1021         movrel          x11, iadst16_coeffs
1022 .endif
1023         movrel          x12, min_eob_idct_idct_16, 2
1024 .ifc \txfm1,idct
1025         ld1             {v0.8h,v1.8h}, [x10]
1026         sxtl            v2.4s,  v1.4h
1027         sxtl2           v3.4s,  v1.8h
1028         sxtl2           v1.4s,  v0.8h
1029         sxtl            v0.4s,  v0.4h
1030 .endif
1031         mov             x9,  #64
1032
1033 .irp i, 0, 4, 8, 12
1034         add             x0,  sp,  #(\i*64)
1035 .ifc \txfm1\()_\txfm2,idct_idct
1036 .if \i > 0
1037         ldrh            w1,  [x12], #2
1038         cmp             w3,  w1
1039         mov             x1,  #(16 - \i)/4
1040         b.le            1f
1041 .endif
1042 .endif
1043         mov             x1,  #\i
1044         add             x2,  x6,  #(\i*4)
1045         bl              \txfm1\()16_1d_4x16_pass1_neon
1046 .endr
1047 .ifc \txfm1\()_\txfm2,iadst_idct
1048         ld1             {v0.8h,v1.8h}, [x10]
1049         sxtl            v2.4s,  v1.4h
1050         sxtl2           v3.4s,  v1.8h
1051         sxtl2           v1.4s,  v0.8h
1052         sxtl            v0.4s,  v0.4h
1053 .endif
1054
1055 .ifc \txfm1\()_\txfm2,idct_idct
1056         b               3f
1057 1:
1058         // Set v28-v31 to zero, for the in-register passthrough of
1059         // coefficients to pass 2.
1060         movi            v28.4s,  #0
1061         movi            v29.4s,  #0
1062         movi            v30.4s,  #0
1063         movi            v31.4s,  #0
1064 2:
1065         subs            x1,  x1,  #1
1066 .rept 4
1067         st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
1068 .endr
1069         b.ne            2b
1070 3:
1071 .endif
1072
1073 .irp i, 0, 4, 8, 12
1074         add             x0,  x4,  #(\i*2)
1075         mov             x1,  x5
1076         add             x2,  sp,  #(\i*4)
1077         mov             x3,  #\i
1078         bl              \txfm2\()16_1d_4x16_pass2_neon
1079 .endr
1080
1081         add             sp,  sp,  #1024
1082         ldp             d8,  d9,  [sp], 0x10
1083 .ifnc \txfm1\()_\txfm2,idct_idct
1084         ldp             d10, d11, [sp], 0x10
1085         ldp             d12, d13, [sp], 0x10
1086         ldp             d14, d15, [sp], 0x10
1087 .endif
1088         br              x15
1089 endfunc
1090
1091 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1092         mov             x13, #0x03ff
1093         b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1094 endfunc
1095
1096 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1097         mov             x13, #0x0fff
1098         b               vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1099 endfunc
1100 .endm
1101
1102 itxfm_func16x16 idct,  idct
1103 itxfm_func16x16 iadst, idct
1104 itxfm_func16x16 idct,  iadst
1105 itxfm_func16x16 iadst, iadst
1106
1107
1108 function idct32x32_dc_add_neon
1109         movrel          x4,  idct_coeffs
1110         ld1             {v0.4h}, [x4]
1111         sxtl            v0.4s,  v0.4h
1112
1113         movi            v1.4h,  #0
1114
1115         ld1             {v2.s}[0],  [x2]
1116         smull           v2.2d,  v2.2s,  v0.s[0]
1117         rshrn           v2.2s,  v2.2d,  #14
1118         smull           v2.2d,  v2.2s,  v0.s[0]
1119         rshrn           v2.2s,  v2.2d,  #14
1120         st1             {v1.s}[0],  [x2]
1121         dup             v2.4s,  v2.s[0]
1122
1123         srshr           v0.4s,  v2.4s,  #6
1124
1125         mov             x3,  x0
1126         mov             x4,  #32
1127         sub             x1,  x1,  #32
1128         dup             v31.8h, w13
1129 1:
1130         // Loop to add the constant v0 into all 32x32 outputs
1131         subs            x4,  x4,  #1
1132         ld1             {v1.8h,v2.8h},  [x0], #32
1133         uaddw           v16.4s, v0.4s,  v1.4h
1134         uaddw2          v17.4s, v0.4s,  v1.8h
1135         ld1             {v3.8h,v4.8h},  [x0], x1
1136         uaddw           v18.4s, v0.4s,  v2.4h
1137         uaddw2          v19.4s, v0.4s,  v2.8h
1138         uaddw           v20.4s, v0.4s,  v3.4h
1139         uaddw2          v21.4s, v0.4s,  v3.8h
1140         uaddw           v22.4s, v0.4s,  v4.4h
1141         uaddw2          v23.4s, v0.4s,  v4.8h
1142         sqxtun          v1.4h,  v16.4s
1143         sqxtun2         v1.8h,  v17.4s
1144         sqxtun          v2.4h,  v18.4s
1145         sqxtun2         v2.8h,  v19.4s
1146         sqxtun          v3.4h,  v20.4s
1147         sqxtun2         v3.8h,  v21.4s
1148         sqxtun          v4.4h,  v22.4s
1149         sqxtun2         v4.8h,  v23.4s
1150         umin            v1.8h,  v1.8h,  v31.8h
1151         umin            v2.8h,  v2.8h,  v31.8h
1152         st1             {v1.8h,v2.8h},  [x3], #32
1153         umin            v3.8h,  v3.8h,  v31.8h
1154         umin            v4.8h,  v4.8h,  v31.8h
1155         st1             {v3.8h,v4.8h},  [x3], x1
1156         b.ne            1b
1157
1158         ret
1159 endfunc
1160
1161 .macro idct32_odd
1162         dmbutterfly     v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1163         dmbutterfly     v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1164         dmbutterfly     v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1165         dmbutterfly     v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1166         dmbutterfly     v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1167         dmbutterfly     v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1168         dmbutterfly     v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1169         dmbutterfly     v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1170
1171         butterfly_4s    v4,  v24, v16, v24 // v4  = t16, v24 = t17
1172         butterfly_4s    v5,  v20, v28, v20 // v5  = t19, v20 = t18
1173         butterfly_4s    v6,  v26, v18, v26 // v6  = t20, v26 = t21
1174         butterfly_4s    v7,  v22, v30, v22 // v7  = t23, v22 = t22
1175         butterfly_4s    v28, v25, v17, v25 // v28 = t24, v25 = t25
1176         butterfly_4s    v30, v21, v29, v21 // v30 = t27, v21 = t26
1177         butterfly_4s    v29, v23, v31, v23 // v29 = t31, v23 = t30
1178         butterfly_4s    v31, v27, v19, v27 // v31 = t28, v27 = t29
1179
1180         dmbutterfly     v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
1181         dmbutterfly     v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1182         dmbutterfly     v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
1183         dmbutterfly     v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1184
1185         butterfly_4s    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
1186         butterfly_4s    v17, v20, v23, v20 // v17 = t17,  v20 = t18
1187         butterfly_4s    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
1188         butterfly_4s    v19, v21, v22, v21 // v19 = t22,  v21 = t21
1189         butterfly_4s    v4,  v28, v28, v30 // v4  = t24a, v28 = t27a
1190         butterfly_4s    v23, v26, v25, v26 // v23 = t25,  v26 = t26
1191         butterfly_4s    v7,  v8,  v29, v31 // v7  = t31a, v3  = t28a
1192         butterfly_4s    v22, v27, v24, v27 // v22 = t30,  v27 = t29
1193
1194         dmbutterfly     v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31        // v27 = t18a, v20 = t29a
1195         dmbutterfly     v8,  v5,  v0.s[2], v0.s[3], v24, v25, v30, v31        // v3  = t19,  v5  = t28
1196         dmbutterfly     v28, v6,  v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27,  v6  = t20
1197         dmbutterfly     v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1198
1199         butterfly_4s    v31, v24, v7,  v4  // v31 = t31,  v24 = t24
1200         butterfly_4s    v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1201         butterfly_4s_r  v23, v16, v16, v18 // v23 = t23,  v16 = t16
1202         butterfly_4s_r  v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1203         butterfly_4s    v18, v21, v27, v21 // v18 = t18,  v21 = t21
1204         butterfly_4s_r  v27, v28, v5,  v28 // v27 = t27a, v28 = t28a
1205         butterfly_4s    v29, v26, v20, v26 // v29 = t29,  v26 = t26
1206         butterfly_4s    v19, v20, v8,  v6  // v19 = t19a, v20 = t20
1207
1208         dmbutterfly0    v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27,  v20 = t20
1209         dmbutterfly0    v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
1210         dmbutterfly0    v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25,  v22 = t22
1211         dmbutterfly0    v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
1212 .endm
1213
1214 // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
1215 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1216 // a normal IDCT16 with every other input component (the even ones, with
1217 // each output written twice), followed by a separate 16-point IDCT
1218 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1219 // x0 = dst (temp buffer)
1220 // x1 = unused
1221 // x2 = src
1222 // x9 = double input stride
1223 function idct32_1d_4x32_pass1_neon
1224         movi            v4.4s,  #0
1225
1226         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1227 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1228         ld1             {v\i\().4s}, [x2]
1229         st1             {v4.4s},  [x2], x9
1230 .endr
1231
1232         idct16
1233
1234         // Do four 4x4 transposes. Originally, v16-v31 contain the
1235         // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1236         // contain the four transposed 4x4 blocks.
1237         transpose_4x4s  v16, v17, v18, v19, v4, v5, v6, v7
1238         transpose_4x4s  v20, v21, v22, v23, v4, v5, v6, v7
1239         transpose_4x4s  v24, v25, v26, v27, v4, v5, v6, v7
1240         transpose_4x4s  v28, v29, v30, v31, v4, v5, v6, v7
1241
1242         // Store the registers a, b, c, d horizontally, followed by the
1243         // same registers d, c, b, a mirrored.
1244 .macro store_rev a, b, c, d
1245         // There's no rev128 instruction, but we reverse each 64 bit
1246         // half, and then flip them using an ext with 8 bytes offset.
1247         rev64           v7.4s, v\d\().4s
1248         st1             {v\a\().4s},  [x0], #16
1249         ext             v7.16b, v7.16b, v7.16b, #8
1250         st1             {v\b\().4s},  [x0], #16
1251         rev64           v6.4s, v\c\().4s
1252         st1             {v\c\().4s},  [x0], #16
1253         ext             v6.16b, v6.16b, v6.16b, #8
1254         st1             {v\d\().4s},  [x0], #16
1255         rev64           v5.4s, v\b\().4s
1256         st1             {v7.4s},  [x0], #16
1257         ext             v5.16b, v5.16b, v5.16b, #8
1258         st1             {v6.4s},  [x0], #16
1259         rev64           v4.4s, v\a\().4s
1260         st1             {v5.4s},  [x0], #16
1261         ext             v4.16b, v4.16b, v4.16b, #8
1262         st1             {v4.4s},  [x0], #16
1263 .endm
1264         store_rev       16, 20, 24, 28
1265         store_rev       17, 21, 25, 29
1266         store_rev       18, 22, 26, 30
1267         store_rev       19, 23, 27, 31
1268         sub             x0,  x0,  #512
1269 .purgem store_rev
1270
1271         // Move x2 back to the start of the input, and move
1272         // to the first odd row
1273         sub             x2,  x2,  x9, lsl #4
1274         add             x2,  x2,  #128
1275
1276         movi            v4.4s,  #0
1277         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1278 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1279         ld1             {v\i\().4s}, [x2]
1280         st1             {v4.4s},  [x2], x9
1281 .endr
1282
1283         idct32_odd
1284
1285         transpose_4x4s  v31, v30, v29, v28, v4, v5, v6, v7
1286         transpose_4x4s  v27, v26, v25, v24, v4, v5, v6, v7
1287         transpose_4x4s  v23, v22, v21, v20, v4, v5, v6, v7
1288         transpose_4x4s  v19, v18, v17, v16, v4, v5, v6, v7
1289
1290         // Store the registers a, b, c, d horizontally,
1291         // adding into the output first, and the mirrored,
1292         // subtracted from the output.
1293 .macro store_rev a, b, c, d
1294         ld1             {v4.4s},  [x0]
1295         rev64           v9.4s, v\d\().4s
1296         add             v4.4s, v4.4s, v\a\().4s
1297         st1             {v4.4s},  [x0], #16
1298         rev64           v8.4s, v\c\().4s
1299         ld1             {v4.4s},  [x0]
1300         ext             v9.16b, v9.16b, v9.16b, #8
1301         add             v4.4s, v4.4s, v\b\().4s
1302         st1             {v4.4s},  [x0], #16
1303         ext             v8.16b, v8.16b, v8.16b, #8
1304         ld1             {v4.4s},  [x0]
1305         rev64           v\b\().4s, v\b\().4s
1306         add             v4.4s, v4.4s, v\c\().4s
1307         st1             {v4.4s},  [x0], #16
1308         rev64           v\a\().4s, v\a\().4s
1309         ld1             {v4.4s},  [x0]
1310         ext             v\b\().16b, v\b\().16b, v\b\().16b, #8
1311         add             v4.4s, v4.4s, v\d\().4s
1312         st1             {v4.4s},  [x0], #16
1313         ext             v\a\().16b, v\a\().16b, v\a\().16b, #8
1314         ld1             {v4.4s},  [x0]
1315         sub             v4.4s, v4.4s, v9.4s
1316         st1             {v4.4s},  [x0], #16
1317         ld1             {v4.4s},  [x0]
1318         sub             v4.4s, v4.4s, v8.4s
1319         st1             {v4.4s},  [x0], #16
1320         ld1             {v4.4s},  [x0]
1321         sub             v4.4s, v4.4s, v\b\().4s
1322         st1             {v4.4s},  [x0], #16
1323         ld1             {v4.4s},  [x0]
1324         sub             v4.4s, v4.4s, v\a\().4s
1325         st1             {v4.4s},  [x0], #16
1326 .endm
1327
1328         store_rev       31, 27, 23, 19
1329         store_rev       30, 26, 22, 18
1330         store_rev       29, 25, 21, 17
1331         store_rev       28, 24, 20, 16
1332 .purgem store_rev
1333         ret
1334 endfunc
1335
1336 // This is mostly the same as 4x32_pass1, but without the transpose,
1337 // and use the source as temp buffer between the two idct passes, and
1338 // add into the destination.
1339 // x0 = dst
1340 // x1 = dst stride
1341 // x2 = src (temp buffer)
1342 // x7 = negative double temp buffer stride
1343 // x9 = double temp buffer stride
1344 function idct32_1d_4x32_pass2_neon
1345         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1346 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1347         ld1             {v\i\().4s}, [x2], x9
1348 .endr
1349         sub             x2,  x2,  x9, lsl #4
1350
1351         idct16
1352
1353 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1354         st1             {v\i\().4s}, [x2], x9
1355 .endr
1356
1357         sub             x2,  x2,  x9, lsl #4
1358         add             x2,  x2,  #128
1359
1360         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1361 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1362         ld1             {v\i\().4s}, [x2], x9
1363 .endr
1364         sub             x2,  x2,  x9, lsl #4
1365         sub             x2,  x2,  #128
1366
1367         idct32_odd
1368
1369 .macro load_acc_store a, b, c, d, neg=0
1370 .if \neg == 0
1371         ld1             {v4.4s},  [x2], x9
1372         ld1             {v5.4s},  [x2], x9
1373         add             v4.4s, v4.4s, v\a\().4s
1374         ld1             {v6.4s},  [x2], x9
1375         add             v5.4s, v5.4s, v\b\().4s
1376         ld1             {v7.4s},  [x2], x9
1377         add             v6.4s, v6.4s, v\c\().4s
1378         add             v7.4s, v7.4s, v\d\().4s
1379 .else
1380         ld1             {v4.4s},  [x2], x7
1381         ld1             {v5.4s},  [x2], x7
1382         sub             v4.4s, v4.4s, v\a\().4s
1383         ld1             {v6.4s},  [x2], x7
1384         sub             v5.4s, v5.4s, v\b\().4s
1385         ld1             {v7.4s},  [x2], x7
1386         sub             v6.4s, v6.4s, v\c\().4s
1387         sub             v7.4s, v7.4s, v\d\().4s
1388 .endif
1389         ld1             {v8.4h},   [x0], x1
1390         ld1             {v8.d}[1], [x0], x1
1391         srshr           v4.4s, v4.4s, #6
1392         ld1             {v9.4h},   [x0], x1
1393         srshr           v5.4s, v5.4s, #6
1394         uaddw           v4.4s, v4.4s, v8.4h
1395         ld1             {v9.d}[1], [x0], x1
1396         srshr           v6.4s, v6.4s, #6
1397         uaddw2          v5.4s, v5.4s, v8.8h
1398         srshr           v7.4s, v7.4s, #6
1399         sub             x0,  x0,  x1, lsl #2
1400         uaddw           v6.4s, v6.4s, v9.4h
1401         sqxtun          v4.4h, v4.4s
1402         uaddw2          v7.4s, v7.4s, v9.8h
1403         sqxtun2         v4.8h, v5.4s
1404         umin            v4.8h, v4.8h, v15.8h
1405         st1             {v4.4h},   [x0], x1
1406         sqxtun          v5.4h, v6.4s
1407         st1             {v4.d}[1], [x0], x1
1408         sqxtun2         v5.8h, v7.4s
1409         umin            v5.8h, v5.8h, v15.8h
1410         st1             {v5.4h},   [x0], x1
1411         st1             {v5.d}[1], [x0], x1
1412 .endm
1413         load_acc_store  31, 30, 29, 28
1414         load_acc_store  27, 26, 25, 24
1415         load_acc_store  23, 22, 21, 20
1416         load_acc_store  19, 18, 17, 16
1417         sub             x2,  x2,  x9
1418         load_acc_store  16, 17, 18, 19, 1
1419         load_acc_store  20, 21, 22, 23, 1
1420         load_acc_store  24, 25, 26, 27, 1
1421         load_acc_store  28, 29, 30, 31, 1
1422 .purgem load_acc_store
1423         ret
1424 endfunc
1425
1426 const min_eob_idct_idct_32, align=4
1427         .short  0, 9, 34, 70, 135, 240, 336, 448
1428 endconst
1429
1430 function vp9_idct_idct_32x32_add_16_neon
1431         cmp             w3,  #1
1432         b.eq            idct32x32_dc_add_neon
1433
1434         movrel          x10, idct_coeffs
1435         movrel          x12, min_eob_idct_idct_32, 2
1436
1437         mov             x15, x30
1438         stp             d8,  d9,  [sp, #-0x10]!
1439         stp             d10, d11, [sp, #-0x10]!
1440         stp             d12, d13, [sp, #-0x10]!
1441         stp             d14, d15, [sp, #-0x10]!
1442
1443         sub             sp,  sp,  #4096
1444
1445         mov             x4,  x0
1446         mov             x5,  x1
1447         mov             x6,  x2
1448
1449         // Double stride of the input, since we only read every other line
1450         mov             x9,  #256
1451         neg             x7,  x9
1452
1453         ld1             {v0.8h,v1.8h},   [x10], #32
1454         sxtl            v2.4s,  v1.4h
1455         sxtl2           v3.4s,  v1.8h
1456         sxtl2           v1.4s,  v0.8h
1457         sxtl            v0.4s,  v0.4h
1458         ld1             {v10.8h,v11.8h}, [x10]
1459         sxtl            v12.4s, v11.4h
1460         sxtl2           v13.4s, v11.8h
1461         sxtl2           v11.4s, v10.8h
1462         sxtl            v10.4s, v10.4h
1463
1464         dup             v15.8h, w13
1465
1466 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1467         add             x0,  sp,  #(\i*128)
1468 .if \i > 0
1469         ldrh            w1,  [x12], #2
1470         cmp             w3,  w1
1471         mov             x1,  #(32 - \i)/4
1472         b.le            1f
1473 .endif
1474         add             x2,  x6,  #(\i*4)
1475         bl              idct32_1d_4x32_pass1_neon
1476 .endr
1477         b               3f
1478
1479 1:
1480         // Write zeros to the temp buffer for pass 2
1481         movi            v16.4s,  #0
1482         movi            v17.4s,  #0
1483         movi            v18.4s,  #0
1484         movi            v19.4s,  #0
1485 2:
1486         subs            x1,  x1,  #1
1487 .rept 4
1488         st1             {v16.4s-v19.4s},  [x0], #64
1489         st1             {v16.4s-v19.4s},  [x0], #64
1490 .endr
1491         b.ne            2b
1492 3:
1493 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1494         add             x0,  x4,  #(\i*2)
1495         mov             x1,  x5
1496         add             x2,  sp,  #(\i*4)
1497         bl              idct32_1d_4x32_pass2_neon
1498 .endr
1499
1500         add             sp,  sp,  #4096
1501         ldp             d14, d15, [sp], 0x10
1502         ldp             d12, d13, [sp], 0x10
1503         ldp             d10, d11, [sp], 0x10
1504         ldp             d8,  d9,  [sp], 0x10
1505
1506         br              x15
1507 endfunc
1508
1509 function ff_vp9_idct_idct_32x32_add_10_neon, export=1
1510         mov             x13, #0x03ff
1511         b               vp9_idct_idct_32x32_add_16_neon
1512 endfunc
1513
1514 function ff_vp9_idct_idct_32x32_add_12_neon, export=1
1515         mov             x13, #0x0fff
1516         b               vp9_idct_idct_32x32_add_16_neon
1517 endfunc