2 * Copyright (c) 2016 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
24 const itxfm4_coeffs, align=4
25 .short 11585, 6270, 15137, 0
27 .short 5283, 15212, 9929, 13377
30 const iadst8_coeffs, align=4
31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
33 .short 11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
34 .short 16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
39 const iadst16_coeffs, align=4
40 .short 16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
41 .short 11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
44 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
45 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
46 // in/out are .8h registers; this can do with 4 temp registers, but is
47 // more efficient if 6 temp registers are available.
48 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
50 neg \tmp4\().4h, v0.4h
52 add \tmp1\().8h, \in1\().8h, \in2\().8h
53 sub \tmp2\().8h, \in1\().8h, \in2\().8h
55 smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
56 smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
58 smull \tmp3\().4s, \tmp1\().4h, v0.h[0]
59 smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0]
62 rshrn \out1\().4h, \tmp3\().4s, #14
63 rshrn2 \out1\().8h, \tmp4\().4s, #14
64 smull \tmp3\().4s, \tmp2\().4h, v0.h[0]
65 smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0]
66 rshrn \out2\().4h, \tmp3\().4s, #14
67 rshrn2 \out2\().8h, \tmp4\().4s, #14
69 smull \tmp5\().4s, \tmp2\().4h, v0.h[0]
70 smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0]
71 rshrn \out1\().4h, \tmp3\().4s, #14
72 rshrn2 \out1\().8h, \tmp4\().4s, #14
73 rshrn \out2\().4h, \tmp5\().4s, #14
74 rshrn2 \out2\().8h, \tmp6\().4s, #14
78 // out1,out2 = in1 * coef1 - in2 * coef2
79 // out3,out4 = in1 * coef2 + in2 * coef1
80 // out are 4 x .4s registers, in are 2 x .8h registers
81 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
82 smull \out1\().4s, \in1\().4h, \coef1
83 smull2 \out2\().4s, \in1\().8h, \coef1
84 smull \out3\().4s, \in1\().4h, \coef2
85 smull2 \out4\().4s, \in1\().8h, \coef2
86 smlsl \out1\().4s, \in2\().4h, \coef2
87 smlsl2 \out2\().4s, \in2\().8h, \coef2
88 smlal \out3\().4s, \in2\().4h, \coef1
89 smlal2 \out4\().4s, \in2\().8h, \coef1
92 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
93 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
94 // inout are 2 x .8h registers
95 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
96 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
98 neg \tmp3\().4s, \tmp3\().4s
99 neg \tmp4\().4s, \tmp4\().4s
101 rshrn \inout1\().4h, \tmp1\().4s, #14
102 rshrn2 \inout1\().8h, \tmp2\().4s, #14
103 rshrn \inout2\().4h, \tmp3\().4s, #14
104 rshrn2 \inout2\().8h, \tmp4\().4s, #14
109 .macro butterfly_8h out1, out2, in1, in2
110 add \out1\().8h, \in1\().8h, \in2\().8h
111 sub \out2\().8h, \in1\().8h, \in2\().8h
116 .macro butterfly_8h_r out1, out2, in1, in2
117 sub \out1\().8h, \in1\().8h, \in2\().8h
118 add \out2\().8h, \in1\().8h, \in2\().8h
121 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
122 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
123 // out are 2 x .8h registers, in are 4 x .4s registers
124 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
125 add \tmp1\().4s, \in1\().4s, \in3\().4s
126 add \tmp2\().4s, \in2\().4s, \in4\().4s
127 sub \tmp3\().4s, \in1\().4s, \in3\().4s
128 sub \tmp4\().4s, \in2\().4s, \in4\().4s
129 rshrn \out1\().4h, \tmp1\().4s, #14
130 rshrn2 \out1\().8h, \tmp2\().4s, #14
131 rshrn \out2\().4h, \tmp3\().4s, #14
132 rshrn2 \out2\().8h, \tmp4\().4s, #14
135 .macro iwht4 c0, c1, c2, c3
136 add \c0\().4h, \c0\().4h, \c1\().4h
137 sub v17.4h, \c2\().4h, \c3\().4h
138 sub v16.4h, \c0\().4h, v17.4h
139 sshr v16.4h, v16.4h, #1
140 sub \c2\().4h, v16.4h, \c1\().4h
141 sub \c1\().4h, v16.4h, \c3\().4h
142 add \c3\().4h, v17.4h, \c2\().4h
143 sub \c0\().4h, \c0\().4h, \c1\().4h
146 .macro idct4 c0, c1, c2, c3
147 smull v22.4s, \c1\().4h, v0.h[2]
148 smull v20.4s, \c1\().4h, v0.h[1]
149 add v16.4h, \c0\().4h, \c2\().4h
150 sub v17.4h, \c0\().4h, \c2\().4h
151 smlal v22.4s, \c3\().4h, v0.h[1]
152 smull v18.4s, v16.4h, v0.h[0]
153 smull v19.4s, v17.4h, v0.h[0]
154 smlsl v20.4s, \c3\().4h, v0.h[2]
155 rshrn v22.4h, v22.4s, #14
156 rshrn v18.4h, v18.4s, #14
157 rshrn v19.4h, v19.4s, #14
158 rshrn v20.4h, v20.4s, #14
159 add \c0\().4h, v18.4h, v22.4h
160 sub \c3\().4h, v18.4h, v22.4h
161 add \c1\().4h, v19.4h, v20.4h
162 sub \c2\().4h, v19.4h, v20.4h
165 .macro iadst4 c0, c1, c2, c3
166 smull v16.4s, \c0\().4h, v0.h[4]
167 smlal v16.4s, \c2\().4h, v0.h[5]
168 smlal v16.4s, \c3\().4h, v0.h[6]
169 smull v17.4s, \c0\().4h, v0.h[6]
170 smlsl v17.4s, \c2\().4h, v0.h[4]
171 sub \c0\().4h, \c0\().4h, \c2\().4h
172 smlsl v17.4s, \c3\().4h, v0.h[5]
173 add \c0\().4h, \c0\().4h, \c3\().4h
174 smull v19.4s, \c1\().4h, v0.h[7]
175 smull v18.4s, \c0\().4h, v0.h[7]
176 add v20.4s, v16.4s, v19.4s
177 add v21.4s, v17.4s, v19.4s
178 rshrn \c0\().4h, v20.4s, #14
179 add v16.4s, v16.4s, v17.4s
180 rshrn \c1\().4h, v21.4s, #14
181 sub v16.4s, v16.4s, v19.4s
182 rshrn \c2\().4h, v18.4s, #14
183 rshrn \c3\().4h, v16.4s, #14
186 // The public functions in this file have got the following signature:
187 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
189 .macro itxfm_func4x4 txfm1, txfm2
190 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
193 movrel x4, itxfm4_coeffs
197 movrel x4, iadst4_coeffs
201 movrel x4, itxfm4_coeffs
206 .ifc \txfm1\()_\txfm2,idct_idct
209 // DC-only for idct/idct
211 smull v2.4s, v2.4h, v0.h[0]
212 rshrn v2.4h, v2.4s, #14
213 smull v2.4s, v2.4h, v0.h[0]
214 rshrn v2.4h, v2.4s, #14
224 ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2]
225 st1 {v31.8h}, [x2], #16
228 sshr v4.4h, v4.4h, #2
229 sshr v5.4h, v5.4h, #2
230 sshr v6.4h, v6.4h, #2
231 sshr v7.4h, v7.4h, #2
234 \txfm1\()4 v4, v5, v6, v7
236 st1 {v31.8h}, [x2], #16
237 // Transpose 4x4 with 16 bit elements
238 transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19
240 \txfm2\()4 v4, v5, v6, v7
242 ld1r {v0.2s}, [x0], x1
243 ld1r {v1.2s}, [x0], x1
245 srshr v4.4h, v4.4h, #4
246 srshr v5.4h, v5.4h, #4
247 srshr v6.4h, v6.4h, #4
248 srshr v7.4h, v7.4h, #4
250 uaddw v4.8h, v4.8h, v0.8b
251 uaddw v5.8h, v5.8h, v1.8b
252 ld1r {v2.2s}, [x0], x1
253 ld1r {v3.2s}, [x0], x1
256 sub x0, x0, x1, lsl #2
258 uaddw v6.8h, v6.8h, v2.8b
259 uaddw v7.8h, v7.8h, v3.8b
260 st1 {v0.s}[0], [x0], x1
264 st1 {v1.s}[0], [x0], x1
265 st1 {v2.s}[0], [x0], x1
266 st1 {v3.s}[0], [x0], x1
272 itxfm_func4x4 idct, idct
273 itxfm_func4x4 iadst, idct
274 itxfm_func4x4 idct, iadst
275 itxfm_func4x4 iadst, iadst
276 itxfm_func4x4 iwht, iwht
280 dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
281 dmbutterfly v18, v22, v0.h[1], v0.h[2], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
282 dmbutterfly v17, v23, v0.h[3], v0.h[4], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
283 dmbutterfly v21, v19, v0.h[5], v0.h[6], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
285 butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3
286 butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a
287 butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a
288 butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2
290 dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
292 butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
293 butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
294 butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
295 butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
299 dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a
300 dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a
301 dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a
302 dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a
304 dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4
305 dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5
306 dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6
307 dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7
309 butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2
310 butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3
311 neg v23.8h, v23.8h // v23 = out[7]
313 dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4]
314 neg v19.8h, v19.8h // v19 = out[3]
316 dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[1], v0.h[2] // v26,v27 = t5a, v28,v29 = t4a
317 dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[2], v0.h[1] // v2,v3 = t6a, v4,v5 = t7a
319 dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6
320 dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7
321 neg v17.8h, v17.8h // v17 = out[1]
323 dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5]
324 neg v21.8h, v21.8h // v21 = out[5]
328 .macro itxfm_func8x8 txfm1, txfm2
329 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
330 // The iadst also uses a few coefficients from
331 // idct, so those always need to be loaded.
332 .ifc \txfm1\()_\txfm2,idct_idct
333 movrel x4, idct_coeffs
336 movrel x4, iadst8_coeffs
337 ld1 {v1.8h}, [x4], #16
346 .ifc \txfm1\()_\txfm2,idct_idct
349 // DC-only for idct/idct
351 smull v2.4s, v2.4h, v0.h[0]
352 rshrn v2.4h, v2.4s, #14
353 smull v2.4s, v2.4h, v0.h[0]
354 rshrn v2.4h, v2.4s, #14
367 ld1 {v16.16b,v17.16b,v18.16b,v19.16b}, [x2], #64
368 ld1 {v20.16b,v21.16b,v22.16b,v23.16b}, [x2], #64
370 st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64
371 st1 {v2.16b,v3.16b,v4.16b,v5.16b}, [x2], #64
375 // Transpose 8x8 with 16 bit elements
376 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
381 // Add into the destination
382 ld1 {v0.8b}, [x0], x1
383 srshr v16.8h, v16.8h, #5
384 ld1 {v1.8b}, [x0], x1
385 srshr v17.8h, v17.8h, #5
386 ld1 {v2.8b}, [x0], x1
387 srshr v18.8h, v18.8h, #5
388 uaddw v16.8h, v16.8h, v0.8b
389 ld1 {v3.8b}, [x0], x1
390 srshr v19.8h, v19.8h, #5
391 uaddw v17.8h, v17.8h, v1.8b
392 ld1 {v4.8b}, [x0], x1
393 srshr v20.8h, v20.8h, #5
394 uaddw v18.8h, v18.8h, v2.8b
396 ld1 {v5.8b}, [x0], x1
397 srshr v21.8h, v21.8h, #5
398 uaddw v19.8h, v19.8h, v3.8b
400 ld1 {v6.8b}, [x0], x1
401 srshr v22.8h, v22.8h, #5
402 uaddw v20.8h, v20.8h, v4.8b
404 ld1 {v7.8b}, [x0], x1
405 srshr v23.8h, v23.8h, #5
406 uaddw v21.8h, v21.8h, v5.8b
409 st1 {v0.8b}, [x3], x1
410 uaddw v22.8h, v22.8h, v6.8b
411 st1 {v1.8b}, [x3], x1
413 st1 {v2.8b}, [x3], x1
414 uaddw v23.8h, v23.8h, v7.8b
415 st1 {v3.8b}, [x3], x1
417 st1 {v4.8b}, [x3], x1
419 st1 {v5.8b}, [x3], x1
422 st1 {v6.8b}, [x3], x1
423 st1 {v7.8b}, [x3], x1
429 itxfm_func8x8 idct, idct
430 itxfm_func8x8 iadst, idct
431 itxfm_func8x8 idct, iadst
432 itxfm_func8x8 iadst, iadst
435 function idct16x16_dc_add_neon
436 movrel x4, idct_coeffs
442 smull v2.4s, v2.4h, v0.h[0]
443 rshrn v2.4h, v2.4s, #14
444 smull v2.4s, v2.4h, v0.h[0]
445 rshrn v2.4h, v2.4s, #14
449 srshr v2.8h, v2.8h, #6
453 // Loop to add the constant from v2 into all 16x16 outputs
455 uaddw v4.8h, v2.8h, v3.8b
456 uaddw2 v5.8h, v2.8h, v3.16b
458 sqxtun2 v4.16b, v5.8h
459 st1 {v4.16b}, [x0], x1
467 dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
468 dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
469 dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
470 dmbutterfly v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
471 dmbutterfly v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
472 dmbutterfly v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
473 dmbutterfly v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
474 dmbutterfly v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
476 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
477 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
478 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
479 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
480 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
481 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
482 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
483 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
485 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
486 dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
487 dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
489 butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a
490 butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6
491 butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5
492 butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4
493 butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a
494 butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10
495 butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13
496 butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a
498 dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a
499 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
501 butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
502 butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
503 butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
504 butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
505 butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13]
506 butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
507 butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
508 butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10]
512 ld1 {v0.8h,v1.8h}, [x11]
514 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0
515 dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.h[1], v1.h[0] // v10,v11 = t9, v8,v9 = t8
516 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
517 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2
518 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
520 dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.h[3], v1.h[2] // v6,v7 = t11, v4,v5 = t10
521 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
522 dmbutterfly_l v10, v11, v8, v9, v27, v20, v0.h[5], v0.h[4] // v10,v11 = t5, v8,v9 = t4
523 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
525 dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12
526 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
527 dmbutterfly_l v6, v7, v4, v5, v25, v22, v0.h[7], v0.h[6] // v6,v7 = t7, v4,v5 = t6
528 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
530 dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14
532 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
533 dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[3], v0.h[4] // v14,v15 = t9, v12,v13 = t8
534 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
536 dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[4], v0.h[3] // v4,v5 = t12, v6,v7 = t13
537 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
538 dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[5], v0.h[6] // v10,v11 = t11, v8,v9 = t10
539 butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0
540 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
542 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[6], v0.h[5] // v12,v13 = t14, v14,v15 = t15
543 butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1
544 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
545 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
547 butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2
548 butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3
550 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[1], v0.h[2] // v10,v11 = t13, v8,v9 = t12
551 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[2], v0.h[1] // v12,v13 = t14, v14,v15 = t15
553 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
554 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
555 neg v29.8h, v29.8h // v29 = out[13]
557 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[1], v0.h[2] // v10,v11 = t5a, v8,v9 = t4a
558 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[2], v0.h[1] // v12,v13 = t6a, v14,v15 = t7a
560 butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a
561 butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10
563 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
564 neg v19.8h, v19.8h // v19 = out[3]
565 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
567 butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
568 butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11
570 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
571 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
572 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
573 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
575 neg v31.8h, v5.8h // v31 = out[15]
576 neg v17.8h, v3.8h // v17 = out[1]
582 // Helper macros; we can't use these expressions directly within
583 // e.g. .irp due to the extra concatenation \(). Therefore wrap
584 // them in macros to allow using .irp below.
585 .macro load i, src, inc
586 ld1 {v\i\().8h}, [\src], \inc
588 .macro store i, dst, inc
589 st1 {v\i\().8h}, [\dst], \inc
591 .macro movi_v i, size, imm
592 movi v\i\()\size, \imm
594 .macro load_clear i, src, inc
595 ld1 {v\i\().8h}, [\src]
596 st1 {v2.8h}, [\src], \inc
599 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
600 // transpose into a horizontal 16x8 slice and store.
601 // x0 = dst (temp buffer)
605 .macro itxfm16_1d_funcs txfm
606 function \txfm\()16_1d_8x16_pass1_neon
608 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
609 load_clear \i, x2, x9
614 // Do two 8x8 transposes. Originally, v16-v31 contain the
615 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
616 // transposed 8x8 blocks.
617 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
618 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
620 // Store the transposed 8x8 blocks horizontally.
623 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
628 // Special case: For the last input column (x1 == 8),
629 // which would be stored as the last row in the temp buffer,
630 // don't store the first 8x8 block, but keep it in registers
631 // for the first slice of the second pass (where it is the
633 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
648 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
649 // load the destination pixels (from a similar 8x16 slice), add and store back.
652 // x2 = src (temp buffer)
654 // x9 = temp buffer stride
655 function \txfm\()16_1d_8x16_pass2_neon
656 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
660 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
669 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
670 srshr \coef0, \coef0, #6
671 ld1 {v2.8b}, [x0], x1
672 srshr \coef1, \coef1, #6
673 ld1 {v3.8b}, [x3], x1
674 srshr \coef2, \coef2, #6
675 ld1 {v4.8b}, [x0], x1
676 srshr \coef3, \coef3, #6
677 uaddw \coef0, \coef0, v2.8b
678 ld1 {v5.8b}, [x3], x1
679 uaddw \coef1, \coef1, v3.8b
680 srshr \coef4, \coef4, #6
681 ld1 {v6.8b}, [x0], x1
682 srshr \coef5, \coef5, #6
683 ld1 {v7.8b}, [x3], x1
685 srshr \coef6, \coef6, #6
687 srshr \coef7, \coef7, #6
688 uaddw \coef2, \coef2, v4.8b
689 ld1 {\tmp1}, [x0], x1
690 uaddw \coef3, \coef3, v5.8b
691 ld1 {\tmp2}, [x3], x1
693 sub x0, x0, x1, lsl #2
694 sub x3, x3, x1, lsl #2
696 uaddw \coef4, \coef4, v6.8b
697 st1 {v2.8b}, [x0], x1
698 uaddw \coef5, \coef5, v7.8b
699 st1 {v3.8b}, [x3], x1
701 st1 {v4.8b}, [x0], x1
703 st1 {v5.8b}, [x3], x1
704 uaddw \coef6, \coef6, \tmp1
705 st1 {v6.8b}, [x0], x1
706 uaddw \coef7, \coef7, \tmp2
707 st1 {v7.8b}, [x3], x1
710 st1 {\tmp1}, [x0], x1
711 st1 {\tmp2}, [x3], x1
713 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
714 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
715 .purgem load_add_store
721 itxfm16_1d_funcs idct
722 itxfm16_1d_funcs iadst
724 .macro itxfm_func16x16 txfm1, txfm2
725 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
726 .ifc \txfm1\()_\txfm2,idct_idct
728 b.eq idct16x16_dc_add_neon
731 // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
732 .ifnc \txfm1\()_\txfm2,idct_idct
733 stp d14, d15, [sp, #-0x10]!
734 stp d12, d13, [sp, #-0x10]!
735 stp d10, d11, [sp, #-0x10]!
736 stp d8, d9, [sp, #-0x10]!
745 movrel x10, idct_coeffs
746 .ifnc \txfm1\()_\txfm2,idct_idct
747 movrel x11, iadst16_coeffs
750 ld1 {v0.8h,v1.8h}, [x10]
756 .ifc \txfm1\()_\txfm2,idct_idct
764 bl \txfm1\()16_1d_8x16_pass1_neon
766 .ifc \txfm1\()_\txfm2,iadst_idct
767 ld1 {v0.8h,v1.8h}, [x10]
770 .ifc \txfm1\()_\txfm2,idct_idct
773 // Set v24-v31 to zero, for the in-register passthrough of
774 // coefficients to pass 2. Since we only do two slices, this can
775 // only ever happen for the second slice. So we only need to store
776 // zeros to the temp buffer for the second half of the buffer.
777 // Move x0 to the second half, and use x9 == 32 as increment.
779 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
781 st1 {v24.8h}, [x0], x9
791 bl \txfm2\()16_1d_8x16_pass2_neon
795 .ifnc \txfm1\()_\txfm2,idct_idct
796 ldp d8, d9, [sp], 0x10
797 ldp d10, d11, [sp], 0x10
798 ldp d12, d13, [sp], 0x10
799 ldp d14, d15, [sp], 0x10
805 itxfm_func16x16 idct, idct
806 itxfm_func16x16 iadst, idct
807 itxfm_func16x16 idct, iadst
808 itxfm_func16x16 iadst, iadst
811 function idct32x32_dc_add_neon
812 movrel x4, idct_coeffs
818 smull v2.4s, v2.4h, v0.h[0]
819 rshrn v2.4h, v2.4s, #14
820 smull v2.4s, v2.4h, v0.h[0]
821 rshrn v2.4h, v2.4s, #14
825 srshr v0.8h, v2.8h, #6
829 // Loop to add the constant v0 into all 32x32 outputs
830 ld1 {v1.16b,v2.16b}, [x0]
831 uaddw v3.8h, v0.8h, v1.8b
832 uaddw2 v4.8h, v0.8h, v1.16b
833 uaddw v5.8h, v0.8h, v2.8b
834 uaddw2 v6.8h, v0.8h, v2.16b
836 sqxtun2 v3.16b, v4.8h
838 sqxtun2 v4.16b, v6.8h
839 st1 {v3.16b,v4.16b}, [x0], x1
847 ld1 {v0.8h,v1.8h}, [x11]
849 dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
850 dmbutterfly v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
851 dmbutterfly v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
852 dmbutterfly v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
853 dmbutterfly v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
854 dmbutterfly v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
855 dmbutterfly v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
856 dmbutterfly v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
860 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
861 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
862 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
863 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
864 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
865 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
866 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
867 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
869 dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
870 dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
871 dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
872 dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
874 butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a
875 butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18
876 butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a
877 butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21
878 butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a
879 butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26
880 butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a
881 butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29
883 dmbutterfly v27, v20, v0.h[1], v0.h[2], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
884 dmbutterfly v3, v5, v0.h[1], v0.h[2], v24, v25, v30, v31 // v3 = t19, v5 = t28
885 dmbutterfly v28, v6, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
886 dmbutterfly v26, v21, v0.h[1], v0.h[2], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
888 butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24
889 butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a
890 butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16
891 butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
892 butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21
893 butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
894 butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26
895 butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20
897 dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20
898 dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
899 dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22
900 dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
903 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
904 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
905 // a normal IDCT16 with every other input component (the even ones, with
906 // each output written twice), followed by a separate 16-point IDCT
907 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
908 // x0 = dst (temp buffer)
911 // x9 = double input stride
913 // x11 = idct_coeffs + 32
914 function idct32_1d_8x32_pass1_neon
915 ld1 {v0.8h,v1.8h}, [x10]
919 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
920 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
921 ld1 {v\i\().8h}, [x2]
922 st1 {v4.8h}, [x2], x9
927 // Do two 8x8 transposes. Originally, v16-v31 contain the
928 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
929 // two transposed 8x8 blocks.
930 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
931 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
933 // Store the registers a, b horizontally, followed by the
934 // same registers b, a mirrored.
935 .macro store_rev a, b
936 // There's no rev128 instruction, but we reverse each 64 bit
937 // half, and then flip them using an ext with 8 bytes offset.
938 rev64 v1.8h, v\b\().8h
939 st1 {v\a\().8h}, [x0], #16
940 rev64 v0.8h, v\a\().8h
941 ext v1.16b, v1.16b, v1.16b, #8
942 st1 {v\b\().8h}, [x0], #16
943 ext v0.16b, v0.16b, v0.16b, #8
944 st1 {v1.8h}, [x0], #16
945 st1 {v0.8h}, [x0], #16
958 // Move x2 back to the start of the input, and move
959 // to the first odd row
960 sub x2, x2, x9, lsl #4
964 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
965 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
966 ld1 {v\i\().8h}, [x2]
967 st1 {v4.8h}, [x2], x9
972 transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
973 transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
975 // Store the registers a, b horizontally,
976 // adding into the output first, and the mirrored,
977 // subtracted from the output.
978 .macro store_rev a, b
980 rev64 v1.8h, v\b\().8h
981 add v4.8h, v4.8h, v\a\().8h
982 rev64 v0.8h, v\a\().8h
983 st1 {v4.8h}, [x0], #16
984 ext v1.16b, v1.16b, v1.16b, #8
986 ext v0.16b, v0.16b, v0.16b, #8
987 add v5.8h, v5.8h, v\b\().8h
988 st1 {v5.8h}, [x0], #16
990 sub v6.8h, v6.8h, v1.8h
991 st1 {v6.8h}, [x0], #16
993 sub v7.8h, v7.8h, v0.8h
994 st1 {v7.8h}, [x0], #16
1009 // This is mostly the same as 8x32_pass1, but without the transpose,
1010 // and use the source as temp buffer between the two idct passes, and
1011 // add into the destination.
1014 // x2 = src (temp buffer)
1015 // x7 = negative double temp buffer stride
1016 // x9 = double temp buffer stride
1017 // x10 = idct_coeffs
1018 // x11 = idct_coeffs + 32
1019 function idct32_1d_8x32_pass2_neon
1020 ld1 {v0.8h,v1.8h}, [x10]
1022 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1023 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1024 ld1 {v\i\().8h}, [x2], x9
1026 sub x2, x2, x9, lsl #4
1030 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1031 st1 {v\i\().8h}, [x2], x9
1034 sub x2, x2, x9, lsl #4
1037 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1038 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1039 ld1 {v\i\().8h}, [x2], x9
1041 sub x2, x2, x9, lsl #4
1046 .macro load_acc_store a, b, c, d, neg=0
1048 ld1 {v4.8h}, [x2], x9
1049 ld1 {v5.8h}, [x2], x9
1050 add v4.8h, v4.8h, v\a\().8h
1051 ld1 {v6.8h}, [x2], x9
1052 add v5.8h, v5.8h, v\b\().8h
1053 ld1 {v7.8h}, [x2], x9
1054 add v6.8h, v6.8h, v\c\().8h
1055 add v7.8h, v7.8h, v\d\().8h
1057 ld1 {v4.8h}, [x2], x7
1058 ld1 {v5.8h}, [x2], x7
1059 sub v4.8h, v4.8h, v\a\().8h
1060 ld1 {v6.8h}, [x2], x7
1061 sub v5.8h, v5.8h, v\b\().8h
1062 ld1 {v7.8h}, [x2], x7
1063 sub v6.8h, v6.8h, v\c\().8h
1064 sub v7.8h, v7.8h, v\d\().8h
1066 ld1 {v0.8b}, [x0], x1
1067 ld1 {v1.8b}, [x0], x1
1068 srshr v4.8h, v4.8h, #6
1069 ld1 {v2.8b}, [x0], x1
1070 srshr v5.8h, v5.8h, #6
1071 uaddw v4.8h, v4.8h, v0.8b
1072 ld1 {v3.8b}, [x0], x1
1073 srshr v6.8h, v6.8h, #6
1074 uaddw v5.8h, v5.8h, v1.8b
1075 srshr v7.8h, v7.8h, #6
1076 sub x0, x0, x1, lsl #2
1077 uaddw v6.8h, v6.8h, v2.8b
1079 uaddw v7.8h, v7.8h, v3.8b
1081 st1 {v4.8b}, [x0], x1
1083 st1 {v5.8b}, [x0], x1
1085 st1 {v6.8b}, [x0], x1
1086 st1 {v7.8b}, [x0], x1
1088 load_acc_store 31, 30, 29, 28
1089 load_acc_store 27, 26, 25, 24
1090 load_acc_store 23, 22, 21, 20
1091 load_acc_store 19, 18, 17, 16
1093 load_acc_store 16, 17, 18, 19, 1
1094 load_acc_store 20, 21, 22, 23, 1
1095 load_acc_store 24, 25, 26, 27, 1
1096 load_acc_store 28, 29, 30, 31, 1
1097 .purgem load_acc_store
1101 const min_eob_idct_idct_32, align=4
1102 .short 0, 34, 135, 336
1105 function ff_vp9_idct_idct_32x32_add_neon, export=1
1107 b.eq idct32x32_dc_add_neon
1109 movrel x10, idct_coeffs
1111 movrel x12, min_eob_idct_idct_32, 2
1115 stp d14, d15, [sp, #-0x10]!
1116 stp d12, d13, [sp, #-0x10]!
1117 stp d10, d11, [sp, #-0x10]!
1118 stp d8, d9, [sp, #-0x10]!
1126 // Double stride of the input, since we only read every other line
1130 .irp i, 0, 8, 16, 24
1131 add x0, sp, #(\i*64)
1135 mov x1, #(32 - \i)/4
1139 bl idct32_1d_8x32_pass1_neon
1144 // Write zeros to the temp buffer for pass 2
1152 st1 {v16.8h-v19.8h}, [x0], #64
1156 .irp i, 0, 8, 16, 24
1160 bl idct32_1d_8x32_pass2_neon
1165 ldp d8, d9, [sp], 0x10
1166 ldp d10, d11, [sp], 0x10
1167 ldp d12, d13, [sp], 0x10
1168 ldp d14, d15, [sp], 0x10