2 * Copyright (c) 2017 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
24 const itxfm4_coeffs, align=4
25 .short 11585, 0, 6270, 15137
27 .short 5283, 15212, 9929, 13377
30 const iadst8_coeffs, align=4
31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
33 .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34 .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
39 const iadst16_coeffs, align=4
40 .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41 .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
44 .macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
45 trn1 \r4\().4s, \r0\().4s, \r1\().4s
46 trn2 \r5\().4s, \r0\().4s, \r1\().4s
47 trn1 \r6\().4s, \r2\().4s, \r3\().4s
48 trn2 \r7\().4s, \r2\().4s, \r3\().4s
49 trn1 \r0\().2d, \r4\().2d, \r6\().2d
50 trn2 \r2\().2d, \r4\().2d, \r6\().2d
51 trn1 \r1\().2d, \r5\().2d, \r7\().2d
52 trn2 \r3\().2d, \r5\().2d, \r7\().2d
55 // Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
56 // over two registers.
57 .macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
58 transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3
59 transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3
61 // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
62 // while swapping the two 4x4 matrices between each other
64 // First step of the 4x4 transpose of r1-r7, into t0-t3
65 trn1 \t0\().4s, \r1\().4s, \r3\().4s
66 trn2 \t1\().4s, \r1\().4s, \r3\().4s
67 trn1 \t2\().4s, \r5\().4s, \r7\().4s
68 trn2 \t3\().4s, \r5\().4s, \r7\().4s
70 // First step of the 4x4 transpose of r8-r12, into r1-r7
71 trn1 \r1\().4s, \r8\().4s, \r10\().4s
72 trn2 \r3\().4s, \r8\().4s, \r10\().4s
73 trn1 \r5\().4s, \r12\().4s, \r14\().4s
74 trn2 \r7\().4s, \r12\().4s, \r14\().4s
76 // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
77 trn1 \r8\().2d, \t0\().2d, \t2\().2d
78 trn2 \r12\().2d, \t0\().2d, \t2\().2d
79 trn1 \r10\().2d, \t1\().2d, \t3\().2d
80 trn2 \r14\().2d, \t1\().2d, \t3\().2d
82 // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
83 trn1 \t0\().2d, \r1\().2d, \r5\().2d
84 trn2 \r5\().2d, \r1\().2d, \r5\().2d
85 trn1 \t1\().2d, \r3\().2d, \r7\().2d
86 trn2 \r7\().2d, \r3\().2d, \r7\().2d
88 // Move the outputs of trn1 back in place
89 mov \r1\().16b, \t0\().16b
90 mov \r3\().16b, \t1\().16b
93 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
94 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
95 // in/out are .4s registers; this can do with 4 temp registers, but is
96 // more efficient if 6 temp registers are available.
97 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
99 neg \tmp4\().4s, v0.4s
101 add \tmp1\().4s, \in1\().4s, \in2\().4s
102 sub \tmp2\().4s, \in1\().4s, \in2\().4s
104 smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
105 smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
107 smull \tmp3\().2d, \tmp1\().2s, v0.s[0]
108 smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0]
111 rshrn \out1\().2s, \tmp3\().2d, #14
112 rshrn2 \out1\().4s, \tmp4\().2d, #14
113 smull \tmp3\().2d, \tmp2\().2s, v0.s[0]
114 smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0]
115 rshrn \out2\().2s, \tmp3\().2d, #14
116 rshrn2 \out2\().4s, \tmp4\().2d, #14
118 smull \tmp5\().2d, \tmp2\().2s, v0.s[0]
119 smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0]
120 rshrn \out1\().2s, \tmp3\().2d, #14
121 rshrn2 \out1\().4s, \tmp4\().2d, #14
122 rshrn \out2\().2s, \tmp5\().2d, #14
123 rshrn2 \out2\().4s, \tmp6\().2d, #14
127 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
128 // writing the same output into both out1 and out2.
129 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
130 smull \tmp1\().2d, \in1\().2s, v0.s[0]
131 smull2 \tmp2\().2d, \in1\().4s, v0.s[0]
132 rshrn \out1\().2s, \tmp1\().2d, #14
133 rshrn2 \out1\().4s, \tmp2\().2d, #14
134 rshrn \out2\().2s, \tmp1\().2d, #14
135 rshrn2 \out2\().4s, \tmp2\().2d, #14
138 // out1,out2 = in1 * coef1 - in2 * coef2
139 // out3,out4 = in1 * coef2 + in2 * coef1
140 // out are 4 x .2d registers, in are 2 x .4s registers
141 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
142 smull \out1\().2d, \in1\().2s, \coef1
143 smull2 \out2\().2d, \in1\().4s, \coef1
144 smull \out3\().2d, \in1\().2s, \coef2
145 smull2 \out4\().2d, \in1\().4s, \coef2
146 smlsl \out1\().2d, \in2\().2s, \coef2
147 smlsl2 \out2\().2d, \in2\().4s, \coef2
148 smlal \out3\().2d, \in2\().2s, \coef1
149 smlal2 \out4\().2d, \in2\().4s, \coef1
152 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
153 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
154 // inout are 2 x .4s registers
155 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
156 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
158 neg \tmp3\().2d, \tmp3\().2d
159 neg \tmp4\().2d, \tmp4\().2d
161 rshrn \inout1\().2s, \tmp1\().2d, #14
162 rshrn2 \inout1\().4s, \tmp2\().2d, #14
163 rshrn \inout2\().2s, \tmp3\().2d, #14
164 rshrn2 \inout2\().4s, \tmp4\().2d, #14
167 // Same as dmbutterfly above, but treating the input in inout2 as zero
168 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
169 smull \tmp1\().2d, \inout1\().2s, \coef1
170 smull2 \tmp2\().2d, \inout1\().4s, \coef1
171 smull \tmp3\().2d, \inout1\().2s, \coef2
172 smull2 \tmp4\().2d, \inout1\().4s, \coef2
173 rshrn \inout1\().2s, \tmp1\().2d, #14
174 rshrn2 \inout1\().4s, \tmp2\().2d, #14
175 rshrn \inout2\().2s, \tmp3\().2d, #14
176 rshrn2 \inout2\().4s, \tmp4\().2d, #14
179 // Same as dmbutterfly above, but treating the input in inout1 as zero
180 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
181 smull \tmp1\().2d, \inout2\().2s, \coef2
182 smull2 \tmp2\().2d, \inout2\().4s, \coef2
183 smull \tmp3\().2d, \inout2\().2s, \coef1
184 smull2 \tmp4\().2d, \inout2\().4s, \coef1
185 neg \tmp1\().2d, \tmp1\().2d
186 neg \tmp2\().2d, \tmp2\().2d
187 rshrn \inout2\().2s, \tmp3\().2d, #14
188 rshrn2 \inout2\().4s, \tmp4\().2d, #14
189 rshrn \inout1\().2s, \tmp1\().2d, #14
190 rshrn2 \inout1\().4s, \tmp2\().2d, #14
193 .macro dsmull_h out1, out2, in, coef
194 smull \out1\().2d, \in\().2s, \coef
195 smull2 \out2\().2d, \in\().4s, \coef
198 .macro drshrn_h out, in1, in2, shift
199 rshrn \out\().2s, \in1\().2d, \shift
200 rshrn2 \out\().4s, \in2\().2d, \shift
206 .macro butterfly_4s out1, out2, in1, in2
207 add \out1\().4s, \in1\().4s, \in2\().4s
208 sub \out2\().4s, \in1\().4s, \in2\().4s
213 .macro butterfly_4s_r out1, out2, in1, in2
214 sub \out1\().4s, \in1\().4s, \in2\().4s
215 add \out2\().4s, \in1\().4s, \in2\().4s
218 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
219 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
220 // out are 2 x .4s registers, in are 4 x .2d registers
221 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
222 add \tmp1\().2d, \in1\().2d, \in3\().2d
223 add \tmp2\().2d, \in2\().2d, \in4\().2d
224 sub \tmp3\().2d, \in1\().2d, \in3\().2d
225 sub \tmp4\().2d, \in2\().2d, \in4\().2d
226 rshrn \out1\().2s, \tmp1\().2d, #14
227 rshrn2 \out1\().4s, \tmp2\().2d, #14
228 rshrn \out2\().2s, \tmp3\().2d, #14
229 rshrn2 \out2\().4s, \tmp4\().2d, #14
232 .macro iwht4_10 c0, c1, c2, c3
233 add \c0\().4s, \c0\().4s, \c1\().4s
234 sub v17.4s, \c2\().4s, \c3\().4s
235 sub v16.4s, \c0\().4s, v17.4s
236 sshr v16.4s, v16.4s, #1
237 sub \c2\().4s, v16.4s, \c1\().4s
238 sub \c1\().4s, v16.4s, \c3\().4s
239 add \c3\().4s, v17.4s, \c2\().4s
240 sub \c0\().4s, \c0\().4s, \c1\().4s
243 .macro iwht4_12 c0, c1, c2, c3
244 iwht4_10 \c0, \c1, \c2, \c3
247 .macro idct4_10 c0, c1, c2, c3
248 mul v22.4s, \c1\().4s, v0.s[3]
249 mul v20.4s, \c1\().4s, v0.s[2]
250 add v16.4s, \c0\().4s, \c2\().4s
251 sub v17.4s, \c0\().4s, \c2\().4s
252 mla v22.4s, \c3\().4s, v0.s[2]
253 mul v18.4s, v16.4s, v0.s[0]
254 mul v24.4s, v17.4s, v0.s[0]
255 mls v20.4s, \c3\().4s, v0.s[3]
256 srshr v22.4s, v22.4s, #14
257 srshr v18.4s, v18.4s, #14
258 srshr v24.4s, v24.4s, #14
259 srshr v20.4s, v20.4s, #14
260 add \c0\().4s, v18.4s, v22.4s
261 sub \c3\().4s, v18.4s, v22.4s
262 add \c1\().4s, v24.4s, v20.4s
263 sub \c2\().4s, v24.4s, v20.4s
266 .macro idct4_12 c0, c1, c2, c3
267 smull v22.2d, \c1\().2s, v0.s[3]
268 smull2 v23.2d, \c1\().4s, v0.s[3]
269 smull v20.2d, \c1\().2s, v0.s[2]
270 smull2 v21.2d, \c1\().4s, v0.s[2]
271 add v16.4s, \c0\().4s, \c2\().4s
272 sub v17.4s, \c0\().4s, \c2\().4s
273 smlal v22.2d, \c3\().2s, v0.s[2]
274 smlal2 v23.2d, \c3\().4s, v0.s[2]
275 smull v18.2d, v16.2s, v0.s[0]
276 smull2 v19.2d, v16.4s, v0.s[0]
277 smull v24.2d, v17.2s, v0.s[0]
278 smull2 v25.2d, v17.4s, v0.s[0]
279 smlsl v20.2d, \c3\().2s, v0.s[3]
280 smlsl2 v21.2d, \c3\().4s, v0.s[3]
281 rshrn v22.2s, v22.2d, #14
282 rshrn2 v22.4s, v23.2d, #14
283 rshrn v18.2s, v18.2d, #14
284 rshrn2 v18.4s, v19.2d, #14
285 rshrn v24.2s, v24.2d, #14
286 rshrn2 v24.4s, v25.2d, #14
287 rshrn v20.2s, v20.2d, #14
288 rshrn2 v20.4s, v21.2d, #14
289 add \c0\().4s, v18.4s, v22.4s
290 sub \c3\().4s, v18.4s, v22.4s
291 add \c1\().4s, v24.4s, v20.4s
292 sub \c2\().4s, v24.4s, v20.4s
295 .macro iadst4_10 c0, c1, c2, c3
296 mul v16.4s, \c0\().4s, v1.s[0]
297 mla v16.4s, \c2\().4s, v1.s[1]
298 mla v16.4s, \c3\().4s, v1.s[2]
299 mul v18.4s, \c0\().4s, v1.s[2]
300 mls v18.4s, \c2\().4s, v1.s[0]
301 sub \c0\().4s, \c0\().4s, \c2\().4s
302 mls v18.4s, \c3\().4s, v1.s[1]
303 add \c0\().4s, \c0\().4s, \c3\().4s
304 mul v22.4s, \c1\().4s, v1.s[3]
305 mul v20.4s, \c0\().4s, v1.s[3]
306 add v24.4s, v16.4s, v22.4s
307 add v26.4s, v18.4s, v22.4s
308 srshr \c0\().4s, v24.4s, #14
309 add v16.4s, v16.4s, v18.4s
310 srshr \c1\().4s, v26.4s, #14
311 sub v16.4s, v16.4s, v22.4s
312 srshr \c2\().4s, v20.4s, #14
313 srshr \c3\().4s, v16.4s, #14
316 .macro iadst4_12 c0, c1, c2, c3
317 smull v16.2d, \c0\().2s, v1.s[0]
318 smull2 v17.2d, \c0\().4s, v1.s[0]
319 smlal v16.2d, \c2\().2s, v1.s[1]
320 smlal2 v17.2d, \c2\().4s, v1.s[1]
321 smlal v16.2d, \c3\().2s, v1.s[2]
322 smlal2 v17.2d, \c3\().4s, v1.s[2]
323 smull v18.2d, \c0\().2s, v1.s[2]
324 smull2 v19.2d, \c0\().4s, v1.s[2]
325 smlsl v18.2d, \c2\().2s, v1.s[0]
326 smlsl2 v19.2d, \c2\().4s, v1.s[0]
327 sub \c0\().4s, \c0\().4s, \c2\().4s
328 smlsl v18.2d, \c3\().2s, v1.s[1]
329 smlsl2 v19.2d, \c3\().4s, v1.s[1]
330 add \c0\().4s, \c0\().4s, \c3\().4s
331 smull v22.2d, \c1\().2s, v1.s[3]
332 smull2 v23.2d, \c1\().4s, v1.s[3]
333 smull v20.2d, \c0\().2s, v1.s[3]
334 smull2 v21.2d, \c0\().4s, v1.s[3]
335 add v24.2d, v16.2d, v22.2d
336 add v25.2d, v17.2d, v23.2d
337 add v26.2d, v18.2d, v22.2d
338 add v27.2d, v19.2d, v23.2d
339 rshrn \c0\().2s, v24.2d, #14
340 rshrn2 \c0\().4s, v25.2d, #14
341 add v16.2d, v16.2d, v18.2d
342 add v17.2d, v17.2d, v19.2d
343 rshrn \c1\().2s, v26.2d, #14
344 rshrn2 \c1\().4s, v27.2d, #14
345 sub v16.2d, v16.2d, v22.2d
346 sub v17.2d, v17.2d, v23.2d
347 rshrn \c2\().2s, v20.2d, #14
348 rshrn2 \c2\().4s, v21.2d, #14
349 rshrn \c3\().2s, v16.2d, #14
350 rshrn2 \c3\().4s, v17.2d, #14
353 // The public functions in this file have got the following signature:
354 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
356 .macro itxfm_func4x4 txfm1, txfm2, bpp
357 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
360 movrel x4, itxfm4_coeffs
365 movrel x4, iadst4_coeffs
370 movrel x4, itxfm4_coeffs
378 .ifc \txfm1\()_\txfm2,idct_idct
381 // DC-only for idct/idct
383 smull v2.2d, v2.2s, v0.s[0]
384 rshrn v2.2s, v2.2d, #14
385 smull v2.2d, v2.2s, v0.s[0]
386 rshrn v2.2s, v2.2d, #14
396 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2]
397 st1 {v30.4s,v31.4s}, [x2], #32
400 sshr v4.4s, v4.4s, #2
401 sshr v5.4s, v5.4s, #2
402 sshr v6.4s, v6.4s, #2
403 sshr v7.4s, v7.4s, #2
406 \txfm1\()4_\bpp v4, v5, v6, v7
408 st1 {v30.4s,v31.4s}, [x2], #32
409 // Transpose 4x4 with 32 bit elements
410 transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19
412 \txfm2\()4_\bpp v4, v5, v6, v7
414 mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
415 ld1 {v0.4h}, [x0], x1
416 ld1 {v1.4h}, [x0], x1
418 srshr v4.4s, v4.4s, #4
419 srshr v5.4s, v5.4s, #4
420 srshr v6.4s, v6.4s, #4
421 srshr v7.4s, v7.4s, #4
423 uaddw v4.4s, v4.4s, v0.4h
424 uaddw v5.4s, v5.4s, v1.4h
425 ld1 {v2.4h}, [x0], x1
426 ld1 {v3.4h}, [x0], x1
429 sub x0, x0, x1, lsl #2
431 uaddw v6.4s, v6.4s, v2.4h
432 umin v0.8h, v0.8h, v31.8h
433 uaddw v7.4s, v7.4s, v3.4h
434 st1 {v0.4h}, [x0], x1
437 umin v2.8h, v2.8h, v31.8h
439 st1 {v0.d}[1], [x0], x1
440 st1 {v2.4h}, [x0], x1
441 st1 {v2.d}[1], [x0], x1
447 .macro itxfm_funcs4x4 bpp
448 itxfm_func4x4 idct, idct, \bpp
449 itxfm_func4x4 iadst, idct, \bpp
450 itxfm_func4x4 idct, iadst, \bpp
451 itxfm_func4x4 iadst, iadst, \bpp
452 itxfm_func4x4 iwht, iwht, \bpp
458 function idct8x8_dc_add_neon
459 movrel x4, idct_coeffs
466 smull v2.2d, v2.2s, v0.s[0]
467 rshrn v2.2s, v2.2d, #14
468 smull v2.2d, v2.2s, v0.s[0]
469 rshrn v2.2s, v2.2d, #14
473 srshr v2.4s, v2.4s, #5
479 // Loop to add the constant from v2 into all 8x8 outputs
481 ld1 {v3.8h}, [x0], x1
482 ld1 {v4.8h}, [x0], x1
483 uaddw v16.4s, v2.4s, v3.4h
484 uaddw2 v17.4s, v2.4s, v3.8h
485 uaddw v18.4s, v2.4s, v4.4h
486 uaddw2 v19.4s, v2.4s, v4.8h
488 sqxtun2 v3.8h, v17.4s
490 sqxtun2 v4.8h, v19.4s
491 umin v3.8h, v3.8h, v31.8h
492 umin v4.8h, v4.8h, v31.8h
493 st1 {v3.8h}, [x3], x1
494 st1 {v4.8h}, [x3], x1
500 .macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
501 dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
502 dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3 // r2 = t2a, r6 = t3a
503 dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3 // r1 = t4a, r7 = t7a
504 dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3 // r5 = t5a, r3 = t6a
506 butterfly_4s \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
507 butterfly_4s \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
508 butterfly_4s \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
509 butterfly_4s \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
511 dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
513 butterfly_4s \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
514 butterfly_4s \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
515 butterfly_4s \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
516 butterfly_4s \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
519 .macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
520 dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0] // t2,t3 = t1a, t0,t1 = t0a
521 dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0] // r0,r7 = t5a, t4,t5 = t4a
523 dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
524 dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
526 dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2] // t4,t5 = t3a, t2,t3 = t2a
527 dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2] // r2,r5 = t7a, r0,r7 = t6a
529 dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
530 dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
532 butterfly_4s \r7, \r4, \r4, \r0 // r7 = -out[7], r4 = t3
533 neg \r7\().4s, \r7\().4s // r7 = out[7]
534 butterfly_4s \r0, \r1, \r3, \r1 // r0 = out[0], r1 = t2
536 dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3] // r2,r3 = t5a, t3,t5 = t4a
537 dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2] // t0,t1 = t6a, r5,r6 = t7a
539 dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6], t2 = t7
541 dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2 // r3 = -out[3], r4 = out[4]
542 neg \r3\().4s, \r3\().4s // r3 = out[3]
544 dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
545 neg \r1\().4s, \r1\().4s // r1 = out[1]
547 dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5 // r2 = out[2], r5 = -out[5]
548 neg \r5\().4s, \r5\().4s // r5 = out[5]
552 .macro itxfm_func8x8 txfm1, txfm2
553 function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
554 .ifc \txfm1\()_\txfm2,idct_idct
556 b.eq idct8x8_dc_add_neon
558 // The iadst also uses a few coefficients from
559 // idct, so those always need to be loaded.
560 .ifc \txfm1\()_\txfm2,idct_idct
561 movrel x4, idct_coeffs
563 movrel x4, iadst8_coeffs
564 ld1 {v1.8h}, [x4], #16
565 stp d8, d9, [sp, #-0x10]!
579 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64
580 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
581 ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64
582 ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
584 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
585 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
586 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
587 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
589 .ifc \txfm1\()_\txfm2,idct_idct
590 idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
591 idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
593 \txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
594 \txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
597 // Transpose 8x8 with 16 bit elements
598 transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
600 .ifc \txfm1\()_\txfm2,idct_idct
601 idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
602 idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
604 \txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
605 \txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
609 // Add into the destination
610 ld1 {v0.8h}, [x0], x1
611 srshr v16.4s, v16.4s, #5
612 srshr v17.4s, v17.4s, #5
613 ld1 {v1.8h}, [x0], x1
614 srshr v18.4s, v18.4s, #5
615 srshr v19.4s, v19.4s, #5
616 ld1 {v2.8h}, [x0], x1
617 srshr v20.4s, v20.4s, #5
618 srshr v21.4s, v21.4s, #5
619 uaddw v16.4s, v16.4s, v0.4h
620 uaddw2 v17.4s, v17.4s, v0.8h
621 ld1 {v3.8h}, [x0], x1
622 srshr v22.4s, v22.4s, #5
623 srshr v23.4s, v23.4s, #5
624 uaddw v18.4s, v18.4s, v1.4h
625 uaddw2 v19.4s, v19.4s, v1.8h
626 ld1 {v4.8h}, [x0], x1
627 srshr v24.4s, v24.4s, #5
628 srshr v25.4s, v25.4s, #5
629 uaddw v20.4s, v20.4s, v2.4h
630 uaddw2 v21.4s, v21.4s, v2.8h
632 sqxtun2 v0.8h, v17.4s
634 ld1 {v5.8h}, [x0], x1
635 srshr v26.4s, v26.4s, #5
636 srshr v27.4s, v27.4s, #5
637 uaddw v22.4s, v22.4s, v3.4h
638 uaddw2 v23.4s, v23.4s, v3.8h
640 sqxtun2 v1.8h, v19.4s
641 umin v0.8h, v0.8h, v16.8h
642 ld1 {v6.8h}, [x0], x1
643 srshr v28.4s, v28.4s, #5
644 srshr v29.4s, v29.4s, #5
645 uaddw v24.4s, v24.4s, v4.4h
646 uaddw2 v25.4s, v25.4s, v4.8h
648 sqxtun2 v2.8h, v21.4s
649 umin v1.8h, v1.8h, v16.8h
650 ld1 {v7.8h}, [x0], x1
651 srshr v30.4s, v30.4s, #5
652 srshr v31.4s, v31.4s, #5
653 uaddw v26.4s, v26.4s, v5.4h
654 uaddw2 v27.4s, v27.4s, v5.8h
656 sqxtun2 v3.8h, v23.4s
657 umin v2.8h, v2.8h, v16.8h
659 st1 {v0.8h}, [x3], x1
660 uaddw v28.4s, v28.4s, v6.4h
661 uaddw2 v29.4s, v29.4s, v6.8h
662 st1 {v1.8h}, [x3], x1
664 sqxtun2 v4.8h, v25.4s
665 umin v3.8h, v3.8h, v16.8h
666 st1 {v2.8h}, [x3], x1
667 uaddw v30.4s, v30.4s, v7.4h
668 uaddw2 v31.4s, v31.4s, v7.8h
669 st1 {v3.8h}, [x3], x1
671 sqxtun2 v5.8h, v27.4s
672 umin v4.8h, v4.8h, v16.8h
673 st1 {v4.8h}, [x3], x1
675 sqxtun2 v6.8h, v29.4s
676 umin v5.8h, v5.8h, v16.8h
677 st1 {v5.8h}, [x3], x1
679 sqxtun2 v7.8h, v31.4s
680 umin v6.8h, v6.8h, v16.8h
682 st1 {v6.8h}, [x3], x1
683 umin v7.8h, v7.8h, v16.8h
684 st1 {v7.8h}, [x3], x1
686 .ifnc \txfm1\()_\txfm2,idct_idct
687 ldp d8, d9, [sp], 0x10
692 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
694 b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
697 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
699 b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
703 itxfm_func8x8 idct, idct
704 itxfm_func8x8 iadst, idct
705 itxfm_func8x8 idct, iadst
706 itxfm_func8x8 iadst, iadst
709 function idct16x16_dc_add_neon
710 movrel x4, idct_coeffs
717 smull v2.2d, v2.2s, v0.s[0]
718 rshrn v2.2s, v2.2d, #14
719 smull v2.2d, v2.2s, v0.s[0]
720 rshrn v2.2s, v2.2d, #14
724 srshr v0.4s, v2.4s, #6
730 // Loop to add the constant from v2 into all 16x16 outputs
732 ld1 {v1.8h,v2.8h}, [x0], x1
733 uaddw v16.4s, v0.4s, v1.4h
734 uaddw2 v17.4s, v0.4s, v1.8h
735 ld1 {v3.8h,v4.8h}, [x0], x1
736 uaddw v18.4s, v0.4s, v2.4h
737 uaddw2 v19.4s, v0.4s, v2.8h
738 uaddw v20.4s, v0.4s, v3.4h
739 uaddw2 v21.4s, v0.4s, v3.8h
740 uaddw v22.4s, v0.4s, v4.4h
741 uaddw2 v23.4s, v0.4s, v4.8h
743 sqxtun2 v1.8h, v17.4s
745 sqxtun2 v2.8h, v19.4s
747 sqxtun2 v3.8h, v21.4s
749 sqxtun2 v4.8h, v23.4s
750 umin v1.8h, v1.8h, v31.8h
751 umin v2.8h, v2.8h, v31.8h
752 st1 {v1.8h,v2.8h}, [x3], x1
753 umin v3.8h, v3.8h, v31.8h
754 umin v4.8h, v4.8h, v31.8h
755 st1 {v3.8h,v4.8h}, [x3], x1
762 butterfly_4s v18, v7, v4, v7 // v18 = t0a, v7 = t7a
763 butterfly_4s v19, v22, v5, v22 // v19 = t1a, v22 = t6
764 butterfly_4s v4, v26, v20, v26 // v4 = t2a, v26 = t5
765 butterfly_4s v5, v6, v28, v6 // v5 = t3a, v6 = t4
766 butterfly_4s v20, v28, v16, v24 // v20 = t8a, v28 = t11a
767 butterfly_4s v24, v21, v23, v21 // v24 = t9, v21 = t10
768 butterfly_4s v23, v27, v25, v27 // v23 = t14, v27 = t13
769 butterfly_4s v25, v29, v29, v17 // v25 = t15a, v29 = t12a
771 dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31 // v8 = t13a, v9 = t10a
772 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
774 butterfly_4s v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
775 butterfly_4s v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
776 butterfly_4s_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
777 butterfly_4s v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
778 butterfly_4s v18, v29, v4, v8 // v18 = out[2], v29 = out[13]
779 butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
780 butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
781 butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10]
786 dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
787 dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
788 dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
789 dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a
790 dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a
791 dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a
792 dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
793 dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
795 butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3
796 butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2
797 butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5
798 butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6
799 butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9
800 butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10
801 butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13
802 butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14
804 dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
805 dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
806 dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
811 dmbutterfly0_h v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
812 dmbutterfly_h1 v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
813 dmbutterfly_h1 v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
814 dmbutterfly_h2 v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a
815 dmbutterfly_h1 v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a
816 dmbutterfly_h2 v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a
817 dmbutterfly_h1 v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
818 dmbutterfly_h2 v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
820 butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3
821 butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2
822 butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5
823 butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6
824 butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9
825 butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10
826 butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13
827 butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14
829 dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
830 dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
831 dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
835 function idct16_quarter
836 dsmull_h v24, v25, v19, v3.s[3]
837 dsmull_h v4, v5, v17, v2.s[0]
838 dsmull_h v7, v6, v18, v1.s[1]
839 dsmull_h v30, v31, v18, v1.s[0]
842 dsmull_h v29, v28, v17, v2.s[1]
843 dsmull_h v26, v27, v19, v3.s[2]
844 dsmull_h v22, v23, v16, v0.s[0]
845 drshrn_h v24, v24, v25, #14
846 drshrn_h v16, v4, v5, #14
847 drshrn_h v7, v7, v6, #14
848 drshrn_h v6, v30, v31, #14
849 drshrn_h v29, v29, v28, #14
850 drshrn_h v17, v26, v27, #14
851 drshrn_h v28, v22, v23, #14
853 dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.s[2], v0.s[3]
854 dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.s[2], v0.s[3]
857 drshrn_h v27, v20, v21, #14
858 drshrn_h v21, v22, v23, #14
859 drshrn_h v23, v18, v19, #14
860 drshrn_h v25, v30, v31, #14
863 dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
869 ld1 {v0.8h,v1.8h}, [x11]
875 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0] // v6,v7 = t1, v4,v5 = t0
876 dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0] // v10,v11 = t9, v8,v9 = t8
877 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
878 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2] // v14,v15 = t3, v12,v13 = t2
879 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
881 dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2] // v6,v7 = t11, v4,v5 = t10
882 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
883 dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0] // v10,v11 = t5, v8,v9 = t4
884 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
886 dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0] // v14,v15 = t13, v12,v13 = t12
887 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
888 dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2] // v6,v7 = t7, v4,v5 = t6
889 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
891 dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2] // v10,v11 = t15, v8,v9 = t14
893 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
896 dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1] // v14,v15 = t9, v12,v13 = t8
897 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
899 dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0] // v4,v5 = t12, v6,v7 = t13
900 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
901 dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3] // v10,v11 = t11, v8,v9 = t10
902 butterfly_4s_r v4, v27, v16, v27 // v4 = t4, v27 = t0
903 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
905 dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2] // v12,v13 = t14, v14,v15 = t15
906 butterfly_4s_r v5, v20, v31, v20 // v5 = t5, v20 = t1
907 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
908 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
910 butterfly_4s_r v6, v25, v18, v25 // v6 = t6, v25 = t2
911 butterfly_4s_r v7, v22, v29, v22 // v7 = t7, v22 = t3
913 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3] // v10,v11 = t13, v8,v9 = t12
914 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2] // v12,v13 = t14, v14,v15 = t15
916 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
917 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
918 neg v29.4s, v29.4s // v29 = out[13]
920 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3] // v10,v11 = t5a, v8,v9 = t4a
921 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2] // v12,v13 = t6a, v14,v15 = t7a
923 butterfly_4s v2, v6, v27, v25 // v2 = out[0], v6 = t2a
924 butterfly_4s v3, v7, v23, v21 // v3 =-out[1], v7 = t10
926 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
927 neg v19.4s, v19.4s // v19 = out[3]
928 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
930 butterfly_4s v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
931 butterfly_4s v4, v9, v24, v26 // v4 = out[14],v9 = t11
933 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
934 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
935 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
936 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
938 neg v31.4s, v5.4s // v31 = out[15]
939 neg v17.4s, v3.4s // v17 = out[1]
946 // Helper macros; we can't use these expressions directly within
947 // e.g. .irp due to the extra concatenation \(). Therefore wrap
948 // them in macros to allow using .irp below.
949 .macro load i, src, inc
950 ld1 {v\i\().4s}, [\src], \inc
952 .macro store i, dst, inc
953 st1 {v\i\().4s}, [\dst], \inc
955 .macro movi_v i, size, imm
956 movi v\i\()\size, \imm
958 .macro load_clear i, src, inc
959 ld1 {v\i\().4s}, [\src]
960 st1 {v4.4s}, [\src], \inc
963 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
964 srshr \coef0, \coef0, #6
965 ld1 {v4.4h}, [x0], x1
966 srshr \coef1, \coef1, #6
967 ld1 {v4.d}[1], [x3], x1
968 srshr \coef2, \coef2, #6
969 ld1 {v5.4h}, [x0], x1
970 srshr \coef3, \coef3, #6
971 uaddw \coef0, \coef0, v4.4h
972 ld1 {v5.d}[1], [x3], x1
973 srshr \coef4, \coef4, #6
974 uaddw2 \coef1, \coef1, v4.8h
975 ld1 {v6.4h}, [x0], x1
976 srshr \coef5, \coef5, #6
977 uaddw \coef2, \coef2, v5.4h
978 ld1 {v6.d}[1], [x3], x1
980 srshr \coef6, \coef6, #6
981 uaddw2 \coef3, \coef3, v5.8h
982 ld1 {v7.4h}, [x0], x1
983 sqxtun2 v4.8h, \coef1
984 srshr \coef7, \coef7, #6
985 uaddw \coef4, \coef4, v6.4h
986 ld1 {v7.d}[1], [x3], x1
987 umin v4.8h, v4.8h, v8.8h
988 sub x0, x0, x1, lsl #2
989 sub x3, x3, x1, lsl #2
991 uaddw2 \coef5, \coef5, v6.8h
992 st1 {v4.4h}, [x0], x1
993 sqxtun2 v5.8h, \coef3
994 uaddw \coef6, \coef6, v7.4h
995 st1 {v4.d}[1], [x3], x1
996 umin v5.8h, v5.8h, v8.8h
998 uaddw2 \coef7, \coef7, v7.8h
999 st1 {v5.4h}, [x0], x1
1000 sqxtun2 v6.8h, \coef5
1001 st1 {v5.d}[1], [x3], x1
1002 umin v6.8h, v6.8h, v8.8h
1003 sqxtun v7.4h, \coef6
1004 st1 {v6.4h}, [x0], x1
1005 sqxtun2 v7.8h, \coef7
1006 st1 {v6.d}[1], [x3], x1
1007 umin v7.8h, v7.8h, v8.8h
1008 st1 {v7.4h}, [x0], x1
1009 st1 {v7.d}[1], [x3], x1
1012 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1013 // transpose into a horizontal 16x4 slice and store.
1014 // x0 = dst (temp buffer)
1015 // x1 = slice offset
1017 // x9 = input stride
1018 .macro itxfm16_1d_funcs txfm
1019 function \txfm\()16_1d_4x16_pass1_neon
1023 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1024 load_clear \i, x2, x9
1029 // Do four 4x4 transposes. Originally, v16-v31 contain the
1030 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1031 // contain the four transposed 4x4 blocks.
1032 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
1033 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
1034 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
1035 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
1037 // Store the transposed 4x4 blocks horizontally.
1040 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1045 // Special case: For the last input column (x1 == 12),
1046 // which would be stored as the last row in the temp buffer,
1047 // don't store the first 4x4 block, but keep it in registers
1048 // for the first slice of the second pass (where it is the
1051 st1 {v20.4s}, [x0], #16
1052 st1 {v24.4s}, [x0], #16
1053 st1 {v28.4s}, [x0], #16
1055 st1 {v21.4s}, [x0], #16
1056 st1 {v25.4s}, [x0], #16
1057 st1 {v29.4s}, [x0], #16
1059 st1 {v22.4s}, [x0], #16
1060 st1 {v26.4s}, [x0], #16
1061 st1 {v30.4s}, [x0], #16
1063 st1 {v23.4s}, [x0], #16
1064 st1 {v27.4s}, [x0], #16
1065 st1 {v31.4s}, [x0], #16
1067 mov v28.16b, v16.16b
1068 mov v29.16b, v17.16b
1069 mov v30.16b, v18.16b
1070 mov v31.16b, v19.16b
1074 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
1075 // load the destination pixels (from a similar 4x16 slice), add and store back.
1078 // x2 = src (temp buffer)
1079 // x3 = slice offset
1080 // x9 = temp buffer stride
1081 function \txfm\()16_1d_4x16_pass2_neon
1084 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
1088 .irp i, 28, 29, 30, 31
1098 load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1099 load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1105 itxfm16_1d_funcs idct
1106 itxfm16_1d_funcs iadst
1108 // This is the minimum eob value for each subpartition, in increments of 4
1109 const min_eob_idct_idct_16, align=4
1110 .short 0, 10, 38, 89
1113 .macro itxfm_func16x16 txfm1, txfm2
1114 function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1115 .ifc \txfm1\()_\txfm2,idct_idct
1117 b.eq idct16x16_dc_add_neon
1120 // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
1121 .ifnc \txfm1\()_\txfm2,idct_idct
1122 stp d14, d15, [sp, #-0x10]!
1123 stp d12, d13, [sp, #-0x10]!
1124 stp d10, d11, [sp, #-0x10]!
1126 stp d8, d9, [sp, #-0x10]!
1134 movrel x10, idct_coeffs
1135 .ifnc \txfm1\()_\txfm2,idct_idct
1136 movrel x11, iadst16_coeffs
1139 ld1 {v0.8h,v1.8h}, [x10]
1147 .ifc \txfm1\()_\txfm2,idct_idct
1149 b.le idct16x16_quarter_add_16_neon
1151 b.le idct16x16_half_add_16_neon
1153 movrel x12, min_eob_idct_idct_16, 2
1157 add x0, sp, #(\i*64)
1158 .ifc \txfm1\()_\txfm2,idct_idct
1162 mov x1, #(16 - \i)/4
1168 bl \txfm1\()16_1d_4x16_pass1_neon
1170 .ifc \txfm1\()_\txfm2,iadst_idct
1171 ld1 {v0.8h,v1.8h}, [x10]
1178 .ifc \txfm1\()_\txfm2,idct_idct
1181 // Set v28-v31 to zero, for the in-register passthrough of
1182 // coefficients to pass 2.
1190 st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
1201 bl \txfm2\()16_1d_4x16_pass2_neon
1205 ldp d8, d9, [sp], 0x10
1206 .ifnc \txfm1\()_\txfm2,idct_idct
1207 ldp d10, d11, [sp], 0x10
1208 ldp d12, d13, [sp], 0x10
1209 ldp d14, d15, [sp], 0x10
1214 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1216 b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1219 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1221 b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1225 itxfm_func16x16 idct, idct
1226 itxfm_func16x16 iadst, idct
1227 itxfm_func16x16 idct, iadst
1228 itxfm_func16x16 iadst, iadst
1230 function idct16_1d_4x16_pass1_quarter_neon
1234 .irp i, 16, 17, 18, 19
1235 load_clear \i, x2, x9
1240 // Do four 4x4 transposes. Originally, v16-v31 contain the
1241 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1242 // contain the four transposed 4x4 blocks.
1243 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
1244 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
1245 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
1246 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
1248 // Store the transposed 4x4 blocks horizontally.
1249 // The first 4x4 block is kept in registers for the second pass,
1250 // store the rest in the temp buffer.
1252 st1 {v20.4s}, [x0], #16
1253 st1 {v24.4s}, [x0], #16
1254 st1 {v28.4s}, [x0], #16
1256 st1 {v21.4s}, [x0], #16
1257 st1 {v25.4s}, [x0], #16
1258 st1 {v29.4s}, [x0], #16
1260 st1 {v22.4s}, [x0], #16
1261 st1 {v26.4s}, [x0], #16
1262 st1 {v30.4s}, [x0], #16
1264 st1 {v23.4s}, [x0], #16
1265 st1 {v27.4s}, [x0], #16
1266 st1 {v31.4s}, [x0], #16
1270 function idct16_1d_4x16_pass2_quarter_neon
1273 // Only load the top 4 lines, and only do it for the later slices.
1274 // For the first slice, d16-d19 is kept in registers from the first pass.
1276 .irp i, 16, 17, 18, 19
1286 load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1287 load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1292 function idct16_1d_4x16_pass1_half_neon
1296 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1297 load_clear \i, x2, x9
1302 // Do four 4x4 transposes. Originally, v16-v31 contain the
1303 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1304 // contain the four transposed 4x4 blocks.
1305 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
1306 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
1307 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
1308 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
1310 // Store the transposed 4x4 blocks horizontally.
1313 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
1318 // Special case: For the second input column (r1 == 4),
1319 // which would be stored as the second row in the temp buffer,
1320 // don't store the first 4x4 block, but keep it in registers
1321 // for the first slice of the second pass (where it is the
1322 // second 4x4 block).
1324 st1 {v20.4s}, [x0], #16
1325 st1 {v24.4s}, [x0], #16
1326 st1 {v28.4s}, [x0], #16
1328 st1 {v21.4s}, [x0], #16
1329 st1 {v25.4s}, [x0], #16
1330 st1 {v29.4s}, [x0], #16
1332 st1 {v22.4s}, [x0], #16
1333 st1 {v26.4s}, [x0], #16
1334 st1 {v30.4s}, [x0], #16
1336 st1 {v23.4s}, [x0], #16
1337 st1 {v27.4s}, [x0], #16
1338 st1 {v31.4s}, [x0], #16
1340 mov v20.16b, v16.16b
1341 mov v21.16b, v17.16b
1342 mov v22.16b, v18.16b
1343 mov v23.16b, v19.16b
1347 function idct16_1d_4x16_pass2_half_neon
1350 .irp i, 16, 17, 18, 19
1354 .irp i, 20, 21, 22, 23
1364 load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
1365 load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
1370 .macro idct16_partial size
1371 function idct16x16_\size\()_add_16_neon
1375 bl idct16_1d_4x16_pass1_\size\()_neon
1380 bl idct16_1d_4x16_pass1_\size\()_neon
1388 bl idct16_1d_4x16_pass2_\size\()_neon
1392 ldp d8, d9, [sp], 0x10
1397 idct16_partial quarter
1400 function idct32x32_dc_add_neon
1401 movrel x4, idct_coeffs
1408 smull v2.2d, v2.2s, v0.s[0]
1409 rshrn v2.2s, v2.2d, #14
1410 smull v2.2d, v2.2s, v0.s[0]
1411 rshrn v2.2s, v2.2d, #14
1415 srshr v0.4s, v2.4s, #6
1422 // Loop to add the constant v0 into all 32x32 outputs
1424 ld1 {v1.8h,v2.8h}, [x0], #32
1425 uaddw v16.4s, v0.4s, v1.4h
1426 uaddw2 v17.4s, v0.4s, v1.8h
1427 ld1 {v3.8h,v4.8h}, [x0], x1
1428 uaddw v18.4s, v0.4s, v2.4h
1429 uaddw2 v19.4s, v0.4s, v2.8h
1430 uaddw v20.4s, v0.4s, v3.4h
1431 uaddw2 v21.4s, v0.4s, v3.8h
1432 uaddw v22.4s, v0.4s, v4.4h
1433 uaddw2 v23.4s, v0.4s, v4.8h
1434 sqxtun v1.4h, v16.4s
1435 sqxtun2 v1.8h, v17.4s
1436 sqxtun v2.4h, v18.4s
1437 sqxtun2 v2.8h, v19.4s
1438 sqxtun v3.4h, v20.4s
1439 sqxtun2 v3.8h, v21.4s
1440 sqxtun v4.4h, v22.4s
1441 sqxtun2 v4.8h, v23.4s
1442 umin v1.8h, v1.8h, v31.8h
1443 umin v2.8h, v2.8h, v31.8h
1444 st1 {v1.8h,v2.8h}, [x3], #32
1445 umin v3.8h, v3.8h, v31.8h
1446 umin v4.8h, v4.8h, v31.8h
1447 st1 {v3.8h,v4.8h}, [x3], x1
1454 butterfly_4s v16, v5, v4, v5 // v16 = t16a, v5 = t19a
1455 butterfly_4s v17, v20, v23, v20 // v17 = t17, v20 = t18
1456 butterfly_4s v18, v6, v7, v6 // v18 = t23a, v6 = t20a
1457 butterfly_4s v19, v21, v22, v21 // v19 = t22, v21 = t21
1458 butterfly_4s v4, v28, v28, v30 // v4 = t24a, v28 = t27a
1459 butterfly_4s v23, v26, v25, v26 // v23 = t25, v26 = t26
1460 butterfly_4s v7, v8, v29, v31 // v7 = t31a, v3 = t28a
1461 butterfly_4s v22, v27, v24, v27 // v22 = t30, v27 = t29
1463 dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
1464 dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31 // v3 = t19, v5 = t28
1465 dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
1466 dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1468 butterfly_4s v31, v24, v7, v4 // v31 = t31, v24 = t24
1469 butterfly_4s v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1470 butterfly_4s_r v23, v16, v16, v18 // v23 = t23, v16 = t16
1471 butterfly_4s_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1472 butterfly_4s v18, v21, v27, v21 // v18 = t18, v21 = t21
1473 butterfly_4s_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
1474 butterfly_4s v29, v26, v20, v26 // v29 = t29, v26 = t26
1475 butterfly_4s v19, v20, v8, v6 // v19 = t19a, v20 = t20
1477 dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27, v20 = t20
1478 dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
1479 dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22
1480 dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
1485 dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1486 dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1487 dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1488 dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1489 dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1490 dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1491 dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1492 dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1494 butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17
1495 butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18
1496 butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21
1497 butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22
1498 butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25
1499 butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26
1500 butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30
1501 butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29
1503 dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1504 dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1505 dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1506 dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1510 function idct32_odd_half
1511 dmbutterfly_h1 v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1512 dmbutterfly_h2 v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1513 dmbutterfly_h1 v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1514 dmbutterfly_h2 v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1515 dmbutterfly_h1 v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1516 dmbutterfly_h2 v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1517 dmbutterfly_h1 v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1518 dmbutterfly_h2 v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1520 butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17
1521 butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18
1522 butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21
1523 butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22
1524 butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25
1525 butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26
1526 butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30
1527 butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29
1529 dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1530 dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1531 dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1532 dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1536 function idct32_odd_quarter
1537 dsmull_h v4, v5, v16, v10.s[0]
1538 dsmull_h v28, v29, v19, v11.s[3]
1539 dsmull_h v30, v31, v16, v10.s[1]
1540 dsmull_h v22, v23, v17, v13.s[2]
1541 dsmull_h v7, v6, v17, v13.s[3]
1542 dsmull_h v26, v27, v19, v11.s[2]
1543 dsmull_h v20, v21, v18, v12.s[0]
1544 dsmull_h v24, v25, v18, v12.s[1]
1551 drshrn_h v4, v4, v5, #14
1552 drshrn_h v5, v28, v29, #14
1553 drshrn_h v29, v30, v31, #14
1554 drshrn_h v28, v22, v23, #14
1555 drshrn_h v7, v7, v6, #14
1556 drshrn_h v31, v26, v27, #14
1557 drshrn_h v6, v20, v21, #14
1558 drshrn_h v30, v24, v25, #14
1560 dmbutterfly_l v16, v17, v18, v19, v29, v4, v1.s[0], v1.s[1]
1561 dmbutterfly_l v27, v26, v20, v21, v31, v5, v1.s[0], v1.s[1]
1562 drshrn_h v23, v16, v17, #14
1563 drshrn_h v24, v18, v19, #14
1566 drshrn_h v27, v27, v26, #14
1567 drshrn_h v20, v20, v21, #14
1568 dmbutterfly_l v16, v17, v18, v19, v30, v6, v1.s[2], v1.s[3]
1569 drshrn_h v21, v16, v17, #14
1570 drshrn_h v26, v18, v19, #14
1571 dmbutterfly_l v16, v17, v18, v19, v28, v7, v1.s[2], v1.s[3]
1572 drshrn_h v25, v16, v17, #14
1575 drshrn_h v22, v18, v19, #14
1580 .macro idct32_funcs suffix
1581 // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
1582 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1583 // a normal IDCT16 with every other input component (the even ones, with
1584 // each output written twice), followed by a separate 16-point IDCT
1585 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1586 // x0 = dst (temp buffer)
1589 // x9 = double input stride
1590 function idct32_1d_4x32_pass1\suffix\()_neon
1595 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1597 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1598 load_clear \i, x2, x9
1601 .ifc \suffix,_quarter
1602 .irp i, 16, 17, 18, 19
1603 load_clear \i, x2, x9
1607 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1608 load_clear \i, x2, x9
1614 // Do four 4x4 transposes. Originally, v16-v31 contain the
1615 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1616 // contain the four transposed 4x4 blocks.
1617 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
1618 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
1619 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
1620 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
1622 // Store the registers a, b, c, d horizontally, followed by the
1623 // same registers d, c, b, a mirrored.
1624 .macro store_rev a, b, c, d
1625 // There's no rev128 instruction, but we reverse each 64 bit
1626 // half, and then flip them using an ext with 8 bytes offset.
1629 ext v7.16b, v7.16b, v7.16b, #8
1633 ext v6.16b, v6.16b, v6.16b, #8
1636 st1 {v7.4s}, [x0], #16
1637 ext v5.16b, v5.16b, v5.16b, #8
1638 st1 {v6.4s}, [x0], #16
1640 st1 {v5.4s}, [x0], #16
1641 ext v4.16b, v4.16b, v4.16b, #8
1642 st1 {v4.4s}, [x0], #16
1644 store_rev v16.4s, v20.4s, v24.4s, v28.4s
1645 store_rev v17.4s, v21.4s, v25.4s, v29.4s
1646 store_rev v18.4s, v22.4s, v26.4s, v30.4s
1647 store_rev v19.4s, v23.4s, v27.4s, v31.4s
1651 // Move x2 back to the start of the input, and move
1652 // to the first odd row
1654 sub x2, x2, x9, lsl #4
1656 .ifc \suffix,_quarter
1657 sub x2, x2, x9, lsl #2
1660 sub x2, x2, x9, lsl #3
1665 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1667 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1668 load_clear \i, x2, x9
1671 .ifc \suffix,_quarter
1672 .irp i, 16, 17, 18, 19
1673 load_clear \i, x2, x9
1677 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1678 load_clear \i, x2, x9
1682 bl idct32_odd\suffix
1684 transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7
1685 transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7
1686 transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7
1687 transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7
1689 // Store the registers a, b, c, d horizontally,
1690 // adding into the output first, and the mirrored,
1691 // subtracted from the output.
1692 .macro store_rev a, b, c, d, a16b, b16b
1695 add v4.4s, v4.4s, \a
1696 st1 {v4.4s}, [x0], #16
1699 ext v9.16b, v9.16b, v9.16b, #8
1700 add v4.4s, v4.4s, \b
1701 st1 {v4.4s}, [x0], #16
1702 ext v8.16b, v8.16b, v8.16b, #8
1705 add v4.4s, v4.4s, \c
1706 st1 {v4.4s}, [x0], #16
1709 ext \b16b, \b16b, \b16b, #8
1710 add v4.4s, v4.4s, \d
1711 st1 {v4.4s}, [x0], #16
1712 ext \a16b, \a16b, \a16b, #8
1714 sub v4.4s, v4.4s, v9.4s
1715 st1 {v4.4s}, [x0], #16
1717 sub v4.4s, v4.4s, v8.4s
1718 st1 {v4.4s}, [x0], #16
1720 sub v4.4s, v4.4s, \b
1721 st1 {v4.4s}, [x0], #16
1723 sub v4.4s, v4.4s, \a
1724 st1 {v4.4s}, [x0], #16
1727 store_rev v31.4s, v27.4s, v23.4s, v19.4s, v31.16b, v27.16b
1728 store_rev v30.4s, v26.4s, v22.4s, v18.4s, v30.16b, v26.16b
1729 store_rev v29.4s, v25.4s, v21.4s, v17.4s, v29.16b, v25.16b
1730 store_rev v28.4s, v24.4s, v20.4s, v16.4s, v28.16b, v24.16b
1735 // This is mostly the same as 4x32_pass1, but without the transpose,
1736 // and use the source as temp buffer between the two idct passes, and
1737 // add into the destination.
1740 // x2 = src (temp buffer)
1741 // x7 = negative double temp buffer stride
1742 // x9 = double temp buffer stride
1743 function idct32_1d_4x32_pass2\suffix\()_neon
1746 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1748 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1751 sub x2, x2, x9, lsl #4
1753 .ifc \suffix,_quarter
1754 .irp i, 16, 17, 18, 19
1757 sub x2, x2, x9, lsl #2
1760 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1763 sub x2, x2, x9, lsl #3
1768 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1772 sub x2, x2, x9, lsl #4
1775 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1777 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1780 sub x2, x2, x9, lsl #4
1782 .ifc \suffix,_quarter
1783 .irp i, 16, 17, 18, 19
1786 sub x2, x2, x9, lsl #2
1789 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1792 sub x2, x2, x9, lsl #3
1796 bl idct32_odd\suffix
1798 .macro load_acc_store a, b, c, d, neg=0
1800 ld1 {v4.4s}, [x2], x9
1801 ld1 {v5.4s}, [x2], x9
1802 add v4.4s, v4.4s, \a
1803 ld1 {v6.4s}, [x2], x9
1804 add v5.4s, v5.4s, \b
1805 ld1 {v7.4s}, [x2], x9
1806 add v6.4s, v6.4s, \c
1807 add v7.4s, v7.4s, \d
1809 ld1 {v4.4s}, [x2], x7
1810 ld1 {v5.4s}, [x2], x7
1811 sub v4.4s, v4.4s, \a
1812 ld1 {v6.4s}, [x2], x7
1813 sub v5.4s, v5.4s, \b
1814 ld1 {v7.4s}, [x2], x7
1815 sub v6.4s, v6.4s, \c
1816 sub v7.4s, v7.4s, \d
1818 ld1 {v8.4h}, [x0], x1
1819 ld1 {v8.d}[1], [x0], x1
1820 srshr v4.4s, v4.4s, #6
1821 ld1 {v9.4h}, [x0], x1
1822 srshr v5.4s, v5.4s, #6
1823 uaddw v4.4s, v4.4s, v8.4h
1824 ld1 {v9.d}[1], [x0], x1
1825 srshr v6.4s, v6.4s, #6
1826 uaddw2 v5.4s, v5.4s, v8.8h
1827 srshr v7.4s, v7.4s, #6
1828 sub x0, x0, x1, lsl #2
1829 uaddw v6.4s, v6.4s, v9.4h
1831 uaddw2 v7.4s, v7.4s, v9.8h
1832 sqxtun2 v4.8h, v5.4s
1833 umin v4.8h, v4.8h, v15.8h
1834 st1 {v4.4h}, [x0], x1
1836 st1 {v4.d}[1], [x0], x1
1837 sqxtun2 v5.8h, v7.4s
1838 umin v5.8h, v5.8h, v15.8h
1839 st1 {v5.4h}, [x0], x1
1840 st1 {v5.d}[1], [x0], x1
1842 load_acc_store v31.4s, v30.4s, v29.4s, v28.4s
1843 load_acc_store v27.4s, v26.4s, v25.4s, v24.4s
1844 load_acc_store v23.4s, v22.4s, v21.4s, v20.4s
1845 load_acc_store v19.4s, v18.4s, v17.4s, v16.4s
1847 load_acc_store v16.4s, v17.4s, v18.4s, v19.4s, 1
1848 load_acc_store v20.4s, v21.4s, v22.4s, v23.4s, 1
1849 load_acc_store v24.4s, v25.4s, v26.4s, v27.4s, 1
1850 load_acc_store v28.4s, v29.4s, v30.4s, v31.4s, 1
1851 .purgem load_acc_store
1857 idct32_funcs _quarter
1860 const min_eob_idct_idct_32, align=4
1861 .short 0, 9, 34, 70, 135, 240, 336, 448
1864 function vp9_idct_idct_32x32_add_16_neon
1866 b.eq idct32x32_dc_add_neon
1868 movrel x10, idct_coeffs
1871 stp d8, d9, [sp, #-0x10]!
1872 stp d10, d11, [sp, #-0x10]!
1873 stp d12, d13, [sp, #-0x10]!
1874 stp d14, d15, [sp, #-0x10]!
1882 // Double stride of the input, since we only read every other line
1886 ld1 {v0.8h,v1.8h}, [x10], #32
1891 ld1 {v10.8h,v11.8h}, [x10]
1893 sxtl2 v13.4s, v11.8h
1894 sxtl2 v11.4s, v10.8h
1900 b.le idct32x32_quarter_add_16_neon
1902 b.le idct32x32_half_add_16_neon
1904 movrel x12, min_eob_idct_idct_32, 2
1906 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1907 add x0, sp, #(\i*128)
1911 mov x1, #(32 - \i)/4
1915 bl idct32_1d_4x32_pass1_neon
1920 // Write zeros to the temp buffer for pass 2
1928 st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
1929 st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
1933 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1937 bl idct32_1d_4x32_pass2_neon
1941 ldp d14, d15, [sp], 0x10
1942 ldp d12, d13, [sp], 0x10
1943 ldp d10, d11, [sp], 0x10
1944 ldp d8, d9, [sp], 0x10
1949 function ff_vp9_idct_idct_32x32_add_10_neon, export=1
1951 b vp9_idct_idct_32x32_add_16_neon
1954 function ff_vp9_idct_idct_32x32_add_12_neon, export=1
1956 b vp9_idct_idct_32x32_add_16_neon
1959 .macro idct32_partial size
1960 function idct32x32_\size\()_add_16_neon
1962 add x0, sp, #(\i*128)
1970 bl idct32_1d_4x32_pass1_\size\()_neon
1975 add x0, sp, #(\i*128)
1981 bl idct32_1d_4x32_pass1_\size\()_neon
1987 // Write zeros to the temp buffer for pass 2
1994 st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
1995 st1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
1999 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
2003 bl idct32_1d_4x32_pass2_\size\()_neon
2007 ldp d14, d15, [sp], 0x10
2008 ldp d12, d13, [sp], 0x10
2009 ldp d10, d11, [sp], 0x10
2010 ldp d8, d9, [sp], 0x10
2016 idct32_partial quarter