2 * Copyright (c) 2017 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
24 const itxfm4_coeffs, align=4
25 .short 11585, 0, 6270, 15137
27 .short 5283, 15212, 9929, 13377
30 const iadst8_coeffs, align=4
31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
33 .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34 .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
39 const iadst16_coeffs, align=4
40 .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41 .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
44 .macro transpose_4x4s r0, r1, r2, r3, r4, r5, r6, r7
45 trn1 \r4\().4s, \r0\().4s, \r1\().4s
46 trn2 \r5\().4s, \r0\().4s, \r1\().4s
47 trn1 \r6\().4s, \r2\().4s, \r3\().4s
48 trn2 \r7\().4s, \r2\().4s, \r3\().4s
49 trn1 \r0\().2d, \r4\().2d, \r6\().2d
50 trn2 \r2\().2d, \r4\().2d, \r6\().2d
51 trn1 \r1\().2d, \r5\().2d, \r7\().2d
52 trn2 \r3\().2d, \r5\().2d, \r7\().2d
55 // Transpose a 8x8 matrix of 32 bit elements, where each row is spread out
56 // over two registers.
57 .macro transpose_8x8s r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, t0, t1, t2, t3
58 transpose_4x4s \r0, \r2, \r4, \r6, \t0, \t1, \t2, \t3
59 transpose_4x4s \r9, \r11, \r13, \r15, \t0, \t1, \t2, \t3
61 // Do 4x4 transposes of r1,r3,r5,r7 and r8,r10,r12,r14
62 // while swapping the two 4x4 matrices between each other
64 // First step of the 4x4 transpose of r1-r7, into t0-t3
65 trn1 \t0\().4s, \r1\().4s, \r3\().4s
66 trn2 \t1\().4s, \r1\().4s, \r3\().4s
67 trn1 \t2\().4s, \r5\().4s, \r7\().4s
68 trn2 \t3\().4s, \r5\().4s, \r7\().4s
70 // First step of the 4x4 transpose of r8-r12, into r1-r7
71 trn1 \r1\().4s, \r8\().4s, \r10\().4s
72 trn2 \r3\().4s, \r8\().4s, \r10\().4s
73 trn1 \r5\().4s, \r12\().4s, \r14\().4s
74 trn2 \r7\().4s, \r12\().4s, \r14\().4s
76 // Second step of the 4x4 transpose of r1-r7 (now in t0-r3), into r8-r12
77 trn1 \r8\().2d, \t0\().2d, \t2\().2d
78 trn2 \r12\().2d, \t0\().2d, \t2\().2d
79 trn1 \r10\().2d, \t1\().2d, \t3\().2d
80 trn2 \r14\().2d, \t1\().2d, \t3\().2d
82 // Second step of the 4x4 transpose of r8-r12 (now in r1-r7), in place as far as possible
83 trn1 \t0\().2d, \r1\().2d, \r5\().2d
84 trn2 \r5\().2d, \r1\().2d, \r5\().2d
85 trn1 \t1\().2d, \r3\().2d, \r7\().2d
86 trn2 \r7\().2d, \r3\().2d, \r7\().2d
88 // Move the outputs of trn1 back in place
89 mov \r1\().16b, \t0\().16b
90 mov \r3\().16b, \t1\().16b
93 // out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
94 // out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
95 // in/out are .4s registers; this can do with 4 temp registers, but is
96 // more efficient if 6 temp registers are available.
97 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
99 neg \tmp4\().4s, v0.4s
101 add \tmp1\().4s, \in1\().4s, \in2\().4s
102 sub \tmp2\().4s, \in1\().4s, \in2\().4s
104 smull \tmp3\().2d, \tmp1\().2s, \tmp4\().s[0]
105 smull2 \tmp4\().2d, \tmp1\().4s, \tmp4\().s[0]
107 smull \tmp3\().2d, \tmp1\().2s, v0.s[0]
108 smull2 \tmp4\().2d, \tmp1\().4s, v0.s[0]
111 rshrn \out1\().2s, \tmp3\().2d, #14
112 rshrn2 \out1\().4s, \tmp4\().2d, #14
113 smull \tmp3\().2d, \tmp2\().2s, v0.s[0]
114 smull2 \tmp4\().2d, \tmp2\().4s, v0.s[0]
115 rshrn \out2\().2s, \tmp3\().2d, #14
116 rshrn2 \out2\().4s, \tmp4\().2d, #14
118 smull \tmp5\().2d, \tmp2\().2s, v0.s[0]
119 smull2 \tmp6\().2d, \tmp2\().4s, v0.s[0]
120 rshrn \out1\().2s, \tmp3\().2d, #14
121 rshrn2 \out1\().4s, \tmp4\().2d, #14
122 rshrn \out2\().2s, \tmp5\().2d, #14
123 rshrn2 \out2\().4s, \tmp6\().2d, #14
127 // out1,out2 = in1 * coef1 - in2 * coef2
128 // out3,out4 = in1 * coef2 + in2 * coef1
129 // out are 4 x .2d registers, in are 2 x .4s registers
130 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
131 smull \out1\().2d, \in1\().2s, \coef1
132 smull2 \out2\().2d, \in1\().4s, \coef1
133 smull \out3\().2d, \in1\().2s, \coef2
134 smull2 \out4\().2d, \in1\().4s, \coef2
135 smlsl \out1\().2d, \in2\().2s, \coef2
136 smlsl2 \out2\().2d, \in2\().4s, \coef2
137 smlal \out3\().2d, \in2\().2s, \coef1
138 smlal2 \out4\().2d, \in2\().4s, \coef1
141 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
142 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
143 // inout are 2 x .4s registers
144 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
145 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
147 neg \tmp3\().2d, \tmp3\().2d
148 neg \tmp4\().2d, \tmp4\().2d
150 rshrn \inout1\().2s, \tmp1\().2d, #14
151 rshrn2 \inout1\().4s, \tmp2\().2d, #14
152 rshrn \inout2\().2s, \tmp3\().2d, #14
153 rshrn2 \inout2\().4s, \tmp4\().2d, #14
158 .macro butterfly_4s out1, out2, in1, in2
159 add \out1\().4s, \in1\().4s, \in2\().4s
160 sub \out2\().4s, \in1\().4s, \in2\().4s
165 .macro butterfly_4s_r out1, out2, in1, in2
166 sub \out1\().4s, \in1\().4s, \in2\().4s
167 add \out2\().4s, \in1\().4s, \in2\().4s
170 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
171 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
172 // out are 2 x .4s registers, in are 4 x .2d registers
173 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
174 add \tmp1\().2d, \in1\().2d, \in3\().2d
175 add \tmp2\().2d, \in2\().2d, \in4\().2d
176 sub \tmp3\().2d, \in1\().2d, \in3\().2d
177 sub \tmp4\().2d, \in2\().2d, \in4\().2d
178 rshrn \out1\().2s, \tmp1\().2d, #14
179 rshrn2 \out1\().4s, \tmp2\().2d, #14
180 rshrn \out2\().2s, \tmp3\().2d, #14
181 rshrn2 \out2\().4s, \tmp4\().2d, #14
184 .macro iwht4_10 c0, c1, c2, c3
185 add \c0\().4s, \c0\().4s, \c1\().4s
186 sub v17.4s, \c2\().4s, \c3\().4s
187 sub v16.4s, \c0\().4s, v17.4s
188 sshr v16.4s, v16.4s, #1
189 sub \c2\().4s, v16.4s, \c1\().4s
190 sub \c1\().4s, v16.4s, \c3\().4s
191 add \c3\().4s, v17.4s, \c2\().4s
192 sub \c0\().4s, \c0\().4s, \c1\().4s
195 .macro iwht4_12 c0, c1, c2, c3
196 iwht4_10 \c0, \c1, \c2, \c3
199 .macro idct4_10 c0, c1, c2, c3
200 mul v22.4s, \c1\().4s, v0.s[3]
201 mul v20.4s, \c1\().4s, v0.s[2]
202 add v16.4s, \c0\().4s, \c2\().4s
203 sub v17.4s, \c0\().4s, \c2\().4s
204 mla v22.4s, \c3\().4s, v0.s[2]
205 mul v18.4s, v16.4s, v0.s[0]
206 mul v24.4s, v17.4s, v0.s[0]
207 mls v20.4s, \c3\().4s, v0.s[3]
208 srshr v22.4s, v22.4s, #14
209 srshr v18.4s, v18.4s, #14
210 srshr v24.4s, v24.4s, #14
211 srshr v20.4s, v20.4s, #14
212 add \c0\().4s, v18.4s, v22.4s
213 sub \c3\().4s, v18.4s, v22.4s
214 add \c1\().4s, v24.4s, v20.4s
215 sub \c2\().4s, v24.4s, v20.4s
218 .macro idct4_12 c0, c1, c2, c3
219 smull v22.2d, \c1\().2s, v0.s[3]
220 smull2 v23.2d, \c1\().4s, v0.s[3]
221 smull v20.2d, \c1\().2s, v0.s[2]
222 smull2 v21.2d, \c1\().4s, v0.s[2]
223 add v16.4s, \c0\().4s, \c2\().4s
224 sub v17.4s, \c0\().4s, \c2\().4s
225 smlal v22.2d, \c3\().2s, v0.s[2]
226 smlal2 v23.2d, \c3\().4s, v0.s[2]
227 smull v18.2d, v16.2s, v0.s[0]
228 smull2 v19.2d, v16.4s, v0.s[0]
229 smull v24.2d, v17.2s, v0.s[0]
230 smull2 v25.2d, v17.4s, v0.s[0]
231 smlsl v20.2d, \c3\().2s, v0.s[3]
232 smlsl2 v21.2d, \c3\().4s, v0.s[3]
233 rshrn v22.2s, v22.2d, #14
234 rshrn2 v22.4s, v23.2d, #14
235 rshrn v18.2s, v18.2d, #14
236 rshrn2 v18.4s, v19.2d, #14
237 rshrn v24.2s, v24.2d, #14
238 rshrn2 v24.4s, v25.2d, #14
239 rshrn v20.2s, v20.2d, #14
240 rshrn2 v20.4s, v21.2d, #14
241 add \c0\().4s, v18.4s, v22.4s
242 sub \c3\().4s, v18.4s, v22.4s
243 add \c1\().4s, v24.4s, v20.4s
244 sub \c2\().4s, v24.4s, v20.4s
247 .macro iadst4_10 c0, c1, c2, c3
248 mul v16.4s, \c0\().4s, v1.s[0]
249 mla v16.4s, \c2\().4s, v1.s[1]
250 mla v16.4s, \c3\().4s, v1.s[2]
251 mul v18.4s, \c0\().4s, v1.s[2]
252 mls v18.4s, \c2\().4s, v1.s[0]
253 sub \c0\().4s, \c0\().4s, \c2\().4s
254 mls v18.4s, \c3\().4s, v1.s[1]
255 add \c0\().4s, \c0\().4s, \c3\().4s
256 mul v22.4s, \c1\().4s, v1.s[3]
257 mul v20.4s, \c0\().4s, v1.s[3]
258 add v24.4s, v16.4s, v22.4s
259 add v26.4s, v18.4s, v22.4s
260 srshr \c0\().4s, v24.4s, #14
261 add v16.4s, v16.4s, v18.4s
262 srshr \c1\().4s, v26.4s, #14
263 sub v16.4s, v16.4s, v22.4s
264 srshr \c2\().4s, v20.4s, #14
265 srshr \c3\().4s, v16.4s, #14
268 .macro iadst4_12 c0, c1, c2, c3
269 smull v16.2d, \c0\().2s, v1.s[0]
270 smull2 v17.2d, \c0\().4s, v1.s[0]
271 smlal v16.2d, \c2\().2s, v1.s[1]
272 smlal2 v17.2d, \c2\().4s, v1.s[1]
273 smlal v16.2d, \c3\().2s, v1.s[2]
274 smlal2 v17.2d, \c3\().4s, v1.s[2]
275 smull v18.2d, \c0\().2s, v1.s[2]
276 smull2 v19.2d, \c0\().4s, v1.s[2]
277 smlsl v18.2d, \c2\().2s, v1.s[0]
278 smlsl2 v19.2d, \c2\().4s, v1.s[0]
279 sub \c0\().4s, \c0\().4s, \c2\().4s
280 smlsl v18.2d, \c3\().2s, v1.s[1]
281 smlsl2 v19.2d, \c3\().4s, v1.s[1]
282 add \c0\().4s, \c0\().4s, \c3\().4s
283 smull v22.2d, \c1\().2s, v1.s[3]
284 smull2 v23.2d, \c1\().4s, v1.s[3]
285 smull v20.2d, \c0\().2s, v1.s[3]
286 smull2 v21.2d, \c0\().4s, v1.s[3]
287 add v24.2d, v16.2d, v22.2d
288 add v25.2d, v17.2d, v23.2d
289 add v26.2d, v18.2d, v22.2d
290 add v27.2d, v19.2d, v23.2d
291 rshrn \c0\().2s, v24.2d, #14
292 rshrn2 \c0\().4s, v25.2d, #14
293 add v16.2d, v16.2d, v18.2d
294 add v17.2d, v17.2d, v19.2d
295 rshrn \c1\().2s, v26.2d, #14
296 rshrn2 \c1\().4s, v27.2d, #14
297 sub v16.2d, v16.2d, v22.2d
298 sub v17.2d, v17.2d, v23.2d
299 rshrn \c2\().2s, v20.2d, #14
300 rshrn2 \c2\().4s, v21.2d, #14
301 rshrn \c3\().2s, v16.2d, #14
302 rshrn2 \c3\().4s, v17.2d, #14
305 // The public functions in this file have got the following signature:
306 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
308 .macro itxfm_func4x4 txfm1, txfm2, bpp
309 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_\bpp\()_neon, export=1
312 movrel x4, itxfm4_coeffs
317 movrel x4, iadst4_coeffs
322 movrel x4, itxfm4_coeffs
330 .ifc \txfm1\()_\txfm2,idct_idct
333 // DC-only for idct/idct
335 smull v2.2d, v2.2s, v0.s[0]
336 rshrn v2.2s, v2.2d, #14
337 smull v2.2d, v2.2s, v0.s[0]
338 rshrn v2.2s, v2.2d, #14
348 ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2]
349 st1 {v30.4s,v31.4s}, [x2], #32
352 sshr v4.4s, v4.4s, #2
353 sshr v5.4s, v5.4s, #2
354 sshr v6.4s, v6.4s, #2
355 sshr v7.4s, v7.4s, #2
358 \txfm1\()4_\bpp v4, v5, v6, v7
360 st1 {v30.4s,v31.4s}, [x2], #32
361 // Transpose 4x4 with 32 bit elements
362 transpose_4x4s v4, v5, v6, v7, v16, v17, v18, v19
364 \txfm2\()4_\bpp v4, v5, v6, v7
366 mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
367 ld1 {v0.4h}, [x0], x1
368 ld1 {v1.4h}, [x0], x1
370 srshr v4.4s, v4.4s, #4
371 srshr v5.4s, v5.4s, #4
372 srshr v6.4s, v6.4s, #4
373 srshr v7.4s, v7.4s, #4
375 uaddw v4.4s, v4.4s, v0.4h
376 uaddw v5.4s, v5.4s, v1.4h
377 ld1 {v2.4h}, [x0], x1
378 ld1 {v3.4h}, [x0], x1
381 sub x0, x0, x1, lsl #2
383 uaddw v6.4s, v6.4s, v2.4h
384 umin v0.8h, v0.8h, v31.8h
385 uaddw v7.4s, v7.4s, v3.4h
386 st1 {v0.4h}, [x0], x1
389 umin v2.8h, v2.8h, v31.8h
391 st1 {v0.d}[1], [x0], x1
392 st1 {v2.4h}, [x0], x1
393 st1 {v2.d}[1], [x0], x1
399 .macro itxfm_funcs4x4 bpp
400 itxfm_func4x4 idct, idct, \bpp
401 itxfm_func4x4 iadst, idct, \bpp
402 itxfm_func4x4 idct, iadst, \bpp
403 itxfm_func4x4 iadst, iadst, \bpp
404 itxfm_func4x4 iwht, iwht, \bpp
410 function idct8x8_dc_add_neon
411 movrel x4, idct_coeffs
418 smull v2.2d, v2.2s, v0.s[0]
419 rshrn v2.2s, v2.2d, #14
420 smull v2.2d, v2.2s, v0.s[0]
421 rshrn v2.2s, v2.2d, #14
425 srshr v2.4s, v2.4s, #5
431 // Loop to add the constant from v2 into all 8x8 outputs
433 ld1 {v3.8h}, [x0], x1
434 ld1 {v4.8h}, [x0], x1
435 uaddw v16.4s, v2.4s, v3.4h
436 uaddw2 v17.4s, v2.4s, v3.8h
437 uaddw v18.4s, v2.4s, v4.4h
438 uaddw2 v19.4s, v2.4s, v4.8h
440 sqxtun2 v3.8h, v17.4s
442 sqxtun2 v4.8h, v19.4s
443 umin v3.8h, v3.8h, v31.8h
444 umin v4.8h, v4.8h, v31.8h
445 st1 {v3.8h}, [x3], x1
446 st1 {v4.8h}, [x3], x1
452 .macro idct8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
453 dmbutterfly0 \r0, \r4, \r0, \r4, \t0, \t1, \t2, \t3, \t4, \t5 // r0 = t0a, r4 = t1a
454 dmbutterfly \r2, \r6, v0.s[2], v0.s[3], \t0, \t1, \t2, \t3 // r2 = t2a, r6 = t3a
455 dmbutterfly \r1, \r7, v1.s[0], v1.s[1], \t0, \t1, \t2, \t3 // r1 = t4a, r7 = t7a
456 dmbutterfly \r5, \r3, v1.s[2], v1.s[3], \t0, \t1, \t2, \t3 // r5 = t5a, r3 = t6a
458 butterfly_4s \t0, \t1, \r0, \r6 // t0 = t0, t1 = t3
459 butterfly_4s \t2, \r5, \r1, \r5 // t2 = t4, r5 = t5a
460 butterfly_4s \t3, \r6, \r7, \r3 // t3 = t7, r6 = t6a
461 butterfly_4s \r7, \r4, \r4, \r2 // r7 = t1, r4 = t2
463 dmbutterfly0 \r6, \r5, \r6, \r5, \r0, \r1, \r2, \r3, \t4, \t5 // r6 = t6, r5 = t5
465 butterfly_4s \r1, \r6, \r7, \r6 // r1 = out[1], r6 = out[6]
466 butterfly_4s \r0, \r7, \t0, \t3 // r0 = out[0], r7 = out[7]
467 butterfly_4s \r2, \r5, \r4, \r5 // r2 = out[2], r5 = out[5]
468 butterfly_4s \r3, \r4, \t1, \t2 // r3 = out[3], r4 = out[4]
471 .macro iadst8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1, t2, t3, t4, t5
472 dmbutterfly_l \t2, \t3, \t0, \t1, \r7, \r0, v2.s[1], v2.s[0] // t2,t3 = t1a, t0,t1 = t0a
473 dmbutterfly_l \r0, \r7, \t4, \t5, \r3, \r4, v3.s[1], v3.s[0] // r0,r7 = t5a, t4,t5 = t4a
475 dbutterfly_n \r3, \t0, \t0, \t1, \t4, \t5, \r3, \r4, \t0, \t1 // r3 = t0, t0 = t4
476 dbutterfly_n \r4, \t1, \t2, \t3, \r0, \r7, \r4, \t1, \t4, \t5 // r4 = t1, t1 = t5
478 dmbutterfly_l \t4, \t5, \t2, \t3, \r5, \r2, v2.s[3], v2.s[2] // t4,t5 = t3a, t2,t3 = t2a
479 dmbutterfly_l \r2, \r5, \r0, \r7, \r1, \r6, v3.s[3], v3.s[2] // r2,r5 = t7a, r0,r7 = t6a
481 dbutterfly_n \r1, \t2, \t2, \t3, \r0, \r7, \r1, \r6, \t2, \t3 // r1 = t2, t2 = t6
482 dbutterfly_n \r0, \t4, \t4, \t5, \r2, \r5, \r0, \r7, \t4, \t5 // r0 = t3, t4 = t7
484 butterfly_4s \r7, \r4, \r4, \r0 // r7 = -out[7], r4 = t3
485 neg \r7\().4s, \r7\().4s // r7 = out[7]
486 butterfly_4s \r0, \r1, \r3, \r1 // r0 = out[0], r1 = t2
488 dmbutterfly_l \r2, \r3, \t3, \t5, \t0, \t1, v0.s[2], v0.s[3] // r2,r3 = t5a, t3,t5 = t4a
489 dmbutterfly_l \t0, \t1, \r5, \r6, \t4, \t2, v0.s[3], v0.s[2] // t0,t1 = t6a, r5,r6 = t7a
491 dbutterfly_n \r6, \t2, \r2, \r3, \r5, \r6, \t2, \t4, \r2, \r3 // r6 = out[6], t2 = t7
493 dmbutterfly0 \r3, \r4, \r1, \r4, \t4, \r5, \r1, \r2 // r3 = -out[3], r4 = out[4]
494 neg \r3\().4s, \r3\().4s // r3 = out[3]
496 dbutterfly_n \r1, \t0, \t3, \t5, \t0, \t1, \r1, \r2, \t0, \t1 // r1 = -out[1], t0 = t6
497 neg \r1\().4s, \r1\().4s // r1 = out[1]
499 dmbutterfly0 \r2, \r5, \t0, \t2, \t1, \t3, \t4, \t5 // r2 = out[2], r5 = -out[5]
500 neg \r5\().4s, \r5\().4s // r5 = out[5]
504 .macro itxfm_func8x8 txfm1, txfm2
505 function vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
506 .ifc \txfm1\()_\txfm2,idct_idct
508 b.eq idct8x8_dc_add_neon
510 // The iadst also uses a few coefficients from
511 // idct, so those always need to be loaded.
512 .ifc \txfm1\()_\txfm2,idct_idct
513 movrel x4, idct_coeffs
515 movrel x4, iadst8_coeffs
516 ld1 {v1.8h}, [x4], #16
517 stp d8, d9, [sp, #-0x10]!
531 ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2], #64
532 ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2], #64
533 ld1 {v24.4s,v25.4s,v26.4s,v27.4s}, [x2], #64
534 ld1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
536 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
537 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
538 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
539 st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x2], #64
541 .ifc \txfm1\()_\txfm2,idct_idct
542 idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
543 idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
545 \txfm1\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
546 \txfm1\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
549 // Transpose 8x8 with 16 bit elements
550 transpose_8x8s v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v4, v5, v6, v7
552 .ifc \txfm1\()_\txfm2,idct_idct
553 idct8 v16, v18, v20, v22, v24, v26, v28, v30, v2, v3, v4, v5, v6, v7
554 idct8 v17, v19, v21, v23, v25, v27, v29, v31, v2, v3, v4, v5, v6, v7
556 \txfm2\()8 v16, v18, v20, v22, v24, v26, v28, v30, v4, v5, v6, v7, v8, v9
557 \txfm2\()8 v17, v19, v21, v23, v25, v27, v29, v31, v4, v5, v6, v7, v8, v9
561 // Add into the destination
562 ld1 {v0.8h}, [x0], x1
563 srshr v16.4s, v16.4s, #5
564 srshr v17.4s, v17.4s, #5
565 ld1 {v1.8h}, [x0], x1
566 srshr v18.4s, v18.4s, #5
567 srshr v19.4s, v19.4s, #5
568 ld1 {v2.8h}, [x0], x1
569 srshr v20.4s, v20.4s, #5
570 srshr v21.4s, v21.4s, #5
571 uaddw v16.4s, v16.4s, v0.4h
572 uaddw2 v17.4s, v17.4s, v0.8h
573 ld1 {v3.8h}, [x0], x1
574 srshr v22.4s, v22.4s, #5
575 srshr v23.4s, v23.4s, #5
576 uaddw v18.4s, v18.4s, v1.4h
577 uaddw2 v19.4s, v19.4s, v1.8h
578 ld1 {v4.8h}, [x0], x1
579 srshr v24.4s, v24.4s, #5
580 srshr v25.4s, v25.4s, #5
581 uaddw v20.4s, v20.4s, v2.4h
582 uaddw2 v21.4s, v21.4s, v2.8h
584 sqxtun2 v0.8h, v17.4s
586 ld1 {v5.8h}, [x0], x1
587 srshr v26.4s, v26.4s, #5
588 srshr v27.4s, v27.4s, #5
589 uaddw v22.4s, v22.4s, v3.4h
590 uaddw2 v23.4s, v23.4s, v3.8h
592 sqxtun2 v1.8h, v19.4s
593 umin v0.8h, v0.8h, v16.8h
594 ld1 {v6.8h}, [x0], x1
595 srshr v28.4s, v28.4s, #5
596 srshr v29.4s, v29.4s, #5
597 uaddw v24.4s, v24.4s, v4.4h
598 uaddw2 v25.4s, v25.4s, v4.8h
600 sqxtun2 v2.8h, v21.4s
601 umin v1.8h, v1.8h, v16.8h
602 ld1 {v7.8h}, [x0], x1
603 srshr v30.4s, v30.4s, #5
604 srshr v31.4s, v31.4s, #5
605 uaddw v26.4s, v26.4s, v5.4h
606 uaddw2 v27.4s, v27.4s, v5.8h
608 sqxtun2 v3.8h, v23.4s
609 umin v2.8h, v2.8h, v16.8h
611 st1 {v0.8h}, [x3], x1
612 uaddw v28.4s, v28.4s, v6.4h
613 uaddw2 v29.4s, v29.4s, v6.8h
614 st1 {v1.8h}, [x3], x1
616 sqxtun2 v4.8h, v25.4s
617 umin v3.8h, v3.8h, v16.8h
618 st1 {v2.8h}, [x3], x1
619 uaddw v30.4s, v30.4s, v7.4h
620 uaddw2 v31.4s, v31.4s, v7.8h
621 st1 {v3.8h}, [x3], x1
623 sqxtun2 v5.8h, v27.4s
624 umin v4.8h, v4.8h, v16.8h
625 st1 {v4.8h}, [x3], x1
627 sqxtun2 v6.8h, v29.4s
628 umin v5.8h, v5.8h, v16.8h
629 st1 {v5.8h}, [x3], x1
631 sqxtun2 v7.8h, v31.4s
632 umin v6.8h, v6.8h, v16.8h
634 st1 {v6.8h}, [x3], x1
635 umin v7.8h, v7.8h, v16.8h
636 st1 {v7.8h}, [x3], x1
638 .ifnc \txfm1\()_\txfm2,idct_idct
639 ldp d8, d9, [sp], 0x10
644 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_10_neon, export=1
646 b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
649 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_12_neon, export=1
651 b vp9_\txfm1\()_\txfm2\()_8x8_add_16_neon
655 itxfm_func8x8 idct, idct
656 itxfm_func8x8 iadst, idct
657 itxfm_func8x8 idct, iadst
658 itxfm_func8x8 iadst, iadst
661 function idct16x16_dc_add_neon
662 movrel x4, idct_coeffs
669 smull v2.2d, v2.2s, v0.s[0]
670 rshrn v2.2s, v2.2d, #14
671 smull v2.2d, v2.2s, v0.s[0]
672 rshrn v2.2s, v2.2d, #14
676 srshr v0.4s, v2.4s, #6
682 // Loop to add the constant from v2 into all 16x16 outputs
684 ld1 {v1.8h,v2.8h}, [x0], x1
685 uaddw v16.4s, v0.4s, v1.4h
686 uaddw2 v17.4s, v0.4s, v1.8h
687 ld1 {v3.8h,v4.8h}, [x0], x1
688 uaddw v18.4s, v0.4s, v2.4h
689 uaddw2 v19.4s, v0.4s, v2.8h
690 uaddw v20.4s, v0.4s, v3.4h
691 uaddw2 v21.4s, v0.4s, v3.8h
692 uaddw v22.4s, v0.4s, v4.4h
693 uaddw2 v23.4s, v0.4s, v4.8h
695 sqxtun2 v1.8h, v17.4s
697 sqxtun2 v2.8h, v19.4s
699 sqxtun2 v3.8h, v21.4s
701 sqxtun2 v4.8h, v23.4s
702 umin v1.8h, v1.8h, v31.8h
703 umin v2.8h, v2.8h, v31.8h
704 st1 {v1.8h,v2.8h}, [x3], x1
705 umin v3.8h, v3.8h, v31.8h
706 umin v4.8h, v4.8h, v31.8h
707 st1 {v3.8h,v4.8h}, [x3], x1
714 dmbutterfly0 v16, v24, v16, v24, v4, v5, v6, v7, v8, v9 // v16 = t0a, v24 = t1a
715 dmbutterfly v20, v28, v0.s[2], v0.s[3], v4, v5, v6, v7 // v20 = t2a, v28 = t3a
716 dmbutterfly v18, v30, v1.s[0], v1.s[1], v4, v5, v6, v7 // v18 = t4a, v30 = t7a
717 dmbutterfly v26, v22, v1.s[2], v1.s[3], v4, v5, v6, v7 // v26 = t5a, v22 = t6a
718 dmbutterfly v17, v31, v2.s[0], v2.s[1], v4, v5, v6, v7 // v17 = t8a, v31 = t15a
719 dmbutterfly v25, v23, v2.s[2], v2.s[3], v4, v5, v6, v7 // v25 = t9a, v23 = t14a
720 dmbutterfly v21, v27, v3.s[0], v3.s[1], v4, v5, v6, v7 // v21 = t10a, v27 = t13a
721 dmbutterfly v29, v19, v3.s[2], v3.s[3], v4, v5, v6, v7 // v29 = t11a, v19 = t12a
723 butterfly_4s v4, v28, v16, v28 // v4 = t0, v28 = t3
724 butterfly_4s v5, v20, v24, v20 // v5 = t1, v20 = t2
725 butterfly_4s v6, v26, v18, v26 // v6 = t4, v26 = t5
726 butterfly_4s v7, v22, v30, v22 // v7 = t7, v22 = t6
727 butterfly_4s v16, v25, v17, v25 // v16 = t8, v25 = t9
728 butterfly_4s v24, v21, v29, v21 // v24 = t11, v21 = t10
729 butterfly_4s v17, v27, v19, v27 // v17 = t12, v27 = t13
730 butterfly_4s v29, v23, v31, v23 // v29 = t15, v23 = t14
732 dmbutterfly0 v22, v26, v22, v26, v8, v9, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
733 dmbutterfly v23, v25, v0.s[2], v0.s[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
734 dmbutterfly v27, v21, v0.s[2], v0.s[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
736 butterfly_4s v18, v7, v4, v7 // v18 = t0a, v7 = t7a
737 butterfly_4s v19, v22, v5, v22 // v19 = t1a, v22 = t6
738 butterfly_4s v4, v26, v20, v26 // v4 = t2a, v26 = t5
739 butterfly_4s v5, v6, v28, v6 // v5 = t3a, v6 = t4
740 butterfly_4s v20, v28, v16, v24 // v20 = t8a, v28 = t11a
741 butterfly_4s v24, v21, v23, v21 // v24 = t9, v21 = t10
742 butterfly_4s v23, v27, v25, v27 // v23 = t14, v27 = t13
743 butterfly_4s v25, v29, v29, v17 // v25 = t15a, v29 = t12a
745 dmbutterfly0 v8, v9, v27, v21, v8, v9, v16, v17, v30, v31 // v8 = t13a, v9 = t10a
746 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
748 butterfly_4s v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
749 butterfly_4s v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
750 butterfly_4s_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
751 butterfly_4s v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
752 butterfly_4s v18, v29, v4, v8 // v18 = out[2], v29 = out[13]
753 butterfly_4s v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
754 butterfly_4s v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
755 butterfly_4s v21, v26, v26, v9 // v21 = out[5], v26 = out[10]
759 ld1 {v0.8h,v1.8h}, [x11]
765 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.s[1], v0.s[0] // v6,v7 = t1, v4,v5 = t0
766 dmbutterfly_l v10, v11, v8, v9, v23, v24, v1.s[1], v1.s[0] // v10,v11 = t9, v8,v9 = t8
767 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
768 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.s[3], v0.s[2] // v14,v15 = t3, v12,v13 = t2
769 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
771 dmbutterfly_l v6, v7, v4, v5, v21, v26, v1.s[3], v1.s[2] // v6,v7 = t11, v4,v5 = t10
772 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
773 dmbutterfly_l v10, v11, v8, v9, v27, v20, v2.s[1], v2.s[0] // v10,v11 = t5, v8,v9 = t4
774 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
776 dmbutterfly_l v14, v15, v12, v13, v19, v28, v3.s[1], v3.s[0] // v14,v15 = t13, v12,v13 = t12
777 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
778 dmbutterfly_l v6, v7, v4, v5, v25, v22, v2.s[3], v2.s[2] // v6,v7 = t7, v4,v5 = t6
779 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
781 dmbutterfly_l v10, v11, v8, v9, v17, v30, v3.s[3], v3.s[2] // v10,v11 = t15, v8,v9 = t14
783 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
786 dmbutterfly_l v14, v15, v12, v13, v23, v24, v1.s[0], v1.s[1] // v14,v15 = t9, v12,v13 = t8
787 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
789 dmbutterfly_l v4, v5, v6, v7, v28, v19, v1.s[1], v1.s[0] // v4,v5 = t12, v6,v7 = t13
790 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
791 dmbutterfly_l v10, v11, v8, v9, v21, v26, v1.s[2], v1.s[3] // v10,v11 = t11, v8,v9 = t10
792 butterfly_4s_r v4, v27, v16, v27 // v4 = t4, v27 = t0
793 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
795 dmbutterfly_l v12, v13, v14, v15, v30, v17, v1.s[3], v1.s[2] // v12,v13 = t14, v14,v15 = t15
796 butterfly_4s_r v5, v20, v31, v20 // v5 = t5, v20 = t1
797 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
798 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
800 butterfly_4s_r v6, v25, v18, v25 // v6 = t6, v25 = t2
801 butterfly_4s_r v7, v22, v29, v22 // v7 = t7, v22 = t3
803 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.s[2], v0.s[3] // v10,v11 = t13, v8,v9 = t12
804 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.s[3], v0.s[2] // v12,v13 = t14, v14,v15 = t15
806 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
807 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
808 neg v29.4s, v29.4s // v29 = out[13]
810 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.s[2], v0.s[3] // v10,v11 = t5a, v8,v9 = t4a
811 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.s[3], v0.s[2] // v12,v13 = t6a, v14,v15 = t7a
813 butterfly_4s v2, v6, v27, v25 // v2 = out[0], v6 = t2a
814 butterfly_4s v3, v7, v23, v21 // v3 =-out[1], v7 = t10
816 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
817 neg v19.4s, v19.4s // v19 = out[3]
818 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
820 butterfly_4s v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
821 butterfly_4s v4, v9, v24, v26 // v4 = out[14],v9 = t11
823 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
824 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
825 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
826 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
828 neg v31.4s, v5.4s // v31 = out[15]
829 neg v17.4s, v3.4s // v17 = out[1]
835 // Helper macros; we can't use these expressions directly within
836 // e.g. .irp due to the extra concatenation \(). Therefore wrap
837 // them in macros to allow using .irp below.
838 .macro load i, src, inc
839 ld1 {v\i\().4s}, [\src], \inc
841 .macro store i, dst, inc
842 st1 {v\i\().4s}, [\dst], \inc
844 .macro movi_v i, size, imm
845 movi v\i\()\size, \imm
847 .macro load_clear i, src, inc
848 ld1 {v\i\().4s}, [\src]
849 st1 {v4.4s}, [\src], \inc
852 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
853 // transpose into a horizontal 16x4 slice and store.
854 // x0 = dst (temp buffer)
858 .macro itxfm16_1d_funcs txfm
859 function \txfm\()16_1d_4x16_pass1_neon
861 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
862 load_clear \i, x2, x9
867 // Do four 4x4 transposes. Originally, v16-v31 contain the
868 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
869 // contain the four transposed 4x4 blocks.
870 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
871 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
872 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
873 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
875 // Store the transposed 8x8 blocks horizontally.
878 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
883 // Special case: For the last input column (x1 == 12),
884 // which would be stored as the last row in the temp buffer,
885 // don't store the first 4x4 block, but keep it in registers
886 // for the first slice of the second pass (where it is the
912 // Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
913 // load the destination pixels (from a similar 4x16 slice), add and store back.
916 // x2 = src (temp buffer)
918 // x9 = temp buffer stride
919 function \txfm\()16_1d_4x16_pass2_neon
920 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
924 .irp i, 28, 29, 30, 31
934 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7
935 srshr \coef0, \coef0, #6
936 ld1 {v4.4h}, [x0], x1
937 srshr \coef1, \coef1, #6
938 ld1 {v4.d}[1], [x3], x1
939 srshr \coef2, \coef2, #6
940 ld1 {v5.4h}, [x0], x1
941 srshr \coef3, \coef3, #6
942 uaddw \coef0, \coef0, v4.4h
943 ld1 {v5.d}[1], [x3], x1
944 srshr \coef4, \coef4, #6
945 uaddw2 \coef1, \coef1, v4.8h
946 ld1 {v6.4h}, [x0], x1
947 srshr \coef5, \coef5, #6
948 uaddw \coef2, \coef2, v5.4h
949 ld1 {v6.d}[1], [x3], x1
951 srshr \coef6, \coef6, #6
952 uaddw2 \coef3, \coef3, v5.8h
953 ld1 {v7.4h}, [x0], x1
954 sqxtun2 v4.8h, \coef1
955 srshr \coef7, \coef7, #6
956 uaddw \coef4, \coef4, v6.4h
957 ld1 {v7.d}[1], [x3], x1
958 umin v4.8h, v4.8h, v8.8h
959 sub x0, x0, x1, lsl #2
960 sub x3, x3, x1, lsl #2
962 uaddw2 \coef5, \coef5, v6.8h
963 st1 {v4.4h}, [x0], x1
964 sqxtun2 v5.8h, \coef3
965 uaddw \coef6, \coef6, v7.4h
966 st1 {v4.d}[1], [x3], x1
967 umin v5.8h, v5.8h, v8.8h
969 uaddw2 \coef7, \coef7, v7.8h
970 st1 {v5.4h}, [x0], x1
971 sqxtun2 v6.8h, \coef5
972 st1 {v5.d}[1], [x3], x1
973 umin v6.8h, v6.8h, v8.8h
975 st1 {v6.4h}, [x0], x1
976 sqxtun2 v7.8h, \coef7
977 st1 {v6.d}[1], [x3], x1
978 umin v7.8h, v7.8h, v8.8h
979 st1 {v7.4h}, [x0], x1
980 st1 {v7.d}[1], [x3], x1
982 load_add_store v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
983 load_add_store v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
984 .purgem load_add_store
990 itxfm16_1d_funcs idct
991 itxfm16_1d_funcs iadst
993 // This is the minimum eob value for each subpartition, in increments of 4
994 const min_eob_idct_idct_16, align=4
998 .macro itxfm_func16x16 txfm1, txfm2
999 function vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1000 .ifc \txfm1\()_\txfm2,idct_idct
1002 b.eq idct16x16_dc_add_neon
1005 // iadst16 requires clobbering v8-v15, idct16 only clobbers v8-v9.
1006 .ifnc \txfm1\()_\txfm2,idct_idct
1007 stp d14, d15, [sp, #-0x10]!
1008 stp d12, d13, [sp, #-0x10]!
1009 stp d10, d11, [sp, #-0x10]!
1011 stp d8, d9, [sp, #-0x10]!
1019 movrel x10, idct_coeffs
1020 .ifnc \txfm1\()_\txfm2,idct_idct
1021 movrel x11, iadst16_coeffs
1023 movrel x12, min_eob_idct_idct_16, 2
1025 ld1 {v0.8h,v1.8h}, [x10]
1034 add x0, sp, #(\i*64)
1035 .ifc \txfm1\()_\txfm2,idct_idct
1039 mov x1, #(16 - \i)/4
1045 bl \txfm1\()16_1d_4x16_pass1_neon
1047 .ifc \txfm1\()_\txfm2,iadst_idct
1048 ld1 {v0.8h,v1.8h}, [x10]
1055 .ifc \txfm1\()_\txfm2,idct_idct
1058 // Set v28-v31 to zero, for the in-register passthrough of
1059 // coefficients to pass 2.
1067 st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x0], x9
1078 bl \txfm2\()16_1d_4x16_pass2_neon
1082 ldp d8, d9, [sp], 0x10
1083 .ifnc \txfm1\()_\txfm2,idct_idct
1084 ldp d10, d11, [sp], 0x10
1085 ldp d12, d13, [sp], 0x10
1086 ldp d14, d15, [sp], 0x10
1091 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_10_neon, export=1
1093 b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1096 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_12_neon, export=1
1098 b vp9_\txfm1\()_\txfm2\()_16x16_add_16_neon
1102 itxfm_func16x16 idct, idct
1103 itxfm_func16x16 iadst, idct
1104 itxfm_func16x16 idct, iadst
1105 itxfm_func16x16 iadst, iadst
1108 function idct32x32_dc_add_neon
1109 movrel x4, idct_coeffs
1116 smull v2.2d, v2.2s, v0.s[0]
1117 rshrn v2.2s, v2.2d, #14
1118 smull v2.2d, v2.2s, v0.s[0]
1119 rshrn v2.2s, v2.2d, #14
1123 srshr v0.4s, v2.4s, #6
1130 // Loop to add the constant v0 into all 32x32 outputs
1132 ld1 {v1.8h,v2.8h}, [x0], #32
1133 uaddw v16.4s, v0.4s, v1.4h
1134 uaddw2 v17.4s, v0.4s, v1.8h
1135 ld1 {v3.8h,v4.8h}, [x0], x1
1136 uaddw v18.4s, v0.4s, v2.4h
1137 uaddw2 v19.4s, v0.4s, v2.8h
1138 uaddw v20.4s, v0.4s, v3.4h
1139 uaddw2 v21.4s, v0.4s, v3.8h
1140 uaddw v22.4s, v0.4s, v4.4h
1141 uaddw2 v23.4s, v0.4s, v4.8h
1142 sqxtun v1.4h, v16.4s
1143 sqxtun2 v1.8h, v17.4s
1144 sqxtun v2.4h, v18.4s
1145 sqxtun2 v2.8h, v19.4s
1146 sqxtun v3.4h, v20.4s
1147 sqxtun2 v3.8h, v21.4s
1148 sqxtun v4.4h, v22.4s
1149 sqxtun2 v4.8h, v23.4s
1150 umin v1.8h, v1.8h, v31.8h
1151 umin v2.8h, v2.8h, v31.8h
1152 st1 {v1.8h,v2.8h}, [x3], #32
1153 umin v3.8h, v3.8h, v31.8h
1154 umin v4.8h, v4.8h, v31.8h
1155 st1 {v3.8h,v4.8h}, [x3], x1
1162 dmbutterfly v16, v31, v10.s[0], v10.s[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1163 dmbutterfly v24, v23, v10.s[2], v10.s[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1164 dmbutterfly v20, v27, v11.s[0], v11.s[1], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1165 dmbutterfly v28, v19, v11.s[2], v11.s[3], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1166 dmbutterfly v18, v29, v12.s[0], v12.s[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1167 dmbutterfly v26, v21, v12.s[2], v12.s[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1168 dmbutterfly v22, v25, v13.s[0], v13.s[1], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1169 dmbutterfly v30, v17, v13.s[2], v13.s[3], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1171 butterfly_4s v4, v24, v16, v24 // v4 = t16, v24 = t17
1172 butterfly_4s v5, v20, v28, v20 // v5 = t19, v20 = t18
1173 butterfly_4s v6, v26, v18, v26 // v6 = t20, v26 = t21
1174 butterfly_4s v7, v22, v30, v22 // v7 = t23, v22 = t22
1175 butterfly_4s v28, v25, v17, v25 // v28 = t24, v25 = t25
1176 butterfly_4s v30, v21, v29, v21 // v30 = t27, v21 = t26
1177 butterfly_4s v29, v23, v31, v23 // v29 = t31, v23 = t30
1178 butterfly_4s v31, v27, v19, v27 // v31 = t28, v27 = t29
1180 dmbutterfly v23, v24, v1.s[0], v1.s[1], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1181 dmbutterfly v27, v20, v1.s[0], v1.s[1], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1182 dmbutterfly v21, v26, v1.s[2], v1.s[3], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1183 dmbutterfly v25, v22, v1.s[2], v1.s[3], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1185 butterfly_4s v16, v5, v4, v5 // v16 = t16a, v5 = t19a
1186 butterfly_4s v17, v20, v23, v20 // v17 = t17, v20 = t18
1187 butterfly_4s v18, v6, v7, v6 // v18 = t23a, v6 = t20a
1188 butterfly_4s v19, v21, v22, v21 // v19 = t22, v21 = t21
1189 butterfly_4s v4, v28, v28, v30 // v4 = t24a, v28 = t27a
1190 butterfly_4s v23, v26, v25, v26 // v23 = t25, v26 = t26
1191 butterfly_4s v7, v8, v29, v31 // v7 = t31a, v3 = t28a
1192 butterfly_4s v22, v27, v24, v27 // v22 = t30, v27 = t29
1194 dmbutterfly v27, v20, v0.s[2], v0.s[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
1195 dmbutterfly v8, v5, v0.s[2], v0.s[3], v24, v25, v30, v31 // v3 = t19, v5 = t28
1196 dmbutterfly v28, v6, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
1197 dmbutterfly v26, v21, v0.s[2], v0.s[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1199 butterfly_4s v31, v24, v7, v4 // v31 = t31, v24 = t24
1200 butterfly_4s v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1201 butterfly_4s_r v23, v16, v16, v18 // v23 = t23, v16 = t16
1202 butterfly_4s_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1203 butterfly_4s v18, v21, v27, v21 // v18 = t18, v21 = t21
1204 butterfly_4s_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
1205 butterfly_4s v29, v26, v20, v26 // v29 = t29, v26 = t26
1206 butterfly_4s v19, v20, v8, v6 // v19 = t19a, v20 = t20
1208 dmbutterfly0 v27, v20, v27, v20, v4, v5, v6, v7, v8, v9 // v27 = t27, v20 = t20
1209 dmbutterfly0 v26, v21, v26, v21, v4, v5, v6, v7, v8, v9 // v26 = t26a, v21 = t21a
1210 dmbutterfly0 v25, v22, v25, v22, v4, v5, v6, v7, v8, v9 // v25 = t25, v22 = t22
1211 dmbutterfly0 v24, v23, v24, v23, v4, v5, v6, v7, v8, v9 // v24 = t24a, v23 = t23a
1214 // Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
1215 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1216 // a normal IDCT16 with every other input component (the even ones, with
1217 // each output written twice), followed by a separate 16-point IDCT
1218 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1219 // x0 = dst (temp buffer)
1222 // x9 = double input stride
1223 function idct32_1d_4x32_pass1_neon
1226 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1227 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1228 ld1 {v\i\().4s}, [x2]
1229 st1 {v4.4s}, [x2], x9
1234 // Do four 4x4 transposes. Originally, v16-v31 contain the
1235 // 16 rows. Afterwards, v16-v19, v20-v23, v24-v27 and v28-v31
1236 // contain the four transposed 4x4 blocks.
1237 transpose_4x4s v16, v17, v18, v19, v4, v5, v6, v7
1238 transpose_4x4s v20, v21, v22, v23, v4, v5, v6, v7
1239 transpose_4x4s v24, v25, v26, v27, v4, v5, v6, v7
1240 transpose_4x4s v28, v29, v30, v31, v4, v5, v6, v7
1242 // Store the registers a, b, c, d horizontally, followed by the
1243 // same registers d, c, b, a mirrored.
1244 .macro store_rev a, b, c, d
1245 // There's no rev128 instruction, but we reverse each 64 bit
1246 // half, and then flip them using an ext with 8 bytes offset.
1247 rev64 v7.4s, v\d\().4s
1248 st1 {v\a\().4s}, [x0], #16
1249 ext v7.16b, v7.16b, v7.16b, #8
1250 st1 {v\b\().4s}, [x0], #16
1251 rev64 v6.4s, v\c\().4s
1252 st1 {v\c\().4s}, [x0], #16
1253 ext v6.16b, v6.16b, v6.16b, #8
1254 st1 {v\d\().4s}, [x0], #16
1255 rev64 v5.4s, v\b\().4s
1256 st1 {v7.4s}, [x0], #16
1257 ext v5.16b, v5.16b, v5.16b, #8
1258 st1 {v6.4s}, [x0], #16
1259 rev64 v4.4s, v\a\().4s
1260 st1 {v5.4s}, [x0], #16
1261 ext v4.16b, v4.16b, v4.16b, #8
1262 st1 {v4.4s}, [x0], #16
1264 store_rev 16, 20, 24, 28
1265 store_rev 17, 21, 25, 29
1266 store_rev 18, 22, 26, 30
1267 store_rev 19, 23, 27, 31
1271 // Move x2 back to the start of the input, and move
1272 // to the first odd row
1273 sub x2, x2, x9, lsl #4
1277 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1278 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1279 ld1 {v\i\().4s}, [x2]
1280 st1 {v4.4s}, [x2], x9
1285 transpose_4x4s v31, v30, v29, v28, v4, v5, v6, v7
1286 transpose_4x4s v27, v26, v25, v24, v4, v5, v6, v7
1287 transpose_4x4s v23, v22, v21, v20, v4, v5, v6, v7
1288 transpose_4x4s v19, v18, v17, v16, v4, v5, v6, v7
1290 // Store the registers a, b, c, d horizontally,
1291 // adding into the output first, and the mirrored,
1292 // subtracted from the output.
1293 .macro store_rev a, b, c, d
1295 rev64 v9.4s, v\d\().4s
1296 add v4.4s, v4.4s, v\a\().4s
1297 st1 {v4.4s}, [x0], #16
1298 rev64 v8.4s, v\c\().4s
1300 ext v9.16b, v9.16b, v9.16b, #8
1301 add v4.4s, v4.4s, v\b\().4s
1302 st1 {v4.4s}, [x0], #16
1303 ext v8.16b, v8.16b, v8.16b, #8
1305 rev64 v\b\().4s, v\b\().4s
1306 add v4.4s, v4.4s, v\c\().4s
1307 st1 {v4.4s}, [x0], #16
1308 rev64 v\a\().4s, v\a\().4s
1310 ext v\b\().16b, v\b\().16b, v\b\().16b, #8
1311 add v4.4s, v4.4s, v\d\().4s
1312 st1 {v4.4s}, [x0], #16
1313 ext v\a\().16b, v\a\().16b, v\a\().16b, #8
1315 sub v4.4s, v4.4s, v9.4s
1316 st1 {v4.4s}, [x0], #16
1318 sub v4.4s, v4.4s, v8.4s
1319 st1 {v4.4s}, [x0], #16
1321 sub v4.4s, v4.4s, v\b\().4s
1322 st1 {v4.4s}, [x0], #16
1324 sub v4.4s, v4.4s, v\a\().4s
1325 st1 {v4.4s}, [x0], #16
1328 store_rev 31, 27, 23, 19
1329 store_rev 30, 26, 22, 18
1330 store_rev 29, 25, 21, 17
1331 store_rev 28, 24, 20, 16
1336 // This is mostly the same as 4x32_pass1, but without the transpose,
1337 // and use the source as temp buffer between the two idct passes, and
1338 // add into the destination.
1341 // x2 = src (temp buffer)
1342 // x7 = negative double temp buffer stride
1343 // x9 = double temp buffer stride
1344 function idct32_1d_4x32_pass2_neon
1345 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1346 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1347 ld1 {v\i\().4s}, [x2], x9
1349 sub x2, x2, x9, lsl #4
1353 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1354 st1 {v\i\().4s}, [x2], x9
1357 sub x2, x2, x9, lsl #4
1360 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1361 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1362 ld1 {v\i\().4s}, [x2], x9
1364 sub x2, x2, x9, lsl #4
1369 .macro load_acc_store a, b, c, d, neg=0
1371 ld1 {v4.4s}, [x2], x9
1372 ld1 {v5.4s}, [x2], x9
1373 add v4.4s, v4.4s, v\a\().4s
1374 ld1 {v6.4s}, [x2], x9
1375 add v5.4s, v5.4s, v\b\().4s
1376 ld1 {v7.4s}, [x2], x9
1377 add v6.4s, v6.4s, v\c\().4s
1378 add v7.4s, v7.4s, v\d\().4s
1380 ld1 {v4.4s}, [x2], x7
1381 ld1 {v5.4s}, [x2], x7
1382 sub v4.4s, v4.4s, v\a\().4s
1383 ld1 {v6.4s}, [x2], x7
1384 sub v5.4s, v5.4s, v\b\().4s
1385 ld1 {v7.4s}, [x2], x7
1386 sub v6.4s, v6.4s, v\c\().4s
1387 sub v7.4s, v7.4s, v\d\().4s
1389 ld1 {v8.4h}, [x0], x1
1390 ld1 {v8.d}[1], [x0], x1
1391 srshr v4.4s, v4.4s, #6
1392 ld1 {v9.4h}, [x0], x1
1393 srshr v5.4s, v5.4s, #6
1394 uaddw v4.4s, v4.4s, v8.4h
1395 ld1 {v9.d}[1], [x0], x1
1396 srshr v6.4s, v6.4s, #6
1397 uaddw2 v5.4s, v5.4s, v8.8h
1398 srshr v7.4s, v7.4s, #6
1399 sub x0, x0, x1, lsl #2
1400 uaddw v6.4s, v6.4s, v9.4h
1402 uaddw2 v7.4s, v7.4s, v9.8h
1403 sqxtun2 v4.8h, v5.4s
1404 umin v4.8h, v4.8h, v15.8h
1405 st1 {v4.4h}, [x0], x1
1407 st1 {v4.d}[1], [x0], x1
1408 sqxtun2 v5.8h, v7.4s
1409 umin v5.8h, v5.8h, v15.8h
1410 st1 {v5.4h}, [x0], x1
1411 st1 {v5.d}[1], [x0], x1
1413 load_acc_store 31, 30, 29, 28
1414 load_acc_store 27, 26, 25, 24
1415 load_acc_store 23, 22, 21, 20
1416 load_acc_store 19, 18, 17, 16
1418 load_acc_store 16, 17, 18, 19, 1
1419 load_acc_store 20, 21, 22, 23, 1
1420 load_acc_store 24, 25, 26, 27, 1
1421 load_acc_store 28, 29, 30, 31, 1
1422 .purgem load_acc_store
1426 const min_eob_idct_idct_32, align=4
1427 .short 0, 9, 34, 70, 135, 240, 336, 448
1430 function vp9_idct_idct_32x32_add_16_neon
1432 b.eq idct32x32_dc_add_neon
1434 movrel x10, idct_coeffs
1435 movrel x12, min_eob_idct_idct_32, 2
1438 stp d8, d9, [sp, #-0x10]!
1439 stp d10, d11, [sp, #-0x10]!
1440 stp d12, d13, [sp, #-0x10]!
1441 stp d14, d15, [sp, #-0x10]!
1449 // Double stride of the input, since we only read every other line
1453 ld1 {v0.8h,v1.8h}, [x10], #32
1458 ld1 {v10.8h,v11.8h}, [x10]
1460 sxtl2 v13.4s, v11.8h
1461 sxtl2 v11.4s, v10.8h
1466 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1467 add x0, sp, #(\i*128)
1471 mov x1, #(32 - \i)/4
1475 bl idct32_1d_4x32_pass1_neon
1480 // Write zeros to the temp buffer for pass 2
1488 st1 {v16.4s-v19.4s}, [x0], #64
1489 st1 {v16.4s-v19.4s}, [x0], #64
1493 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
1497 bl idct32_1d_4x32_pass2_neon
1501 ldp d14, d15, [sp], 0x10
1502 ldp d12, d13, [sp], 0x10
1503 ldp d10, d11, [sp], 0x10
1504 ldp d8, d9, [sp], 0x10
1509 function ff_vp9_idct_idct_32x32_add_10_neon, export=1
1511 b vp9_idct_idct_32x32_add_16_neon
1514 function ff_vp9_idct_idct_32x32_add_12_neon, export=1
1516 b vp9_idct_idct_32x32_add_16_neon