2 * Copyright (c) 2016 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
24 const itxfm4_coeffs, align=4
25 .short 11585, 0, 6270, 15137
27 .short 5283, 15212, 9929, 13377
30 const iadst8_coeffs, align=4
31 .short 16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
33 .short 11585, 0, 6270, 15137, 3196, 16069, 13623, 9102
34 .short 1606, 16305, 12665, 10394, 7723, 14449, 15679, 4756
35 .short 804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
36 .short 3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
39 const iadst16_coeffs, align=4
40 .short 16364, 804, 15893, 3981, 11003, 12140, 8423, 14053
41 .short 14811, 7005, 13160, 9760, 5520, 15426, 2404, 16207
44 // out1 = ((in1 + in2) * v0[0] + (1 << 13)) >> 14
45 // out2 = ((in1 - in2) * v0[0] + (1 << 13)) >> 14
46 // in/out are .8h registers; this can do with 4 temp registers, but is
47 // more efficient if 6 temp registers are available.
48 .macro dmbutterfly0 out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, neg=0
50 neg \tmp4\().4h, v0.4h
52 add \tmp1\().8h, \in1\().8h, \in2\().8h
53 sub \tmp2\().8h, \in1\().8h, \in2\().8h
55 smull \tmp3\().4s, \tmp1\().4h, \tmp4\().h[0]
56 smull2 \tmp4\().4s, \tmp1\().8h, \tmp4\().h[0]
58 smull \tmp3\().4s, \tmp1\().4h, v0.h[0]
59 smull2 \tmp4\().4s, \tmp1\().8h, v0.h[0]
62 rshrn \out1\().4h, \tmp3\().4s, #14
63 rshrn2 \out1\().8h, \tmp4\().4s, #14
64 smull \tmp3\().4s, \tmp2\().4h, v0.h[0]
65 smull2 \tmp4\().4s, \tmp2\().8h, v0.h[0]
66 rshrn \out2\().4h, \tmp3\().4s, #14
67 rshrn2 \out2\().8h, \tmp4\().4s, #14
69 smull \tmp5\().4s, \tmp2\().4h, v0.h[0]
70 smull2 \tmp6\().4s, \tmp2\().8h, v0.h[0]
71 rshrn \out1\().4h, \tmp3\().4s, #14
72 rshrn2 \out1\().8h, \tmp4\().4s, #14
73 rshrn \out2\().4h, \tmp5\().4s, #14
74 rshrn2 \out2\().8h, \tmp6\().4s, #14
78 // Same as dmbutterfly0 above, but treating the input in in2 as zero,
79 // writing the same output into both out1 and out2.
80 .macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
81 smull \tmp1\().4s, \in1\().4h, v0.h[0]
82 smull2 \tmp2\().4s, \in1\().8h, v0.h[0]
83 rshrn \out1\().4h, \tmp1\().4s, #14
84 rshrn2 \out1\().8h, \tmp2\().4s, #14
85 rshrn \out2\().4h, \tmp1\().4s, #14
86 rshrn2 \out2\().8h, \tmp2\().4s, #14
89 // out1,out2 = in1 * coef1 - in2 * coef2
90 // out3,out4 = in1 * coef2 + in2 * coef1
91 // out are 4 x .4s registers, in are 2 x .8h registers
92 .macro dmbutterfly_l out1, out2, out3, out4, in1, in2, coef1, coef2
93 smull \out1\().4s, \in1\().4h, \coef1
94 smull2 \out2\().4s, \in1\().8h, \coef1
95 smull \out3\().4s, \in1\().4h, \coef2
96 smull2 \out4\().4s, \in1\().8h, \coef2
97 smlsl \out1\().4s, \in2\().4h, \coef2
98 smlsl2 \out2\().4s, \in2\().8h, \coef2
99 smlal \out3\().4s, \in2\().4h, \coef1
100 smlal2 \out4\().4s, \in2\().8h, \coef1
103 // inout1 = (inout1 * coef1 - inout2 * coef2 + (1 << 13)) >> 14
104 // inout2 = (inout1 * coef2 + inout2 * coef1 + (1 << 13)) >> 14
105 // inout are 2 x .8h registers
106 .macro dmbutterfly inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4, neg=0
107 dmbutterfly_l \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \coef1, \coef2
109 neg \tmp3\().4s, \tmp3\().4s
110 neg \tmp4\().4s, \tmp4\().4s
112 rshrn \inout1\().4h, \tmp1\().4s, #14
113 rshrn2 \inout1\().8h, \tmp2\().4s, #14
114 rshrn \inout2\().4h, \tmp3\().4s, #14
115 rshrn2 \inout2\().8h, \tmp4\().4s, #14
118 // Same as dmbutterfly above, but treating the input in inout2 as zero
119 .macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
120 smull \tmp1\().4s, \inout1\().4h, \coef1
121 smull2 \tmp2\().4s, \inout1\().8h, \coef1
122 smull \tmp3\().4s, \inout1\().4h, \coef2
123 smull2 \tmp4\().4s, \inout1\().8h, \coef2
124 rshrn \inout1\().4h, \tmp1\().4s, #14
125 rshrn2 \inout1\().8h, \tmp2\().4s, #14
126 rshrn \inout2\().4h, \tmp3\().4s, #14
127 rshrn2 \inout2\().8h, \tmp4\().4s, #14
130 // Same as dmbutterfly above, but treating the input in inout1 as zero
131 .macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
132 smull \tmp1\().4s, \inout2\().4h, \coef2
133 smull2 \tmp2\().4s, \inout2\().8h, \coef2
134 smull \tmp3\().4s, \inout2\().4h, \coef1
135 smull2 \tmp4\().4s, \inout2\().8h, \coef1
136 neg \tmp1\().4s, \tmp1\().4s
137 neg \tmp2\().4s, \tmp2\().4s
138 rshrn \inout2\().4h, \tmp3\().4s, #14
139 rshrn2 \inout2\().8h, \tmp4\().4s, #14
140 rshrn \inout1\().4h, \tmp1\().4s, #14
141 rshrn2 \inout1\().8h, \tmp2\().4s, #14
144 .macro dsmull_h out1, out2, in, coef
145 smull \out1\().4s, \in\().4h, \coef
146 smull2 \out2\().4s, \in\().8h, \coef
149 .macro drshrn_h out, in1, in2, shift
150 rshrn \out\().4h, \in1\().4s, \shift
151 rshrn2 \out\().8h, \in2\().4s, \shift
157 .macro butterfly_8h out1, out2, in1, in2
158 add \out1\().8h, \in1\().8h, \in2\().8h
159 sub \out2\().8h, \in1\().8h, \in2\().8h
164 .macro butterfly_8h_r out1, out2, in1, in2
165 sub \out1\().8h, \in1\().8h, \in2\().8h
166 add \out2\().8h, \in1\().8h, \in2\().8h
169 // out1 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
170 // out2 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
171 // out are 2 x .8h registers, in are 4 x .4s registers
172 .macro dbutterfly_n out1, out2, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
173 add \tmp1\().4s, \in1\().4s, \in3\().4s
174 add \tmp2\().4s, \in2\().4s, \in4\().4s
175 sub \tmp3\().4s, \in1\().4s, \in3\().4s
176 sub \tmp4\().4s, \in2\().4s, \in4\().4s
177 rshrn \out1\().4h, \tmp1\().4s, #14
178 rshrn2 \out1\().8h, \tmp2\().4s, #14
179 rshrn \out2\().4h, \tmp3\().4s, #14
180 rshrn2 \out2\().8h, \tmp4\().4s, #14
183 .macro iwht4 c0, c1, c2, c3
184 add \c0\().4h, \c0\().4h, \c1\().4h
185 sub v17.4h, \c2\().4h, \c3\().4h
186 sub v16.4h, \c0\().4h, v17.4h
187 sshr v16.4h, v16.4h, #1
188 sub \c2\().4h, v16.4h, \c1\().4h
189 sub \c1\().4h, v16.4h, \c3\().4h
190 add \c3\().4h, v17.4h, \c2\().4h
191 sub \c0\().4h, \c0\().4h, \c1\().4h
194 .macro idct4 c0, c1, c2, c3
195 smull v22.4s, \c1\().4h, v0.h[3]
196 smull v20.4s, \c1\().4h, v0.h[2]
197 add v16.4h, \c0\().4h, \c2\().4h
198 sub v17.4h, \c0\().4h, \c2\().4h
199 smlal v22.4s, \c3\().4h, v0.h[2]
200 smull v18.4s, v16.4h, v0.h[0]
201 smull v19.4s, v17.4h, v0.h[0]
202 smlsl v20.4s, \c3\().4h, v0.h[3]
203 rshrn v22.4h, v22.4s, #14
204 rshrn v18.4h, v18.4s, #14
205 rshrn v19.4h, v19.4s, #14
206 rshrn v20.4h, v20.4s, #14
207 add \c0\().4h, v18.4h, v22.4h
208 sub \c3\().4h, v18.4h, v22.4h
209 add \c1\().4h, v19.4h, v20.4h
210 sub \c2\().4h, v19.4h, v20.4h
213 .macro iadst4 c0, c1, c2, c3
214 smull v16.4s, \c0\().4h, v0.h[4]
215 smlal v16.4s, \c2\().4h, v0.h[5]
216 smlal v16.4s, \c3\().4h, v0.h[6]
217 smull v17.4s, \c0\().4h, v0.h[6]
218 smlsl v17.4s, \c2\().4h, v0.h[4]
219 sub \c0\().4h, \c0\().4h, \c2\().4h
220 smlsl v17.4s, \c3\().4h, v0.h[5]
221 add \c0\().4h, \c0\().4h, \c3\().4h
222 smull v19.4s, \c1\().4h, v0.h[7]
223 smull v18.4s, \c0\().4h, v0.h[7]
224 add v20.4s, v16.4s, v19.4s
225 add v21.4s, v17.4s, v19.4s
226 rshrn \c0\().4h, v20.4s, #14
227 add v16.4s, v16.4s, v17.4s
228 rshrn \c1\().4h, v21.4s, #14
229 sub v16.4s, v16.4s, v19.4s
230 rshrn \c2\().4h, v18.4s, #14
231 rshrn \c3\().4h, v16.4s, #14
234 // The public functions in this file have got the following signature:
235 // void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
237 .macro itxfm_func4x4 txfm1, txfm2
238 function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
241 movrel x4, itxfm4_coeffs
245 movrel x4, iadst4_coeffs
249 movrel x4, itxfm4_coeffs
254 .ifc \txfm1\()_\txfm2,idct_idct
257 // DC-only for idct/idct
259 smull v2.4s, v2.4h, v0.h[0]
260 rshrn v2.4h, v2.4s, #14
261 smull v2.4s, v2.4h, v0.h[0]
262 rshrn v2.4h, v2.4s, #14
272 ld1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x2]
273 st1 {v31.8h}, [x2], #16
276 sshr v4.4h, v4.4h, #2
277 sshr v5.4h, v5.4h, #2
278 sshr v6.4h, v6.4h, #2
279 sshr v7.4h, v7.4h, #2
282 \txfm1\()4 v4, v5, v6, v7
284 st1 {v31.8h}, [x2], #16
285 // Transpose 4x4 with 16 bit elements
286 transpose_4x4H v4, v5, v6, v7, v16, v17, v18, v19
288 \txfm2\()4 v4, v5, v6, v7
290 ld1 {v0.s}[0], [x0], x1
291 ld1 {v1.s}[0], [x0], x1
293 srshr v4.4h, v4.4h, #4
294 srshr v5.4h, v5.4h, #4
295 srshr v6.4h, v6.4h, #4
296 srshr v7.4h, v7.4h, #4
298 uaddw v4.8h, v4.8h, v0.8b
299 uaddw v5.8h, v5.8h, v1.8b
300 ld1 {v2.s}[0], [x0], x1
301 ld1 {v3.s}[0], [x0], x1
304 sub x0, x0, x1, lsl #2
306 uaddw v6.8h, v6.8h, v2.8b
307 uaddw v7.8h, v7.8h, v3.8b
308 st1 {v0.s}[0], [x0], x1
312 st1 {v1.s}[0], [x0], x1
313 st1 {v2.s}[0], [x0], x1
314 st1 {v3.s}[0], [x0], x1
320 itxfm_func4x4 idct, idct
321 itxfm_func4x4 iadst, idct
322 itxfm_func4x4 idct, iadst
323 itxfm_func4x4 iadst, iadst
324 itxfm_func4x4 iwht, iwht
328 dmbutterfly0 v16, v20, v16, v20, v2, v3, v4, v5, v6, v7 // v16 = t0a, v20 = t1a
329 dmbutterfly v18, v22, v0.h[2], v0.h[3], v2, v3, v4, v5 // v18 = t2a, v22 = t3a
330 dmbutterfly v17, v23, v0.h[4], v0.h[5], v2, v3, v4, v5 // v17 = t4a, v23 = t7a
331 dmbutterfly v21, v19, v0.h[6], v0.h[7], v2, v3, v4, v5 // v21 = t5a, v19 = t6a
333 butterfly_8h v24, v25, v16, v22 // v24 = t0, v25 = t3
334 butterfly_8h v28, v29, v17, v21 // v28 = t4, v29 = t5a
335 butterfly_8h v30, v31, v23, v19 // v30 = t7, v31 = t6a
336 butterfly_8h v26, v27, v20, v18 // v26 = t1, v27 = t2
338 dmbutterfly0 v31, v29, v31, v29, v2, v3, v4, v5, v6, v7 // v31 = t6, v29 = t5
340 butterfly_8h v16, v23, v24, v30 // v16 = out[0], v23 = out[7]
341 butterfly_8h v17, v22, v26, v31 // v17 = out[1], v22 = out[6]
342 butterfly_8h v18, v21, v27, v29 // q13 = out[2], q10 = out[5]
343 butterfly_8h v19, v20, v25, v28 // v17 = out[3], q12 = out[4]
347 dmbutterfly_l v24, v25, v26, v27, v23, v16, v1.h[1], v1.h[0] // v24,v25 = t1a, v26,v27 = t0a
348 dmbutterfly_l v28, v29, v30, v31, v21, v18, v1.h[3], v1.h[2] // v28,v29 = t3a, v30,v31 = t2a
349 dmbutterfly_l v2, v3, v4, v5, v19, v20, v1.h[5], v1.h[4] // v2,v3 = t5a, v4,v5 = t4a
350 dmbutterfly_l v16, v18, v21, v23, v17, v22, v1.h[7], v1.h[6] // v16,v18 = t7a, v21,v23 = t6a
352 dbutterfly_n v4, v5, v26, v27, v4, v5, v6, v7, v26, v27 // v4 = t0, v5 = t4
353 dbutterfly_n v2, v3, v24, v25, v2, v3, v6, v7, v26, v27 // v2 = t1, v3 = t5
354 dbutterfly_n v24, v25, v30, v31, v21, v23, v6, v7, v26, v27 // v24 = t2, v25 = t6
355 dbutterfly_n v30, v31, v28, v29, v16, v18, v6, v7, v26, v27 // v30 = t3, v31 = t7
357 butterfly_8h v16, v6, v4, v24 // v16 = out[0], v6 = t2
358 butterfly_8h v23, v7, v2, v30 // v23 = -out[7], v7 = t3
359 neg v23.8h, v23.8h // v23 = out[7]
361 dmbutterfly0 v19, v20, v6, v7, v24, v26, v27, v28, v29, v30 // v19 = -out[3], v20 = out[4]
362 neg v19.8h, v19.8h // v19 = out[3]
364 dmbutterfly_l v26, v27, v28, v29, v5, v3, v0.h[2], v0.h[3] // v26,v27 = t5a, v28,v29 = t4a
365 dmbutterfly_l v2, v3, v4, v5, v31, v25, v0.h[3], v0.h[2] // v2,v3 = t6a, v4,v5 = t7a
367 dbutterfly_n v17, v30, v28, v29, v2, v3, v6, v7, v24, v25 // v17 = -out[1], v30 = t6
368 dbutterfly_n v22, v31, v26, v27, v4, v5, v6, v7, v24, v25 // v22 = out[6], v31 = t7
369 neg v17.8h, v17.8h // v17 = out[1]
371 dmbutterfly0 v18, v21, v30, v31, v2, v3, v4, v5, v6, v7 // v18 = out[2], v21 = -out[5]
372 neg v21.8h, v21.8h // v21 = out[5]
376 .macro itxfm_func8x8 txfm1, txfm2
377 function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
378 // The iadst also uses a few coefficients from
379 // idct, so those always need to be loaded.
380 .ifc \txfm1\()_\txfm2,idct_idct
381 movrel x4, idct_coeffs
383 movrel x4, iadst8_coeffs
384 ld1 {v1.8h}, [x4], #16
393 .ifc \txfm1\()_\txfm2,idct_idct
396 // DC-only for idct/idct
398 smull v2.4s, v2.4h, v0.h[0]
399 rshrn v2.4h, v2.4s, #14
400 smull v2.4s, v2.4h, v0.h[0]
401 rshrn v2.4h, v2.4s, #14
414 ld1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x2], #64
415 ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [x2], #64
417 st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
418 st1 {v2.8h,v3.8h,v4.8h,v5.8h}, [x2], #64
422 // Transpose 8x8 with 16 bit elements
423 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
428 // Add into the destination
429 ld1 {v0.8b}, [x0], x1
430 srshr v16.8h, v16.8h, #5
431 ld1 {v1.8b}, [x0], x1
432 srshr v17.8h, v17.8h, #5
433 ld1 {v2.8b}, [x0], x1
434 srshr v18.8h, v18.8h, #5
435 uaddw v16.8h, v16.8h, v0.8b
436 ld1 {v3.8b}, [x0], x1
437 srshr v19.8h, v19.8h, #5
438 uaddw v17.8h, v17.8h, v1.8b
439 ld1 {v4.8b}, [x0], x1
440 srshr v20.8h, v20.8h, #5
441 uaddw v18.8h, v18.8h, v2.8b
443 ld1 {v5.8b}, [x0], x1
444 srshr v21.8h, v21.8h, #5
445 uaddw v19.8h, v19.8h, v3.8b
447 ld1 {v6.8b}, [x0], x1
448 srshr v22.8h, v22.8h, #5
449 uaddw v20.8h, v20.8h, v4.8b
451 ld1 {v7.8b}, [x0], x1
452 srshr v23.8h, v23.8h, #5
453 uaddw v21.8h, v21.8h, v5.8b
456 st1 {v0.8b}, [x3], x1
457 uaddw v22.8h, v22.8h, v6.8b
458 st1 {v1.8b}, [x3], x1
460 st1 {v2.8b}, [x3], x1
461 uaddw v23.8h, v23.8h, v7.8b
462 st1 {v3.8b}, [x3], x1
464 st1 {v4.8b}, [x3], x1
466 st1 {v5.8b}, [x3], x1
469 st1 {v6.8b}, [x3], x1
470 st1 {v7.8b}, [x3], x1
476 itxfm_func8x8 idct, idct
477 itxfm_func8x8 iadst, idct
478 itxfm_func8x8 idct, iadst
479 itxfm_func8x8 iadst, iadst
482 function idct16x16_dc_add_neon
483 movrel x4, idct_coeffs
489 smull v2.4s, v2.4h, v0.h[0]
490 rshrn v2.4h, v2.4s, #14
491 smull v2.4s, v2.4h, v0.h[0]
492 rshrn v2.4h, v2.4s, #14
496 srshr v2.8h, v2.8h, #6
501 // Loop to add the constant from v2 into all 16x16 outputs
503 ld1 {v3.16b}, [x0], x1
504 ld1 {v4.16b}, [x0], x1
505 uaddw v16.8h, v2.8h, v3.8b
506 uaddw2 v17.8h, v2.8h, v3.16b
507 uaddw v18.8h, v2.8h, v4.8b
508 uaddw2 v19.8h, v2.8h, v4.16b
510 sqxtun2 v3.16b, v17.8h
512 sqxtun2 v4.16b, v19.8h
513 st1 {v3.16b}, [x3], x1
514 st1 {v4.16b}, [x3], x1
521 butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a
522 butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6
523 butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5
524 butterfly_8h v5, v6, v28, v6 // v5 = t3a, v6 = t4
525 butterfly_8h v20, v28, v16, v24 // v20 = t8a, v28 = t11a
526 butterfly_8h v24, v21, v23, v21 // v24 = t9, v21 = t10
527 butterfly_8h v23, v27, v25, v27 // v23 = t14, v27 = t13
528 butterfly_8h v25, v29, v29, v17 // v25 = t15a, v29 = t12a
530 dmbutterfly0 v2, v3, v27, v21, v2, v3, v16, v17, v30, v31 // v2 = t13a, v3 = t10a
531 dmbutterfly0 v28, v27, v29, v28, v21, v29, v16, v17, v30, v31 // v28 = t12, v27 = t11
533 butterfly_8h v16, v31, v18, v25 // v16 = out[0], v31 = out[15]
534 butterfly_8h v17, v30, v19, v23 // v17 = out[1], v30 = out[14]
535 butterfly_8h_r v25, v22, v22, v24 // v25 = out[9], v22 = out[6]
536 butterfly_8h v23, v24, v7, v20 // v23 = out[7], v24 = out[8]
537 butterfly_8h v18, v29, v4, v2 // v18 = out[2], v29 = out[13]
538 butterfly_8h v19, v28, v5, v28 // v19 = out[3], v28 = out[12]
539 butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
540 butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10]
545 dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
546 dmbutterfly v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
547 dmbutterfly v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
548 dmbutterfly v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
549 dmbutterfly v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
550 dmbutterfly v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
551 dmbutterfly v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
552 dmbutterfly v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
554 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
555 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
556 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
557 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
558 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
559 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
560 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
561 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
563 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
564 dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
565 dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
570 dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
571 dmbutterfly_h1 v20, v28, v0.h[2], v0.h[3], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
572 dmbutterfly_h1 v18, v30, v0.h[4], v0.h[5], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
573 dmbutterfly_h2 v26, v22, v0.h[6], v0.h[7], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
574 dmbutterfly_h1 v17, v31, v1.h[0], v1.h[1], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
575 dmbutterfly_h2 v25, v23, v1.h[2], v1.h[3], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
576 dmbutterfly_h1 v21, v27, v1.h[4], v1.h[5], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
577 dmbutterfly_h2 v29, v19, v1.h[6], v1.h[7], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
579 butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
580 butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
581 butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
582 butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
583 butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
584 butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
585 butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
586 butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
588 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
589 dmbutterfly v23, v25, v0.h[2], v0.h[3], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
590 dmbutterfly v27, v21, v0.h[2], v0.h[3], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
594 function idct16_quarter
595 dsmull_h v24, v25, v19, v1.h[7]
596 dsmull_h v4, v5, v17, v1.h[0]
597 dsmull_h v7, v6, v18, v0.h[5]
598 dsmull_h v30, v31, v18, v0.h[4]
601 dsmull_h v29, v28, v17, v1.h[1]
602 dsmull_h v26, v27, v19, v1.h[6]
603 dsmull_h v22, v23, v16, v0.h[0]
604 drshrn_h v24, v24, v25, #14
605 drshrn_h v16, v4, v5, #14
606 drshrn_h v7, v7, v6, #14
607 drshrn_h v6, v30, v31, #14
608 drshrn_h v29, v29, v28, #14
609 drshrn_h v17, v26, v27, #14
610 drshrn_h v28, v22, v23, #14
612 dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[2], v0.h[3]
613 dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[2], v0.h[3]
616 drshrn_h v27, v20, v21, #14
617 drshrn_h v21, v22, v23, #14
618 drshrn_h v23, v18, v19, #14
619 drshrn_h v25, v30, v31, #14
622 dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
628 ld1 {v0.8h,v1.8h}, [x11]
630 dmbutterfly_l v6, v7, v4, v5, v31, v16, v0.h[1], v0.h[0] // v6,v7 = t1, v4,v5 = t0
631 dmbutterfly_l v10, v11, v8, v9, v23, v24, v0.h[5], v0.h[4] // v10,v11 = t9, v8,v9 = t8
632 dbutterfly_n v31, v24, v6, v7, v10, v11, v12, v13, v10, v11 // v31 = t1a, v24 = t9a
633 dmbutterfly_l v14, v15, v12, v13, v29, v18, v0.h[3], v0.h[2] // v14,v15 = t3, v12,v13 = t2
634 dbutterfly_n v16, v23, v4, v5, v8, v9, v6, v7, v8, v9 // v16 = t0a, v23 = t8a
636 dmbutterfly_l v6, v7, v4, v5, v21, v26, v0.h[7], v0.h[6] // v6,v7 = t11, v4,v5 = t10
637 dbutterfly_n v29, v26, v14, v15, v6, v7, v8, v9, v6, v7 // v29 = t3a, v26 = t11a
638 dmbutterfly_l v10, v11, v8, v9, v27, v20, v1.h[1], v1.h[0] // v10,v11 = t5, v8,v9 = t4
639 dbutterfly_n v18, v21, v12, v13, v4, v5, v6, v7, v4, v5 // v18 = t2a, v21 = t10a
641 dmbutterfly_l v14, v15, v12, v13, v19, v28, v1.h[5], v1.h[4] // v14,v15 = t13, v12,v13 = t12
642 dbutterfly_n v20, v28, v10, v11, v14, v15, v4, v5, v14, v15 // v20 = t5a, v28 = t13a
643 dmbutterfly_l v6, v7, v4, v5, v25, v22, v1.h[3], v1.h[2] // v6,v7 = t7, v4,v5 = t6
644 dbutterfly_n v27, v19, v8, v9, v12, v13, v10, v11, v12, v13 // v27 = t4a, v19 = t12a
646 dmbutterfly_l v10, v11, v8, v9, v17, v30, v1.h[7], v1.h[6] // v10,v11 = t15, v8,v9 = t14
648 dbutterfly_n v22, v30, v6, v7, v10, v11, v12, v13, v10, v11 // v22 = t7a, v30 = t15a
649 dmbutterfly_l v14, v15, v12, v13, v23, v24, v0.h[4], v0.h[5] // v14,v15 = t9, v12,v13 = t8
650 dbutterfly_n v25, v17, v4, v5, v8, v9, v6, v7, v8, v9 // v25 = t6a, v17 = t14a
652 dmbutterfly_l v4, v5, v6, v7, v28, v19, v0.h[5], v0.h[4] // v4,v5 = t12, v6,v7 = t13
653 dbutterfly_n v23, v19, v12, v13, v4, v5, v8, v9, v4, v5 // v23 = t8a, v19 = t12a
654 dmbutterfly_l v10, v11, v8, v9, v21, v26, v0.h[6], v0.h[7] // v10,v11 = t11, v8,v9 = t10
655 butterfly_8h_r v4, v27, v16, v27 // v4 = t4, v27 = t0
656 dbutterfly_n v24, v28, v14, v15, v6, v7, v12, v13, v6, v7 // v24 = t9a, v28 = t13a
658 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[7], v0.h[6] // v12,v13 = t14, v14,v15 = t15
659 butterfly_8h_r v5, v20, v31, v20 // v5 = t5, v20 = t1
660 dbutterfly_n v21, v17, v8, v9, v12, v13, v6, v7, v12, v13 // v21 = t10a, v17 = t14a
661 dbutterfly_n v26, v30, v10, v11, v14, v15, v8, v9, v14, v15 // v26 = t11a, v30 = t15a
663 butterfly_8h_r v6, v25, v18, v25 // v6 = t6, v25 = t2
664 butterfly_8h_r v7, v22, v29, v22 // v7 = t7, v22 = t3
666 dmbutterfly_l v10, v11, v8, v9, v19, v28, v0.h[2], v0.h[3] // v10,v11 = t13, v8,v9 = t12
667 dmbutterfly_l v12, v13, v14, v15, v30, v17, v0.h[3], v0.h[2] // v12,v13 = t14, v14,v15 = t15
669 dbutterfly_n v18, v30, v8, v9, v12, v13, v16, v17, v12, v13 // v18 = out[2], v30 = t14a
670 dbutterfly_n v29, v17, v10, v11, v14, v15, v12, v13, v14, v15 // v29 = -out[13], v17 = t15a
671 neg v29.8h, v29.8h // v29 = out[13]
673 dmbutterfly_l v10, v11, v8, v9, v4, v5, v0.h[2], v0.h[3] // v10,v11 = t5a, v8,v9 = t4a
674 dmbutterfly_l v12, v13, v14, v15, v7, v6, v0.h[3], v0.h[2] // v12,v13 = t6a, v14,v15 = t7a
676 butterfly_8h v2, v6, v27, v25 // v2 = out[0], v6 = t2a
677 butterfly_8h v3, v7, v23, v21 // v3 =-out[1], v7 = t10
679 dbutterfly_n v19, v31, v8, v9, v12, v13, v4, v5, v8, v9 // v19 = -out[3], v31 = t6
680 neg v19.8h, v19.8h // v19 = out[3]
681 dbutterfly_n v28, v16, v10, v11, v14, v15, v4, v5, v10, v11 // v28 = out[12], v16 = t7
683 butterfly_8h v5, v8, v20, v22 // v5 =-out[15],v8 = t3a
684 butterfly_8h v4, v9, v24, v26 // v4 = out[14],v9 = t11
686 dmbutterfly0 v23, v24, v6, v8, v10, v11, v12, v13, v14, v15, 1 // v23 = out[7], v24 = out[8]
687 dmbutterfly0 v21, v26, v30, v17, v10, v11, v12, v13, v14, v15, 1 // v21 = out[5], v26 = out[10]
688 dmbutterfly0 v20, v27, v16, v31, v10, v11, v12, v13, v14, v15 // v20 = out[4], v27 = out[11]
689 dmbutterfly0 v22, v25, v9, v7, v10, v11, v12, v13, v14, v15 // v22 = out[6], v25 = out[9]
691 neg v31.8h, v5.8h // v31 = out[15]
692 neg v17.8h, v3.8h // v17 = out[1]
699 // Helper macros; we can't use these expressions directly within
700 // e.g. .irp due to the extra concatenation \(). Therefore wrap
701 // them in macros to allow using .irp below.
702 .macro load i, src, inc
703 ld1 {v\i\().8h}, [\src], \inc
705 .macro store i, dst, inc
706 st1 {v\i\().8h}, [\dst], \inc
708 .macro movi_v i, size, imm
709 movi v\i\()\size, \imm
711 .macro load_clear i, src, inc
712 ld1 {v\i\().8h}, [\src]
713 st1 {v2.8h}, [\src], \inc
716 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
717 srshr \coef0, \coef0, #6
718 ld1 {v2.8b}, [x0], x1
719 srshr \coef1, \coef1, #6
720 ld1 {v3.8b}, [x3], x1
721 srshr \coef2, \coef2, #6
722 ld1 {v4.8b}, [x0], x1
723 srshr \coef3, \coef3, #6
724 uaddw \coef0, \coef0, v2.8b
725 ld1 {v5.8b}, [x3], x1
726 uaddw \coef1, \coef1, v3.8b
727 srshr \coef4, \coef4, #6
728 ld1 {v6.8b}, [x0], x1
729 srshr \coef5, \coef5, #6
730 ld1 {v7.8b}, [x3], x1
732 srshr \coef6, \coef6, #6
734 srshr \coef7, \coef7, #6
735 uaddw \coef2, \coef2, v4.8b
736 ld1 {\tmp1}, [x0], x1
737 uaddw \coef3, \coef3, v5.8b
738 ld1 {\tmp2}, [x3], x1
740 sub x0, x0, x1, lsl #2
741 sub x3, x3, x1, lsl #2
743 uaddw \coef4, \coef4, v6.8b
744 st1 {v2.8b}, [x0], x1
745 uaddw \coef5, \coef5, v7.8b
746 st1 {v3.8b}, [x3], x1
748 st1 {v4.8b}, [x0], x1
750 st1 {v5.8b}, [x3], x1
751 uaddw \coef6, \coef6, \tmp1
752 st1 {v6.8b}, [x0], x1
753 uaddw \coef7, \coef7, \tmp2
754 st1 {v7.8b}, [x3], x1
757 st1 {\tmp1}, [x0], x1
758 st1 {\tmp2}, [x3], x1
761 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
762 // transpose into a horizontal 16x8 slice and store.
763 // x0 = dst (temp buffer)
767 .macro itxfm16_1d_funcs txfm
768 function \txfm\()16_1d_8x16_pass1_neon
772 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
773 load_clear \i, x2, x9
778 // Do two 8x8 transposes. Originally, v16-v31 contain the
779 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
780 // transposed 8x8 blocks.
781 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
782 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
784 // Store the transposed 8x8 blocks horizontally.
787 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
792 // Special case: For the last input column (x1 == 8),
793 // which would be stored as the last row in the temp buffer,
794 // don't store the first 8x8 block, but keep it in registers
795 // for the first slice of the second pass (where it is the
797 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
812 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
813 // load the destination pixels (from a similar 8x16 slice), add and store back.
816 // x2 = src (temp buffer)
818 // x9 = temp buffer stride
819 function \txfm\()16_1d_8x16_pass2_neon
821 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
825 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
834 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
835 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
841 itxfm16_1d_funcs idct
842 itxfm16_1d_funcs iadst
844 .macro itxfm_func16x16 txfm1, txfm2
845 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
846 .ifc \txfm1\()_\txfm2,idct_idct
848 b.eq idct16x16_dc_add_neon
851 // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
852 .ifnc \txfm1\()_\txfm2,idct_idct
853 stp d14, d15, [sp, #-0x10]!
854 stp d12, d13, [sp, #-0x10]!
855 stp d10, d11, [sp, #-0x10]!
856 stp d8, d9, [sp, #-0x10]!
865 movrel x10, idct_coeffs
866 .ifnc \txfm1\()_\txfm2,idct_idct
867 movrel x11, iadst16_coeffs
870 ld1 {v0.8h,v1.8h}, [x10]
874 .ifc \txfm1\()_\txfm2,idct_idct
876 b.le idct16x16_quarter_add_neon
878 b.le idct16x16_half_add_neon
883 .ifc \txfm1\()_\txfm2,idct_idct
891 bl \txfm1\()16_1d_8x16_pass1_neon
893 .ifc \txfm1\()_\txfm2,iadst_idct
894 ld1 {v0.8h,v1.8h}, [x10]
897 .ifc \txfm1\()_\txfm2,idct_idct
900 // Set v24-v31 to zero, for the in-register passthrough of
901 // coefficients to pass 2. Since we only do two slices, this can
902 // only ever happen for the second slice. So we only need to store
903 // zeros to the temp buffer for the second half of the buffer.
904 // Move x0 to the second half, and use x9 == 32 as increment.
906 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
908 st1 {v24.8h}, [x0], x9
918 bl \txfm2\()16_1d_8x16_pass2_neon
922 .ifnc \txfm1\()_\txfm2,idct_idct
923 ldp d8, d9, [sp], 0x10
924 ldp d10, d11, [sp], 0x10
925 ldp d12, d13, [sp], 0x10
926 ldp d14, d15, [sp], 0x10
932 itxfm_func16x16 idct, idct
933 itxfm_func16x16 iadst, idct
934 itxfm_func16x16 idct, iadst
935 itxfm_func16x16 iadst, iadst
937 function idct16_1d_8x16_pass1_quarter_neon
940 .irp i, 16, 17, 18, 19
941 load_clear \i, x2, x9
946 // Do two 8x8 transposes. Originally, v16-v31 contain the
947 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
948 // transposed 8x8 blocks.
949 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
950 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
952 // Store the transposed 8x8 blocks horizontally.
953 // The first 8x8 block is kept in registers for the second pass,
954 // store the rest in the temp buffer.
955 // Since only a 4x4 part of the input was nonzero, this means that
956 // only 4 rows are nonzero after transposing, and the second pass
957 // only reads the topmost 4 rows. Therefore only store the topmost
960 .irp i, 24, 25, 26, 27
966 function idct16_1d_8x16_pass2_quarter_neon
969 .irp i, 16, 17, 18, 19
978 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
979 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
984 function idct16_1d_8x16_pass1_half_neon
987 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
988 load_clear \i, x2, x9
993 // Do two 8x8 transposes. Originally, v16-v31 contain the
994 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
995 // transposed 8x8 blocks.
996 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
997 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
999 // Store the transposed 8x8 blocks horizontally.
1000 // The first 8x8 block is kept in registers for the second pass,
1001 // store the rest in the temp buffer.
1003 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
1009 function idct16_1d_8x16_pass2_half_neon
1012 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1021 load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
1022 load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
1027 .macro idct16_partial size
1028 function idct16x16_\size\()_add_neon
1031 bl idct16_1d_8x16_pass1_\size\()_neon
1037 bl idct16_1d_8x16_pass2_\size\()_neon
1045 idct16_partial quarter
1048 function idct32x32_dc_add_neon
1049 movrel x4, idct_coeffs
1055 smull v2.4s, v2.4h, v0.h[0]
1056 rshrn v2.4h, v2.4s, #14
1057 smull v2.4s, v2.4h, v0.h[0]
1058 rshrn v2.4h, v2.4s, #14
1062 srshr v0.8h, v2.8h, #6
1067 // Loop to add the constant v0 into all 32x32 outputs
1069 ld1 {v1.16b,v2.16b}, [x0], x1
1070 uaddw v16.8h, v0.8h, v1.8b
1071 uaddw2 v17.8h, v0.8h, v1.16b
1072 ld1 {v3.16b,v4.16b}, [x0], x1
1073 uaddw v18.8h, v0.8h, v2.8b
1074 uaddw2 v19.8h, v0.8h, v2.16b
1075 uaddw v20.8h, v0.8h, v3.8b
1076 uaddw2 v21.8h, v0.8h, v3.16b
1077 uaddw v22.8h, v0.8h, v4.8b
1078 uaddw2 v23.8h, v0.8h, v4.16b
1079 sqxtun v1.8b, v16.8h
1080 sqxtun2 v1.16b, v17.8h
1081 sqxtun v2.8b, v18.8h
1082 sqxtun2 v2.16b, v19.8h
1083 sqxtun v3.8b, v20.8h
1084 sqxtun2 v3.16b, v21.8h
1085 st1 {v1.16b,v2.16b}, [x3], x1
1086 sqxtun v4.8b, v22.8h
1087 sqxtun2 v4.16b, v23.8h
1088 st1 {v3.16b,v4.16b}, [x3], x1
1095 butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a
1096 butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18
1097 butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a
1098 butterfly_8h v19, v21, v22, v21 // v19 = t22, v21 = t21
1099 butterfly_8h v4, v28, v28, v30 // v4 = t24a, v28 = t27a
1100 butterfly_8h v23, v26, v25, v26 // v23 = t25, v26 = t26
1101 butterfly_8h v7, v3, v29, v31 // v7 = t31a, v3 = t28a
1102 butterfly_8h v22, v27, v24, v27 // v22 = t30, v27 = t29
1104 dmbutterfly v27, v20, v0.h[2], v0.h[3], v24, v25, v30, v31 // v27 = t18a, v20 = t29a
1105 dmbutterfly v3, v5, v0.h[2], v0.h[3], v24, v25, v30, v31 // v3 = t19, v5 = t28
1106 dmbutterfly v28, v6, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v28 = t27, v6 = t20
1107 dmbutterfly v26, v21, v0.h[2], v0.h[3], v24, v25, v30, v31, neg=1 // v26 = t26a, v21 = t21a
1109 butterfly_8h v31, v24, v7, v4 // v31 = t31, v24 = t24
1110 butterfly_8h v30, v25, v22, v23 // v30 = t30a, v25 = t25a
1111 butterfly_8h_r v23, v16, v16, v18 // v23 = t23, v16 = t16
1112 butterfly_8h_r v22, v17, v17, v19 // v22 = t22a, v17 = t17a
1113 butterfly_8h v18, v21, v27, v21 // v18 = t18, v21 = t21
1114 butterfly_8h_r v27, v28, v5, v28 // v27 = t27a, v28 = t28a
1115 butterfly_8h v29, v26, v20, v26 // v29 = t29, v26 = t26
1116 butterfly_8h v19, v20, v3, v6 // v19 = t19a, v20 = t20
1118 dmbutterfly0 v27, v20, v27, v20, v2, v3, v4, v5, v6, v7 // v27 = t27, v20 = t20
1119 dmbutterfly0 v26, v21, v26, v21, v2, v3, v4, v5, v6, v7 // v26 = t26a, v21 = t21a
1120 dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22
1121 dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
1126 dmbutterfly v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1127 dmbutterfly v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1128 dmbutterfly v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1129 dmbutterfly v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1130 dmbutterfly v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1131 dmbutterfly v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1132 dmbutterfly v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1133 dmbutterfly v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1135 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
1136 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
1137 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
1138 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
1139 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
1140 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
1141 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
1142 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
1144 dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1145 dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1146 dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1147 dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1151 function idct32_odd_half
1152 dmbutterfly_h1 v16, v31, v8.h[0], v8.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
1153 dmbutterfly_h2 v24, v23, v8.h[2], v8.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
1154 dmbutterfly_h1 v20, v27, v8.h[4], v8.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
1155 dmbutterfly_h2 v28, v19, v8.h[6], v8.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
1156 dmbutterfly_h1 v18, v29, v9.h[0], v9.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
1157 dmbutterfly_h2 v26, v21, v9.h[2], v9.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
1158 dmbutterfly_h1 v22, v25, v9.h[4], v9.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
1159 dmbutterfly_h2 v30, v17, v9.h[6], v9.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
1161 butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
1162 butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
1163 butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
1164 butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
1165 butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
1166 butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
1167 butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
1168 butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
1170 dmbutterfly v23, v24, v0.h[4], v0.h[5], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
1171 dmbutterfly v27, v20, v0.h[4], v0.h[5], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
1172 dmbutterfly v21, v26, v0.h[6], v0.h[7], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
1173 dmbutterfly v25, v22, v0.h[6], v0.h[7], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
1177 function idct32_odd_quarter
1178 dsmull_h v4, v5, v16, v8.h[0]
1179 dsmull_h v28, v29, v19, v8.h[7]
1180 dsmull_h v30, v31, v16, v8.h[1]
1181 dsmull_h v22, v23, v17, v9.h[6]
1182 dsmull_h v7, v6, v17, v9.h[7]
1183 dsmull_h v26, v27, v19, v8.h[6]
1184 dsmull_h v20, v21, v18, v9.h[0]
1185 dsmull_h v24, v25, v18, v9.h[1]
1192 drshrn_h v4, v4, v5, #14
1193 drshrn_h v5, v28, v29, #14
1194 drshrn_h v29, v30, v31, #14
1195 drshrn_h v28, v22, v23, #14
1196 drshrn_h v7, v7, v6, #14
1197 drshrn_h v31, v26, v27, #14
1198 drshrn_h v6, v20, v21, #14
1199 drshrn_h v30, v24, v25, #14
1201 dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[4], v0.h[5]
1202 dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[4], v0.h[5]
1203 drshrn_h v23, v16, v17, #14
1204 drshrn_h v24, v18, v19, #14
1207 drshrn_h v27, v27, v26, #14
1208 drshrn_h v20, v20, v21, #14
1209 dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[6], v0.h[7]
1210 drshrn_h v21, v16, v17, #14
1211 drshrn_h v26, v18, v19, #14
1212 dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[6], v0.h[7]
1213 drshrn_h v25, v16, v17, #14
1216 drshrn_h v22, v18, v19, #14
1221 .macro idct32_funcs suffix
1222 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
1223 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
1224 // a normal IDCT16 with every other input component (the even ones, with
1225 // each output written twice), followed by a separate 16-point IDCT
1226 // of the odd inputs, added/subtracted onto the outputs of the first idct16.
1227 // x0 = dst (temp buffer)
1230 // x9 = double input stride
1231 function idct32_1d_8x32_pass1\suffix\()_neon
1235 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1237 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1238 load_clear \i, x2, x9
1241 .ifc \suffix,_quarter
1242 .irp i, 16, 17, 18, 19
1243 load_clear \i, x2, x9
1247 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1248 load_clear \i, x2, x9
1254 // Do two 8x8 transposes. Originally, v16-v31 contain the
1255 // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
1256 // two transposed 8x8 blocks.
1257 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
1258 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
1260 // Store the registers a, b horizontally, followed by the
1261 // same registers b, a mirrored.
1262 .macro store_rev a, b
1263 // There's no rev128 instruction, but we reverse each 64 bit
1264 // half, and then flip them using an ext with 8 bytes offset.
1268 ext v3.16b, v3.16b, v3.16b, #8
1270 ext v2.16b, v2.16b, v2.16b, #8
1271 st1 {v3.8h}, [x0], #16
1272 st1 {v2.8h}, [x0], #16
1274 store_rev v16.8h, v24.8h
1275 store_rev v17.8h, v25.8h
1276 store_rev v18.8h, v26.8h
1277 store_rev v19.8h, v27.8h
1278 store_rev v20.8h, v28.8h
1279 store_rev v21.8h, v29.8h
1280 store_rev v22.8h, v30.8h
1281 store_rev v23.8h, v31.8h
1285 // Move x2 back to the start of the input, and move
1286 // to the first odd row
1288 sub x2, x2, x9, lsl #4
1290 .ifc \suffix,_quarter
1291 sub x2, x2, x9, lsl #2
1294 sub x2, x2, x9, lsl #3
1299 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1301 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1302 load_clear \i, x2, x9
1305 .ifc \suffix,_quarter
1306 .irp i, 16, 17, 18, 19
1307 load_clear \i, x2, x9
1311 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1312 load_clear \i, x2, x9
1316 bl idct32_odd\suffix
1318 transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
1319 transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
1321 // Store the registers a, b horizontally,
1322 // adding into the output first, and the mirrored,
1323 // subtracted from the output.
1324 .macro store_rev a, b
1327 add v4.8h, v4.8h, \a
1329 st1 {v4.8h}, [x0], #16
1330 ext v3.16b, v3.16b, v3.16b, #8
1332 ext v2.16b, v2.16b, v2.16b, #8
1333 add v5.8h, v5.8h, \b
1334 st1 {v5.8h}, [x0], #16
1336 sub v6.8h, v6.8h, v3.8h
1337 st1 {v6.8h}, [x0], #16
1339 sub v7.8h, v7.8h, v2.8h
1340 st1 {v7.8h}, [x0], #16
1343 store_rev v31.8h, v23.8h
1344 store_rev v30.8h, v22.8h
1345 store_rev v29.8h, v21.8h
1346 store_rev v28.8h, v20.8h
1347 store_rev v27.8h, v19.8h
1348 store_rev v26.8h, v18.8h
1349 store_rev v25.8h, v17.8h
1350 store_rev v24.8h, v16.8h
1355 // This is mostly the same as 8x32_pass1, but without the transpose,
1356 // and use the source as temp buffer between the two idct passes, and
1357 // add into the destination.
1360 // x2 = src (temp buffer)
1361 // x7 = negative double temp buffer stride
1362 // x9 = double temp buffer stride
1363 function idct32_1d_8x32_pass2\suffix\()_neon
1365 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
1367 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1370 sub x2, x2, x9, lsl #4
1372 .ifc \suffix,_quarter
1373 .irp i, 16, 17, 18, 19
1376 sub x2, x2, x9, lsl #2
1379 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1382 sub x2, x2, x9, lsl #3
1387 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1391 sub x2, x2, x9, lsl #4
1394 // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
1396 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
1399 sub x2, x2, x9, lsl #4
1401 .ifc \suffix,_quarter
1402 .irp i, 16, 17, 18, 19
1405 sub x2, x2, x9, lsl #2
1408 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
1411 sub x2, x2, x9, lsl #3
1415 bl idct32_odd\suffix
1417 .macro load_acc_store a, b, c, d, neg=0
1419 ld1 {v4.8h}, [x2], x9
1420 ld1 {v5.8h}, [x2], x9
1421 add v4.8h, v4.8h, \a
1422 ld1 {v6.8h}, [x2], x9
1423 add v5.8h, v5.8h, \b
1424 ld1 {v7.8h}, [x2], x9
1425 add v6.8h, v6.8h, \c
1426 add v7.8h, v7.8h, \d
1428 ld1 {v4.8h}, [x2], x7
1429 ld1 {v5.8h}, [x2], x7
1430 sub v4.8h, v4.8h, \a
1431 ld1 {v6.8h}, [x2], x7
1432 sub v5.8h, v5.8h, \b
1433 ld1 {v7.8h}, [x2], x7
1434 sub v6.8h, v6.8h, \c
1435 sub v7.8h, v7.8h, \d
1437 ld1 {v10.8b}, [x0], x1
1438 ld1 {v11.8b}, [x0], x1
1439 srshr v4.8h, v4.8h, #6
1440 ld1 {v2.8b}, [x0], x1
1441 srshr v5.8h, v5.8h, #6
1442 uaddw v4.8h, v4.8h, v10.8b
1443 ld1 {v3.8b}, [x0], x1
1444 srshr v6.8h, v6.8h, #6
1445 uaddw v5.8h, v5.8h, v11.8b
1446 srshr v7.8h, v7.8h, #6
1447 sub x0, x0, x1, lsl #2
1448 uaddw v6.8h, v6.8h, v2.8b
1450 uaddw v7.8h, v7.8h, v3.8b
1452 st1 {v4.8b}, [x0], x1
1454 st1 {v5.8b}, [x0], x1
1456 st1 {v6.8b}, [x0], x1
1457 st1 {v7.8b}, [x0], x1
1459 load_acc_store v31.8h, v30.8h, v29.8h, v28.8h
1460 load_acc_store v27.8h, v26.8h, v25.8h, v24.8h
1461 load_acc_store v23.8h, v22.8h, v21.8h, v20.8h
1462 load_acc_store v19.8h, v18.8h, v17.8h, v16.8h
1464 load_acc_store v16.8h, v17.8h, v18.8h, v19.8h, 1
1465 load_acc_store v20.8h, v21.8h, v22.8h, v23.8h, 1
1466 load_acc_store v24.8h, v25.8h, v26.8h, v27.8h, 1
1467 load_acc_store v28.8h, v29.8h, v30.8h, v31.8h, 1
1468 .purgem load_acc_store
1474 idct32_funcs _quarter
1477 const min_eob_idct_idct_32, align=4
1478 .short 0, 34, 135, 336
1481 function ff_vp9_idct_idct_32x32_add_neon, export=1
1483 b.eq idct32x32_dc_add_neon
1485 movrel x10, idct_coeffs
1489 stp d10, d11, [sp, #-0x10]!
1490 stp d8, d9, [sp, #-0x10]!
1498 // Double stride of the input, since we only read every other line
1502 ld1 {v0.8h,v1.8h}, [x10], #32
1503 ld1 {v8.8h,v9.8h}, [x10]
1506 b.le idct32x32_quarter_add_neon
1508 b.le idct32x32_half_add_neon
1510 movrel x12, min_eob_idct_idct_32, 2
1512 .irp i, 0, 8, 16, 24
1513 add x0, sp, #(\i*64)
1517 mov x1, #(32 - \i)/4
1521 bl idct32_1d_8x32_pass1_neon
1526 // Write zeros to the temp buffer for pass 2
1534 st1 {v16.8h,v17.8h,v18.8h,v19.8h}, [x0], #64
1538 .irp i, 0, 8, 16, 24
1542 bl idct32_1d_8x32_pass2_neon
1547 ldp d8, d9, [sp], 0x10
1548 ldp d10, d11, [sp], 0x10
1553 .macro idct32_partial size
1554 function idct32x32_\size\()_add_neon
1557 bl idct32_1d_8x32_pass1_\size\()_neon
1561 bl idct32_1d_8x32_pass1_\size\()_neon
1563 .irp i, 0, 8, 16, 24
1567 bl idct32_1d_8x32_pass2_\size\()_neon
1572 ldp d8, d9, [sp], 0x10
1573 ldp d10, d11, [sp], 0x10
1579 idct32_partial quarter