2 * ARM NEON optimised IDCT functions for HEVC decoding
3 * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
4 * Copyright (c) 2017 Alexandra Hájková
6 * Ported from arm/hevcdsp_idct_neon.S by
7 * Copyright (c) 2020 Reimar Döffinger
8 * Copyright (c) 2020 Josh Dekker
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License as published by the Free Software Foundation; either
15 * version 2.1 of the License, or (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 #include "libavutil/aarch64/asm.S"
40 .macro clip10 in1, in2, c1, c2
47 function ff_hevc_add_residual_4x4_8_neon, export=1
48 ld1 {v0.8h-v1.8h}, [x1]
49 ld1 {v2.s}[0], [x0], x2
50 ld1 {v2.s}[1], [x0], x2
51 ld1 {v2.s}[2], [x0], x2
52 ld1 {v2.s}[3], [x0], x2
53 sub x0, x0, x2, lsl #2
56 sqadd v0.8h, v0.8h, v6.8h
57 sqadd v1.8h, v1.8h, v7.8h
60 st1 {v0.s}[0], [x0], x2
61 st1 {v0.s}[1], [x0], x2
62 st1 {v0.s}[2], [x0], x2
63 st1 {v0.s}[3], [x0], x2
67 function ff_hevc_add_residual_4x4_10_neon, export=1
69 ld1 {v0.8h-v1.8h}, [x1]
70 ld1 {v2.d}[0], [x12], x2
71 ld1 {v2.d}[1], [x12], x2
72 ld1 {v3.d}[0], [x12], x2
73 sqadd v0.8h, v0.8h, v2.8h
74 ld1 {v3.d}[1], [x12], x2
76 sqadd v1.8h, v1.8h, v3.8h
77 mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
78 clip10 v0.8h, v1.8h, v4.8h, v5.8h
79 st1 {v0.d}[0], [x0], x2
80 st1 {v0.d}[1], [x0], x2
81 st1 {v1.d}[0], [x0], x2
82 st1 {v1.d}[1], [x0], x2
86 function ff_hevc_add_residual_8x8_8_neon, export=1
94 ld1 {v0.8h-v1.8h}, [x1], #32
96 sqadd v0.8h, v0.8h, v3.8h
97 sqadd v1.8h, v1.8h, v2.8h
100 st1 {v0.d}[0], [x0], x2
101 st1 {v0.d}[1], [x12], x2
106 function ff_hevc_add_residual_8x8_10_neon, export=1
111 mvni v5.8h, #0xFC, lsl #8 // movi #0x3FF
113 ld1 {v0.8h-v1.8h}, [x1], #32
115 sqadd v0.8h, v0.8h, v2.8h
117 sqadd v1.8h, v1.8h, v3.8h
118 clip10 v0.8h, v1.8h, v4.8h, v5.8h
119 st1 {v0.8h}, [x0], x2
120 st1 {v1.8h}, [x12], x2
125 function ff_hevc_add_residual_16x16_8_neon, export=1
131 ld1 {v0.8h-v3.8h}, [x1], #64
134 uxtl2 v18.8h, v16.16b
136 uxtl2 v21.8h, v19.16b
137 sqadd v0.8h, v0.8h, v17.8h
138 sqadd v1.8h, v1.8h, v18.8h
139 sqadd v2.8h, v2.8h, v20.8h
140 sqadd v3.8h, v3.8h, v21.8h
142 sqxtun2 v0.16b, v1.8h
144 sqxtun2 v1.16b, v3.8h
145 st1 {v0.16b}, [x0], x2
146 st1 {v1.16b}, [x12], x2
151 function ff_hevc_add_residual_16x16_10_neon, export=1
154 mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
158 ld1 {v16.8h-v17.8h}, [x0]
159 ld1 {v0.8h-v3.8h}, [x1], #64
160 sqadd v0.8h, v0.8h, v16.8h
161 ld1 {v18.8h-v19.8h}, [x12]
162 sqadd v1.8h, v1.8h, v17.8h
163 sqadd v2.8h, v2.8h, v18.8h
164 sqadd v3.8h, v3.8h, v19.8h
165 clip10 v0.8h, v1.8h, v20.8h, v21.8h
166 clip10 v2.8h, v3.8h, v20.8h, v21.8h
167 st1 {v0.8h-v1.8h}, [x0], x2
168 st1 {v2.8h-v3.8h}, [x12], x2
173 function ff_hevc_add_residual_32x32_8_neon, export=1
178 ld1 {v20.16b, v21.16b}, [x0]
180 uxtl2 v17.8h, v20.16b
181 ld1 {v22.16b, v23.16b}, [x12]
183 uxtl2 v19.8h, v21.16b
185 ld1 {v0.8h-v3.8h}, [x1], #64
186 ld1 {v4.8h-v7.8h}, [x1], #64
187 uxtl2 v21.8h, v22.16b
189 uxtl2 v23.8h, v23.16b
190 sqadd v0.8h, v0.8h, v16.8h
191 sqadd v1.8h, v1.8h, v17.8h
192 sqadd v2.8h, v2.8h, v18.8h
193 sqadd v3.8h, v3.8h, v19.8h
194 sqadd v4.8h, v4.8h, v20.8h
195 sqadd v5.8h, v5.8h, v21.8h
196 sqadd v6.8h, v6.8h, v22.8h
197 sqadd v7.8h, v7.8h, v23.8h
199 sqxtun2 v0.16b, v1.8h
201 sqxtun2 v1.16b, v3.8h
203 sqxtun2 v2.16b, v5.8h
204 st1 {v0.16b, v1.16b}, [x0], x2
206 sqxtun2 v3.16b, v7.8h
207 st1 {v2.16b, v3.16b}, [x12], x2
212 function ff_hevc_add_residual_32x32_10_neon, export=1
215 mvni v21.8h, #0xFC, lsl #8 // movi #0x3FF
217 ld1 {v0.8h-v3.8h}, [x1], #64
218 ld1 {v16.8h-v19.8h}, [x0]
219 sqadd v0.8h, v0.8h, v16.8h
220 sqadd v1.8h, v1.8h, v17.8h
221 sqadd v2.8h, v2.8h, v18.8h
222 sqadd v3.8h, v3.8h, v19.8h
223 clip10 v0.8h, v1.8h, v20.8h, v21.8h
224 clip10 v2.8h, v3.8h, v20.8h, v21.8h
225 st1 {v0.8h-v3.8h}, [x0], x2
230 .macro sum_sub out, in, c, op, p
232 smlal\p \out, \in, \c
234 smlsl\p \out, \in, \c
238 .macro fixsqrshrn d, dt, n, m
240 sqrshrn2 \d\dt, \n\().4s, \m
242 sqrshrn \n\().4h, \n\().4s, \m
243 mov \d\().d[0], \n\().d[0]
247 // uses and clobbers v28-v31 as temp registers
248 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
249 sshll\p1 v28.4s, \in0, #6
251 smull\p1 v30.4s, \in1, v0.h[1]
252 smull\p1 v31.4s, \in1, v0.h[3]
253 smlal\p2 v28.4s, \in2, v0.h[0] //e0
254 smlsl\p2 v29.4s, \in2, v0.h[0] //e1
255 smlal\p2 v30.4s, \in3, v0.h[3] //o0
256 smlsl\p2 v31.4s, \in3, v0.h[1] //o1
258 add \out0, v28.4s, v30.4s
259 add \out1, v29.4s, v31.4s
260 sub \out2, v29.4s, v31.4s
261 sub \out3, v28.4s, v30.4s
264 .macro transpose8_4x4 r0, r1, r2, r3
265 trn1 v2.8h, \r0\().8h, \r1\().8h
266 trn2 v3.8h, \r0\().8h, \r1\().8h
267 trn1 v4.8h, \r2\().8h, \r3\().8h
268 trn2 v5.8h, \r2\().8h, \r3\().8h
269 trn1 \r0\().4s, v2.4s, v4.4s
270 trn2 \r2\().4s, v2.4s, v4.4s
271 trn1 \r1\().4s, v3.4s, v5.4s
272 trn2 \r3\().4s, v3.4s, v5.4s
275 .macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
276 transpose8_4x4 \r0, \r1, \r2, \r3
277 transpose8_4x4 \r4, \r5, \r6, \r7
280 .macro tr_8x4 shift, in0,in0t, in1,in1t, in2,in2t, in3,in3t, in4,in4t, in5,in5t, in6,in6t, in7,in7t, p1, p2
281 tr_4x4_8 \in0\in0t, \in2\in2t, \in4\in4t, \in6\in6t, v24.4s, v25.4s, v26.4s, v27.4s, \p1, \p2
283 smull\p1 v30.4s, \in1\in1t, v0.h[6]
284 smull\p1 v28.4s, \in1\in1t, v0.h[4]
285 smull\p1 v29.4s, \in1\in1t, v0.h[5]
286 sum_sub v30.4s, \in3\in3t, v0.h[4], -, \p1
287 sum_sub v28.4s, \in3\in3t, v0.h[5], +, \p1
288 sum_sub v29.4s, \in3\in3t, v0.h[7], -, \p1
290 sum_sub v30.4s, \in5\in5t, v0.h[7], +, \p2
291 sum_sub v28.4s, \in5\in5t, v0.h[6], +, \p2
292 sum_sub v29.4s, \in5\in5t, v0.h[4], -, \p2
294 sum_sub v30.4s, \in7\in7t, v0.h[5], +, \p2
295 sum_sub v28.4s, \in7\in7t, v0.h[7], +, \p2
296 sum_sub v29.4s, \in7\in7t, v0.h[6], -, \p2
298 add v31.4s, v26.4s, v30.4s
299 sub v26.4s, v26.4s, v30.4s
300 fixsqrshrn \in2,\in2t, v31, \shift
303 smull\p1 v31.4s, \in1\in1t, v0.h[7]
304 sum_sub v31.4s, \in3\in3t, v0.h[6], -, \p1
305 sum_sub v31.4s, \in5\in5t, v0.h[5], +, \p2
306 sum_sub v31.4s, \in7\in7t, v0.h[4], -, \p2
307 fixsqrshrn \in5,\in5t, v26, \shift
310 add v26.4s, v24.4s, v28.4s
311 sub v24.4s, v24.4s, v28.4s
312 add v28.4s, v25.4s, v29.4s
313 sub v25.4s, v25.4s, v29.4s
314 add v30.4s, v27.4s, v31.4s
315 sub v27.4s, v27.4s, v31.4s
317 fixsqrshrn \in0,\in0t, v26, \shift
318 fixsqrshrn \in7,\in7t, v24, \shift
319 fixsqrshrn \in1,\in1t, v28, \shift
320 fixsqrshrn \in6,\in6t, v25, \shift
321 fixsqrshrn \in3,\in3t, v30, \shift
322 fixsqrshrn \in4,\in4t, v27, \shift
325 .macro idct_8x8 bitdepth
326 function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
329 ld1 {v16.8h-v19.8h}, [x1], #64
330 ld1 {v20.8h-v23.8h}, [x1]
335 tr_8x4 7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
336 tr_8x4 7, v16,.8h, v17,.8h, v18,.8h, v19,.8h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, 2, 2
338 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23
340 tr_8x4 20 - \bitdepth, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v16,.8h, v17,.8h, v18,.8h, v19,.8h, , 2
341 tr_8x4 20 - \bitdepth, v20,.4h, v21,.4h, v22,.4h, v23,.4h, v20,.8h, v21,.8h, v22,.8h, v23,.8h, , 2
343 transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23
346 st1 {v16.8h-v19.8h}, [x1], #64
347 st1 {v20.8h-v23.8h}, [x1]
353 .macro butterfly e, o, tmp_p, tmp_m
358 .macro tr16_8x4 in0, in1, in2, in3, offset
359 tr_4x4_8 \in0\().4h, \in1\().4h, \in2\().4h, \in3\().4h, v24.4s, v25.4s, v26.4s, v27.4s
361 smull2 v28.4s, \in0\().8h, v0.h[4]
362 smull2 v29.4s, \in0\().8h, v0.h[5]
363 smull2 v30.4s, \in0\().8h, v0.h[6]
364 smull2 v31.4s, \in0\().8h, v0.h[7]
365 sum_sub v28.4s, \in1\().8h, v0.h[5], +, 2
366 sum_sub v29.4s, \in1\().8h, v0.h[7], -, 2
367 sum_sub v30.4s, \in1\().8h, v0.h[4], -, 2
368 sum_sub v31.4s, \in1\().8h, v0.h[6], -, 2
370 sum_sub v28.4s, \in2\().8h, v0.h[6], +, 2
371 sum_sub v29.4s, \in2\().8h, v0.h[4], -, 2
372 sum_sub v30.4s, \in2\().8h, v0.h[7], +, 2
373 sum_sub v31.4s, \in2\().8h, v0.h[5], +, 2
375 sum_sub v28.4s, \in3\().8h, v0.h[7], +, 2
376 sum_sub v29.4s, \in3\().8h, v0.h[6], -, 2
377 sum_sub v30.4s, \in3\().8h, v0.h[5], +, 2
378 sum_sub v31.4s, \in3\().8h, v0.h[4], -, 2
380 butterfly v24.4s, v28.4s, v16.4s, v23.4s
381 butterfly v25.4s, v29.4s, v17.4s, v22.4s
382 butterfly v26.4s, v30.4s, v18.4s, v21.4s
383 butterfly v27.4s, v31.4s, v19.4s, v20.4s
385 st1 {v16.4s-v19.4s}, [x4], #64
386 st1 {v20.4s-v23.4s}, [x4]
389 .macro load16 in0, in1, in2, in3
390 ld1 {\in0}[0], [x1], x2
391 ld1 {\in0}[1], [x3], x2
392 ld1 {\in1}[0], [x1], x2
393 ld1 {\in1}[1], [x3], x2
394 ld1 {\in2}[0], [x1], x2
395 ld1 {\in2}[1], [x3], x2
396 ld1 {\in3}[0], [x1], x2
397 ld1 {\in3}[1], [x3], x2
400 .macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
401 sum_sub v21.4s, \in, \t0, \op0, \p
402 sum_sub v22.4s, \in, \t1, \op1, \p
403 sum_sub v23.4s, \in, \t2, \op2, \p
404 sum_sub v24.4s, \in, \t3, \op3, \p
405 sum_sub v25.4s, \in, \t4, \op4, \p
406 sum_sub v26.4s, \in, \t5, \op5, \p
407 sum_sub v27.4s, \in, \t6, \op6, \p
408 sum_sub v28.4s, \in, \t7, \op7, \p
411 .macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
412 add v20.4s, \in0, \in1
422 .macro store16 in0, in1, in2, in3, rx
423 st1 {\in0}[0], [x1], x2
424 st1 {\in0}[1], [x3], \rx
425 st1 {\in1}[0], [x1], x2
426 st1 {\in1}[1], [x3], \rx
427 st1 {\in2}[0], [x1], x2
428 st1 {\in2}[1], [x3], \rx
429 st1 {\in3}[0], [x1], x2
430 st1 {\in3}[1], [x3], \rx
433 .macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift
434 sqrshrn \out0\().4h, \in0, \shift
435 sqrshrn2 \out0\().8h, \in1, \shift
436 sqrshrn \out1\().4h, \in2, \shift
437 sqrshrn2 \out1\().8h, \in3, \shift
438 sqrshrn \out2\().4h, \in4, \shift
439 sqrshrn2 \out2\().8h, \in5, \shift
440 sqrshrn \out3\().4h, \in6, \shift
441 sqrshrn2 \out3\().8h, \in7, \shift
444 .macro transpose16_4x4_2 r0, r1, r2, r3
446 trn1 v2.4h, \r0\().4h, \r1\().4h
447 trn2 v3.4h, \r0\().4h, \r1\().4h
448 trn1 v4.4h, \r2\().4h, \r3\().4h
449 trn2 v5.4h, \r2\().4h, \r3\().4h
450 trn1 v6.2s, v2.2s, v4.2s
451 trn2 v7.2s, v2.2s, v4.2s
452 trn1 v2.2s, v3.2s, v5.2s
453 trn2 v4.2s, v3.2s, v5.2s
454 mov \r0\().d[0], v6.d[0]
455 mov \r2\().d[0], v7.d[0]
456 mov \r1\().d[0], v2.d[0]
457 mov \r3\().d[0], v4.d[0]
459 // upper halves in reverse order
460 trn1 v2.8h, \r3\().8h, \r2\().8h
461 trn2 v3.8h, \r3\().8h, \r2\().8h
462 trn1 v4.8h, \r1\().8h, \r0\().8h
463 trn2 v5.8h, \r1\().8h, \r0\().8h
464 trn1 v6.4s, v2.4s, v4.4s
465 trn2 v7.4s, v2.4s, v4.4s
466 trn1 v2.4s, v3.4s, v5.4s
467 trn2 v4.4s, v3.4s, v5.4s
468 mov \r3\().d[1], v6.d[1]
469 mov \r1\().d[1], v7.d[1]
470 mov \r2\().d[1], v2.d[1]
471 mov \r0\().d[1], v4.d[1]
474 .macro tr_16x4 name, shift, offset, step
475 function func_tr_16x4_\name
477 add x3, x5, #(\step * 64)
478 mov x2, #(\step * 128)
479 load16 v16.d, v17.d, v18.d, v19.d
483 tr16_8x4 v16, v17, v18, v19, \offset
485 add x1, x5, #(\step * 32)
486 add x3, x5, #(\step * 3 *32)
487 mov x2, #(\step * 128)
488 load16 v20.d, v17.d, v18.d, v19.d
491 smull v21.4s, v20.4h, v1.h[0]
492 smull v22.4s, v20.4h, v1.h[1]
493 smull v23.4s, v20.4h, v1.h[2]
494 smull v24.4s, v20.4h, v1.h[3]
495 smull v25.4s, v20.4h, v1.h[4]
496 smull v26.4s, v20.4h, v1.h[5]
497 smull v27.4s, v20.4h, v1.h[6]
498 smull v28.4s, v20.4h, v1.h[7]
500 add_member v20.8h, v1.h[1], v1.h[4], v1.h[7], v1.h[5], v1.h[2], v1.h[0], v1.h[3], v1.h[6], +, +, +, -, -, -, -, -, 2
501 add_member v17.4h, v1.h[2], v1.h[7], v1.h[3], v1.h[1], v1.h[6], v1.h[4], v1.h[0], v1.h[5], +, +, -, -, -, +, +, +
502 add_member v17.8h, v1.h[3], v1.h[5], v1.h[1], v1.h[7], v1.h[0], v1.h[6], v1.h[2], v1.h[4], +, -, -, +, +, +, -, -, 2
503 add_member v18.4h, v1.h[4], v1.h[2], v1.h[6], v1.h[0], v1.h[7], v1.h[1], v1.h[5], v1.h[3], +, -, -, +, -, -, +, +
504 add_member v18.8h, v1.h[5], v1.h[0], v1.h[4], v1.h[6], v1.h[1], v1.h[3], v1.h[7], v1.h[2], +, -, +, +, -, +, +, -, 2
505 add_member v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
506 add_member v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2
509 ld1 {v16.4s-v19.4s}, [x4], #64
511 butterfly16 v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
512 scale v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
513 transpose16_4x4_2 v29, v30, v31, v24
515 add x3, x6, #(24 +3*32)
518 store16 v29.d, v30.d, v31.d, v24.d, x4
520 add x4, sp, #(\offset + 64)
521 ld1 {v16.4s-v19.4s}, [x4]
522 butterfly16 v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, v28.4s
523 scale v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
524 transpose16_4x4_2 v29, v30, v31, v20
527 add x3, x6, #(16 + 3 * 32)
530 store16 v29.d, v30.d, v31.d, v20.d, x4
536 .macro idct_16x16 bitdepth
537 function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
541 // allocate a temp buffer
545 add x5, x0, #(8 * \i)
546 add x6, sp, #(8 * \i * 16)
547 bl func_tr_16x4_firstpass
551 add x5, sp, #(8 * \i)
552 add x6, x0, #(8 * \i * 16)
553 bl func_tr_16x4_secondpass_\bitdepth
566 tr_16x4 firstpass, 7, 512, 1
567 tr_16x4 secondpass_8, 20 - 8, 512, 1
568 tr_16x4 secondpass_10, 20 - 10, 512, 1
573 // void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
574 .macro idct_dc size, bitdepth
575 function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
576 movi v1.8h, #((1 << (14 - \bitdepth))+1)
578 add v4.8h, v4.8h, v1.8h
579 sshr v0.8h, v4.8h, #(15 - \bitdepth)
580 sshr v1.8h, v4.8h, #(15 - \bitdepth)
582 sshr v2.8h, v4.8h, #(15 - \bitdepth)
583 sshr v3.8h, v4.8h, #(15 - \bitdepth)
584 .if \size > 16 /* dc 32x32 */
591 .if \size > 8 /* dc 16x16 */
592 st1 {v0.8h-v3.8h}, [x0], x13
593 st1 {v0.8h-v3.8h}, [x12], x13
594 st1 {v0.8h-v3.8h}, [x0], x13
595 st1 {v0.8h-v3.8h}, [x12], x13
596 st1 {v0.8h-v3.8h}, [x0], x13
597 st1 {v0.8h-v3.8h}, [x12], x13
599 st1 {v0.8h-v3.8h}, [x0], x13
600 st1 {v0.8h-v3.8h}, [x12], x13
601 .if \size > 16 /* dc 32x32 */
605 st1 {v0.8h-v1.8h}, [x0]