2 * ARM NEON optimised IDCT functions for HEVC decoding
3 * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
4 * Copyright (c) 2017 Alexandra Hájková
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/arm/asm.S"
32 function ff_hevc_idct_4x4_dc_neon_8, export=1
41 vst1.16 {q0, q1}, [r0]
45 function ff_hevc_idct_8x8_dc_neon_8, export=1
64 function ff_hevc_idct_16x16_dc_neon_8, export=1
86 function ff_hevc_idct_32x32_dc_neon_8, export=1
108 function ff_hevc_add_residual_4x4_neon_8, export=1
110 vld1.32 d4[0], [r0], r2
111 vld1.32 d4[1], [r0], r2
112 vld1.32 d5[0], [r0], r2
113 vld1.32 d5[1], [r0], r2
114 sub r0, r0, r2, lsl #2
121 vst1.32 d0[0], [r0], r2
122 vst1.32 d0[1], [r0], r2
123 vst1.32 d1[0], [r0], r2
124 vst1.32 d1[1], [r0], r2
128 function ff_hevc_add_residual_8x8_neon_8, export=1
141 function ff_hevc_add_residual_16x16_neon_8, export=1
144 vld1.16 {q0, q1}, [r1]!
152 vst1.8 {q0}, [r0], r2
157 function ff_hevc_add_residual_32x32_neon_8, export=1
161 vld1.8 {q8, q9}, [r0]
174 vst1.8 {q0, q1}, [r0], r2
179 /* uses registers q2 - q9 for temp values */
181 .macro tr4_luma_shift r0, r1, r2, r3, shift
182 vaddl.s16 q5, \r0, \r2 // c0 = src0 + src2
183 vaddl.s16 q2, \r2, \r3 // c1 = src2 + src3
184 vsubl.s16 q4, \r0, \r3 // c2 = src0 - src3
185 vmull.s16 q6, \r1, d0[0] // c3 = 74 * src1
187 vaddl.s16 q7, \r0, \r3 // src0 + src3
188 vsubw.s16 q7, q7, \r2 // src0 - src2 + src3
189 vmul.s32 q7, q7, d0[0] // dst2 = 74 * (src0 - src2 + src3)
191 vmul.s32 q8, q5, d0[1] // 29 * c0
192 vmul.s32 q9, q2, d1[0] // 55 * c1
193 vadd.s32 q8, q9 // 29 * c0 + 55 * c1
194 vadd.s32 q8, q6 // dst0 = 29 * c0 + 55 * c1 + c3
196 vmul.s32 q2, q2, d0[1] // 29 * c1
197 vmul.s32 q9, q4, d1[0] // 55 * c2
198 vsub.s32 q9, q2 // 55 * c2 - 29 * c1
199 vadd.s32 q9, q6 // dst1 = 55 * c2 - 29 * c1 + c3
201 vmul.s32 q5, q5, d1[0] // 55 * c0
202 vmul.s32 q4, q4, d0[1] // 29 * c2
203 vadd.s32 q5, q4 // 55 * c0 + 29 * c2
204 vsub.s32 q5, q6 // dst3 = 55 * c0 + 29 * c2 - c3
206 vqrshrn.s32 \r0, q8, \shift
207 vqrshrn.s32 \r1, q9, \shift
208 vqrshrn.s32 \r2, q7, \shift
209 vqrshrn.s32 \r3, q5, \shift
212 function ff_hevc_transform_luma_4x4_neon_8, export=1
214 vld1.16 {q14, q15}, [r0] // coeffs
222 tr4_luma_shift d28, d29, d30, d31, #7
228 tr4_luma_shift d28, d29, d30, d31, #12
233 vst1.16 {q14, q15}, [r0]
238 .macro sum_sub out, in, c, op
240 vmlal.s16 \out, \in, \c
242 vmlsl.s16 \out, \in, \c
246 .macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift, tmp0, tmp1, tmp2, tmp3, tmp4
247 vshll.s16 \tmp0, \in0, #6
248 vmull.s16 \tmp2, \in1, d4[1]
250 vmull.s16 \tmp3, \in1, d4[3]
251 vmlal.s16 \tmp0, \in2, d4[0] @e0
252 vmlsl.s16 \tmp1, \in2, d4[0] @e1
253 vmlal.s16 \tmp2, \in3, d4[3] @o0
254 vmlsl.s16 \tmp3, \in3, d4[1] @o1
256 vadd.s32 \tmp4, \tmp0, \tmp2
257 vsub.s32 \tmp0, \tmp0, \tmp2
258 vadd.s32 \tmp2, \tmp1, \tmp3
259 vsub.s32 \tmp1, \tmp1, \tmp3
260 vqrshrn.s32 \out0, \tmp4, #\shift
261 vqrshrn.s32 \out3, \tmp0, #\shift
262 vqrshrn.s32 \out1, \tmp2, #\shift
263 vqrshrn.s32 \out2, \tmp1, #\shift
266 .macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3
267 vshll.s16 \tmp0, \in0, #6
268 vld1.s16 {\in0}, [r1, :64]!
270 vmull.s16 \tmp2, \in1, \in0[1]
271 vmull.s16 \tmp3, \in1, \in0[3]
272 vmlal.s16 \tmp0, \in2, \in0[0] @e0
273 vmlsl.s16 \tmp1, \in2, \in0[0] @e1
274 vmlal.s16 \tmp2, \in3, \in0[3] @o0
275 vmlsl.s16 \tmp3, \in3, \in0[1] @o1
277 vld1.s16 {\in0}, [r1, :64]
279 vadd.s32 \out0, \tmp0, \tmp2
280 vadd.s32 \out1, \tmp1, \tmp3
281 vsub.s32 \out2, \tmp1, \tmp3
282 vsub.s32 \out3, \tmp0, \tmp2
287 @ Do a 4x4 transpose, using q registers for the subtransposes that don't
288 @ need to address the indiviudal d registers.
289 @ r0,r1 == rq0, r2,r3 == rq1
290 .macro transpose_4x4 rq0, rq1, r0, r1, r2, r3
296 .macro idct_4x4 bitdepth
297 function ff_hevc_idct_4x4_\bitdepth\()_neon, export=1
299 vld1.s16 {q0-q1}, [r0, :128]
302 vld1.s16 {d4}, [r1, :64]
304 tr_4x4 d0, d1, d2, d3, d16, d17, d18, d19, 7, q10, q11, q12, q13, q0
305 transpose_4x4 q8, q9, d16, d17, d18, d19
307 tr_4x4 d16, d17, d18, d19, d0, d1, d2, d3, 20 - \bitdepth, q10, q11, q12, q13, q0
308 transpose_4x4 q0, q1, d0, d1, d2, d3
309 vst1.s16 {d0-d3}, [r0, :128]
314 .macro transpose8_4x4 r0, r1, r2, r3
321 .macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7, l0, l1, l2, l3, l4, l5, l6, l7
322 transpose8_4x4 \r0, \r1, \r2, \r3
323 transpose8_4x4 \r4, \r5, \r6, \r7
325 transpose8_4x4 \l0, \l1, \l2, \l3
326 transpose8_4x4 \l4, \l5, \l6, \l7
329 .macro tr_8x4 shift, in0, in1, in2, in3, in4, in5, in6, in7
330 tr_4x4_8 \in0, \in2, \in4, \in6, q8, q9, q10, q11, q12, q13, q14, q15
332 vmull.s16 q14, \in1, \in0[2]
333 vmull.s16 q12, \in1, \in0[0]
334 vmull.s16 q13, \in1, \in0[1]
335 sum_sub q14, \in3, \in0[0], -
336 sum_sub q12, \in3, \in0[1], +
337 sum_sub q13, \in3, \in0[3], -
339 sum_sub q14, \in5, \in0[3], +
340 sum_sub q12, \in5, \in0[2], +
341 sum_sub q13, \in5, \in0[0], -
343 sum_sub q14, \in7, \in0[1], +
344 sum_sub q12, \in7, \in0[3], +
345 sum_sub q13, \in7, \in0[2], -
347 vadd.s32 q15, q10, q14
348 vsub.s32 q10, q10, q14
349 vqrshrn.s32 \in2, q15, \shift
351 vmull.s16 q15, \in1, \in0[3]
352 sum_sub q15, \in3, \in0[2], -
353 sum_sub q15, \in5, \in0[1], +
354 sum_sub q15, \in7, \in0[0], -
356 vqrshrn.s32 \in5, q10, \shift
358 vadd.s32 q10, q8, q12
360 vadd.s32 q12, q9, q13
362 vadd.s32 q14, q11, q15
363 vsub.s32 q11, q11, q15
365 vqrshrn.s32 \in0, q10, \shift
366 vqrshrn.s32 \in7, q8, \shift
367 vqrshrn.s32 \in1, q12, \shift
368 vqrshrn.s32 \in6, q9, \shift
369 vqrshrn.s32 \in3, q14, \shift
370 vqrshrn.s32 \in4, q11, \shift
373 .macro idct_8x8 bitdepth
374 function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
381 vld1.s16 {q0-q1}, [r1,:128], r2
382 vld1.s16 {q2-q3}, [r3,:128], r2
383 vld1.s16 {q4-q5}, [r1,:128], r2
384 vld1.s16 {q6-q7}, [r3,:128], r2
388 tr_8x4 7, d0, d2, d4, d6, d8, d10, d12, d14
389 tr_8x4 7, d1, d3, d5, d7, d9, d11, d13, d15
391 @ Transpose each 4x4 block, and swap how d4-d7 and d8-d11 are used.
401 transpose_8x8 d0, d2, d4, d6, d8, d10, d12, d14, d1, d3, d5, d7, d9, d11, d13, d15
412 tr_8x4 20 - \bitdepth, d0, d2, d4, d6, d1, d3, d5, d7
414 tr_8x4 20 - \bitdepth, d0, d10, d12, d14, d9, d11, d13, d15
417 transpose_8x8 d0, d2, d4, d6, d8, d10, d12, d14, d1, d3, d5, d7, d9, d11, d13, d15
422 vst1.s16 {q0-q1}, [r1,:128], r2
423 vst1.s16 {q2-q3}, [r3,:128], r2
424 vst1.s16 {q4-q5}, [r1,:128], r2
425 vst1.s16 {q6-q7}, [r3,:128], r2