2 * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
24 function ff_hevc_idct_4x4_dc_neon_8, export=1
33 vst1.16 {q0, q1}, [r0]
37 function ff_hevc_idct_8x8_dc_neon_8, export=1
56 function ff_hevc_idct_16x16_dc_neon_8, export=1
78 function ff_hevc_idct_32x32_dc_neon_8, export=1
100 function ff_hevc_transform_add_4x4_neon_8, export=1
102 vld1.32 d4[0], [r0], r2
103 vld1.32 d4[1], [r0], r2
104 vld1.32 d5[0], [r0], r2
105 vld1.32 d5[1], [r0], r2
106 sub r0, r0, r2, lsl #2
113 vst1.32 d0[0], [r0], r2
114 vst1.32 d0[1], [r0], r2
115 vst1.32 d1[0], [r0], r2
116 vst1.32 d1[1], [r0], r2
120 function ff_hevc_transform_add_8x8_neon_8, export=1
133 function ff_hevc_transform_add_16x16_neon_8, export=1
136 vld1.16 {q0, q1}, [r1]!
144 vst1.8 {q0}, [r0], r2
149 function ff_hevc_transform_add_32x32_neon_8, export=1
153 vld1.8 {q8, q9}, [r0]
166 vst1.8 {q0, q1}, [r0], r2
171 .macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7
188 .macro transpose_16b_4x4 r0, r1, r2, r3
195 /* uses registers q2 - q9 for temp values */
197 .macro tr4_luma_shift r0, r1, r2, r3, shift
198 vaddl.s16 q5, \r0, \r2 // c0 = src0 + src2
199 vaddl.s16 q2, \r2, \r3 // c1 = src2 + src3
200 vsubl.s16 q4, \r0, \r3 // c2 = src0 - src3
201 vmull.s16 q6, \r1, d0[0] // c3 = 74 * src1
203 vaddl.s16 q7, \r0, \r3 // src0 + src3
204 vsubw.s16 q7, q7, \r2 // src0 - src2 + src3
205 vmul.s32 q7, q7, d0[0] // dst2 = 74 * (src0 - src2 + src3)
207 vmul.s32 q8, q5, d0[1] // 29 * c0
208 vmul.s32 q9, q2, d1[0] // 55 * c1
209 vadd.s32 q8, q9 // 29 * c0 + 55 * c1
210 vadd.s32 q8, q6 // dst0 = 29 * c0 + 55 * c1 + c3
212 vmul.s32 q2, q2, d0[1] // 29 * c1
213 vmul.s32 q9, q4, d1[0] // 55 * c2
214 vsub.s32 q9, q2 // 55 * c2 - 29 * c1
215 vadd.s32 q9, q6 // dst1 = 55 * c2 - 29 * c1 + c3
217 vmul.s32 q5, q5, d1[0] // 55 * c0
218 vmul.s32 q4, q4, d0[1] // 29 * c2
219 vadd.s32 q5, q4 // 55 * c0 + 29 * c2
220 vsub.s32 q5, q6 // dst3 = 55 * c0 + 29 * c2 - c3
222 vqrshrn.s32 \r0, q8, \shift
223 vqrshrn.s32 \r1, q9, \shift
224 vqrshrn.s32 \r2, q7, \shift
225 vqrshrn.s32 \r3, q5, \shift
228 /* uses registers q2 - q6 for temp values */
229 .macro tr4 r0, r1, r2, r3
230 vmull.s16 q4, \r1, d0[0] // 83 * src1
231 vmull.s16 q6, \r1, d0[1] // 36 * src1
232 vshll.s16 q2, \r0, #6 // 64 * src0
233 vshll.s16 q3, \r2, #6 // 64 * src2
234 vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
235 vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
236 vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
237 vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
239 vsub.s32 q3, q5, q4 // e0 - o0
240 vadd.s32 q4, q5, q4 // e0 + o0
241 vadd.s32 q5, q2, q6 // e1 + o1
242 vsub.s32 q6, q2, q6 // e1 - o1
245 .macro tr4_shift r0, r1, r2, r3, shift
246 vmull.s16 q4, \r1, d0[0] // 83 * src1
247 vmull.s16 q6, \r1, d0[1] // 36 * src1
248 vshll.s16 q2, \r0, #6 // 64 * src0
249 vshll.s16 q3, \r2, #6 // 64 * src2
250 vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
251 vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
252 vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
253 vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
255 vsub.s32 q3, q5, q4 // e0 - o0
256 vadd.s32 q4, q5, q4 // e0 + o0
257 vadd.s32 q5, q2, q6 // e1 + o1
258 vsub.s32 q6, q2, q6 // e1 - o1
260 vqrshrn.s32 \r0, q4, \shift
261 vqrshrn.s32 \r1, q5, \shift
262 vqrshrn.s32 \r2, q6, \shift
263 vqrshrn.s32 \r3, q3, \shift
266 function ff_hevc_transform_4x4_neon_8, export=1
268 vld1.16 {q14, q15}, [r0] // coeffs
269 ldr r3, =0x00240053 // 36 and 83
272 tr4_shift d28, d29, d30, d31, #7
278 tr4_shift d28, d29, d30, d31, #12
284 vst1.16 {q14, q15}, [r0]
289 function ff_hevc_transform_luma_4x4_neon_8, export=1
291 vld1.16 {q14, q15}, [r0] // coeffs
299 tr4_luma_shift d28, d29, d30, d31, #7
305 tr4_luma_shift d28, d29, d30, d31, #12
310 vst1.16 {q14, q15}, [r0]
315 .macro tr8_begin in0, in1, in2, in3
316 vmull.s16 q7, \in0, d1[1] // 89 * src1
317 vmull.s16 q8, \in0, d1[0] // 75 * src1
318 vmull.s16 q9, \in0, d1[3] // 50 * src1
319 vmull.s16 q10, \in0, d1[2] // 18 * src1
321 vmlal.s16 q7, \in1, d1[0] // 75 * src3
322 vmlsl.s16 q8, \in1, d1[2] //-18 * src3
323 vmlsl.s16 q9, \in1, d1[1] //-89 * src3
324 vmlsl.s16 q10, \in1, d1[3] //-50 * src3
326 vmlal.s16 q7, \in2, d1[3] // 50 * src5
327 vmlsl.s16 q8, \in2, d1[1] //-89 * src5
328 vmlal.s16 q9, \in2, d1[2] // 18 * src5
329 vmlal.s16 q10, \in2, d1[0] // 75 * src5
331 vmlal.s16 q7, \in3, d1[2] // 18 * src7
332 vmlsl.s16 q8, \in3, d1[3] //-50 * src7
333 vmlal.s16 q9, \in3, d1[0] // 75 * src7
334 vmlsl.s16 q10, \in3, d1[1] //-89 * src7
338 vadd.s32 q1, q4, q7 // e_8[0] + o_8[0], dst[0]
339 vsub.s32 q4, q4, q7 // e_8[0] - o_8[0], dst[7]
341 vadd.s32 q2, q5, q8 // e_8[1] + o_8[1], dst[1]
342 vsub.s32 q5, q5, q8 // e_8[1] - o_8[1], dst[6]
344 vadd.s32 q11, q6, q9 // e_8[2] + o_8[2], dst[2]
345 vsub.s32 q6, q6, q9 // e_8[2] - o_8[2], dst[5]
347 vadd.s32 q12, q3, q10 // e_8[3] + o_8[3], dst[3]
348 vsub.s32 q3, q3, q10 // e_8[3] - o_8[3], dst[4]
349 vqrshrn.s32 d2, q1, \shift
350 vqrshrn.s32 d3, q2, \shift
351 vqrshrn.s32 d4, q11, \shift
352 vqrshrn.s32 d5, q12, \shift
353 vqrshrn.s32 d6, q3, \shift
354 vqrshrn.s32 d7, q6, \shift
355 vqrshrn.s32 d9, q4, \shift
356 vqrshrn.s32 d8, q5, \shift
359 function ff_hevc_transform_8x8_neon_8, export=1
365 vld1.16 {d0, d1}, [r3]
368 vld1.16 {d24}, [r0], r5
369 vld1.16 {d25}, [r0], r5
370 vld1.16 {d26}, [r0], r5
371 vld1.16 {d27}, [r0], r5
372 vld1.16 {d28}, [r0], r5
373 vld1.16 {d29}, [r0], r5
374 vld1.16 {d30}, [r0], r5
375 vld1.16 {d31}, [r0], r5
377 tr8_begin d25, d27, d29, d31
378 tr4 d24, d26, d28, d30
380 vst1.16 {d2}, [r0], r5
381 vst1.16 {d3}, [r0], r5
382 vst1.16 {d4}, [r0], r5
383 vst1.16 {d5}, [r0], r5
384 vst1.16 {d6}, [r0], r5
385 vst1.16 {d7}, [r0], r5
386 vst1.16 {d8}, [r0], r5
387 vst1.16 {d9}, [r0], r5
389 //skip right half if col_limit in r1 is less than 4
394 vld1.16 {d24}, [r0], r5
395 vld1.16 {d25}, [r0], r5
396 vld1.16 {d26}, [r0], r5
397 vld1.16 {d27}, [r0], r5
398 vld1.16 {d28}, [r0], r5
399 vld1.16 {d29}, [r0], r5
400 vld1.16 {d30}, [r0], r5
401 vld1.16 {d31}, [r0], r5
403 tr8_begin d25, d27, d29, d31
404 tr4 d24, d26, d28, d30
406 vst1.16 {d2}, [r0], r5
407 vst1.16 {d3}, [r0], r5
408 vst1.16 {d4}, [r0], r5
409 vst1.16 {d5}, [r0], r5
410 vst1.16 {d6}, [r0], r5
411 vst1.16 {d7}, [r0], r5
412 vst1.16 {d8}, [r0], r5
413 vst1.16 {d9}, [r0], r5
417 vldm r0, {q12-q15} // coeffs
418 transpose_16b_4x4 d24, d26, d28, d30
419 transpose_16b_4x4 d25, d27, d29, d31
420 tr8_begin d26, d30, d27, d31
421 tr4 d24, d28, d25, d29
423 transpose_16b_4x4 d2, d3, d4, d5
424 transpose_16b_4x4 d6, d7, d8, d9
432 vldm r0, {q12-q15} // coeffs
433 transpose_16b_4x4 d24, d26, d28, d30
434 transpose_16b_4x4 d25, d27, d29, d31
435 tr8_begin d26, d30, d27, d31
436 tr4 d24, d28, d25, d29
438 transpose_16b_4x4 d2, d3, d4, d5
439 transpose_16b_4x4 d6, d7, d8, d9
445 vst1.16 {q1-q2}, [r0]
447 vst1.16 {q3-q4}, [r0]
456 .word 0x00240053 // 36 and d1[0] = 83
459 .word 0x0059004b // 89, d0[0] = 75
460 .word 0x00320012 // 50, d0[2] = 18
462 .word 0x005a0057 // 90, d2[0] = 87
463 .word 0x00500046 // 80, d2[2] = 70
464 .word 0x0039002b // 57, d2[0] = 43
465 .word 0x00190009 // 25, d2[2] = 9