2 * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
26 function ff_hevc_idct_4x4_dc_neon_8, export=1
35 vst1.16 {q0, q1}, [r0]
39 function ff_hevc_idct_8x8_dc_neon_8, export=1
58 function ff_hevc_idct_16x16_dc_neon_8, export=1
80 function ff_hevc_idct_32x32_dc_neon_8, export=1
102 function ff_hevc_transform_add_4x4_neon_8, export=1
104 vld1.32 d4[0], [r0], r2
105 vld1.32 d4[1], [r0], r2
106 vld1.32 d5[0], [r0], r2
107 vld1.32 d5[1], [r0], r2
108 sub r0, r0, r2, lsl #2
115 vst1.32 d0[0], [r0], r2
116 vst1.32 d0[1], [r0], r2
117 vst1.32 d1[0], [r0], r2
118 vst1.32 d1[1], [r0], r2
122 function ff_hevc_transform_add_8x8_neon_8, export=1
135 function ff_hevc_transform_add_16x16_neon_8, export=1
138 vld1.16 {q0, q1}, [r1]!
146 vst1.8 {q0}, [r0], r2
151 function ff_hevc_transform_add_32x32_neon_8, export=1
155 vld1.8 {q8, q9}, [r0]
168 vst1.8 {q0, q1}, [r0], r2
173 .macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7
190 .macro transpose_16b_4x4 r0, r1, r2, r3
197 /* uses registers q2 - q9 for temp values */
199 .macro tr4_luma_shift r0, r1, r2, r3, shift
200 vaddl.s16 q5, \r0, \r2 // c0 = src0 + src2
201 vaddl.s16 q2, \r2, \r3 // c1 = src2 + src3
202 vsubl.s16 q4, \r0, \r3 // c2 = src0 - src3
203 vmull.s16 q6, \r1, d0[0] // c3 = 74 * src1
205 vaddl.s16 q7, \r0, \r3 // src0 + src3
206 vsubw.s16 q7, q7, \r2 // src0 - src2 + src3
207 vmul.s32 q7, q7, d0[0] // dst2 = 74 * (src0 - src2 + src3)
209 vmul.s32 q8, q5, d0[1] // 29 * c0
210 vmul.s32 q9, q2, d1[0] // 55 * c1
211 vadd.s32 q8, q9 // 29 * c0 + 55 * c1
212 vadd.s32 q8, q6 // dst0 = 29 * c0 + 55 * c1 + c3
214 vmul.s32 q2, q2, d0[1] // 29 * c1
215 vmul.s32 q9, q4, d1[0] // 55 * c2
216 vsub.s32 q9, q2 // 55 * c2 - 29 * c1
217 vadd.s32 q9, q6 // dst1 = 55 * c2 - 29 * c1 + c3
219 vmul.s32 q5, q5, d1[0] // 55 * c0
220 vmul.s32 q4, q4, d0[1] // 29 * c2
221 vadd.s32 q5, q4 // 55 * c0 + 29 * c2
222 vsub.s32 q5, q6 // dst3 = 55 * c0 + 29 * c2 - c3
224 vqrshrn.s32 \r0, q8, \shift
225 vqrshrn.s32 \r1, q9, \shift
226 vqrshrn.s32 \r2, q7, \shift
227 vqrshrn.s32 \r3, q5, \shift
230 /* uses registers q2 - q6 for temp values */
231 .macro tr4 r0, r1, r2, r3
232 vmull.s16 q4, \r1, d0[0] // 83 * src1
233 vmull.s16 q6, \r1, d0[1] // 36 * src1
234 vshll.s16 q2, \r0, #6 // 64 * src0
235 vshll.s16 q3, \r2, #6 // 64 * src2
236 vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
237 vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
238 vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
239 vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
241 vsub.s32 q3, q5, q4 // e0 - o0
242 vadd.s32 q4, q5, q4 // e0 + o0
243 vadd.s32 q5, q2, q6 // e1 + o1
244 vsub.s32 q6, q2, q6 // e1 - o1
247 .macro tr4_shift r0, r1, r2, r3, shift
248 vmull.s16 q4, \r1, d0[0] // 83 * src1
249 vmull.s16 q6, \r1, d0[1] // 36 * src1
250 vshll.s16 q2, \r0, #6 // 64 * src0
251 vshll.s16 q3, \r2, #6 // 64 * src2
252 vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0
253 vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1
254 vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0
255 vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1
257 vsub.s32 q3, q5, q4 // e0 - o0
258 vadd.s32 q4, q5, q4 // e0 + o0
259 vadd.s32 q5, q2, q6 // e1 + o1
260 vsub.s32 q6, q2, q6 // e1 - o1
262 vqrshrn.s32 \r0, q4, \shift
263 vqrshrn.s32 \r1, q5, \shift
264 vqrshrn.s32 \r2, q6, \shift
265 vqrshrn.s32 \r3, q3, \shift
268 function ff_hevc_transform_4x4_neon_8, export=1
270 vld1.16 {q14, q15}, [r0] // coeffs
271 ldr r3, =0x00240053 // 36 and 83
274 tr4_shift d28, d29, d30, d31, #7
280 tr4_shift d28, d29, d30, d31, #12
286 vst1.16 {q14, q15}, [r0]
291 function ff_hevc_transform_luma_4x4_neon_8, export=1
293 vld1.16 {q14, q15}, [r0] // coeffs
301 tr4_luma_shift d28, d29, d30, d31, #7
307 tr4_luma_shift d28, d29, d30, d31, #12
312 vst1.16 {q14, q15}, [r0]
317 .macro tr8_begin in0, in1, in2, in3
318 vmull.s16 q7, \in0, d1[1] // 89 * src1
319 vmull.s16 q8, \in0, d1[0] // 75 * src1
320 vmull.s16 q9, \in0, d1[3] // 50 * src1
321 vmull.s16 q10, \in0, d1[2] // 18 * src1
323 vmlal.s16 q7, \in1, d1[0] // 75 * src3
324 vmlsl.s16 q8, \in1, d1[2] //-18 * src3
325 vmlsl.s16 q9, \in1, d1[1] //-89 * src3
326 vmlsl.s16 q10, \in1, d1[3] //-50 * src3
328 vmlal.s16 q7, \in2, d1[3] // 50 * src5
329 vmlsl.s16 q8, \in2, d1[1] //-89 * src5
330 vmlal.s16 q9, \in2, d1[2] // 18 * src5
331 vmlal.s16 q10, \in2, d1[0] // 75 * src5
333 vmlal.s16 q7, \in3, d1[2] // 18 * src7
334 vmlsl.s16 q8, \in3, d1[3] //-50 * src7
335 vmlal.s16 q9, \in3, d1[0] // 75 * src7
336 vmlsl.s16 q10, \in3, d1[1] //-89 * src7
340 vadd.s32 q1, q4, q7 // e_8[0] + o_8[0], dst[0]
341 vsub.s32 q4, q4, q7 // e_8[0] - o_8[0], dst[7]
343 vadd.s32 q2, q5, q8 // e_8[1] + o_8[1], dst[1]
344 vsub.s32 q5, q5, q8 // e_8[1] - o_8[1], dst[6]
346 vadd.s32 q11, q6, q9 // e_8[2] + o_8[2], dst[2]
347 vsub.s32 q6, q6, q9 // e_8[2] - o_8[2], dst[5]
349 vadd.s32 q12, q3, q10 // e_8[3] + o_8[3], dst[3]
350 vsub.s32 q3, q3, q10 // e_8[3] - o_8[3], dst[4]
351 vqrshrn.s32 d2, q1, \shift
352 vqrshrn.s32 d3, q2, \shift
353 vqrshrn.s32 d4, q11, \shift
354 vqrshrn.s32 d5, q12, \shift
355 vqrshrn.s32 d6, q3, \shift
356 vqrshrn.s32 d7, q6, \shift
357 vqrshrn.s32 d9, q4, \shift
358 vqrshrn.s32 d8, q5, \shift
361 function ff_hevc_transform_8x8_neon_8, export=1
367 vld1.16 {d0, d1}, [r3]
370 vld1.16 {d24}, [r0], r5
371 vld1.16 {d25}, [r0], r5
372 vld1.16 {d26}, [r0], r5
373 vld1.16 {d27}, [r0], r5
374 vld1.16 {d28}, [r0], r5
375 vld1.16 {d29}, [r0], r5
376 vld1.16 {d30}, [r0], r5
377 vld1.16 {d31}, [r0], r5
379 tr8_begin d25, d27, d29, d31
380 tr4 d24, d26, d28, d30
382 vst1.16 {d2}, [r0], r5
383 vst1.16 {d3}, [r0], r5
384 vst1.16 {d4}, [r0], r5
385 vst1.16 {d5}, [r0], r5
386 vst1.16 {d6}, [r0], r5
387 vst1.16 {d7}, [r0], r5
388 vst1.16 {d8}, [r0], r5
389 vst1.16 {d9}, [r0], r5
391 //skip right half if col_limit in r1 is less than 4
396 vld1.16 {d24}, [r0], r5
397 vld1.16 {d25}, [r0], r5
398 vld1.16 {d26}, [r0], r5
399 vld1.16 {d27}, [r0], r5
400 vld1.16 {d28}, [r0], r5
401 vld1.16 {d29}, [r0], r5
402 vld1.16 {d30}, [r0], r5
403 vld1.16 {d31}, [r0], r5
405 tr8_begin d25, d27, d29, d31
406 tr4 d24, d26, d28, d30
408 vst1.16 {d2}, [r0], r5
409 vst1.16 {d3}, [r0], r5
410 vst1.16 {d4}, [r0], r5
411 vst1.16 {d5}, [r0], r5
412 vst1.16 {d6}, [r0], r5
413 vst1.16 {d7}, [r0], r5
414 vst1.16 {d8}, [r0], r5
415 vst1.16 {d9}, [r0], r5
419 vldm r0, {q12-q15} // coeffs
420 transpose_16b_4x4 d24, d26, d28, d30
421 transpose_16b_4x4 d25, d27, d29, d31
422 tr8_begin d26, d30, d27, d31
423 tr4 d24, d28, d25, d29
425 transpose_16b_4x4 d2, d3, d4, d5
426 transpose_16b_4x4 d6, d7, d8, d9
434 vldm r0, {q12-q15} // coeffs
435 transpose_16b_4x4 d24, d26, d28, d30
436 transpose_16b_4x4 d25, d27, d29, d31
437 tr8_begin d26, d30, d27, d31
438 tr4 d24, d28, d25, d29
440 transpose_16b_4x4 d2, d3, d4, d5
441 transpose_16b_4x4 d6, d7, d8, d9
447 vst1.16 {q1-q2}, [r0]
449 vst1.16 {q3-q4}, [r0]
458 .word 0x00240053 // 36 and d1[0] = 83
461 .word 0x0059004b // 89, d0[0] = 75
462 .word 0x00320012 // 50, d0[2] = 18
464 .word 0x005a0057 // 90, d2[0] = 87
465 .word 0x00500046 // 80, d2[2] = 70
466 .word 0x0039002b // 57, d2[0] = 43
467 .word 0x00190009 // 25, d2[2] = 9