2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/arm/asm.S"
25 function ff_clear_block_neon, export=1
28 vst1.16 {q0}, [r0,:128]!
33 function ff_clear_blocks_neon, export=1
36 vst1.16 {q0}, [r0,:128]!
41 .macro pixels16 rnd=1, avg=0
45 1: vld1.8 {q0}, [r1], r2
54 vld1.8 {q8}, [r12,:128], r2
56 vld1.8 {q9}, [r12,:128], r2
58 vld1.8 {q10}, [r12,:128], r2
60 vld1.8 {q11}, [r12,:128], r2
64 vst1.64 {q0}, [r0,:128], r2
65 vst1.64 {q1}, [r0,:128], r2
66 vst1.64 {q2}, [r0,:128], r2
67 vst1.64 {q3}, [r0,:128], r2
72 .macro pixels16_x2 rnd=1, avg=0
73 1: vld1.8 {d0-d2}, [r1], r2
74 vld1.8 {d4-d6}, [r1], r2
83 vld1.8 {q1}, [r0,:128], r2
84 vld1.8 {q3}, [r0,:128]
89 vst1.8 {q0}, [r0,:128], r2
90 vst1.8 {q2}, [r0,:128], r2
95 .macro pixels16_y2 rnd=1, avg=0
101 vld1.8 {q0}, [r1], r2
103 vld1.8 {q1}, [r1], r2
107 vld1.8 {q8}, [r0,:128], r2
108 vld1.8 {q9}, [r0,:128]
113 vst1.8 {q2}, [r0,:128], r2
114 vst1.8 {q3}, [r0,:128], r2
118 vld1.8 {q0}, [r1], r2
121 vld1.8 {q8}, [r0,:128], r2
122 vld1.8 {q9}, [r0,:128]
127 vst1.8 {q2}, [r0,:128], r2
128 vst1.8 {q3}, [r0,:128], r2
133 .macro pixels16_xy2 rnd=1, avg=0
135 vld1.8 {d0-d2}, [r1], r2
136 vld1.8 {d4-d6}, [r1], r2
137 NRND vmov.i16 q13, #1
140 vext.8 q1, q0, q1, #1
141 vext.8 q3, q2, q3, #1
147 vld1.8 {d0-d2}, [r1], r2
150 NRND vadd.u16 q12, q12, q13
151 vext.8 q15, q0, q1, #1
152 vadd.u16 q1 , q10, q11
154 NRND vadd.u16 q1, q1, q13
157 vld1.8 {q8}, [r0,:128]
158 vrhadd.u8 q14, q14, q8
161 vld1.8 {d2-d4}, [r1], r2
162 vaddl.u8 q10, d1, d31
163 vst1.8 {q14}, [r0,:128], r2
166 NRND vadd.u16 q12, q12, q13
167 vext.8 q2, q1, q2, #1
168 vadd.u16 q0, q10, q11
170 NRND vadd.u16 q0, q0, q13
173 vld1.8 {q9}, [r0,:128]
174 vrhadd.u8 q15, q15, q9
178 vst1.8 {q15}, [r0,:128], r2
181 vld1.8 {d0-d2}, [r1], r2
183 NRND vadd.u16 q12, q12, q13
184 vext.8 q15, q0, q1, #1
185 vadd.u16 q1 , q10, q11
187 NRND vadd.u16 q1, q1, q13
190 vld1.8 {q8}, [r0,:128]
191 vrhadd.u8 q14, q14, q8
194 vaddl.u8 q10, d1, d31
195 vst1.8 {q14}, [r0,:128], r2
197 NRND vadd.u16 q12, q12, q13
198 vadd.u16 q0, q10, q11
200 NRND vadd.u16 q0, q0, q13
203 vld1.8 {q9}, [r0,:128]
204 vrhadd.u8 q15, q15, q9
206 vst1.8 {q15}, [r0,:128], r2
211 .macro pixels8 rnd=1, avg=0
212 1: vld1.8 {d0}, [r1], r2
213 vld1.8 {d1}, [r1], r2
214 vld1.8 {d2}, [r1], r2
216 vld1.8 {d3}, [r1], r2
221 vld1.8 {d4}, [r0,:64], r2
223 vld1.8 {d5}, [r0,:64], r2
225 vld1.8 {d6}, [r0,:64], r2
227 vld1.8 {d7}, [r0,:64], r2
229 sub r0, r0, r2, lsl #2
232 vst1.8 {d0}, [r0,:64], r2
233 vst1.8 {d1}, [r0,:64], r2
234 vst1.8 {d2}, [r0,:64], r2
235 vst1.8 {d3}, [r0,:64], r2
240 .macro pixels8_x2 rnd=1, avg=0
241 1: vld1.8 {q0}, [r1], r2
242 vext.8 d1, d0, d1, #1
243 vld1.8 {q1}, [r1], r2
244 vext.8 d3, d2, d3, #1
251 vld1.8 {d4}, [r0,:64], r2
252 vld1.8 {d5}, [r0,:64]
256 vst1.8 {d0}, [r0,:64], r2
257 vst1.8 {d1}, [r0,:64], r2
262 .macro pixels8_y2 rnd=1, avg=0
264 vld1.8 {d0}, [r1], r2
265 vld1.8 {d1}, [r1], r2
268 vld1.8 {d0}, [r1], r2
270 vld1.8 {d1}, [r1], r2
274 vld1.8 {d2}, [r0,:64], r2
275 vld1.8 {d3}, [r0,:64]
279 vst1.8 {d4}, [r0,:64], r2
280 vst1.8 {d5}, [r0,:64], r2
284 vld1.8 {d0}, [r1], r2
287 vld1.8 {d2}, [r0,:64], r2
288 vld1.8 {d3}, [r0,:64]
292 vst1.8 {d4}, [r0,:64], r2
293 vst1.8 {d5}, [r0,:64], r2
298 .macro pixels8_xy2 rnd=1, avg=0
300 vld1.8 {q0}, [r1], r2
301 vld1.8 {q1}, [r1], r2
302 NRND vmov.i16 q11, #1
305 vext.8 d4, d0, d1, #1
306 vext.8 d6, d2, d3, #1
310 vld1.8 {q0}, [r1], r2
313 vext.8 d4, d0, d1, #1
314 NRND vadd.u16 q10, q10, q11
317 vld1.8 {q1}, [r1], r2
321 vld1.8 {d7}, [r0,:64]
324 NRND vadd.u16 q10, q10, q11
325 vst1.8 {d5}, [r0,:64], r2
328 vld1.8 {d5}, [r0,:64]
331 vext.8 d6, d2, d3, #1
333 vst1.8 {d7}, [r0,:64], r2
336 vld1.8 {q0}, [r1], r2
338 vext.8 d4, d0, d1, #1
339 NRND vadd.u16 q10, q10, q11
344 vld1.8 {d7}, [r0,:64]
347 NRND vadd.u16 q10, q10, q11
348 vst1.8 {d5}, [r0,:64], r2
351 vld1.8 {d5}, [r0,:64]
354 vst1.8 {d7}, [r0,:64], r2
359 .macro pixfunc pfx, name, suf, rnd=1, avg=0
361 .macro avg rd, rn, rm
362 vrhadd.u8 \rd, \rn, \rm
364 .macro shrn rd, rn, rm
365 vrshrn.u16 \rd, \rn, \rm
367 .macro NRND insn:vararg
370 .macro avg rd, rn, rm
371 vhadd.u8 \rd, \rn, \rm
373 .macro shrn rd, rn, rm
374 vshrn.u16 \rd, \rn, \rm
376 .macro NRND insn:vararg
380 function ff_\pfx\name\suf\()_neon, export=1
388 .macro pixfunc2 pfx, name, avg=0
389 pixfunc \pfx, \name, rnd=1, avg=\avg
390 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
393 function ff_put_h264_qpel16_mc00_neon, export=1
397 pixfunc put_, pixels16, avg=0
398 pixfunc2 put_, pixels16_x2, avg=0
399 pixfunc2 put_, pixels16_y2, avg=0
400 pixfunc2 put_, pixels16_xy2, avg=0
402 function ff_avg_h264_qpel16_mc00_neon, export=1
406 pixfunc avg_, pixels16, avg=1
407 pixfunc2 avg_, pixels16_x2, avg=1
408 pixfunc2 avg_, pixels16_y2, avg=1
409 pixfunc2 avg_, pixels16_xy2, avg=1
411 function ff_put_h264_qpel8_mc00_neon, export=1
415 pixfunc put_, pixels8, avg=0
416 pixfunc2 put_, pixels8_x2, avg=0
417 pixfunc2 put_, pixels8_y2, avg=0
418 pixfunc2 put_, pixels8_xy2, avg=0
420 function ff_avg_h264_qpel8_mc00_neon, export=1
424 pixfunc avg_, pixels8, avg=1
425 pixfunc2 avg_, pixels8_x2, avg=1
426 pixfunc2 avg_, pixels8_y2, avg=1
427 pixfunc2 avg_, pixels8_xy2, avg=1
429 function ff_put_pixels_clamped_neon, export=1
430 vld1.16 {d16-d19}, [r0,:128]!
432 vld1.16 {d20-d23}, [r0,:128]!
434 vld1.16 {d24-d27}, [r0,:128]!
436 vld1.16 {d28-d31}, [r0,:128]!
438 vst1.8 {d0}, [r1,:64], r2
440 vst1.8 {d1}, [r1,:64], r2
442 vst1.8 {d2}, [r1,:64], r2
444 vst1.8 {d3}, [r1,:64], r2
446 vst1.8 {d4}, [r1,:64], r2
447 vst1.8 {d5}, [r1,:64], r2
448 vst1.8 {d6}, [r1,:64], r2
449 vst1.8 {d7}, [r1,:64], r2
453 function ff_put_signed_pixels_clamped_neon, export=1
455 vld1.16 {d16-d17}, [r0,:128]!
457 vld1.16 {d18-d19}, [r0,:128]!
459 vld1.16 {d16-d17}, [r0,:128]!
461 vld1.16 {d18-d19}, [r0,:128]!
463 vld1.16 {d20-d21}, [r0,:128]!
465 vld1.16 {d22-d23}, [r0,:128]!
467 vst1.8 {d0}, [r1,:64], r2
469 vst1.8 {d1}, [r1,:64], r2
471 vst1.8 {d2}, [r1,:64], r2
473 vld1.16 {d24-d25}, [r0,:128]!
475 vld1.16 {d26-d27}, [r0,:128]!
478 vst1.8 {d3}, [r1,:64], r2
480 vst1.8 {d4}, [r1,:64], r2
482 vst1.8 {d5}, [r1,:64], r2
485 vst1.8 {d6}, [r1,:64], r2
486 vst1.8 {d7}, [r1,:64], r2
490 function ff_add_pixels_clamped_neon, export=1
492 vld1.8 {d16}, [r1,:64], r2
493 vld1.16 {d0-d1}, [r0,:128]!
495 vld1.8 {d17}, [r1,:64], r2
496 vld1.16 {d2-d3}, [r0,:128]!
498 vld1.8 {d18}, [r1,:64], r2
500 vld1.16 {d4-d5}, [r0,:128]!
502 vst1.8 {d0}, [r3,:64], r2
504 vld1.8 {d19}, [r1,:64], r2
505 vld1.16 {d6-d7}, [r0,:128]!
508 vst1.8 {d2}, [r3,:64], r2
509 vld1.8 {d16}, [r1,:64], r2
511 vld1.16 {d0-d1}, [r0,:128]!
513 vst1.8 {d4}, [r3,:64], r2
514 vld1.8 {d17}, [r1,:64], r2
515 vld1.16 {d2-d3}, [r0,:128]!
517 vst1.8 {d6}, [r3,:64], r2
519 vld1.8 {d18}, [r1,:64], r2
520 vld1.16 {d4-d5}, [r0,:128]!
522 vst1.8 {d0}, [r3,:64], r2
524 vld1.8 {d19}, [r1,:64], r2
526 vld1.16 {d6-d7}, [r0,:128]!
528 vst1.8 {d2}, [r3,:64], r2
530 vst1.8 {d4}, [r3,:64], r2
531 vst1.8 {d6}, [r3,:64], r2
535 function ff_vector_fmul_window_neon, export=1
540 add r2, r2, r5, lsl #2
541 add r4, r3, r5, lsl #3
542 add ip, r0, r5, lsl #3
544 vld1.32 {d0,d1}, [r1,:128]!
545 vld1.32 {d2,d3}, [r2,:128], r5
546 vld1.32 {d4,d5}, [r3,:128]!
547 vld1.32 {d6,d7}, [r4,:128], r5
557 vld1.32 {d0,d1}, [r1,:128]!
559 vld1.32 {d18,d19},[r2,:128], r5
561 vld1.32 {d24,d25},[r3,:128]!
563 vld1.32 {d6,d7}, [r4,:128], r5
568 vst1.32 {d20,d21},[r0,:128]!
569 vst1.32 {d22,d23},[ip,:128], r5
571 2: vmla.f32 d22, d3, d7
577 vst1.32 {d20,d21},[r0,:128]!
578 vst1.32 {d22,d23},[ip,:128], r5
582 #if CONFIG_VORBIS_DECODER
583 function ff_vorbis_inverse_coupling_neon, export=1
590 vld1.32 {d24-d25},[r1,:128]!
591 vld1.32 {d22-d23},[r0,:128]!
597 vadd.f32 q12, q11, q2
598 vsub.f32 q11, q11, q3
599 1: vld1.32 {d2-d3}, [r1,:128]!
600 vld1.32 {d0-d1}, [r0,:128]!
604 vst1.32 {d24-d25},[r3, :128]!
605 vst1.32 {d22-d23},[r12,:128]!
612 vld1.32 {d24-d25},[r1,:128]!
613 vld1.32 {d22-d23},[r0,:128]!
617 vst1.32 {d2-d3}, [r3, :128]!
618 vst1.32 {d0-d1}, [r12,:128]!
621 vadd.f32 q12, q11, q2
622 vsub.f32 q11, q11, q3
625 2: vst1.32 {d2-d3}, [r3, :128]!
626 vst1.32 {d0-d1}, [r12,:128]!
630 3: vld1.32 {d2-d3}, [r1,:128]
631 vld1.32 {d0-d1}, [r0,:128]
639 vst1.32 {d2-d3}, [r0,:128]!
640 vst1.32 {d0-d1}, [r1,:128]!
645 function ff_butterflies_float_neon, export=1
646 1: vld1.32 {q0},[r0,:128]
647 vld1.32 {q1},[r1,:128]
650 vst1.32 {q2},[r1,:128]!
651 vst1.32 {q1},[r0,:128]!
657 function ff_scalarproduct_float_neon, export=1
659 1: vld1.32 {q0},[r0,:128]!
660 vld1.32 {q1},[r1,:128]!
666 NOVFP vmov.32 r0, d0[0]
670 function ff_vector_fmul_reverse_neon, export=1
671 add r2, r2, r3, lsl #2
674 vld1.32 {q0-q1}, [r1,:128]!
675 vld1.32 {q2-q3}, [r2,:128], r12
686 vld1.32 {q0-q1}, [r1,:128]!
687 vld1.32 {q2-q3}, [r2,:128], r12
688 vst1.32 {q8-q9}, [r0,:128]!
690 2: vst1.32 {q8-q9}, [r0,:128]!
694 function ff_vector_fmul_add_neon, export=1
696 vld1.32 {q0-q1}, [r1,:128]!
697 vld1.32 {q8-q9}, [r2,:128]!
698 vld1.32 {q2-q3}, [r3,:128]!
701 1: vadd.f32 q12, q2, q10
702 vadd.f32 q13, q3, q11
708 vld1.32 {q0}, [r1,:128]!
709 vld1.32 {q8}, [r2,:128]!
711 vld1.32 {q1}, [r1,:128]!
712 vld1.32 {q9}, [r2,:128]!
714 vld1.32 {q2-q3}, [r3,:128]!
715 vst1.32 {q12-q13},[r0,:128]!
717 2: vst1.32 {q12-q13},[r0,:128]!
721 function ff_vector_clipf_neon, export=1
722 VFP vdup.32 q1, d0[1]
723 VFP vdup.32 q0, d0[0]
727 vld1.f32 {q2},[r1,:128]!
729 vld1.f32 {q3},[r1,:128]!
731 1: vmax.f32 q8, q10, q0
735 vld1.f32 {q2},[r1,:128]!
737 vld1.f32 {q3},[r1,:128]!
739 vst1.f32 {q8},[r0,:128]!
740 vst1.f32 {q9},[r0,:128]!
742 2: vst1.f32 {q8},[r0,:128]!
743 vst1.f32 {q9},[r0,:128]!
747 function ff_apply_window_int16_neon, export=1
749 add r4, r1, r3, lsl #1
750 add lr, r0, r3, lsl #1
755 vld1.16 {q0}, [r1,:128]!
756 vld1.16 {q2}, [r2,:128]!
757 vld1.16 {q1}, [r4,:128], r12
759 vqrdmulh.s16 q0, q0, q2
760 vqrdmulh.s16 d2, d2, d7
761 vqrdmulh.s16 d3, d3, d6
762 vst1.16 {q0}, [r0,:128]!
763 vst1.16 {q1}, [lr,:128], r12
770 function ff_vector_clip_int32_neon, export=1
775 vld1.32 {q2-q3}, [r1,:128]!
780 vst1.32 {q2-q3}, [r0,:128]!