2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/arm/asm.S"
27 function ff_clear_block_neon, export=1
30 vst1.16 {q0}, [r0,:128]!
35 function ff_clear_blocks_neon, export=1
38 vst1.16 {q0}, [r0,:128]!
43 .macro pixels16 rnd=1, avg=0
47 1: vld1.8 {q0}, [r1], r2
56 vld1.8 {q8}, [r12,:128], r2
58 vld1.8 {q9}, [r12,:128], r2
60 vld1.8 {q10}, [r12,:128], r2
62 vld1.8 {q11}, [r12,:128], r2
66 vst1.64 {q0}, [r0,:128], r2
67 vst1.64 {q1}, [r0,:128], r2
68 vst1.64 {q2}, [r0,:128], r2
69 vst1.64 {q3}, [r0,:128], r2
74 .macro pixels16_x2 rnd=1, avg=0
75 1: vld1.8 {d0-d2}, [r1], r2
76 vld1.8 {d4-d6}, [r1], r2
85 vld1.8 {q1}, [r0,:128], r2
86 vld1.8 {q3}, [r0,:128]
91 vst1.8 {q0}, [r0,:128], r2
92 vst1.8 {q2}, [r0,:128], r2
97 .macro pixels16_y2 rnd=1, avg=0
100 vld1.8 {q1}, [r1], r2
103 vld1.8 {q0}, [r1], r2
105 vld1.8 {q1}, [r1], r2
109 vld1.8 {q8}, [r0,:128], r2
110 vld1.8 {q9}, [r0,:128]
115 vst1.8 {q2}, [r0,:128], r2
116 vst1.8 {q3}, [r0,:128], r2
120 vld1.8 {q0}, [r1], r2
123 vld1.8 {q8}, [r0,:128], r2
124 vld1.8 {q9}, [r0,:128]
129 vst1.8 {q2}, [r0,:128], r2
130 vst1.8 {q3}, [r0,:128], r2
135 .macro pixels16_xy2 rnd=1, avg=0
137 vld1.8 {d0-d2}, [r1], r2
138 vld1.8 {d4-d6}, [r1], r2
139 NRND vmov.i16 q13, #1
142 vext.8 q1, q0, q1, #1
143 vext.8 q3, q2, q3, #1
149 vld1.8 {d0-d2}, [r1], r2
152 NRND vadd.u16 q12, q12, q13
153 vext.8 q15, q0, q1, #1
154 vadd.u16 q1 , q10, q11
156 NRND vadd.u16 q1, q1, q13
159 vld1.8 {q8}, [r0,:128]
160 vrhadd.u8 q14, q14, q8
163 vld1.8 {d2-d4}, [r1], r2
164 vaddl.u8 q10, d1, d31
165 vst1.8 {q14}, [r0,:128], r2
168 NRND vadd.u16 q12, q12, q13
169 vext.8 q2, q1, q2, #1
170 vadd.u16 q0, q10, q11
172 NRND vadd.u16 q0, q0, q13
175 vld1.8 {q9}, [r0,:128]
176 vrhadd.u8 q15, q15, q9
180 vst1.8 {q15}, [r0,:128], r2
183 vld1.8 {d0-d2}, [r1], r2
185 NRND vadd.u16 q12, q12, q13
186 vext.8 q15, q0, q1, #1
187 vadd.u16 q1 , q10, q11
189 NRND vadd.u16 q1, q1, q13
192 vld1.8 {q8}, [r0,:128]
193 vrhadd.u8 q14, q14, q8
196 vaddl.u8 q10, d1, d31
197 vst1.8 {q14}, [r0,:128], r2
199 NRND vadd.u16 q12, q12, q13
200 vadd.u16 q0, q10, q11
202 NRND vadd.u16 q0, q0, q13
205 vld1.8 {q9}, [r0,:128]
206 vrhadd.u8 q15, q15, q9
208 vst1.8 {q15}, [r0,:128], r2
213 .macro pixels8 rnd=1, avg=0
214 1: vld1.8 {d0}, [r1], r2
215 vld1.8 {d1}, [r1], r2
216 vld1.8 {d2}, [r1], r2
218 vld1.8 {d3}, [r1], r2
223 vld1.8 {d4}, [r0,:64], r2
225 vld1.8 {d5}, [r0,:64], r2
227 vld1.8 {d6}, [r0,:64], r2
229 vld1.8 {d7}, [r0,:64], r2
231 sub r0, r0, r2, lsl #2
234 vst1.8 {d0}, [r0,:64], r2
235 vst1.8 {d1}, [r0,:64], r2
236 vst1.8 {d2}, [r0,:64], r2
237 vst1.8 {d3}, [r0,:64], r2
242 .macro pixels8_x2 rnd=1, avg=0
243 1: vld1.8 {q0}, [r1], r2
244 vext.8 d1, d0, d1, #1
245 vld1.8 {q1}, [r1], r2
246 vext.8 d3, d2, d3, #1
253 vld1.8 {d4}, [r0,:64], r2
254 vld1.8 {d5}, [r0,:64]
258 vst1.8 {d0}, [r0,:64], r2
259 vst1.8 {d1}, [r0,:64], r2
264 .macro pixels8_y2 rnd=1, avg=0
266 vld1.8 {d0}, [r1], r2
267 vld1.8 {d1}, [r1], r2
270 vld1.8 {d0}, [r1], r2
272 vld1.8 {d1}, [r1], r2
276 vld1.8 {d2}, [r0,:64], r2
277 vld1.8 {d3}, [r0,:64]
281 vst1.8 {d4}, [r0,:64], r2
282 vst1.8 {d5}, [r0,:64], r2
286 vld1.8 {d0}, [r1], r2
289 vld1.8 {d2}, [r0,:64], r2
290 vld1.8 {d3}, [r0,:64]
294 vst1.8 {d4}, [r0,:64], r2
295 vst1.8 {d5}, [r0,:64], r2
300 .macro pixels8_xy2 rnd=1, avg=0
302 vld1.8 {q0}, [r1], r2
303 vld1.8 {q1}, [r1], r2
304 NRND vmov.i16 q11, #1
307 vext.8 d4, d0, d1, #1
308 vext.8 d6, d2, d3, #1
312 vld1.8 {q0}, [r1], r2
315 vext.8 d4, d0, d1, #1
316 NRND vadd.u16 q10, q10, q11
319 vld1.8 {q1}, [r1], r2
323 vld1.8 {d7}, [r0,:64]
326 NRND vadd.u16 q10, q10, q11
327 vst1.8 {d5}, [r0,:64], r2
330 vld1.8 {d5}, [r0,:64]
333 vext.8 d6, d2, d3, #1
335 vst1.8 {d7}, [r0,:64], r2
338 vld1.8 {q0}, [r1], r2
340 vext.8 d4, d0, d1, #1
341 NRND vadd.u16 q10, q10, q11
346 vld1.8 {d7}, [r0,:64]
349 NRND vadd.u16 q10, q10, q11
350 vst1.8 {d5}, [r0,:64], r2
353 vld1.8 {d5}, [r0,:64]
356 vst1.8 {d7}, [r0,:64], r2
361 .macro pixfunc pfx, name, suf, rnd=1, avg=0
363 .macro avg rd, rn, rm
364 vrhadd.u8 \rd, \rn, \rm
366 .macro shrn rd, rn, rm
367 vrshrn.u16 \rd, \rn, \rm
369 .macro NRND insn:vararg
372 .macro avg rd, rn, rm
373 vhadd.u8 \rd, \rn, \rm
375 .macro shrn rd, rn, rm
376 vshrn.u16 \rd, \rn, \rm
378 .macro NRND insn:vararg
382 function ff_\pfx\name\suf\()_neon, export=1
390 .macro pixfunc2 pfx, name, avg=0
391 pixfunc \pfx, \name, rnd=1, avg=\avg
392 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
395 function ff_put_h264_qpel16_mc00_neon, export=1
399 pixfunc put_, pixels16, avg=0
400 pixfunc2 put_, pixels16_x2, avg=0
401 pixfunc2 put_, pixels16_y2, avg=0
402 pixfunc2 put_, pixels16_xy2, avg=0
404 function ff_avg_h264_qpel16_mc00_neon, export=1
408 pixfunc avg_, pixels16, avg=1
409 pixfunc2 avg_, pixels16_x2, avg=1
410 pixfunc2 avg_, pixels16_y2, avg=1
411 pixfunc2 avg_, pixels16_xy2, avg=1
413 function ff_put_h264_qpel8_mc00_neon, export=1
417 pixfunc put_, pixels8, avg=0
418 pixfunc2 put_, pixels8_x2, avg=0
419 pixfunc2 put_, pixels8_y2, avg=0
420 pixfunc2 put_, pixels8_xy2, avg=0
422 function ff_avg_h264_qpel8_mc00_neon, export=1
426 pixfunc avg_, pixels8, avg=1
427 pixfunc2 avg_, pixels8_x2, avg=1
428 pixfunc2 avg_, pixels8_y2, avg=1
429 pixfunc2 avg_, pixels8_xy2, avg=1
431 function ff_put_pixels_clamped_neon, export=1
432 vld1.16 {d16-d19}, [r0,:128]!
434 vld1.16 {d20-d23}, [r0,:128]!
436 vld1.16 {d24-d27}, [r0,:128]!
438 vld1.16 {d28-d31}, [r0,:128]!
440 vst1.8 {d0}, [r1,:64], r2
442 vst1.8 {d1}, [r1,:64], r2
444 vst1.8 {d2}, [r1,:64], r2
446 vst1.8 {d3}, [r1,:64], r2
448 vst1.8 {d4}, [r1,:64], r2
449 vst1.8 {d5}, [r1,:64], r2
450 vst1.8 {d6}, [r1,:64], r2
451 vst1.8 {d7}, [r1,:64], r2
455 function ff_put_signed_pixels_clamped_neon, export=1
457 vld1.16 {d16-d17}, [r0,:128]!
459 vld1.16 {d18-d19}, [r0,:128]!
461 vld1.16 {d16-d17}, [r0,:128]!
463 vld1.16 {d18-d19}, [r0,:128]!
465 vld1.16 {d20-d21}, [r0,:128]!
467 vld1.16 {d22-d23}, [r0,:128]!
469 vst1.8 {d0}, [r1,:64], r2
471 vst1.8 {d1}, [r1,:64], r2
473 vst1.8 {d2}, [r1,:64], r2
475 vld1.16 {d24-d25}, [r0,:128]!
477 vld1.16 {d26-d27}, [r0,:128]!
480 vst1.8 {d3}, [r1,:64], r2
482 vst1.8 {d4}, [r1,:64], r2
484 vst1.8 {d5}, [r1,:64], r2
487 vst1.8 {d6}, [r1,:64], r2
488 vst1.8 {d7}, [r1,:64], r2
492 function ff_add_pixels_clamped_neon, export=1
494 vld1.8 {d16}, [r1,:64], r2
495 vld1.16 {d0-d1}, [r0,:128]!
497 vld1.8 {d17}, [r1,:64], r2
498 vld1.16 {d2-d3}, [r0,:128]!
500 vld1.8 {d18}, [r1,:64], r2
502 vld1.16 {d4-d5}, [r0,:128]!
504 vst1.8 {d0}, [r3,:64], r2
506 vld1.8 {d19}, [r1,:64], r2
507 vld1.16 {d6-d7}, [r0,:128]!
510 vst1.8 {d2}, [r3,:64], r2
511 vld1.8 {d16}, [r1,:64], r2
513 vld1.16 {d0-d1}, [r0,:128]!
515 vst1.8 {d4}, [r3,:64], r2
516 vld1.8 {d17}, [r1,:64], r2
517 vld1.16 {d2-d3}, [r0,:128]!
519 vst1.8 {d6}, [r3,:64], r2
521 vld1.8 {d18}, [r1,:64], r2
522 vld1.16 {d4-d5}, [r0,:128]!
524 vst1.8 {d0}, [r3,:64], r2
526 vld1.8 {d19}, [r1,:64], r2
528 vld1.16 {d6-d7}, [r0,:128]!
530 vst1.8 {d2}, [r3,:64], r2
532 vst1.8 {d4}, [r3,:64], r2
533 vst1.8 {d6}, [r3,:64], r2
537 function ff_vector_fmul_neon, export=1
539 vld1.32 {d0-d3}, [r1,:128]!
540 vld1.32 {d4-d7}, [r2,:128]!
547 vld1.32 {d0-d1}, [r1,:128]!
548 vld1.32 {d4-d5}, [r2,:128]!
550 vld1.32 {d2-d3}, [r1,:128]!
551 vld1.32 {d6-d7}, [r2,:128]!
553 vst1.32 {d16-d19},[r0,:128]!
554 vld1.32 {d0-d1}, [r1,:128]!
555 vld1.32 {d4-d5}, [r2,:128]!
557 vld1.32 {d2-d3}, [r1,:128]!
558 vld1.32 {d6-d7}, [r2,:128]!
560 vst1.32 {d20-d23},[r0,:128]!
564 2: vld1.32 {d0-d1}, [r1,:128]!
565 vld1.32 {d4-d5}, [r2,:128]!
566 vst1.32 {d16-d17},[r0,:128]!
568 vld1.32 {d2-d3}, [r1,:128]!
569 vld1.32 {d6-d7}, [r2,:128]!
570 vst1.32 {d18-d19},[r0,:128]!
572 3: vst1.32 {d16-d19},[r0,:128]!
576 function ff_vector_fmul_window_neon, export=1
581 add r2, r2, r5, lsl #2
582 add r4, r3, r5, lsl #3
583 add ip, r0, r5, lsl #3
585 vld1.32 {d0,d1}, [r1,:128]!
586 vld1.32 {d2,d3}, [r2,:128], r5
587 vld1.32 {d4,d5}, [r3,:128]!
588 vld1.32 {d6,d7}, [r4,:128], r5
598 vld1.32 {d0,d1}, [r1,:128]!
600 vld1.32 {d18,d19},[r2,:128], r5
602 vld1.32 {d24,d25},[r3,:128]!
604 vld1.32 {d6,d7}, [r4,:128], r5
609 vst1.32 {d20,d21},[r0,:128]!
610 vst1.32 {d22,d23},[ip,:128], r5
612 2: vmla.f32 d22, d3, d7
618 vst1.32 {d20,d21},[r0,:128]!
619 vst1.32 {d22,d23},[ip,:128], r5
623 #if CONFIG_VORBIS_DECODER
624 function ff_vorbis_inverse_coupling_neon, export=1
631 vld1.32 {d24-d25},[r1,:128]!
632 vld1.32 {d22-d23},[r0,:128]!
638 vadd.f32 q12, q11, q2
639 vsub.f32 q11, q11, q3
640 1: vld1.32 {d2-d3}, [r1,:128]!
641 vld1.32 {d0-d1}, [r0,:128]!
645 vst1.32 {d24-d25},[r3, :128]!
646 vst1.32 {d22-d23},[r12,:128]!
653 vld1.32 {d24-d25},[r1,:128]!
654 vld1.32 {d22-d23},[r0,:128]!
658 vst1.32 {d2-d3}, [r3, :128]!
659 vst1.32 {d0-d1}, [r12,:128]!
662 vadd.f32 q12, q11, q2
663 vsub.f32 q11, q11, q3
666 2: vst1.32 {d2-d3}, [r3, :128]!
667 vst1.32 {d0-d1}, [r12,:128]!
671 3: vld1.32 {d2-d3}, [r1,:128]
672 vld1.32 {d0-d1}, [r0,:128]
680 vst1.32 {d2-d3}, [r0,:128]!
681 vst1.32 {d0-d1}, [r1,:128]!
686 function ff_vector_fmul_scalar_neon, export=1
689 VFP vdup.32 q8, d0[0]
693 vld1.32 {q0},[r1,:128]!
694 vld1.32 {q1},[r1,:128]!
695 1: vmul.f32 q0, q0, q8
696 vld1.32 {q2},[r1,:128]!
698 vld1.32 {q3},[r1,:128]!
700 vst1.32 {q0},[r0,:128]!
702 vst1.32 {q1},[r0,:128]!
705 vld1.32 {q0},[r1,:128]!
706 vst1.32 {q2},[r0,:128]!
707 vld1.32 {q1},[r1,:128]!
708 vst1.32 {q3},[r0,:128]!
710 2: vst1.32 {q2},[r0,:128]!
711 vst1.32 {q3},[r0,:128]!
715 3: vld1.32 {q0},[r1,:128]!
717 vst1.32 {q0},[r0,:128]!
724 function ff_vector_fmac_scalar_neon, export=1
729 VFP vdup.32 q15, d0[0]
730 NOVFP vdup.32 q15, r2
734 vld1.32 {q0}, [r1,:128]!
735 vld1.32 {q8}, [acc,:128]!
736 vld1.32 {q1}, [r1,:128]!
737 vld1.32 {q9}, [acc,:128]!
738 1: vmla.f32 q8, q0, q15
739 vld1.32 {q2}, [r1,:128]!
740 vld1.32 {q10}, [acc,:128]!
742 vld1.32 {q3}, [r1,:128]!
743 vld1.32 {q11}, [acc,:128]!
744 vmla.f32 q10, q2, q15
745 vst1.32 {q8}, [r0,:128]!
746 vmla.f32 q11, q3, q15
747 vst1.32 {q9}, [r0,:128]!
750 vld1.32 {q0}, [r1,:128]!
751 vld1.32 {q8}, [acc,:128]!
752 vst1.32 {q10}, [r0,:128]!
753 vld1.32 {q1}, [r1,:128]!
754 vld1.32 {q9}, [acc,:128]!
755 vst1.32 {q11}, [r0,:128]!
757 2: vst1.32 {q10}, [r0,:128]!
758 vst1.32 {q11}, [r0,:128]!
762 3: vld1.32 {q0}, [r1,:128]!
763 vld1.32 {q8}, [acc,:128]!
765 vst1.32 {q8}, [r0,:128]!
772 function ff_butterflies_float_neon, export=1
773 1: vld1.32 {q0},[r0,:128]
774 vld1.32 {q1},[r1,:128]
777 vst1.32 {q2},[r1,:128]!
778 vst1.32 {q1},[r0,:128]!
784 function ff_scalarproduct_float_neon, export=1
786 1: vld1.32 {q0},[r0,:128]!
787 vld1.32 {q1},[r1,:128]!
793 NOVFP vmov.32 r0, d0[0]
797 function ff_vector_fmul_reverse_neon, export=1
798 add r2, r2, r3, lsl #2
801 vld1.32 {q0-q1}, [r1,:128]!
802 vld1.32 {q2-q3}, [r2,:128], r12
813 vld1.32 {q0-q1}, [r1,:128]!
814 vld1.32 {q2-q3}, [r2,:128], r12
815 vst1.32 {q8-q9}, [r0,:128]!
817 2: vst1.32 {q8-q9}, [r0,:128]!
821 function ff_vector_fmul_add_neon, export=1
823 vld1.32 {q0-q1}, [r1,:128]!
824 vld1.32 {q8-q9}, [r2,:128]!
825 vld1.32 {q2-q3}, [r3,:128]!
828 1: vadd.f32 q12, q2, q10
829 vadd.f32 q13, q3, q11
835 vld1.32 {q0}, [r1,:128]!
836 vld1.32 {q8}, [r2,:128]!
838 vld1.32 {q1}, [r1,:128]!
839 vld1.32 {q9}, [r2,:128]!
841 vld1.32 {q2-q3}, [r3,:128]!
842 vst1.32 {q12-q13},[r0,:128]!
844 2: vst1.32 {q12-q13},[r0,:128]!
848 function ff_vector_clipf_neon, export=1
849 VFP vdup.32 q1, d0[1]
850 VFP vdup.32 q0, d0[0]
854 vld1.f32 {q2},[r1,:128]!
856 vld1.f32 {q3},[r1,:128]!
858 1: vmax.f32 q8, q10, q0
862 vld1.f32 {q2},[r1,:128]!
864 vld1.f32 {q3},[r1,:128]!
866 vst1.f32 {q8},[r0,:128]!
867 vst1.f32 {q9},[r0,:128]!
869 2: vst1.f32 {q8},[r0,:128]!
870 vst1.f32 {q9},[r0,:128]!
874 function ff_apply_window_int16_neon, export=1
876 add r4, r1, r3, lsl #1
877 add lr, r0, r3, lsl #1
882 vld1.16 {q0}, [r1,:128]!
883 vld1.16 {q2}, [r2,:128]!
884 vld1.16 {q1}, [r4,:128], r12
886 vqrdmulh.s16 q0, q0, q2
887 vqrdmulh.s16 d2, d2, d7
888 vqrdmulh.s16 d3, d3, d6
889 vst1.16 {q0}, [r0,:128]!
890 vst1.16 {q1}, [lr,:128], r12
897 function ff_vector_clip_int32_neon, export=1
902 vld1.32 {q2-q3}, [r1,:128]!
907 vst1.32 {q2-q3}, [r0,:128]!