2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 function ff_clear_block_neon, export=1
30 vst1.16 {q0}, [r0,:128]!
35 function ff_clear_blocks_neon, export=1
38 vst1.16 {q0}, [r0,:128]!
43 .macro pixels16 rnd=1, avg=0
47 1: vld1.64 {q0}, [r1], r2
48 vld1.64 {q1}, [r1], r2
49 vld1.64 {q2}, [r1], r2
51 vld1.64 {q3}, [r1], r2
56 vld1.64 {q8}, [r12,:128], r2
58 vld1.64 {q9}, [r12,:128], r2
60 vld1.64 {q10}, [r12,:128], r2
62 vld1.64 {q11}, [r12,:128], r2
66 vst1.64 {q0}, [r0,:128], r2
67 vst1.64 {q1}, [r0,:128], r2
68 vst1.64 {q2}, [r0,:128], r2
69 vst1.64 {q3}, [r0,:128], r2
74 .macro pixels16_x2 rnd=1, avg=0
75 1: vld1.64 {d0-d2}, [r1], r2
76 vld1.64 {d4-d6}, [r1], r2
85 vld1.8 {q1}, [r0,:128], r2
86 vld1.8 {q3}, [r0,:128]
91 vst1.64 {q0}, [r0,:128], r2
92 vst1.64 {q2}, [r0,:128], r2
97 .macro pixels16_y2 rnd=1, avg=0
98 vld1.64 {q0}, [r1], r2
99 vld1.64 {q1}, [r1], r2
102 vld1.64 {q0}, [r1], r2
104 vld1.64 {q1}, [r1], r2
108 vld1.8 {q8}, [r0,:128], r2
109 vld1.8 {q9}, [r0,:128]
114 vst1.64 {q2}, [r0,:128], r2
115 vst1.64 {q3}, [r0,:128], r2
120 .macro pixels16_xy2 rnd=1, avg=0
121 vld1.64 {d0-d2}, [r1], r2
122 vld1.64 {d4-d6}, [r1], r2
128 vext.8 q1, q0, q1, #1
129 vext.8 q3, q2, q3, #1
135 vld1.64 {d0-d2}, [r1], r2
139 vadd.u16 q12, q12, q13
141 vext.8 q15, q0, q1, #1
142 vadd.u16 q1 , q10, q11
149 vld1.8 {q8}, [r0,:128]
150 vrhadd.u8 q14, q14, q8
153 vld1.64 {d2-d4}, [r1], r2
154 vaddl.u8 q10, d1, d31
155 vst1.64 {q14}, [r0,:128], r2
159 vadd.u16 q12, q12, q13
161 vext.8 q2, q1, q2, #1
162 vadd.u16 q0, q10, q11
169 vld1.8 {q9}, [r0,:128]
170 vrhadd.u8 q15, q15, q9
174 vst1.64 {q15}, [r0,:128], r2
179 .macro pixels8 rnd=1, avg=0
180 1: vld1.64 {d0}, [r1], r2
181 vld1.64 {d1}, [r1], r2
182 vld1.64 {d2}, [r1], r2
184 vld1.64 {d3}, [r1], r2
189 vld1.64 {d4}, [r0,:64], r2
191 vld1.64 {d5}, [r0,:64], r2
193 vld1.64 {d6}, [r0,:64], r2
195 vld1.64 {d7}, [r0,:64], r2
197 sub r0, r0, r2, lsl #2
200 vst1.64 {d0}, [r0,:64], r2
201 vst1.64 {d1}, [r0,:64], r2
202 vst1.64 {d2}, [r0,:64], r2
203 vst1.64 {d3}, [r0,:64], r2
208 .macro pixels8_x2 rnd=1, avg=0
209 1: vld1.64 {q0}, [r1], r2
210 vext.8 d1, d0, d1, #1
211 vld1.64 {q1}, [r1], r2
212 vext.8 d3, d2, d3, #1
219 vld1.8 {d4}, [r0,:64], r2
220 vld1.8 {d5}, [r0,:64]
224 vst1.64 {d0}, [r0,:64], r2
225 vst1.64 {d1}, [r0,:64], r2
230 .macro pixels8_y2 rnd=1, avg=0
231 vld1.64 {d0}, [r1], r2
232 vld1.64 {d1}, [r1], r2
235 vld1.64 {d0}, [r1], r2
237 vld1.64 {d1}, [r1], r2
241 vld1.8 {d2}, [r0,:64], r2
242 vld1.8 {d3}, [r0,:64]
246 vst1.64 {d4}, [r0,:64], r2
247 vst1.64 {d5}, [r0,:64], r2
252 .macro pixels8_xy2 rnd=1, avg=0
253 vld1.64 {q0}, [r1], r2
254 vld1.64 {q1}, [r1], r2
260 vext.8 d4, d0, d1, #1
261 vext.8 d6, d2, d3, #1
265 vld1.64 {q0}, [r1], r2
268 vext.8 d4, d0, d1, #1
270 vadd.u16 q10, q10, q11
274 vld1.64 {q1}, [r1], r2
278 vld1.8 {d7}, [r0,:64]
282 vadd.u16 q10, q10, q11
284 vst1.64 {d5}, [r0,:64], r2
287 vld1.8 {d5}, [r0,:64]
290 vext.8 d6, d2, d3, #1
292 vst1.64 {d7}, [r0,:64], r2
297 .macro pixfunc pfx, name, suf, rnd=1, avg=0
299 .macro avg rd, rn, rm
300 vrhadd.u8 \rd, \rn, \rm
302 .macro shrn rd, rn, rm
303 vrshrn.u16 \rd, \rn, \rm
306 .macro avg rd, rn, rm
307 vhadd.u8 \rd, \rn, \rm
309 .macro shrn rd, rn, rm
310 vshrn.u16 \rd, \rn, \rm
313 function ff_\pfx\name\suf\()_neon, export=1
320 .macro pixfunc2 pfx, name, avg=0
321 pixfunc \pfx, \name, rnd=1, avg=\avg
322 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
325 function ff_put_h264_qpel16_mc00_neon, export=1
329 pixfunc put_, pixels16, avg=0
330 pixfunc2 put_, pixels16_x2, avg=0
331 pixfunc2 put_, pixels16_y2, avg=0
332 pixfunc2 put_, pixels16_xy2, avg=0
334 function ff_avg_h264_qpel16_mc00_neon, export=1
338 pixfunc avg_, pixels16, avg=1
339 pixfunc2 avg_, pixels16_x2, avg=1
340 pixfunc2 avg_, pixels16_y2, avg=1
341 pixfunc2 avg_, pixels16_xy2, avg=1
343 function ff_put_h264_qpel8_mc00_neon, export=1
347 pixfunc put_, pixels8, avg=0
348 pixfunc2 put_, pixels8_x2, avg=0
349 pixfunc2 put_, pixels8_y2, avg=0
350 pixfunc2 put_, pixels8_xy2, avg=0
352 function ff_avg_h264_qpel8_mc00_neon, export=1
356 pixfunc avg_, pixels8, avg=1
357 pixfunc2 avg_, pixels8_x2, avg=1
358 pixfunc2 avg_, pixels8_y2, avg=1
359 pixfunc2 avg_, pixels8_xy2, avg=1
361 function ff_put_pixels_clamped_neon, export=1
362 vld1.64 {d16-d19}, [r0,:128]!
364 vld1.64 {d20-d23}, [r0,:128]!
366 vld1.64 {d24-d27}, [r0,:128]!
368 vld1.64 {d28-d31}, [r0,:128]!
370 vst1.64 {d0}, [r1,:64], r2
372 vst1.64 {d1}, [r1,:64], r2
374 vst1.64 {d2}, [r1,:64], r2
376 vst1.64 {d3}, [r1,:64], r2
378 vst1.64 {d4}, [r1,:64], r2
379 vst1.64 {d5}, [r1,:64], r2
380 vst1.64 {d6}, [r1,:64], r2
381 vst1.64 {d7}, [r1,:64], r2
385 function ff_put_signed_pixels_clamped_neon, export=1
387 vld1.64 {d16-d17}, [r0,:128]!
389 vld1.64 {d18-d19}, [r0,:128]!
391 vld1.64 {d16-d17}, [r0,:128]!
393 vld1.64 {d18-d19}, [r0,:128]!
395 vld1.64 {d20-d21}, [r0,:128]!
397 vld1.64 {d22-d23}, [r0,:128]!
399 vst1.64 {d0}, [r1,:64], r2
401 vst1.64 {d1}, [r1,:64], r2
403 vst1.64 {d2}, [r1,:64], r2
405 vld1.64 {d24-d25}, [r0,:128]!
407 vld1.64 {d26-d27}, [r0,:128]!
410 vst1.64 {d3}, [r1,:64], r2
412 vst1.64 {d4}, [r1,:64], r2
414 vst1.64 {d5}, [r1,:64], r2
417 vst1.64 {d6}, [r1,:64], r2
418 vst1.64 {d7}, [r1,:64], r2
422 function ff_add_pixels_clamped_neon, export=1
424 vld1.64 {d16}, [r1,:64], r2
425 vld1.64 {d0-d1}, [r0,:128]!
427 vld1.64 {d17}, [r1,:64], r2
428 vld1.64 {d2-d3}, [r0,:128]!
430 vld1.64 {d18}, [r1,:64], r2
432 vld1.64 {d4-d5}, [r0,:128]!
434 vst1.64 {d0}, [r3,:64], r2
436 vld1.64 {d19}, [r1,:64], r2
437 vld1.64 {d6-d7}, [r0,:128]!
440 vst1.64 {d2}, [r3,:64], r2
441 vld1.64 {d16}, [r1,:64], r2
443 vld1.64 {d0-d1}, [r0,:128]!
445 vst1.64 {d4}, [r3,:64], r2
446 vld1.64 {d17}, [r1,:64], r2
447 vld1.64 {d2-d3}, [r0,:128]!
449 vst1.64 {d6}, [r3,:64], r2
451 vld1.64 {d18}, [r1,:64], r2
452 vld1.64 {d4-d5}, [r0,:128]!
454 vst1.64 {d0}, [r3,:64], r2
456 vld1.64 {d19}, [r1,:64], r2
458 vld1.64 {d6-d7}, [r0,:128]!
460 vst1.64 {d2}, [r3,:64], r2
462 vst1.64 {d4}, [r3,:64], r2
463 vst1.64 {d6}, [r3,:64], r2
467 function ff_vector_fmul_neon, export=1
469 vld1.64 {d0-d3}, [r1,:128]!
470 vld1.64 {d4-d7}, [r2,:128]!
477 vld1.64 {d0-d1}, [r1,:128]!
478 vld1.64 {d4-d5}, [r2,:128]!
480 vld1.64 {d2-d3}, [r1,:128]!
481 vld1.64 {d6-d7}, [r2,:128]!
483 vst1.64 {d16-d19},[r0,:128]!
484 vld1.64 {d0-d1}, [r1,:128]!
485 vld1.64 {d4-d5}, [r2,:128]!
487 vld1.64 {d2-d3}, [r1,:128]!
488 vld1.64 {d6-d7}, [r2,:128]!
490 vst1.64 {d20-d23},[r0,:128]!
494 2: vld1.64 {d0-d1}, [r1,:128]!
495 vld1.64 {d4-d5}, [r2,:128]!
496 vst1.64 {d16-d17},[r0,:128]!
498 vld1.64 {d2-d3}, [r1,:128]!
499 vld1.64 {d6-d7}, [r2,:128]!
500 vst1.64 {d18-d19},[r0,:128]!
502 3: vst1.64 {d16-d19},[r0,:128]!
506 function ff_vector_fmul_window_neon, export=1
511 add r2, r2, r5, lsl #2
512 add r4, r3, r5, lsl #3
513 add ip, r0, r5, lsl #3
515 vld1.64 {d0,d1}, [r1,:128]!
516 vld1.64 {d2,d3}, [r2,:128], r5
517 vld1.64 {d4,d5}, [r3,:128]!
518 vld1.64 {d6,d7}, [r4,:128], r5
528 vld1.64 {d0,d1}, [r1,:128]!
530 vld1.64 {d18,d19},[r2,:128], r5
532 vld1.64 {d24,d25},[r3,:128]!
534 vld1.64 {d6,d7}, [r4,:128], r5
539 vst1.64 {d20,d21},[r0,:128]!
540 vst1.64 {d22,d23},[ip,:128], r5
542 2: vmla.f32 d22, d3, d7
548 vst1.64 {d20,d21},[r0,:128]!
549 vst1.64 {d22,d23},[ip,:128], r5
553 #if CONFIG_VORBIS_DECODER
554 function ff_vorbis_inverse_coupling_neon, export=1
561 vld1.32 {d24-d25},[r1,:128]!
562 vld1.32 {d22-d23},[r0,:128]!
568 vadd.f32 q12, q11, q2
569 vsub.f32 q11, q11, q3
570 1: vld1.32 {d2-d3}, [r1,:128]!
571 vld1.32 {d0-d1}, [r0,:128]!
575 vst1.32 {d24-d25},[r3, :128]!
576 vst1.32 {d22-d23},[r12,:128]!
583 vld1.32 {d24-d25},[r1,:128]!
584 vld1.32 {d22-d23},[r0,:128]!
588 vst1.32 {d2-d3}, [r3, :128]!
589 vst1.32 {d0-d1}, [r12,:128]!
592 vadd.f32 q12, q11, q2
593 vsub.f32 q11, q11, q3
596 2: vst1.32 {d2-d3}, [r3, :128]!
597 vst1.32 {d0-d1}, [r12,:128]!
601 3: vld1.32 {d2-d3}, [r1,:128]
602 vld1.32 {d0-d1}, [r0,:128]
610 vst1.32 {d2-d3}, [r0,:128]!
611 vst1.32 {d0-d1}, [r1,:128]!
616 function ff_vector_fmul_scalar_neon, export=1
619 VFP vdup.32 q8, d0[0]
623 vld1.32 {q0},[r1,:128]!
624 vld1.32 {q1},[r1,:128]!
625 1: vmul.f32 q0, q0, q8
626 vld1.32 {q2},[r1,:128]!
628 vld1.32 {q3},[r1,:128]!
630 vst1.32 {q0},[r0,:128]!
632 vst1.32 {q1},[r0,:128]!
635 vld1.32 {q0},[r1,:128]!
636 vst1.32 {q2},[r0,:128]!
637 vld1.32 {q1},[r1,:128]!
638 vst1.32 {q3},[r0,:128]!
640 2: vst1.32 {q2},[r0,:128]!
641 vst1.32 {q3},[r0,:128]!
645 3: vld1.32 {q0},[r1,:128]!
647 vst1.32 {q0},[r0,:128]!
654 function ff_vector_fmac_scalar_neon, export=1
659 VFP vdup.32 q15, d0[0]
660 NOVFP vdup.32 q15, r2
664 vld1.32 {q0}, [r1,:128]!
665 vld1.32 {q8}, [acc,:128]!
666 vld1.32 {q1}, [r1,:128]!
667 vld1.32 {q9}, [acc,:128]!
668 1: vmla.f32 q8, q0, q15
669 vld1.32 {q2}, [r1,:128]!
670 vld1.32 {q10}, [acc,:128]!
672 vld1.32 {q3}, [r1,:128]!
673 vld1.32 {q11}, [acc,:128]!
674 vmla.f32 q10, q2, q15
675 vst1.32 {q8}, [r0,:128]!
676 vmla.f32 q11, q3, q15
677 vst1.32 {q9}, [r0,:128]!
680 vld1.32 {q0}, [r1,:128]!
681 vld1.32 {q8}, [acc,:128]!
682 vst1.32 {q10}, [r0,:128]!
683 vld1.32 {q1}, [r1,:128]!
684 vld1.32 {q9}, [acc,:128]!
685 vst1.32 {q11}, [r0,:128]!
687 2: vst1.32 {q10}, [r0,:128]!
688 vst1.32 {q11}, [r0,:128]!
692 3: vld1.32 {q0}, [r1,:128]!
693 vld1.32 {q8}, [acc,:128]!
695 vst1.32 {q8}, [r0,:128]!
702 function ff_butterflies_float_neon, export=1
703 1: vld1.32 {q0},[r0,:128]
704 vld1.32 {q1},[r1,:128]
707 vst1.32 {q2},[r1,:128]!
708 vst1.32 {q1},[r0,:128]!
714 function ff_scalarproduct_float_neon, export=1
716 1: vld1.32 {q0},[r0,:128]!
717 vld1.32 {q1},[r1,:128]!
723 NOVFP vmov.32 r0, d0[0]
727 function ff_vector_fmul_reverse_neon, export=1
728 add r2, r2, r3, lsl #2
731 vld1.32 {q0-q1}, [r1,:128]!
732 vld1.32 {q2-q3}, [r2,:128], r12
743 vld1.32 {q0-q1}, [r1,:128]!
744 vld1.32 {q2-q3}, [r2,:128], r12
745 vst1.32 {q8-q9}, [r0,:128]!
747 2: vst1.32 {q8-q9}, [r0,:128]!
751 function ff_vector_fmul_add_neon, export=1
753 vld1.32 {q0-q1}, [r1,:128]!
754 vld1.32 {q8-q9}, [r2,:128]!
755 vld1.32 {q2-q3}, [r3,:128]!
758 1: vadd.f32 q12, q2, q10
759 vadd.f32 q13, q3, q11
765 vld1.32 {q0}, [r1,:128]!
766 vld1.32 {q8}, [r2,:128]!
768 vld1.32 {q1}, [r1,:128]!
769 vld1.32 {q9}, [r2,:128]!
771 vld1.32 {q2-q3}, [r3,:128]!
772 vst1.32 {q12-q13},[r0,:128]!
774 2: vst1.32 {q12-q13},[r0,:128]!
778 function ff_vector_clipf_neon, export=1
779 VFP vdup.32 q1, d0[1]
780 VFP vdup.32 q0, d0[0]
784 vld1.f32 {q2},[r1,:128]!
786 vld1.f32 {q3},[r1,:128]!
788 1: vmax.f32 q8, q10, q0
792 vld1.f32 {q2},[r1,:128]!
794 vld1.f32 {q3},[r1,:128]!
796 vst1.f32 {q8},[r0,:128]!
797 vst1.f32 {q9},[r0,:128]!
799 2: vst1.f32 {q8},[r0,:128]!
800 vst1.f32 {q9},[r0,:128]!
804 function ff_apply_window_int16_neon, export=1
806 add r4, r1, r3, lsl #1
807 add lr, r0, r3, lsl #1
812 vld1.16 {q0}, [r1,:128]!
813 vld1.16 {q2}, [r2,:128]!
814 vld1.16 {q1}, [r4,:128], r12
816 vqrdmulh.s16 q0, q0, q2
817 vqrdmulh.s16 d2, d2, d7
818 vqrdmulh.s16 d3, d3, d6
819 vst1.16 {q0}, [r0,:128]!
820 vst1.16 {q1}, [lr,:128], r12
827 function ff_vector_clip_int32_neon, export=1
832 vld1.32 {q2-q3}, [r1,:128]!
837 vst1.32 {q2-q3}, [r0,:128]!