2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 1: vld1.64 {d0, d1}, [r1], r2
33 vld1.64 {d2, d3}, [r1], r2
34 vld1.64 {d4, d5}, [r1], r2
36 vld1.64 {d6, d7}, [r1], r2
41 vld1.64 {d16,d17}, [ip,:128], r2
43 vld1.64 {d18,d19}, [ip,:128], r2
45 vld1.64 {d20,d21}, [ip,:128], r2
47 vld1.64 {d22,d23}, [ip,:128], r2
51 vst1.64 {d0, d1}, [r0,:128], r2
52 vst1.64 {d2, d3}, [r0,:128], r2
53 vst1.64 {d4, d5}, [r0,:128], r2
54 vst1.64 {d6, d7}, [r0,:128], r2
59 .macro pixels16_x2 vhadd=vrhadd.u8
60 1: vld1.64 {d0-d2}, [r1], r2
61 vld1.64 {d4-d6}, [r1], r2
69 vst1.64 {d0, d1}, [r0,:128], r2
70 vst1.64 {d4, d5}, [r0,:128], r2
75 .macro pixels16_y2 vhadd=vrhadd.u8
76 vld1.64 {d0, d1}, [r1], r2
77 vld1.64 {d2, d3}, [r1], r2
80 vld1.64 {d0, d1}, [r1], r2
82 vld1.64 {d2, d3}, [r1], r2
85 vst1.64 {d4, d5}, [r0,:128], r2
86 vst1.64 {d6, d7}, [r0,:128], r2
91 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
92 vld1.64 {d0-d2}, [r1], r2
93 vld1.64 {d4-d6}, [r1], r2
100 vext.8 q3, q2, q3, #1
106 vld1.64 {d0-d2}, [r1], r2
110 vadd.u16 q12, q12, q13
112 vext.8 q15, q0, q1, #1
113 vadd.u16 q1 , q10, q11
120 vld1.64 {d2-d4}, [r1], r2
121 vaddl.u8 q10, d1, d31
122 vst1.64 {d28,d29}, [r0,:128], r2
126 vadd.u16 q12, q12, q13
128 vext.8 q2, q1, q2, #1
129 vadd.u16 q0, q10, q11
137 vst1.64 {d30,d31}, [r0,:128], r2
143 1: vld1.64 {d0}, [r1], r2
144 vld1.64 {d1}, [r1], r2
145 vld1.64 {d2}, [r1], r2
147 vld1.64 {d3}, [r1], r2
152 vld1.64 {d4}, [r0,:64], r2
154 vld1.64 {d5}, [r0,:64], r2
156 vld1.64 {d6}, [r0,:64], r2
158 vld1.64 {d7}, [r0,:64], r2
160 sub r0, r0, r2, lsl #2
163 vst1.64 {d0}, [r0,:64], r2
164 vst1.64 {d1}, [r0,:64], r2
165 vst1.64 {d2}, [r0,:64], r2
166 vst1.64 {d3}, [r0,:64], r2
171 .macro pixels8_x2 vhadd=vrhadd.u8
172 1: vld1.64 {d0, d1}, [r1], r2
173 vext.8 d1, d0, d1, #1
174 vld1.64 {d2, d3}, [r1], r2
175 vext.8 d3, d2, d3, #1
181 vst1.64 {d0}, [r0,:64], r2
182 vst1.64 {d1}, [r0,:64], r2
187 .macro pixels8_y2 vhadd=vrhadd.u8
188 vld1.64 {d0}, [r1], r2
189 vld1.64 {d1}, [r1], r2
192 vld1.64 {d0}, [r1], r2
194 vld1.64 {d1}, [r1], r2
197 vst1.64 {d4}, [r0,:64], r2
198 vst1.64 {d5}, [r0,:64], r2
203 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
204 vld1.64 {d0, d1}, [r1], r2
205 vld1.64 {d2, d3}, [r1], r2
211 vext.8 d4, d0, d1, #1
212 vext.8 d6, d2, d3, #1
216 vld1.64 {d0, d1}, [r1], r2
219 vext.8 d4, d0, d1, #1
221 vadd.u16 q10, q10, q11
225 vld1.64 {d2, d3}, [r1], r2
229 vadd.u16 q10, q10, q11
231 vst1.64 {d5}, [r0,:64], r2
233 vext.8 d6, d2, d3, #1
235 vst1.64 {d7}, [r0,:64], r2
240 .macro pixfunc pfx name suf rnd_op args:vararg
241 function ff_\pfx\name\suf\()_neon, export=1
246 .macro pixfunc2 pfx name args:vararg
248 pixfunc \pfx \name \args
251 function ff_put_h264_qpel16_mc00_neon, export=1
255 pixfunc put_ pixels16
256 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
257 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
258 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
260 function ff_avg_h264_qpel16_mc00_neon, export=1
264 pixfunc avg_ pixels16,, 1
266 function ff_put_h264_qpel8_mc00_neon, export=1
271 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
272 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
273 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
275 function ff_avg_h264_qpel8_mc00_neon, export=1
279 pixfunc avg_ pixels8,, 1
281 function ff_put_pixels_clamped_neon, export=1
282 vld1.64 {d16-d19}, [r0,:128]!
284 vld1.64 {d20-d23}, [r0,:128]!
286 vld1.64 {d24-d27}, [r0,:128]!
288 vld1.64 {d28-d31}, [r0,:128]!
290 vst1.64 {d0}, [r1,:64], r2
292 vst1.64 {d1}, [r1,:64], r2
294 vst1.64 {d2}, [r1,:64], r2
296 vst1.64 {d3}, [r1,:64], r2
298 vst1.64 {d4}, [r1,:64], r2
299 vst1.64 {d5}, [r1,:64], r2
300 vst1.64 {d6}, [r1,:64], r2
301 vst1.64 {d7}, [r1,:64], r2
305 function ff_put_signed_pixels_clamped_neon, export=1
307 vld1.64 {d16-d17}, [r0,:128]!
309 vld1.64 {d18-d19}, [r0,:128]!
311 vld1.64 {d16-d17}, [r0,:128]!
313 vld1.64 {d18-d19}, [r0,:128]!
315 vld1.64 {d20-d21}, [r0,:128]!
317 vld1.64 {d22-d23}, [r0,:128]!
319 vst1.64 {d0}, [r1,:64], r2
321 vst1.64 {d1}, [r1,:64], r2
323 vst1.64 {d2}, [r1,:64], r2
325 vld1.64 {d24-d25}, [r0,:128]!
327 vld1.64 {d26-d27}, [r0,:128]!
330 vst1.64 {d3}, [r1,:64], r2
332 vst1.64 {d4}, [r1,:64], r2
334 vst1.64 {d5}, [r1,:64], r2
337 vst1.64 {d6}, [r1,:64], r2
338 vst1.64 {d7}, [r1,:64], r2
342 function ff_add_pixels_clamped_neon, export=1
344 vld1.64 {d16}, [r1,:64], r2
345 vld1.64 {d0-d1}, [r0,:128]!
347 vld1.64 {d17}, [r1,:64], r2
348 vld1.64 {d2-d3}, [r0,:128]!
350 vld1.64 {d18}, [r1,:64], r2
352 vld1.64 {d4-d5}, [r0,:128]!
354 vst1.64 {d0}, [r3,:64], r2
356 vld1.64 {d19}, [r1,:64], r2
357 vld1.64 {d6-d7}, [r0,:128]!
360 vst1.64 {d2}, [r3,:64], r2
361 vld1.64 {d16}, [r1,:64], r2
363 vld1.64 {d0-d1}, [r0,:128]!
365 vst1.64 {d4}, [r3,:64], r2
366 vld1.64 {d17}, [r1,:64], r2
367 vld1.64 {d2-d3}, [r0,:128]!
369 vst1.64 {d6}, [r3,:64], r2
371 vld1.64 {d18}, [r1,:64], r2
372 vld1.64 {d4-d5}, [r0,:128]!
374 vst1.64 {d0}, [r3,:64], r2
376 vld1.64 {d19}, [r1,:64], r2
378 vld1.64 {d6-d7}, [r0,:128]!
380 vst1.64 {d2}, [r3,:64], r2
382 vst1.64 {d4}, [r3,:64], r2
383 vst1.64 {d6}, [r3,:64], r2
387 function ff_float_to_int16_neon, export=1
389 vld1.64 {d0-d1}, [r1,:128]!
390 vcvt.s32.f32 q8, q0, #16
391 vld1.64 {d2-d3}, [r1,:128]!
392 vcvt.s32.f32 q9, q1, #16
397 vshrn.s32 d4, q8, #16
398 vld1.64 {d0-d1}, [r1,:128]!
399 vcvt.s32.f32 q0, q0, #16
400 vshrn.s32 d5, q9, #16
401 vld1.64 {d2-d3}, [r1,:128]!
402 vcvt.s32.f32 q1, q1, #16
403 vshrn.s32 d6, q0, #16
404 vst1.64 {d4-d5}, [r0,:128]!
405 vshrn.s32 d7, q1, #16
406 vld1.64 {d16-d17},[r1,:128]!
407 vcvt.s32.f32 q8, q8, #16
408 vld1.64 {d18-d19},[r1,:128]!
409 vcvt.s32.f32 q9, q9, #16
410 vst1.64 {d6-d7}, [r0,:128]!
414 2: vld1.64 {d0-d1}, [r1,:128]!
415 vshrn.s32 d4, q8, #16
416 vcvt.s32.f32 q0, q0, #16
417 vld1.64 {d2-d3}, [r1,:128]!
418 vshrn.s32 d5, q9, #16
419 vcvt.s32.f32 q1, q1, #16
420 vshrn.s32 d6, q0, #16
421 vst1.64 {d4-d5}, [r0,:128]!
422 vshrn.s32 d7, q1, #16
423 vst1.64 {d6-d7}, [r0,:128]!
425 3: vshrn.s32 d4, q8, #16
426 vshrn.s32 d5, q9, #16
427 vst1.64 {d4-d5}, [r0,:128]!
431 function ff_float_to_int16_interleave_neon, export=1
434 blt ff_float_to_int16_neon
441 vld1.64 {d0-d1}, [r3,:128]!
442 vcvt.s32.f32 q8, q0, #16
443 vld1.64 {d2-d3}, [r3,:128]!
444 vcvt.s32.f32 q9, q1, #16
445 vld1.64 {d20-d21},[r1,:128]!
446 vcvt.s32.f32 q10, q10, #16
447 vld1.64 {d22-d23},[r1,:128]!
448 vcvt.s32.f32 q11, q11, #16
453 vld1.64 {d0-d1}, [r3,:128]!
454 vcvt.s32.f32 q0, q0, #16
456 vld1.64 {d2-d3}, [r3,:128]!
457 vcvt.s32.f32 q1, q1, #16
458 vld1.64 {d24-d25},[r1,:128]!
459 vcvt.s32.f32 q12, q12, #16
460 vld1.64 {d26-d27},[r1,:128]!
462 vst1.64 {d20-d21},[r0,:128]!
463 vcvt.s32.f32 q13, q13, #16
464 vst1.64 {d22-d23},[r0,:128]!
466 vld1.64 {d16-d17},[r3,:128]!
468 vst1.64 {d24-d25},[r0,:128]!
469 vcvt.s32.f32 q8, q8, #16
470 vld1.64 {d18-d19},[r3,:128]!
471 vcvt.s32.f32 q9, q9, #16
472 vld1.64 {d20-d21},[r1,:128]!
473 vcvt.s32.f32 q10, q10, #16
474 vld1.64 {d22-d23},[r1,:128]!
475 vcvt.s32.f32 q11, q11, #16
476 vst1.64 {d26-d27},[r0,:128]!
480 2: vsri.32 q10, q8, #16
481 vld1.64 {d0-d1}, [r3,:128]!
482 vcvt.s32.f32 q0, q0, #16
483 vld1.64 {d2-d3}, [r3,:128]!
484 vcvt.s32.f32 q1, q1, #16
485 vld1.64 {d24-d25},[r1,:128]!
486 vcvt.s32.f32 q12, q12, #16
488 vld1.64 {d26-d27},[r1,:128]!
489 vcvt.s32.f32 q13, q13, #16
490 vst1.64 {d20-d21},[r0,:128]!
492 vst1.64 {d22-d23},[r0,:128]!
494 vst1.64 {d24-d27},[r0,:128]!
496 3: vsri.32 q10, q8, #16
498 vst1.64 {d20-d23},[r0,:128]!
507 5: ldmia r1!, {r4-r7}
510 vld1.64 {d16-d17},[r4,:128]!
511 vcvt.s32.f32 q8, q8, #16
512 vld1.64 {d18-d19},[r5,:128]!
513 vcvt.s32.f32 q9, q9, #16
514 vld1.64 {d20-d21},[r6,:128]!
515 vcvt.s32.f32 q10, q10, #16
516 vld1.64 {d22-d23},[r7,:128]!
517 vcvt.s32.f32 q11, q11, #16
519 vld1.64 {d0-d1}, [r4,:128]!
520 vcvt.s32.f32 q0, q0, #16
522 vld1.64 {d2-d3}, [r5,:128]!
523 vcvt.s32.f32 q1, q1, #16
524 vsri.32 q11, q10, #16
525 vld1.64 {d4-d5}, [r6,:128]!
526 vcvt.s32.f32 q2, q2, #16
528 vld1.64 {d6-d7}, [r7,:128]!
529 vcvt.s32.f32 q3, q3, #16
531 vst1.64 {d18}, [r8], ip
533 vst1.64 {d22}, [r8], ip
535 vst1.64 {d19}, [r8], ip
537 vst1.64 {d23}, [r8], ip
540 vld1.64 {d16-d17},[r4,:128]!
541 vcvt.s32.f32 q8, q8, #16
542 vst1.64 {d2}, [r8], ip
543 vld1.64 {d18-d19},[r5,:128]!
544 vcvt.s32.f32 q9, q9, #16
545 vst1.64 {d6}, [r8], ip
546 vld1.64 {d20-d21},[r6,:128]!
547 vcvt.s32.f32 q10, q10, #16
548 vst1.64 {d3}, [r8], ip
549 vld1.64 {d22-d23},[r7,:128]!
550 vcvt.s32.f32 q11, q11, #16
551 vst1.64 {d7}, [r8], ip
553 7: vst1.64 {d2}, [r8], ip
554 vst1.64 {d6}, [r8], ip
555 vst1.64 {d3}, [r8], ip
556 vst1.64 {d7}, [r8], ip
570 vld1.64 {d16-d17},[r4,:128]!
571 vcvt.s32.f32 q8, q8, #16
572 vld1.64 {d18-d19},[r5,:128]!
573 vcvt.s32.f32 q9, q9, #16
574 vld1.64 {d20-d21},[r4,:128]!
575 vcvt.s32.f32 q10, q10, #16
576 vld1.64 {d22-d23},[r5,:128]!
577 vcvt.s32.f32 q11, q11, #16
581 vsri.32 d18, d16, #16
582 vsri.32 d19, d17, #16
583 vld1.64 {d16-d17},[r4,:128]!
584 vcvt.s32.f32 q8, q8, #16
585 vst1.32 {d18[0]}, [r8], ip
586 vsri.32 d22, d20, #16
587 vst1.32 {d18[1]}, [r8], ip
588 vsri.32 d23, d21, #16
589 vst1.32 {d19[0]}, [r8], ip
590 vst1.32 {d19[1]}, [r8], ip
591 vld1.64 {d18-d19},[r5,:128]!
592 vcvt.s32.f32 q9, q9, #16
593 vst1.32 {d22[0]}, [r8], ip
594 vst1.32 {d22[1]}, [r8], ip
595 vld1.64 {d20-d21},[r4,:128]!
596 vcvt.s32.f32 q10, q10, #16
597 vst1.32 {d23[0]}, [r8], ip
598 vst1.32 {d23[1]}, [r8], ip
599 vld1.64 {d22-d23},[r5,:128]!
600 vcvt.s32.f32 q11, q11, #16
602 vld1.64 {d0-d1}, [r4,:128]!
603 vcvt.s32.f32 q0, q0, #16
604 vsri.32 d18, d16, #16
605 vld1.64 {d2-d3}, [r5,:128]!
606 vcvt.s32.f32 q1, q1, #16
607 vsri.32 d19, d17, #16
608 vld1.64 {d4-d5}, [r4,:128]!
609 vcvt.s32.f32 q2, q2, #16
610 vld1.64 {d6-d7}, [r5,:128]!
611 vcvt.s32.f32 q3, q3, #16
612 vst1.32 {d18[0]}, [r8], ip
613 vsri.32 d22, d20, #16
614 vst1.32 {d18[1]}, [r8], ip
615 vsri.32 d23, d21, #16
616 vst1.32 {d19[0]}, [r8], ip
618 vst1.32 {d19[1]}, [r8], ip
620 vst1.32 {d22[0]}, [r8], ip
622 vst1.32 {d22[1]}, [r8], ip
624 vst1.32 {d23[0]}, [r8], ip
625 vst1.32 {d23[1]}, [r8], ip
627 vld1.64 {d16-d17},[r4,:128]!
628 vcvt.s32.f32 q8, q8, #16
629 vst1.32 {d2[0]}, [r8], ip
630 vst1.32 {d2[1]}, [r8], ip
631 vld1.64 {d18-d19},[r5,:128]!
632 vcvt.s32.f32 q9, q9, #16
633 vst1.32 {d3[0]}, [r8], ip
634 vst1.32 {d3[1]}, [r8], ip
635 vld1.64 {d20-d21},[r4,:128]!
636 vcvt.s32.f32 q10, q10, #16
637 vst1.32 {d6[0]}, [r8], ip
638 vst1.32 {d6[1]}, [r8], ip
639 vld1.64 {d22-d23},[r5,:128]!
640 vcvt.s32.f32 q11, q11, #16
641 vst1.32 {d7[0]}, [r8], ip
642 vst1.32 {d7[1]}, [r8], ip
644 6: vst1.32 {d2[0]}, [r8], ip
645 vst1.32 {d2[1]}, [r8], ip
646 vst1.32 {d3[0]}, [r8], ip
647 vst1.32 {d3[1]}, [r8], ip
648 vst1.32 {d6[0]}, [r8], ip
649 vst1.32 {d6[1]}, [r8], ip
650 vst1.32 {d7[0]}, [r8], ip
651 vst1.32 {d7[1]}, [r8], ip
653 7: vsri.32 d18, d16, #16
654 vsri.32 d19, d17, #16
655 vst1.32 {d18[0]}, [r8], ip
656 vsri.32 d22, d20, #16
657 vst1.32 {d18[1]}, [r8], ip
658 vsri.32 d23, d21, #16
659 vst1.32 {d19[0]}, [r8], ip
660 vst1.32 {d19[1]}, [r8], ip
661 vst1.32 {d22[0]}, [r8], ip
662 vst1.32 {d22[1]}, [r8], ip
663 vst1.32 {d23[0]}, [r8], ip
664 vst1.32 {d23[1]}, [r8], ip
674 vld1.64 {d0-d1}, [r4,:128]!
675 vcvt.s32.f32 q0, q0, #16
676 vld1.64 {d2-d3}, [r4,:128]!
677 vcvt.s32.f32 q1, q1, #16
680 vld1.64 {d4-d5}, [r4,:128]!
681 vcvt.s32.f32 q2, q2, #16
682 vld1.64 {d6-d7}, [r4,:128]!
683 vcvt.s32.f32 q3, q3, #16
684 vst1.16 {d0[1]}, [r5,:16], ip
685 vst1.16 {d0[3]}, [r5,:16], ip
686 vst1.16 {d1[1]}, [r5,:16], ip
687 vst1.16 {d1[3]}, [r5,:16], ip
688 vst1.16 {d2[1]}, [r5,:16], ip
689 vst1.16 {d2[3]}, [r5,:16], ip
690 vst1.16 {d3[1]}, [r5,:16], ip
691 vst1.16 {d3[3]}, [r5,:16], ip
693 vld1.64 {d0-d1}, [r4,:128]!
694 vcvt.s32.f32 q0, q0, #16
695 vld1.64 {d2-d3}, [r4,:128]!
696 vcvt.s32.f32 q1, q1, #16
697 7: vst1.16 {d4[1]}, [r5,:16], ip
698 vst1.16 {d4[3]}, [r5,:16], ip
699 vst1.16 {d5[1]}, [r5,:16], ip
700 vst1.16 {d5[3]}, [r5,:16], ip
701 vst1.16 {d6[1]}, [r5,:16], ip
702 vst1.16 {d6[3]}, [r5,:16], ip
703 vst1.16 {d7[1]}, [r5,:16], ip
704 vst1.16 {d7[3]}, [r5,:16], ip
708 vst1.16 {d0[1]}, [r5,:16], ip
709 vst1.16 {d0[3]}, [r5,:16], ip
710 vst1.16 {d1[1]}, [r5,:16], ip
711 vst1.16 {d1[3]}, [r5,:16], ip
712 vst1.16 {d2[1]}, [r5,:16], ip
713 vst1.16 {d2[3]}, [r5,:16], ip
714 vst1.16 {d3[1]}, [r5,:16], ip
715 vst1.16 {d3[3]}, [r5,:16], ip
717 vld1.64 {d0-d1}, [r4,:128]!
718 vcvt.s32.f32 q0, q0, #16
719 vld1.64 {d2-d3}, [r4,:128]!
720 vcvt.s32.f32 q1, q1, #16
724 function ff_vector_fmul_neon, export=1
727 vld1.64 {d0-d3}, [r0,:128]!
728 vld1.64 {d4-d7}, [r1,:128]!
735 vld1.64 {d0-d1}, [r0,:128]!
736 vld1.64 {d4-d5}, [r1,:128]!
738 vld1.64 {d2-d3}, [r0,:128]!
739 vld1.64 {d6-d7}, [r1,:128]!
741 vst1.64 {d16-d19},[r3,:128]!
742 vld1.64 {d0-d1}, [r0,:128]!
743 vld1.64 {d4-d5}, [r1,:128]!
745 vld1.64 {d2-d3}, [r0,:128]!
746 vld1.64 {d6-d7}, [r1,:128]!
748 vst1.64 {d20-d23},[r3,:128]!
752 2: vld1.64 {d0-d1}, [r0,:128]!
753 vld1.64 {d4-d5}, [r1,:128]!
754 vst1.64 {d16-d17},[r3,:128]!
756 vld1.64 {d2-d3}, [r0,:128]!
757 vld1.64 {d6-d7}, [r1,:128]!
758 vst1.64 {d18-d19},[r3,:128]!
760 3: vst1.64 {d16-d19},[r3,:128]!
764 function ff_vector_fmul_window_neon, export=1
765 VFP vdup.32 q8, d0[0]
766 NOVFP vld1.32 {d16[],d17[]}, [sp,:32]
768 VFP ldr lr, [sp, #12]
769 NOVFP ldr lr, [sp, #16]
772 add r2, r2, r5, lsl #2
773 add r4, r3, r5, lsl #3
774 add ip, r0, r5, lsl #3
776 vld1.64 {d0,d1}, [r1,:128]!
777 vld1.64 {d2,d3}, [r2,:128], r5
778 vld1.64 {d4,d5}, [r3,:128]!
779 vld1.64 {d6,d7}, [r4,:128], r5
791 vld1.64 {d0,d1}, [r1,:128]!
793 vld1.64 {d18,d19},[r2,:128], r5
795 vld1.64 {d24,d25},[r3,:128]!
797 vld1.64 {d6,d7}, [r4,:128], r5
802 vst1.64 {d20,d21},[r0,:128]!
803 vst1.64 {d22,d23},[ip,:128], r5
805 2: vmla.f32 d22, d3, d7
811 vst1.64 {d20,d21},[r0,:128]!
812 vst1.64 {d22,d23},[ip,:128], r5
816 #if CONFIG_VORBIS_DECODER
817 function ff_vorbis_inverse_coupling_neon, export=1
824 vld1.32 {d24-d25},[r1,:128]!
825 vld1.32 {d22-d23},[r0,:128]!
831 vadd.f32 q12, q11, q2
832 vsub.f32 q11, q11, q3
833 1: vld1.32 {d2-d3}, [r1,:128]!
834 vld1.32 {d0-d1}, [r0,:128]!
838 vst1.32 {d24-d25},[r3, :128]!
839 vst1.32 {d22-d23},[r12,:128]!
846 vld1.32 {d24-d25},[r1,:128]!
847 vld1.32 {d22-d23},[r0,:128]!
851 vst1.32 {d2-d3}, [r3, :128]!
852 vst1.32 {d0-d1}, [r12,:128]!
855 vadd.f32 q12, q11, q2
856 vsub.f32 q11, q11, q3
859 2: vst1.32 {d2-d3}, [r3, :128]!
860 vst1.32 {d0-d1}, [r12,:128]!
863 3: vld1.32 {d2-d3}, [r1,:128]
864 vld1.32 {d0-d1}, [r0,:128]
872 vst1.32 {d2-d3}, [r0,:128]!
873 vst1.32 {d0-d1}, [r1,:128]!
878 function ff_vector_fmul_scalar_neon, export=1
881 VFP vdup.32 q8, d0[0]
885 vld1.32 {q0},[r1,:128]!
886 vld1.32 {q1},[r1,:128]!
887 1: vmul.f32 q0, q0, q8
888 vld1.32 {q2},[r1,:128]!
890 vld1.32 {q3},[r1,:128]!
892 vst1.32 {q0},[r0,:128]!
894 vst1.32 {q1},[r0,:128]!
897 vld1.32 {q0},[r1,:128]!
898 vst1.32 {q2},[r0,:128]!
899 vld1.32 {q1},[r1,:128]!
900 vst1.32 {q3},[r0,:128]!
902 2: vst1.32 {q2},[r0,:128]!
903 vst1.32 {q3},[r0,:128]!
906 3: vld1.32 {q0},[r1,:128]!
908 vst1.32 {q0},[r0,:128]!
915 function ff_vector_fmul_sv_scalar_2_neon, export=1
916 VFP vdup.32 d16, d0[0]
917 NOVFP vdup.32 d16, r3
919 vld1.32 {d0},[r1,:64]!
920 vld1.32 {d1},[r1,:64]!
925 vld1.32 {d2},[r12,:64]
927 vld1.32 {d3},[r12,:64]
931 vld1.32 {d0},[r1,:64]!
932 vld1.32 {d1},[r1,:64]!
933 vst1.32 {d4},[r0,:64]!
934 vst1.32 {d5},[r0,:64]!
936 2: vst1.32 {d4},[r0,:64]!
937 vst1.32 {d5},[r0,:64]!
941 function ff_vector_fmul_sv_scalar_4_neon, export=1
942 VFP vdup.32 q10, d0[0]
943 NOVFP vdup.32 q10, r3
948 vld1.32 {q0},[r1,:128]!
949 vld1.32 {q2},[r1,:128]!
951 vld1.32 {q1},[r12,:128]
953 vld1.32 {q3},[r12,:128]
960 vld1.32 {q0},[r1,:128]!
961 vld1.32 {q2},[r1,:128]!
962 vst1.32 {q8},[r0,:128]!
963 vst1.32 {q9},[r0,:128]!
965 2: vst1.32 {q8},[r0,:128]!
966 vst1.32 {q9},[r0,:128]!
969 3: vld1.32 {q0},[r1,:128]!
971 vld1.32 {q1},[r12,:128]
974 vst1.32 {q0},[r0,:128]!
980 function ff_sv_fmul_scalar_2_neon, export=1
983 VFP vdup.32 q8, d0[0]
986 vld1.32 {d0},[r12,:64]
988 vld1.32 {d1},[r12,:64]
989 1: vmul.f32 q1, q0, q8
993 vld1.32 {d0},[r12,:64]
995 vld1.32 {d1},[r12,:64]
996 vst1.32 {q1},[r0,:128]!
998 2: vst1.32 {q1},[r0,:128]!
1003 function ff_sv_fmul_scalar_4_neon, export=1
1006 VFP vdup.32 q8, d0[0]
1007 NOVFP vdup.32 q8, r2
1008 1: ldr r12, [r1], #4
1009 vld1.32 {q0},[r12,:128]
1011 vst1.32 {q0},[r0,:128]!
1018 function ff_butterflies_float_neon, export=1
1019 1: vld1.32 {q0},[r0,:128]
1020 vld1.32 {q1},[r1,:128]
1023 vst1.32 {q2},[r1,:128]!
1024 vst1.32 {q1},[r0,:128]!
1030 function ff_scalarproduct_float_neon, export=1
1032 1: vld1.32 {q0},[r0,:128]!
1033 vld1.32 {q1},[r1,:128]!
1038 vpadd.f32 d0, d0, d0
1039 NOVFP vmov.32 r0, d0[0]
1043 function ff_int32_to_float_fmul_scalar_neon, export=1
1044 VFP vdup.32 q0, d0[0]
1046 NOVFP vdup.32 q0, r2
1049 vld1.32 {q1},[r1,:128]!
1051 vld1.32 {q2},[r1,:128]!
1053 1: subs len, len, #8
1056 vmul.f32 q10, q8, q0
1058 vld1.32 {q1},[r1,:128]!
1060 vld1.32 {q2},[r1,:128]!
1062 vst1.32 {q9}, [r0,:128]!
1063 vst1.32 {q10},[r0,:128]!
1065 2: vst1.32 {q9}, [r0,:128]!
1066 vst1.32 {q10},[r0,:128]!
1071 function ff_vector_fmul_reverse_neon, export=1
1072 add r2, r2, r3, lsl #2
1075 vld1.32 {q0-q1}, [r1,:128]!
1076 vld1.32 {q2-q3}, [r2,:128], r12
1079 vmul.f32 d16, d0, d7
1080 vmul.f32 d17, d1, d6
1083 vmul.f32 d18, d2, d5
1084 vmul.f32 d19, d3, d4
1087 vld1.32 {q0-q1}, [r1,:128]!
1088 vld1.32 {q2-q3}, [r2,:128], r12
1089 vst1.32 {q8-q9}, [r0,:128]!
1091 2: vst1.32 {q8-q9}, [r0,:128]!
1095 function ff_vector_fmul_add_neon, export=1
1097 vld1.32 {q0-q1}, [r1,:128]!
1098 vld1.32 {q8-q9}, [r2,:128]!
1099 vld1.32 {q2-q3}, [r3,:128]!
1100 vmul.f32 q10, q0, q8
1101 vmul.f32 q11, q1, q9
1102 1: vadd.f32 q12, q2, q10
1103 vadd.f32 q13, q3, q11
1109 vld1.32 {q0}, [r1,:128]!
1110 vld1.32 {q8}, [r2,:128]!
1111 vmul.f32 q10, q0, q8
1112 vld1.32 {q1}, [r1,:128]!
1113 vld1.32 {q9}, [r2,:128]!
1114 vmul.f32 q11, q1, q9
1115 vld1.32 {q2-q3}, [r3,:128]!
1116 vst1.32 {q12-q13},[r0,:128]!
1118 2: vst1.32 {q12-q13},[r0,:128]!
1122 function ff_vector_clipf_neon, export=1
1123 VFP vdup.32 q1, d0[1]
1124 VFP vdup.32 q0, d0[0]
1125 NOVFP vdup.32 q0, r2
1126 NOVFP vdup.32 q1, r3
1128 vld1.f32 {q2},[r1,:128]!
1129 vmin.f32 q10, q2, q1
1130 vld1.f32 {q3},[r1,:128]!
1131 vmin.f32 q11, q3, q1
1132 1: vmax.f32 q8, q10, q0
1133 vmax.f32 q9, q11, q0
1136 vld1.f32 {q2},[r1,:128]!
1137 vmin.f32 q10, q2, q1
1138 vld1.f32 {q3},[r1,:128]!
1139 vmin.f32 q11, q3, q1
1140 vst1.f32 {q8},[r0,:128]!
1141 vst1.f32 {q9},[r0,:128]!
1143 2: vst1.f32 {q8},[r0,:128]!
1144 vst1.f32 {q9},[r0,:128]!