2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/arm/asm.S"
27 function ff_clear_block_neon, export=1
30 vst1.16 {q0}, [r0,:128]!
35 function ff_clear_blocks_neon, export=1
38 vst1.16 {q0}, [r0,:128]!
43 .macro pixels16 rnd=1, avg=0
47 1: vld1.8 {q0}, [r1], r2
56 vld1.8 {q8}, [r12,:128], r2
58 vld1.8 {q9}, [r12,:128], r2
60 vld1.8 {q10}, [r12,:128], r2
62 vld1.8 {q11}, [r12,:128], r2
66 vst1.64 {q0}, [r0,:128], r2
67 vst1.64 {q1}, [r0,:128], r2
68 vst1.64 {q2}, [r0,:128], r2
69 vst1.64 {q3}, [r0,:128], r2
74 .macro pixels16_x2 rnd=1, avg=0
75 1: vld1.8 {d0-d2}, [r1], r2
76 vld1.8 {d4-d6}, [r1], r2
85 vld1.8 {q1}, [r0,:128], r2
86 vld1.8 {q3}, [r0,:128]
91 vst1.8 {q0}, [r0,:128], r2
92 vst1.8 {q2}, [r0,:128], r2
97 .macro pixels16_y2 rnd=1, avg=0
100 vld1.8 {q1}, [r1], r2
103 vld1.8 {q0}, [r1], r2
105 vld1.8 {q1}, [r1], r2
109 vld1.8 {q8}, [r0,:128], r2
110 vld1.8 {q9}, [r0,:128]
115 vst1.8 {q2}, [r0,:128], r2
116 vst1.8 {q3}, [r0,:128], r2
120 vld1.8 {q0}, [r1], r2
123 vld1.8 {q8}, [r0,:128], r2
124 vld1.8 {q9}, [r0,:128]
129 vst1.8 {q2}, [r0,:128], r2
130 vst1.8 {q3}, [r0,:128], r2
135 .macro pixels16_xy2 rnd=1, avg=0
137 vld1.8 {d0-d2}, [r1], r2
138 vld1.8 {d4-d6}, [r1], r2
139 NRND vmov.i16 q13, #1
142 vext.8 q1, q0, q1, #1
143 vext.8 q3, q2, q3, #1
149 vld1.8 {d0-d2}, [r1], r2
152 NRND vadd.u16 q12, q12, q13
153 vext.8 q15, q0, q1, #1
154 vadd.u16 q1 , q10, q11
156 NRND vadd.u16 q1, q1, q13
159 vld1.8 {q8}, [r0,:128]
160 vrhadd.u8 q14, q14, q8
163 vld1.8 {d2-d4}, [r1], r2
164 vaddl.u8 q10, d1, d31
165 vst1.8 {q14}, [r0,:128], r2
168 NRND vadd.u16 q12, q12, q13
169 vext.8 q2, q1, q2, #1
170 vadd.u16 q0, q10, q11
172 NRND vadd.u16 q0, q0, q13
175 vld1.8 {q9}, [r0,:128]
176 vrhadd.u8 q15, q15, q9
180 vst1.8 {q15}, [r0,:128], r2
183 vld1.8 {d0-d2}, [r1], r2
185 NRND vadd.u16 q12, q12, q13
186 vext.8 q15, q0, q1, #1
187 vadd.u16 q1 , q10, q11
189 NRND vadd.u16 q1, q1, q13
192 vld1.8 {q8}, [r0,:128]
193 vrhadd.u8 q14, q14, q8
196 vaddl.u8 q10, d1, d31
197 vst1.8 {q14}, [r0,:128], r2
199 NRND vadd.u16 q12, q12, q13
200 vadd.u16 q0, q10, q11
202 NRND vadd.u16 q0, q0, q13
205 vld1.8 {q9}, [r0,:128]
206 vrhadd.u8 q15, q15, q9
208 vst1.8 {q15}, [r0,:128], r2
213 .macro pixels8 rnd=1, avg=0
214 1: vld1.8 {d0}, [r1], r2
215 vld1.8 {d1}, [r1], r2
216 vld1.8 {d2}, [r1], r2
218 vld1.8 {d3}, [r1], r2
223 vld1.8 {d4}, [r0,:64], r2
225 vld1.8 {d5}, [r0,:64], r2
227 vld1.8 {d6}, [r0,:64], r2
229 vld1.8 {d7}, [r0,:64], r2
231 sub r0, r0, r2, lsl #2
234 vst1.8 {d0}, [r0,:64], r2
235 vst1.8 {d1}, [r0,:64], r2
236 vst1.8 {d2}, [r0,:64], r2
237 vst1.8 {d3}, [r0,:64], r2
242 .macro pixels8_x2 rnd=1, avg=0
243 1: vld1.8 {q0}, [r1], r2
244 vext.8 d1, d0, d1, #1
245 vld1.8 {q1}, [r1], r2
246 vext.8 d3, d2, d3, #1
253 vld1.8 {d4}, [r0,:64], r2
254 vld1.8 {d5}, [r0,:64]
258 vst1.8 {d0}, [r0,:64], r2
259 vst1.8 {d1}, [r0,:64], r2
264 .macro pixels8_y2 rnd=1, avg=0
266 vld1.8 {d0}, [r1], r2
267 vld1.8 {d1}, [r1], r2
270 vld1.8 {d0}, [r1], r2
272 vld1.8 {d1}, [r1], r2
276 vld1.8 {d2}, [r0,:64], r2
277 vld1.8 {d3}, [r0,:64]
281 vst1.8 {d4}, [r0,:64], r2
282 vst1.8 {d5}, [r0,:64], r2
286 vld1.8 {d0}, [r1], r2
289 vld1.8 {d2}, [r0,:64], r2
290 vld1.8 {d3}, [r0,:64]
294 vst1.8 {d4}, [r0,:64], r2
295 vst1.8 {d5}, [r0,:64], r2
300 .macro pixels8_xy2 rnd=1, avg=0
302 vld1.8 {q0}, [r1], r2
303 vld1.8 {q1}, [r1], r2
304 NRND vmov.i16 q11, #1
307 vext.8 d4, d0, d1, #1
308 vext.8 d6, d2, d3, #1
312 vld1.8 {q0}, [r1], r2
315 vext.8 d4, d0, d1, #1
316 NRND vadd.u16 q10, q10, q11
319 vld1.8 {q1}, [r1], r2
323 vld1.8 {d7}, [r0,:64]
326 NRND vadd.u16 q10, q10, q11
327 vst1.8 {d5}, [r0,:64], r2
330 vld1.8 {d5}, [r0,:64]
333 vext.8 d6, d2, d3, #1
335 vst1.8 {d7}, [r0,:64], r2
338 vld1.8 {q0}, [r1], r2
340 vext.8 d4, d0, d1, #1
341 NRND vadd.u16 q10, q10, q11
346 vld1.8 {d7}, [r0,:64]
349 NRND vadd.u16 q10, q10, q11
350 vst1.8 {d5}, [r0,:64], r2
353 vld1.8 {d5}, [r0,:64]
356 vst1.8 {d7}, [r0,:64], r2
361 .macro pixfunc pfx, name, suf, rnd=1, avg=0
363 .macro avg rd, rn, rm
364 vrhadd.u8 \rd, \rn, \rm
366 .macro shrn rd, rn, rm
367 vrshrn.u16 \rd, \rn, \rm
369 .macro NRND insn:vararg
372 .macro avg rd, rn, rm
373 vhadd.u8 \rd, \rn, \rm
375 .macro shrn rd, rn, rm
376 vshrn.u16 \rd, \rn, \rm
378 .macro NRND insn:vararg
382 function ff_\pfx\name\suf\()_neon, export=1
390 .macro pixfunc2 pfx, name, avg=0
391 pixfunc \pfx, \name, rnd=1, avg=\avg
392 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
395 function ff_put_h264_qpel16_mc00_neon, export=1
399 pixfunc put_, pixels16, avg=0
400 pixfunc2 put_, pixels16_x2, avg=0
401 pixfunc2 put_, pixels16_y2, avg=0
402 pixfunc2 put_, pixels16_xy2, avg=0
404 function ff_avg_h264_qpel16_mc00_neon, export=1
408 pixfunc avg_, pixels16, avg=1
409 pixfunc2 avg_, pixels16_x2, avg=1
410 pixfunc2 avg_, pixels16_y2, avg=1
411 pixfunc2 avg_, pixels16_xy2, avg=1
413 function ff_put_h264_qpel8_mc00_neon, export=1
417 pixfunc put_, pixels8, avg=0
418 pixfunc2 put_, pixels8_x2, avg=0
419 pixfunc2 put_, pixels8_y2, avg=0
420 pixfunc2 put_, pixels8_xy2, avg=0
422 function ff_avg_h264_qpel8_mc00_neon, export=1
426 pixfunc avg_, pixels8, avg=1
427 pixfunc2 avg_, pixels8_x2, avg=1
428 pixfunc2 avg_, pixels8_y2, avg=1
429 pixfunc2 avg_, pixels8_xy2, avg=1
431 function ff_put_pixels_clamped_neon, export=1
432 vld1.16 {d16-d19}, [r0,:128]!
434 vld1.16 {d20-d23}, [r0,:128]!
436 vld1.16 {d24-d27}, [r0,:128]!
438 vld1.16 {d28-d31}, [r0,:128]!
440 vst1.8 {d0}, [r1,:64], r2
442 vst1.8 {d1}, [r1,:64], r2
444 vst1.8 {d2}, [r1,:64], r2
446 vst1.8 {d3}, [r1,:64], r2
448 vst1.8 {d4}, [r1,:64], r2
449 vst1.8 {d5}, [r1,:64], r2
450 vst1.8 {d6}, [r1,:64], r2
451 vst1.8 {d7}, [r1,:64], r2
455 function ff_put_signed_pixels_clamped_neon, export=1
457 vld1.16 {d16-d17}, [r0,:128]!
459 vld1.16 {d18-d19}, [r0,:128]!
461 vld1.16 {d16-d17}, [r0,:128]!
463 vld1.16 {d18-d19}, [r0,:128]!
465 vld1.16 {d20-d21}, [r0,:128]!
467 vld1.16 {d22-d23}, [r0,:128]!
469 vst1.8 {d0}, [r1,:64], r2
471 vst1.8 {d1}, [r1,:64], r2
473 vst1.8 {d2}, [r1,:64], r2
475 vld1.16 {d24-d25}, [r0,:128]!
477 vld1.16 {d26-d27}, [r0,:128]!
480 vst1.8 {d3}, [r1,:64], r2
482 vst1.8 {d4}, [r1,:64], r2
484 vst1.8 {d5}, [r1,:64], r2
487 vst1.8 {d6}, [r1,:64], r2
488 vst1.8 {d7}, [r1,:64], r2
492 function ff_add_pixels_clamped_neon, export=1
494 vld1.8 {d16}, [r1,:64], r2
495 vld1.16 {d0-d1}, [r0,:128]!
497 vld1.8 {d17}, [r1,:64], r2
498 vld1.16 {d2-d3}, [r0,:128]!
500 vld1.8 {d18}, [r1,:64], r2
502 vld1.16 {d4-d5}, [r0,:128]!
504 vst1.8 {d0}, [r3,:64], r2
506 vld1.8 {d19}, [r1,:64], r2
507 vld1.16 {d6-d7}, [r0,:128]!
510 vst1.8 {d2}, [r3,:64], r2
511 vld1.8 {d16}, [r1,:64], r2
513 vld1.16 {d0-d1}, [r0,:128]!
515 vst1.8 {d4}, [r3,:64], r2
516 vld1.8 {d17}, [r1,:64], r2
517 vld1.16 {d2-d3}, [r0,:128]!
519 vst1.8 {d6}, [r3,:64], r2
521 vld1.8 {d18}, [r1,:64], r2
522 vld1.16 {d4-d5}, [r0,:128]!
524 vst1.8 {d0}, [r3,:64], r2
526 vld1.8 {d19}, [r1,:64], r2
528 vld1.16 {d6-d7}, [r0,:128]!
530 vst1.8 {d2}, [r3,:64], r2
532 vst1.8 {d4}, [r3,:64], r2
533 vst1.8 {d6}, [r3,:64], r2
537 function ff_vector_fmul_window_neon, export=1
542 add r2, r2, r5, lsl #2
543 add r4, r3, r5, lsl #3
544 add ip, r0, r5, lsl #3
546 vld1.32 {d0,d1}, [r1,:128]!
547 vld1.32 {d2,d3}, [r2,:128], r5
548 vld1.32 {d4,d5}, [r3,:128]!
549 vld1.32 {d6,d7}, [r4,:128], r5
559 vld1.32 {d0,d1}, [r1,:128]!
561 vld1.32 {d18,d19},[r2,:128], r5
563 vld1.32 {d24,d25},[r3,:128]!
565 vld1.32 {d6,d7}, [r4,:128], r5
570 vst1.32 {d20,d21},[r0,:128]!
571 vst1.32 {d22,d23},[ip,:128], r5
573 2: vmla.f32 d22, d3, d7
579 vst1.32 {d20,d21},[r0,:128]!
580 vst1.32 {d22,d23},[ip,:128], r5
584 #if CONFIG_VORBIS_DECODER
585 function ff_vorbis_inverse_coupling_neon, export=1
592 vld1.32 {d24-d25},[r1,:128]!
593 vld1.32 {d22-d23},[r0,:128]!
599 vadd.f32 q12, q11, q2
600 vsub.f32 q11, q11, q3
601 1: vld1.32 {d2-d3}, [r1,:128]!
602 vld1.32 {d0-d1}, [r0,:128]!
606 vst1.32 {d24-d25},[r3, :128]!
607 vst1.32 {d22-d23},[r12,:128]!
614 vld1.32 {d24-d25},[r1,:128]!
615 vld1.32 {d22-d23},[r0,:128]!
619 vst1.32 {d2-d3}, [r3, :128]!
620 vst1.32 {d0-d1}, [r12,:128]!
623 vadd.f32 q12, q11, q2
624 vsub.f32 q11, q11, q3
627 2: vst1.32 {d2-d3}, [r3, :128]!
628 vst1.32 {d0-d1}, [r12,:128]!
632 3: vld1.32 {d2-d3}, [r1,:128]
633 vld1.32 {d0-d1}, [r0,:128]
641 vst1.32 {d2-d3}, [r0,:128]!
642 vst1.32 {d0-d1}, [r1,:128]!
647 function ff_vector_fmul_scalar_neon, export=1
650 VFP vdup.32 q8, d0[0]
654 vld1.32 {q0},[r1,:128]!
655 vld1.32 {q1},[r1,:128]!
656 1: vmul.f32 q0, q0, q8
657 vld1.32 {q2},[r1,:128]!
659 vld1.32 {q3},[r1,:128]!
661 vst1.32 {q0},[r0,:128]!
663 vst1.32 {q1},[r0,:128]!
666 vld1.32 {q0},[r1,:128]!
667 vst1.32 {q2},[r0,:128]!
668 vld1.32 {q1},[r1,:128]!
669 vst1.32 {q3},[r0,:128]!
671 2: vst1.32 {q2},[r0,:128]!
672 vst1.32 {q3},[r0,:128]!
676 3: vld1.32 {q0},[r1,:128]!
678 vst1.32 {q0},[r0,:128]!
685 function ff_butterflies_float_neon, export=1
686 1: vld1.32 {q0},[r0,:128]
687 vld1.32 {q1},[r1,:128]
690 vst1.32 {q2},[r1,:128]!
691 vst1.32 {q1},[r0,:128]!
697 function ff_scalarproduct_float_neon, export=1
699 1: vld1.32 {q0},[r0,:128]!
700 vld1.32 {q1},[r1,:128]!
706 NOVFP vmov.32 r0, d0[0]
710 function ff_vector_fmul_reverse_neon, export=1
711 add r2, r2, r3, lsl #2
714 vld1.32 {q0-q1}, [r1,:128]!
715 vld1.32 {q2-q3}, [r2,:128], r12
726 vld1.32 {q0-q1}, [r1,:128]!
727 vld1.32 {q2-q3}, [r2,:128], r12
728 vst1.32 {q8-q9}, [r0,:128]!
730 2: vst1.32 {q8-q9}, [r0,:128]!
734 function ff_vector_fmul_add_neon, export=1
736 vld1.32 {q0-q1}, [r1,:128]!
737 vld1.32 {q8-q9}, [r2,:128]!
738 vld1.32 {q2-q3}, [r3,:128]!
741 1: vadd.f32 q12, q2, q10
742 vadd.f32 q13, q3, q11
748 vld1.32 {q0}, [r1,:128]!
749 vld1.32 {q8}, [r2,:128]!
751 vld1.32 {q1}, [r1,:128]!
752 vld1.32 {q9}, [r2,:128]!
754 vld1.32 {q2-q3}, [r3,:128]!
755 vst1.32 {q12-q13},[r0,:128]!
757 2: vst1.32 {q12-q13},[r0,:128]!
761 function ff_vector_clipf_neon, export=1
762 VFP vdup.32 q1, d0[1]
763 VFP vdup.32 q0, d0[0]
767 vld1.f32 {q2},[r1,:128]!
769 vld1.f32 {q3},[r1,:128]!
771 1: vmax.f32 q8, q10, q0
775 vld1.f32 {q2},[r1,:128]!
777 vld1.f32 {q3},[r1,:128]!
779 vst1.f32 {q8},[r0,:128]!
780 vst1.f32 {q9},[r0,:128]!
782 2: vst1.f32 {q8},[r0,:128]!
783 vst1.f32 {q9},[r0,:128]!
787 function ff_apply_window_int16_neon, export=1
789 add r4, r1, r3, lsl #1
790 add lr, r0, r3, lsl #1
795 vld1.16 {q0}, [r1,:128]!
796 vld1.16 {q2}, [r2,:128]!
797 vld1.16 {q1}, [r4,:128], r12
799 vqrdmulh.s16 q0, q0, q2
800 vqrdmulh.s16 d2, d2, d7
801 vqrdmulh.s16 d3, d3, d6
802 vst1.16 {q0}, [r0,:128]!
803 vst1.16 {q1}, [lr,:128], r12
810 function ff_vector_clip_int32_neon, export=1
815 vld1.32 {q2-q3}, [r1,:128]!
820 vst1.32 {q2-q3}, [r0,:128]!