2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
38 .macro transpose_4x4 r0 r1 r2 r3
45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64 .macro h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
77 rsb r6, r7, r5, lsl #3
78 rsb ip, r7, r4, lsl #3
79 sub r4, r7, r4, lsl #3
80 sub r4, r4, r5, lsl #3
90 vld1.64 {d4, d5}, [r1], r4
92 vld1.64 {d6, d7}, [r5], r4
101 vld1.64 {d4, d5}, [r1], r4
103 vext.8 d5, d4, d5, #1
110 vrshrn.u16 d16, q8, #6
111 vld1.64 {d6, d7}, [r5], r4
113 vrshrn.u16 d17, q9, #6
115 vld1.64 {d20}, [lr,:64], r2
116 vld1.64 {d21}, [lr,:64], r2
117 vrhadd.u8 q8, q8, q10
119 vext.8 d7, d6, d7, #1
120 vst1.64 {d16}, [r0,:64], r2
121 vst1.64 {d17}, [r0,:64], r2
135 vld1.64 {d4}, [r1], r4
136 vld1.64 {d6}, [r5], r4
141 vld1.64 {d4}, [r1], r4
144 vld1.64 {d6}, [r5], r4
145 vrshrn.u16 d16, q8, #6
146 vrshrn.u16 d17, q9, #6
148 vld1.64 {d20}, [lr,:64], r2
149 vld1.64 {d21}, [lr,:64], r2
150 vrhadd.u8 q8, q8, q10
154 vst1.64 {d16}, [r0,:64], r2
155 vst1.64 {d17}, [r0,:64], r2
160 4: vld1.64 {d4, d5}, [r1], r2
161 vld1.64 {d6, d7}, [r1], r2
162 vext.8 d5, d4, d5, #1
163 vext.8 d7, d6, d7, #1
169 vld1.64 {d4, d5}, [r1], r2
173 vext.8 d5, d4, d5, #1
174 vrshrn.u16 d16, q8, #6
175 vrshrn.u16 d17, q9, #6
177 vld1.64 {d20}, [lr,:64], r2
178 vld1.64 {d21}, [lr,:64], r2
179 vrhadd.u8 q8, q8, q10
181 vld1.64 {d6, d7}, [r1], r2
182 vext.8 d7, d6, d7, #1
183 vst1.64 {d16}, [r0,:64], r2
184 vst1.64 {d17}, [r0,:64], r2
191 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
192 .macro h264_chroma_mc4 type
193 function ff_\type\()_h264_chroma_mc4_neon, export=1
205 rsb r6, r7, r5, lsl #3
206 rsb ip, r7, r4, lsl #3
207 sub r4, r7, r4, lsl #3
208 sub r4, r4, r5, lsl #3
218 vld1.64 {d4}, [r1], r4
220 vld1.64 {d6}, [r5], r4
223 vext.8 d5, d4, d5, #1
224 vext.8 d7, d6, d7, #1
234 vld1.64 {d4}, [r1], r4
235 vext.8 d5, d4, d5, #1
239 vld1.64 {d6}, [r5], r4
240 vadd.i16 d16, d16, d17
241 vadd.i16 d17, d18, d19
242 vrshrn.u16 d16, q8, #6
246 vld1.32 {d20[0]}, [lr,:32], r2
247 vld1.32 {d20[1]}, [lr,:32], r2
248 vrhadd.u8 d16, d16, d20
250 vext.8 d7, d6, d7, #1
252 vst1.32 {d16[0]}, [r0,:32], r2
253 vst1.32 {d16[1]}, [r0,:32], r2
266 vext.32 d1, d0, d1, #1
269 vld1.32 {d4[0]}, [r1], r4
270 vld1.32 {d4[1]}, [r5], r4
274 vld1.32 {d4[0]}, [r1], r4
276 vld1.32 {d4[1]}, [r5], r4
277 vadd.i16 d16, d16, d17
278 vadd.i16 d17, d18, d19
279 vrshrn.u16 d16, q8, #6
281 vld1.32 {d20[0]}, [lr,:32], r2
282 vld1.32 {d20[1]}, [lr,:32], r2
283 vrhadd.u8 d16, d16, d20
287 vst1.32 {d16[0]}, [r0,:32], r2
288 vst1.32 {d16[1]}, [r0,:32], r2
293 4: vld1.64 {d4}, [r1], r2
294 vld1.64 {d6}, [r1], r2
295 vext.8 d5, d4, d5, #1
296 vext.8 d7, d6, d7, #1
300 5: vmull.u8 q8, d4, d0
303 vld1.64 {d4}, [r1], r2
304 vext.8 d5, d4, d5, #1
306 vadd.i16 d16, d16, d17
307 vadd.i16 d17, d18, d19
309 vrshrn.u16 d16, q8, #6
311 vld1.32 {d20[0]}, [lr,:32], r2
312 vld1.32 {d20[1]}, [lr,:32], r2
313 vrhadd.u8 d16, d16, d20
315 vld1.64 {d6}, [r1], r2
316 vext.8 d7, d6, d7, #1
319 vst1.32 {d16[0]}, [r0,:32], r2
320 vst1.32 {d16[1]}, [r0,:32], r2
327 .macro h264_chroma_mc2 type
328 function ff_\type\()_h264_chroma_mc2_neon, export=1
338 rsb r6, r5, lr, lsl #3
339 rsb r12, r5, r4, lsl #3
340 sub r4, r5, r4, lsl #3
341 sub r4, r4, lr, lsl #3
349 vld1.32 {d4[0]}, [r1], r2
350 vld1.32 {d4[1]}, [r1], r2
352 vld1.32 {d5[1]}, [r1]
353 vext.8 q3, q2, q2, #1
358 vld1.16 {d18[0]}, [r0,:16], r2
359 vld1.16 {d18[1]}, [r0,:16]
363 vadd.i16 d16, d16, d17
364 vrshrn.u16 d16, q8, #6
366 vrhadd.u8 d16, d16, d18
368 vst1.16 {d16[0]}, [r0,:16], r2
369 vst1.16 {d16[1]}, [r0,:16], r2
380 vld1.16 {d16[0]}, [r1], r2
381 vld1.16 {d16[1]}, [r1], r2
382 vld1.16 {d18[0]}, [r0,:16], r2
383 vld1.16 {d18[1]}, [r0,:16]
385 vrhadd.u8 d16, d16, d18
386 vst1.16 {d16[0]}, [r0,:16], r2
387 vst1.16 {d16[1]}, [r0,:16], r2
405 /* H.264 loop filter */
407 .macro h264_loop_filter_start
414 and ip, ip, ip, lsl #16
417 ands ip, ip, ip, lsl #8
422 .macro h264_loop_filter_luma
423 vdup.8 q11, r2 @ alpha
425 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
427 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
429 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
430 vsli.32 q12, q12, #16
431 vclt.u8 q6, q6, q11 @ < alpha
432 vdup.8 q11, r3 @ beta
434 vclt.u8 q14, q14, q11 @ < beta
435 vclt.u8 q15, q15, q11 @ < beta
437 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
439 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
440 vclt.u8 q4, q4, q11 @ < beta
442 vclt.u8 q5, q5, q11 @ < beta
446 vrhadd.u8 q14, q8, q0
449 vhadd.u8 q10, q10, q14
451 vhadd.u8 q14, q2, q14
453 vqsub.u8 q11, q9, q12
456 vqsub.u8 q11, q1, q12
459 vmax.u8 q14, q14, q11
462 vsubw.u8 q10, q10, d17
464 vshl.i16 q10, q10, #2
466 vaddw.u8 q10, q10, d19
468 vsubw.u8 q10, q10, d3
469 vrshrn.i16 d4, q2, #3
470 vrshrn.i16 d5, q10, #3
480 vaddw.s8 q14, q14, d4
482 vsubw.s8 q11, q11, d4
483 vsubw.s8 q12, q12, d5
490 function ff_h264_v_loop_filter_luma_neon, export=1
491 h264_loop_filter_start
493 vld1.64 {d0, d1}, [r0,:128], r1
494 vld1.64 {d2, d3}, [r0,:128], r1
495 vld1.64 {d4, d5}, [r0,:128], r1
496 sub r0, r0, r1, lsl #2
497 sub r0, r0, r1, lsl #1
498 vld1.64 {d20,d21}, [r0,:128], r1
499 vld1.64 {d18,d19}, [r0,:128], r1
500 vld1.64 {d16,d17}, [r0,:128], r1
504 h264_loop_filter_luma
506 sub r0, r0, r1, lsl #1
507 vst1.64 {d8, d9}, [r0,:128], r1
508 vst1.64 {d16,d17}, [r0,:128], r1
509 vst1.64 {d0, d1}, [r0,:128], r1
510 vst1.64 {d10,d11}, [r0,:128]
516 function ff_h264_h_loop_filter_luma_neon, export=1
517 h264_loop_filter_start
520 vld1.64 {d6}, [r0], r1
521 vld1.64 {d20}, [r0], r1
522 vld1.64 {d18}, [r0], r1
523 vld1.64 {d16}, [r0], r1
524 vld1.64 {d0}, [r0], r1
525 vld1.64 {d2}, [r0], r1
526 vld1.64 {d4}, [r0], r1
527 vld1.64 {d26}, [r0], r1
528 vld1.64 {d7}, [r0], r1
529 vld1.64 {d21}, [r0], r1
530 vld1.64 {d19}, [r0], r1
531 vld1.64 {d17}, [r0], r1
532 vld1.64 {d1}, [r0], r1
533 vld1.64 {d3}, [r0], r1
534 vld1.64 {d5}, [r0], r1
535 vld1.64 {d27}, [r0], r1
537 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
541 h264_loop_filter_luma
543 transpose_4x4 q4, q8, q0, q5
545 sub r0, r0, r1, lsl #4
547 vst1.32 {d8[0]}, [r0], r1
548 vst1.32 {d16[0]}, [r0], r1
549 vst1.32 {d0[0]}, [r0], r1
550 vst1.32 {d10[0]}, [r0], r1
551 vst1.32 {d8[1]}, [r0], r1
552 vst1.32 {d16[1]}, [r0], r1
553 vst1.32 {d0[1]}, [r0], r1
554 vst1.32 {d10[1]}, [r0], r1
555 vst1.32 {d9[0]}, [r0], r1
556 vst1.32 {d17[0]}, [r0], r1
557 vst1.32 {d1[0]}, [r0], r1
558 vst1.32 {d11[0]}, [r0], r1
559 vst1.32 {d9[1]}, [r0], r1
560 vst1.32 {d17[1]}, [r0], r1
561 vst1.32 {d1[1]}, [r0], r1
562 vst1.32 {d11[1]}, [r0], r1
568 .macro h264_loop_filter_chroma
569 vdup.8 d22, r2 @ alpha
571 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
573 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
577 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
579 vclt.u8 d26, d26, d22 @ < alpha
581 vdup.8 d22, r3 @ beta
582 vrshrn.i16 d4, q2, #3
583 vclt.u8 d28, d28, d22 @ < beta
584 vclt.u8 d30, d30, d22 @ < beta
593 vaddw.s8 q14, q14, d4
594 vsubw.s8 q11, q11, d4
599 function ff_h264_v_loop_filter_chroma_neon, export=1
600 h264_loop_filter_start
602 sub r0, r0, r1, lsl #1
603 vld1.64 {d18}, [r0,:64], r1
604 vld1.64 {d16}, [r0,:64], r1
605 vld1.64 {d0}, [r0,:64], r1
606 vld1.64 {d2}, [r0,:64]
608 h264_loop_filter_chroma
610 sub r0, r0, r1, lsl #1
611 vst1.64 {d16}, [r0,:64], r1
612 vst1.64 {d0}, [r0,:64], r1
617 function ff_h264_h_loop_filter_chroma_neon, export=1
618 h264_loop_filter_start
621 vld1.32 {d18[0]}, [r0], r1
622 vld1.32 {d16[0]}, [r0], r1
623 vld1.32 {d0[0]}, [r0], r1
624 vld1.32 {d2[0]}, [r0], r1
625 vld1.32 {d18[1]}, [r0], r1
626 vld1.32 {d16[1]}, [r0], r1
627 vld1.32 {d0[1]}, [r0], r1
628 vld1.32 {d2[1]}, [r0], r1
635 h264_loop_filter_chroma
642 sub r0, r0, r1, lsl #3
643 vst1.32 {d18[0]}, [r0], r1
644 vst1.32 {d16[0]}, [r0], r1
645 vst1.32 {d0[0]}, [r0], r1
646 vst1.32 {d2[0]}, [r0], r1
647 vst1.32 {d18[1]}, [r0], r1
648 vst1.32 {d16[1]}, [r0], r1
649 vst1.32 {d0[1]}, [r0], r1
650 vst1.32 {d2[1]}, [r0], r1
657 .macro lowpass_const r
663 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
671 vext.8 d2, \r0, \r1, #2
672 vext.8 d3, \r0, \r1, #3
674 vext.8 d4, \r0, \r1, #1
675 vext.8 d5, \r0, \r1, #4
677 vext.8 d30, \r0, \r1, #5
678 vaddl.u8 t0, \r0, d30
679 vext.8 d18, \r2, \r3, #2
680 vmla.i16 t0, q1, d6[1]
681 vext.8 d19, \r2, \r3, #3
682 vaddl.u8 q9, d18, d19
683 vext.8 d20, \r2, \r3, #1
684 vmls.i16 t0, q2, d6[0]
685 vext.8 d21, \r2, \r3, #4
686 vaddl.u8 q10, d20, d21
687 vext.8 d31, \r2, \r3, #5
688 vaddl.u8 t1, \r2, d31
689 vmla.i16 t1, q9, d6[1]
690 vmls.i16 t1, q10, d6[0]
692 vqrshrun.s16 \d0, t0, #5
693 vqrshrun.s16 \d1, t1, #5
699 .macro lowpass_8_1 r0, r1, d0, narrow=1
705 vext.8 d2, \r0, \r1, #2
706 vext.8 d3, \r0, \r1, #3
708 vext.8 d4, \r0, \r1, #1
709 vext.8 d5, \r0, \r1, #4
711 vext.8 d30, \r0, \r1, #5
712 vaddl.u8 t0, \r0, d30
713 vmla.i16 t0, q1, d6[1]
714 vmls.i16 t0, q2, d6[0]
716 vqrshrun.s16 \d0, t0, #5
721 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
722 vext.16 q1, \r0, \r1, #2
723 vext.16 q0, \r0, \r1, #3
725 vext.16 q2, \r0, \r1, #1
727 vext.16 q3, \r0, \r1, #4
728 vaddl.s16 q10, d4, d6
729 vext.16 \r1, \r0, \r1, #5
731 vaddl.s16 q0, \h0, \h1
732 vaddl.s16 q8, \l0, \l1
736 vshl.i32 q15, q10, #2
738 vadd.i32 q10, q10, q15
752 vrshrn.s32 d18, q9, #10
753 vrshrn.s32 d19, q1, #10
758 function put_h264_qpel16_h_lowpass_neon_packed
762 bl put_h264_qpel8_h_lowpass_neon
763 sub r1, r1, r2, lsl #4
767 b put_h264_qpel8_h_lowpass_neon
770 .macro h264_qpel_h_lowpass type
771 function \type\()_h264_qpel16_h_lowpass_neon
774 bl \type\()_h264_qpel8_h_lowpass_neon
775 sub r0, r0, r3, lsl #4
776 sub r1, r1, r2, lsl #4
783 function \type\()_h264_qpel8_h_lowpass_neon
784 1: vld1.64 {d0, d1}, [r1], r2
785 vld1.64 {d16,d17}, [r1], r2
787 lowpass_8 d0, d1, d16, d17, d0, d16
789 vld1.8 {d2}, [r0,:64], r3
791 vld1.8 {d3}, [r0,:64]
792 vrhadd.u8 d16, d16, d3
795 vst1.64 {d0}, [r0,:64], r3
796 vst1.64 {d16}, [r0,:64], r3
802 h264_qpel_h_lowpass put
803 h264_qpel_h_lowpass avg
805 .macro h264_qpel_h_lowpass_l2 type
806 function \type\()_h264_qpel16_h_lowpass_l2_neon
809 bl \type\()_h264_qpel8_h_lowpass_l2_neon
810 sub r0, r0, r2, lsl #4
811 sub r1, r1, r2, lsl #4
812 sub r3, r3, r2, lsl #4
820 function \type\()_h264_qpel8_h_lowpass_l2_neon
821 1: vld1.64 {d0, d1}, [r1], r2
822 vld1.64 {d16,d17}, [r1], r2
823 vld1.64 {d28}, [r3], r2
824 vld1.64 {d29}, [r3], r2
826 lowpass_8 d0, d1, d16, d17, d0, d1
827 vrhadd.u8 q0, q0, q14
829 vld1.8 {d2}, [r0,:64], r2
831 vld1.8 {d3}, [r0,:64]
835 vst1.64 {d0}, [r0,:64], r2
836 vst1.64 {d1}, [r0,:64], r2
842 h264_qpel_h_lowpass_l2 put
843 h264_qpel_h_lowpass_l2 avg
845 function put_h264_qpel16_v_lowpass_neon_packed
848 bl put_h264_qpel8_v_lowpass_neon
849 sub r1, r1, r3, lsl #2
850 bl put_h264_qpel8_v_lowpass_neon
851 sub r1, r1, r3, lsl #4
852 sub r1, r1, r3, lsl #2
854 bl put_h264_qpel8_v_lowpass_neon
855 sub r1, r1, r3, lsl #2
857 b put_h264_qpel8_v_lowpass_neon
860 .macro h264_qpel_v_lowpass type
861 function \type\()_h264_qpel16_v_lowpass_neon
863 bl \type\()_h264_qpel8_v_lowpass_neon
864 sub r1, r1, r3, lsl #2
865 bl \type\()_h264_qpel8_v_lowpass_neon
866 sub r0, r0, r2, lsl #4
868 sub r1, r1, r3, lsl #4
869 sub r1, r1, r3, lsl #2
871 bl \type\()_h264_qpel8_v_lowpass_neon
872 sub r1, r1, r3, lsl #2
876 function \type\()_h264_qpel8_v_lowpass_neon
877 vld1.64 {d8}, [r1], r3
878 vld1.64 {d10}, [r1], r3
879 vld1.64 {d12}, [r1], r3
880 vld1.64 {d14}, [r1], r3
881 vld1.64 {d22}, [r1], r3
882 vld1.64 {d24}, [r1], r3
883 vld1.64 {d26}, [r1], r3
884 vld1.64 {d28}, [r1], r3
885 vld1.64 {d9}, [r1], r3
886 vld1.64 {d11}, [r1], r3
887 vld1.64 {d13}, [r1], r3
888 vld1.64 {d15}, [r1], r3
891 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
892 lowpass_8 d8, d9, d10, d11, d8, d10
893 lowpass_8 d12, d13, d14, d15, d12, d14
894 lowpass_8 d22, d23, d24, d25, d22, d24
895 lowpass_8 d26, d27, d28, d29, d26, d28
896 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
899 vld1.8 {d9}, [r0,:64], r2
901 vld1.8 {d11}, [r0,:64], r2
902 vrhadd.u8 d10, d10, d11
903 vld1.8 {d13}, [r0,:64], r2
904 vrhadd.u8 d12, d12, d13
905 vld1.8 {d15}, [r0,:64], r2
906 vrhadd.u8 d14, d14, d15
907 vld1.8 {d23}, [r0,:64], r2
908 vrhadd.u8 d22, d22, d23
909 vld1.8 {d25}, [r0,:64], r2
910 vrhadd.u8 d24, d24, d25
911 vld1.8 {d27}, [r0,:64], r2
912 vrhadd.u8 d26, d26, d27
913 vld1.8 {d29}, [r0,:64], r2
914 vrhadd.u8 d28, d28, d29
915 sub r0, r0, r2, lsl #3
918 vst1.64 {d8}, [r0,:64], r2
919 vst1.64 {d10}, [r0,:64], r2
920 vst1.64 {d12}, [r0,:64], r2
921 vst1.64 {d14}, [r0,:64], r2
922 vst1.64 {d22}, [r0,:64], r2
923 vst1.64 {d24}, [r0,:64], r2
924 vst1.64 {d26}, [r0,:64], r2
925 vst1.64 {d28}, [r0,:64], r2
931 h264_qpel_v_lowpass put
932 h264_qpel_v_lowpass avg
934 .macro h264_qpel_v_lowpass_l2 type
935 function \type\()_h264_qpel16_v_lowpass_l2_neon
937 bl \type\()_h264_qpel8_v_lowpass_l2_neon
938 sub r1, r1, r3, lsl #2
939 bl \type\()_h264_qpel8_v_lowpass_l2_neon
940 sub r0, r0, r3, lsl #4
941 sub ip, ip, r2, lsl #4
944 sub r1, r1, r3, lsl #4
945 sub r1, r1, r3, lsl #2
947 bl \type\()_h264_qpel8_v_lowpass_l2_neon
948 sub r1, r1, r3, lsl #2
952 function \type\()_h264_qpel8_v_lowpass_l2_neon
953 vld1.64 {d8}, [r1], r3
954 vld1.64 {d10}, [r1], r3
955 vld1.64 {d12}, [r1], r3
956 vld1.64 {d14}, [r1], r3
957 vld1.64 {d22}, [r1], r3
958 vld1.64 {d24}, [r1], r3
959 vld1.64 {d26}, [r1], r3
960 vld1.64 {d28}, [r1], r3
961 vld1.64 {d9}, [r1], r3
962 vld1.64 {d11}, [r1], r3
963 vld1.64 {d13}, [r1], r3
964 vld1.64 {d15}, [r1], r3
967 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
968 lowpass_8 d8, d9, d10, d11, d8, d9
969 lowpass_8 d12, d13, d14, d15, d12, d13
970 lowpass_8 d22, d23, d24, d25, d22, d23
971 lowpass_8 d26, d27, d28, d29, d26, d27
972 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
974 vld1.64 {d0}, [ip], r2
975 vld1.64 {d1}, [ip], r2
976 vld1.64 {d2}, [ip], r2
977 vld1.64 {d3}, [ip], r2
978 vld1.64 {d4}, [ip], r2
980 vld1.64 {d5}, [ip], r2
982 vld1.64 {d10}, [ip], r2
983 vrhadd.u8 q2, q2, q11
984 vld1.64 {d11}, [ip], r2
985 vrhadd.u8 q5, q5, q13
988 vld1.8 {d16}, [r0,:64], r3
989 vrhadd.u8 d0, d0, d16
990 vld1.8 {d17}, [r0,:64], r3
991 vrhadd.u8 d1, d1, d17
992 vld1.8 {d16}, [r0,:64], r3
993 vrhadd.u8 d2, d2, d16
994 vld1.8 {d17}, [r0,:64], r3
995 vrhadd.u8 d3, d3, d17
996 vld1.8 {d16}, [r0,:64], r3
997 vrhadd.u8 d4, d4, d16
998 vld1.8 {d17}, [r0,:64], r3
999 vrhadd.u8 d5, d5, d17
1000 vld1.8 {d16}, [r0,:64], r3
1001 vrhadd.u8 d10, d10, d16
1002 vld1.8 {d17}, [r0,:64], r3
1003 vrhadd.u8 d11, d11, d17
1004 sub r0, r0, r3, lsl #3
1007 vst1.64 {d0}, [r0,:64], r3
1008 vst1.64 {d1}, [r0,:64], r3
1009 vst1.64 {d2}, [r0,:64], r3
1010 vst1.64 {d3}, [r0,:64], r3
1011 vst1.64 {d4}, [r0,:64], r3
1012 vst1.64 {d5}, [r0,:64], r3
1013 vst1.64 {d10}, [r0,:64], r3
1014 vst1.64 {d11}, [r0,:64], r3
1020 h264_qpel_v_lowpass_l2 put
1021 h264_qpel_v_lowpass_l2 avg
1023 function put_h264_qpel8_hv_lowpass_neon_top
1026 1: vld1.64 {d0, d1}, [r1], r3
1027 vld1.64 {d16,d17}, [r1], r3
1029 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
1030 vst1.64 {d22-d25}, [r4,:128]!
1033 vld1.64 {d0, d1}, [r1]
1034 lowpass_8_1 d0, d1, q12, narrow=0
1038 vld1.64 {d30,d31}, [r4,:128], ip
1039 vld1.64 {d20,d21}, [r4,:128], ip
1040 vld1.64 {d18,d19}, [r4,:128], ip
1041 vld1.64 {d16,d17}, [r4,:128], ip
1042 vld1.64 {d14,d15}, [r4,:128], ip
1043 vld1.64 {d12,d13}, [r4,:128], ip
1044 vld1.64 {d10,d11}, [r4,:128], ip
1045 vld1.64 {d8, d9}, [r4,:128], ip
1046 vld1.64 {d6, d7}, [r4,:128], ip
1047 vld1.64 {d4, d5}, [r4,:128], ip
1048 vld1.64 {d2, d3}, [r4,:128], ip
1049 vld1.64 {d0, d1}, [r4,:128]
1051 swap4 d1, d3, d5, d7, d8, d10, d12, d14
1052 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
1054 swap4 d17, d19, d21, d31, d24, d26, d28, d22
1055 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
1057 vst1.64 {d30,d31}, [r4,:128]!
1058 vst1.64 {d6, d7}, [r4,:128]!
1059 vst1.64 {d20,d21}, [r4,:128]!
1060 vst1.64 {d4, d5}, [r4,:128]!
1061 vst1.64 {d18,d19}, [r4,:128]!
1062 vst1.64 {d2, d3}, [r4,:128]!
1063 vst1.64 {d16,d17}, [r4,:128]!
1064 vst1.64 {d0, d1}, [r4,:128]
1066 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
1067 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
1068 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
1069 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
1071 vld1.64 {d16,d17}, [r4,:128], ip
1072 vld1.64 {d30,d31}, [r4,:128], ip
1073 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
1074 vld1.64 {d16,d17}, [r4,:128], ip
1075 vld1.64 {d30,d31}, [r4,:128], ip
1076 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
1077 vld1.64 {d16,d17}, [r4,:128], ip
1078 vld1.64 {d30,d31}, [r4,:128], ip
1079 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
1080 vld1.64 {d16,d17}, [r4,:128], ip
1081 vld1.64 {d30,d31}, [r4,:128]
1082 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
1084 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
1089 .macro h264_qpel8_hv_lowpass type
1090 function \type\()_h264_qpel8_hv_lowpass_neon
1092 bl put_h264_qpel8_hv_lowpass_neon_top
1094 vld1.8 {d0}, [r0,:64], r2
1095 vrhadd.u8 d12, d12, d0
1096 vld1.8 {d1}, [r0,:64], r2
1097 vrhadd.u8 d13, d13, d1
1098 vld1.8 {d2}, [r0,:64], r2
1099 vrhadd.u8 d14, d14, d2
1100 vld1.8 {d3}, [r0,:64], r2
1101 vrhadd.u8 d15, d15, d3
1102 vld1.8 {d4}, [r0,:64], r2
1103 vrhadd.u8 d8, d8, d4
1104 vld1.8 {d5}, [r0,:64], r2
1105 vrhadd.u8 d9, d9, d5
1106 vld1.8 {d6}, [r0,:64], r2
1107 vrhadd.u8 d10, d10, d6
1108 vld1.8 {d7}, [r0,:64], r2
1109 vrhadd.u8 d11, d11, d7
1110 sub r0, r0, r2, lsl #3
1113 vst1.64 {d12}, [r0,:64], r2
1114 vst1.64 {d13}, [r0,:64], r2
1115 vst1.64 {d14}, [r0,:64], r2
1116 vst1.64 {d15}, [r0,:64], r2
1117 vst1.64 {d8}, [r0,:64], r2
1118 vst1.64 {d9}, [r0,:64], r2
1119 vst1.64 {d10}, [r0,:64], r2
1120 vst1.64 {d11}, [r0,:64], r2
1127 h264_qpel8_hv_lowpass put
1128 h264_qpel8_hv_lowpass avg
1130 .macro h264_qpel8_hv_lowpass_l2 type
1131 function \type\()_h264_qpel8_hv_lowpass_l2_neon
1133 bl put_h264_qpel8_hv_lowpass_neon_top
1135 vld1.64 {d0, d1}, [r2,:128]!
1136 vld1.64 {d2, d3}, [r2,:128]!
1137 vrhadd.u8 q0, q0, q6
1138 vld1.64 {d4, d5}, [r2,:128]!
1139 vrhadd.u8 q1, q1, q7
1140 vld1.64 {d6, d7}, [r2,:128]!
1141 vrhadd.u8 q2, q2, q4
1142 vrhadd.u8 q3, q3, q5
1144 vld1.8 {d16}, [r0,:64], r3
1145 vrhadd.u8 d0, d0, d16
1146 vld1.8 {d17}, [r0,:64], r3
1147 vrhadd.u8 d1, d1, d17
1148 vld1.8 {d18}, [r0,:64], r3
1149 vrhadd.u8 d2, d2, d18
1150 vld1.8 {d19}, [r0,:64], r3
1151 vrhadd.u8 d3, d3, d19
1152 vld1.8 {d20}, [r0,:64], r3
1153 vrhadd.u8 d4, d4, d20
1154 vld1.8 {d21}, [r0,:64], r3
1155 vrhadd.u8 d5, d5, d21
1156 vld1.8 {d22}, [r0,:64], r3
1157 vrhadd.u8 d6, d6, d22
1158 vld1.8 {d23}, [r0,:64], r3
1159 vrhadd.u8 d7, d7, d23
1160 sub r0, r0, r3, lsl #3
1162 vst1.64 {d0}, [r0,:64], r3
1163 vst1.64 {d1}, [r0,:64], r3
1164 vst1.64 {d2}, [r0,:64], r3
1165 vst1.64 {d3}, [r0,:64], r3
1166 vst1.64 {d4}, [r0,:64], r3
1167 vst1.64 {d5}, [r0,:64], r3
1168 vst1.64 {d6}, [r0,:64], r3
1169 vst1.64 {d7}, [r0,:64], r3
1176 h264_qpel8_hv_lowpass_l2 put
1177 h264_qpel8_hv_lowpass_l2 avg
1179 .macro h264_qpel16_hv type
1180 function \type\()_h264_qpel16_hv_lowpass_neon
1182 bl \type\()_h264_qpel8_hv_lowpass_neon
1183 sub r1, r1, r3, lsl #2
1184 bl \type\()_h264_qpel8_hv_lowpass_neon
1185 sub r1, r1, r3, lsl #4
1186 sub r1, r1, r3, lsl #2
1188 sub r0, r0, r2, lsl #4
1190 bl \type\()_h264_qpel8_hv_lowpass_neon
1191 sub r1, r1, r3, lsl #2
1193 b \type\()_h264_qpel8_hv_lowpass_neon
1196 function \type\()_h264_qpel16_hv_lowpass_l2_neon
1199 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1200 sub r1, r1, r3, lsl #2
1201 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1202 sub r1, r1, r3, lsl #4
1203 sub r1, r1, r3, lsl #2
1205 sub r0, r0, r3, lsl #4
1207 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1208 sub r1, r1, r3, lsl #2
1210 b \type\()_h264_qpel8_hv_lowpass_l2_neon
1217 .macro h264_qpel8 type
1218 function ff_\type\()_h264_qpel8_mc10_neon, export=1
1223 b \type\()_h264_qpel8_h_lowpass_l2_neon
1226 function ff_\type\()_h264_qpel8_mc20_neon, export=1
1231 b \type\()_h264_qpel8_h_lowpass_neon
1234 function ff_\type\()_h264_qpel8_mc30_neon, export=1
1239 b \type\()_h264_qpel8_h_lowpass_l2_neon
1242 function ff_\type\()_h264_qpel8_mc01_neon, export=1
1245 \type\()_h264_qpel8_mc01:
1248 sub r1, r1, r2, lsl #1
1250 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1255 function ff_\type\()_h264_qpel8_mc11_neon, export=1
1256 push {r0, r1, r11, lr}
1257 \type\()_h264_qpel8_mc11:
1269 bl put_h264_qpel8_h_lowpass_neon
1273 sub r1, r1, r2, lsl #1
1275 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1281 function ff_\type\()_h264_qpel8_mc21_neon, export=1
1282 push {r0, r1, r4, r10, r11, lr}
1283 \type\()_h264_qpel8_mc21:
1289 sub sp, sp, #(8*8+16*12)
1295 bl put_h264_qpel8_h_lowpass_neon
1298 sub r1, r1, r2, lsl #1
1302 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1305 pop {r4, r10, r11, pc}
1308 function ff_\type\()_h264_qpel8_mc31_neon, export=1
1310 push {r0, r1, r11, lr}
1312 b \type\()_h264_qpel8_mc11
1315 function ff_\type\()_h264_qpel8_mc02_neon, export=1
1318 sub r1, r1, r2, lsl #1
1321 bl \type\()_h264_qpel8_v_lowpass_neon
1326 function ff_\type\()_h264_qpel8_mc12_neon, export=1
1327 push {r0, r1, r4, r10, r11, lr}
1328 \type\()_h264_qpel8_mc12:
1334 sub sp, sp, #(8*8+16*12)
1335 sub r1, r1, r2, lsl #1
1340 bl put_h264_qpel8_v_lowpass_neon
1343 sub r1, r1, r3, lsl #1
1346 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1349 pop {r4, r10, r11, pc}
1352 function ff_\type\()_h264_qpel8_mc22_neon, export=1
1353 push {r4, r10, r11, lr}
1358 sub r1, r1, r2, lsl #1
1361 sub sp, sp, #(16*12)
1364 bl \type\()_h264_qpel8_hv_lowpass_neon
1367 pop {r4, r10, r11, pc}
1370 function ff_\type\()_h264_qpel8_mc32_neon, export=1
1371 push {r0, r1, r4, r10, r11, lr}
1373 b \type\()_h264_qpel8_mc12
1376 function ff_\type\()_h264_qpel8_mc03_neon, export=1
1379 b \type\()_h264_qpel8_mc01
1382 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1383 push {r0, r1, r11, lr}
1385 b \type\()_h264_qpel8_mc11
1388 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1389 push {r0, r1, r4, r10, r11, lr}
1391 b \type\()_h264_qpel8_mc21
1394 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1396 push {r0, r1, r11, lr}
1399 b \type\()_h264_qpel8_mc11
1406 .macro h264_qpel16 type
1407 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1411 b \type\()_h264_qpel16_h_lowpass_l2_neon
1414 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1418 b \type\()_h264_qpel16_h_lowpass_neon
1421 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1425 b \type\()_h264_qpel16_h_lowpass_l2_neon
1428 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1431 \type\()_h264_qpel16_mc01:
1434 sub r1, r1, r2, lsl #1
1436 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1441 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1442 push {r0, r1, r4, r11, lr}
1443 \type\()_h264_qpel16_mc11:
1454 bl put_h264_qpel16_h_lowpass_neon
1458 sub r1, r1, r2, lsl #1
1460 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1466 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1467 push {r0, r1, r4-r5, r9-r11, lr}
1468 \type\()_h264_qpel16_mc21:
1474 sub sp, sp, #(16*16+16*12)
1478 bl put_h264_qpel16_h_lowpass_neon_packed
1481 sub r1, r1, r2, lsl #1
1484 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1487 pop {r4-r5, r9-r11, pc}
1490 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1492 push {r0, r1, r4, r11, lr}
1494 b \type\()_h264_qpel16_mc11
1497 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1500 sub r1, r1, r2, lsl #1
1503 bl \type\()_h264_qpel16_v_lowpass_neon
1508 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1509 push {r0, r1, r4-r5, r9-r11, lr}
1510 \type\()_h264_qpel16_mc12:
1516 sub sp, sp, #(16*16+16*12)
1517 sub r1, r1, r2, lsl #1
1521 bl put_h264_qpel16_v_lowpass_neon_packed
1524 sub r1, r1, r3, lsl #1
1527 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1530 pop {r4-r5, r9-r11, pc}
1533 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1534 push {r4, r9-r11, lr}
1540 sub r1, r1, r2, lsl #1
1543 sub sp, sp, #(16*12)
1546 bl \type\()_h264_qpel16_hv_lowpass_neon
1549 pop {r4, r9-r11, pc}
1552 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1553 push {r0, r1, r4-r5, r9-r11, lr}
1555 b \type\()_h264_qpel16_mc12
1558 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1561 b \type\()_h264_qpel16_mc01
1564 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1565 push {r0, r1, r4, r11, lr}
1567 b \type\()_h264_qpel16_mc11
1570 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1571 push {r0, r1, r4-r5, r9-r11, lr}
1573 b \type\()_h264_qpel16_mc21
1576 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1578 push {r0, r1, r4, r11, lr}
1581 b \type\()_h264_qpel16_mc11
1588 @ Biweighted prediction
1590 .macro biweight_16 macs, macd
1596 vld1.8 {d20-d21},[r0,:128], r2
1600 vld1.8 {d22-d23},[r1,:128], r2
1605 vld1.8 {d28-d29},[r0,:128], r2
1610 vld1.8 {d30-d31},[r1,:128], r2
1618 vshl.s16 q12, q12, q9
1619 vshl.s16 q13, q13, q9
1620 vqmovun.s16 d24, q12
1621 vqmovun.s16 d25, q13
1623 vst1.8 {d4- d5}, [r6,:128], r2
1625 vst1.8 {d24-d25},[r6,:128], r2
1630 .macro biweight_8 macs, macd
1636 vld1.8 {d4},[r0,:64], r2
1639 vld1.8 {d5},[r1,:64], r2
1642 vld1.8 {d6},[r0,:64], r2
1645 vld1.8 {d7},[r1,:64], r2
1650 vshl.s16 q10, q10, q9
1653 vst1.8 {d2},[r6,:64], r2
1655 vst1.8 {d4},[r6,:64], r2
1660 .macro biweight_4 macs, macd
1666 vld1.32 {d4[0]},[r0,:32], r2
1667 vld1.32 {d4[1]},[r0,:32], r2
1670 vld1.32 {d5[0]},[r1,:32], r2
1671 vld1.32 {d5[1]},[r1,:32], r2
1675 vld1.32 {d6[0]},[r0,:32], r2
1676 vld1.32 {d6[1]},[r0,:32], r2
1679 vld1.32 {d7[0]},[r1,:32], r2
1680 vld1.32 {d7[1]},[r1,:32], r2
1685 vshl.s16 q10, q10, q9
1688 vst1.32 {d2[0]},[r6,:32], r2
1689 vst1.32 {d2[1]},[r6,:32], r2
1691 vst1.32 {d4[0]},[r6,:32], r2
1692 vst1.32 {d4[1]},[r6,:32], r2
1695 2: vshl.s16 q1, q1, q9
1697 vst1.32 {d2[0]},[r6,:32], r2
1698 vst1.32 {d2[1]},[r6,:32], r2
1702 .macro biweight_func w
1703 function ff_biweight_h264_pixels_\w\()_neon, export=1
1710 eors lr, lr, r5, lsr #30
1723 10: biweight_\w vmlal.u8, vmlal.u8
1725 biweight_\w vmlal.u8, vmlsl.u8
1728 biweight_\w vmlsl.u8, vmlsl.u8
1730 biweight_\w vmlsl.u8, vmlal.u8
1738 @ Weighted prediction
1740 .macro weight_16 add
1743 vld1.8 {d20-d21},[r0,:128], r1
1744 vmull.u8 q2, d0, d20
1746 vmull.u8 q3, d0, d21
1747 vld1.8 {d28-d29},[r0,:128], r1
1748 vmull.u8 q12, d0, d28
1750 vmull.u8 q13, d0, d29
1752 vrshl.s16 q2, q2, q9
1754 vrshl.s16 q3, q3, q9
1758 vrshl.s16 q12, q12, q9
1760 vrshl.s16 q13, q13, q9
1761 vqmovun.s16 d24, q12
1762 vqmovun.s16 d25, q13
1763 vst1.8 {d4- d5}, [r4,:128], r1
1764 vst1.8 {d24-d25},[r4,:128], r1
1772 vld1.8 {d4},[r0,:64], r1
1775 vld1.8 {d6},[r0,:64], r1
1776 vmull.u8 q10, d0, d6
1779 vrshl.s16 q1, q1, q9
1782 vrshl.s16 q10, q10, q9
1784 vst1.8 {d2},[r4,:64], r1
1785 vst1.8 {d4},[r4,:64], r1
1795 vld1.32 {d4[0]},[r0,:32], r1
1796 vld1.32 {d4[1]},[r0,:32], r1
1800 vld1.32 {d6[0]},[r0,:32], r1
1801 vld1.32 {d6[1]},[r0,:32], r1
1802 vmull.u8 q10, d0, d6
1805 vrshl.s16 q1, q1, q9
1808 vrshl.s16 q10, q10, q9
1811 vst1.32 {d2[0]},[r4,:32], r1
1812 vst1.32 {d2[1]},[r4,:32], r1
1814 vst1.32 {d4[0]},[r4,:32], r1
1815 vst1.32 {d4[1]},[r4,:32], r1
1819 vrshl.s16 q1, q1, q9
1821 vst1.32 {d2[0]},[r4,:32], r1
1822 vst1.32 {d2[1]},[r4,:32], r1
1826 .macro weight_func w
1827 function ff_weight_h264_pixels_\w\()_neon, export=1
1841 10: rsb r12, r12, #0
1848 10: rsb r12, r12, #0