2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
38 .macro transpose_4x4 r0 r1 r2 r3
45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64 .macro h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
75 rsb r6, r7, r5, lsl #3
76 rsb ip, r7, r4, lsl #3
77 sub r4, r7, r4, lsl #3
78 sub r4, r4, r5, lsl #3
88 vld1.64 {d4, d5}, [r1], r4
90 vld1.64 {d6, d7}, [r5], r4
99 vld1.64 {d4, d5}, [r1], r4
101 vext.8 d5, d4, d5, #1
108 vrshrn.u16 d16, q8, #6
109 vld1.64 {d6, d7}, [r5], r4
111 vrshrn.u16 d17, q9, #6
113 vld1.64 {d20}, [lr,:64], r2
114 vld1.64 {d21}, [lr,:64], r2
115 vrhadd.u8 q8, q8, q10
117 vext.8 d7, d6, d7, #1
118 vst1.64 {d16}, [r0,:64], r2
119 vst1.64 {d17}, [r0,:64], r2
133 vld1.64 {d4}, [r1], r4
134 vld1.64 {d6}, [r5], r4
139 vld1.64 {d4}, [r1], r4
142 vld1.64 {d6}, [r5], r4
143 vrshrn.u16 d16, q8, #6
144 vrshrn.u16 d17, q9, #6
146 vld1.64 {d20}, [lr,:64], r2
147 vld1.64 {d21}, [lr,:64], r2
148 vrhadd.u8 q8, q8, q10
152 vst1.64 {d16}, [r0,:64], r2
153 vst1.64 {d17}, [r0,:64], r2
158 4: vld1.64 {d4, d5}, [r1], r2
159 vld1.64 {d6, d7}, [r1], r2
160 vext.8 d5, d4, d5, #1
161 vext.8 d7, d6, d7, #1
167 vld1.64 {d4, d5}, [r1], r2
171 vext.8 d5, d4, d5, #1
172 vrshrn.u16 d16, q8, #6
173 vrshrn.u16 d17, q9, #6
175 vld1.64 {d20}, [lr,:64], r2
176 vld1.64 {d21}, [lr,:64], r2
177 vrhadd.u8 q8, q8, q10
179 vld1.64 {d6, d7}, [r1], r2
180 vext.8 d7, d6, d7, #1
181 vst1.64 {d16}, [r0,:64], r2
182 vst1.64 {d17}, [r0,:64], r2
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190 .macro h264_chroma_mc4 type
191 function ff_\type\()_h264_chroma_mc4_neon, export=1
201 rsb r6, r7, r5, lsl #3
202 rsb ip, r7, r4, lsl #3
203 sub r4, r7, r4, lsl #3
204 sub r4, r4, r5, lsl #3
214 vld1.64 {d4}, [r1], r4
216 vld1.64 {d6}, [r5], r4
219 vext.8 d5, d4, d5, #1
220 vext.8 d7, d6, d7, #1
230 vld1.64 {d4}, [r1], r4
231 vext.8 d5, d4, d5, #1
235 vld1.64 {d6}, [r5], r4
236 vadd.i16 d16, d16, d17
237 vadd.i16 d17, d18, d19
238 vrshrn.u16 d16, q8, #6
242 vld1.32 {d20[0]}, [lr,:32], r2
243 vld1.32 {d20[1]}, [lr,:32], r2
244 vrhadd.u8 d16, d16, d20
246 vext.8 d7, d6, d7, #1
248 vst1.32 {d16[0]}, [r0,:32], r2
249 vst1.32 {d16[1]}, [r0,:32], r2
262 vext.32 d1, d0, d1, #1
265 vld1.32 {d4[0]}, [r1], r4
266 vld1.32 {d4[1]}, [r5], r4
270 vld1.32 {d4[0]}, [r1], r4
272 vld1.32 {d4[1]}, [r5], r4
273 vadd.i16 d16, d16, d17
274 vadd.i16 d17, d18, d19
275 vrshrn.u16 d16, q8, #6
277 vld1.32 {d20[0]}, [lr,:32], r2
278 vld1.32 {d20[1]}, [lr,:32], r2
279 vrhadd.u8 d16, d16, d20
283 vst1.32 {d16[0]}, [r0,:32], r2
284 vst1.32 {d16[1]}, [r0,:32], r2
289 4: vld1.64 {d4}, [r1], r2
290 vld1.64 {d6}, [r1], r2
291 vext.8 d5, d4, d5, #1
292 vext.8 d7, d6, d7, #1
296 5: vmull.u8 q8, d4, d0
299 vld1.64 {d4}, [r1], r2
300 vext.8 d5, d4, d5, #1
302 vadd.i16 d16, d16, d17
303 vadd.i16 d17, d18, d19
305 vrshrn.u16 d16, q8, #6
307 vld1.32 {d20[0]}, [lr,:32], r2
308 vld1.32 {d20[1]}, [lr,:32], r2
309 vrhadd.u8 d16, d16, d20
311 vld1.64 {d6}, [r1], r2
312 vext.8 d7, d6, d7, #1
315 vst1.32 {d16[0]}, [r0,:32], r2
316 vst1.32 {d16[1]}, [r0,:32], r2
323 .macro h264_chroma_mc2 type
324 function ff_\type\()_h264_chroma_mc2_neon, export=1
334 rsb r6, r5, lr, lsl #3
335 rsb r12, r5, r4, lsl #3
336 sub r4, r5, r4, lsl #3
337 sub r4, r4, lr, lsl #3
345 vld1.32 {d4[0]}, [r1], r2
346 vld1.32 {d4[1]}, [r1], r2
348 vld1.32 {d5[1]}, [r1]
349 vext.8 q3, q2, q2, #1
354 vld1.16 {d18[0]}, [r0,:16], r2
355 vld1.16 {d18[1]}, [r0,:16]
359 vadd.i16 d16, d16, d17
360 vrshrn.u16 d16, q8, #6
362 vrhadd.u8 d16, d16, d18
364 vst1.16 {d16[0]}, [r0,:16], r2
365 vst1.16 {d16[1]}, [r0,:16], r2
376 vld1.16 {d16[0]}, [r1], r2
377 vld1.16 {d16[1]}, [r1], r2
378 vld1.16 {d18[0]}, [r0,:16], r2
379 vld1.16 {d18[1]}, [r0,:16]
381 vrhadd.u8 d16, d16, d18
382 vst1.16 {d16[0]}, [r0,:16], r2
383 vst1.16 {d16[1]}, [r0,:16], r2
401 /* H.264 loop filter */
403 .macro h264_loop_filter_start
409 and ip, ip, ip, lsl #16
411 ands ip, ip, ip, lsl #8
415 .macro align_push_regs
419 vst1.64 {d12-d15}, [sp,:128]
421 vst1.64 {d8-d11}, [sp,:128]
424 .macro align_pop_regs
425 vld1.64 {d8-d11}, [sp,:128]!
426 vld1.64 {d12-d15}, [sp,:128], ip
429 .macro h264_loop_filter_luma
430 vdup.8 q11, r2 @ alpha
432 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
434 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
436 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
437 vsli.32 q12, q12, #16
438 vclt.u8 q6, q6, q11 @ < alpha
439 vdup.8 q11, r3 @ beta
441 vclt.u8 q14, q14, q11 @ < beta
442 vclt.u8 q15, q15, q11 @ < beta
444 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
446 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
447 vclt.u8 q4, q4, q11 @ < beta
449 vclt.u8 q5, q5, q11 @ < beta
453 vrhadd.u8 q14, q8, q0
456 vhadd.u8 q10, q10, q14
458 vhadd.u8 q14, q2, q14
460 vqsub.u8 q11, q9, q12
463 vqsub.u8 q11, q1, q12
466 vmax.u8 q14, q14, q11
469 vsubw.u8 q10, q10, d17
471 vshl.i16 q10, q10, #2
473 vaddw.u8 q10, q10, d19
475 vsubw.u8 q10, q10, d3
476 vrshrn.i16 d4, q2, #3
477 vrshrn.i16 d5, q10, #3
487 vaddw.s8 q14, q14, d4
489 vsubw.s8 q11, q11, d4
490 vsubw.s8 q12, q12, d5
497 function ff_h264_v_loop_filter_luma_neon, export=1
498 h264_loop_filter_start
500 vld1.64 {d0, d1}, [r0,:128], r1
501 vld1.64 {d2, d3}, [r0,:128], r1
502 vld1.64 {d4, d5}, [r0,:128], r1
503 sub r0, r0, r1, lsl #2
504 sub r0, r0, r1, lsl #1
505 vld1.64 {d20,d21}, [r0,:128], r1
506 vld1.64 {d18,d19}, [r0,:128], r1
507 vld1.64 {d16,d17}, [r0,:128], r1
511 h264_loop_filter_luma
513 sub r0, r0, r1, lsl #1
514 vst1.64 {d8, d9}, [r0,:128], r1
515 vst1.64 {d16,d17}, [r0,:128], r1
516 vst1.64 {d0, d1}, [r0,:128], r1
517 vst1.64 {d10,d11}, [r0,:128]
523 function ff_h264_h_loop_filter_luma_neon, export=1
524 h264_loop_filter_start
527 vld1.64 {d6}, [r0], r1
528 vld1.64 {d20}, [r0], r1
529 vld1.64 {d18}, [r0], r1
530 vld1.64 {d16}, [r0], r1
531 vld1.64 {d0}, [r0], r1
532 vld1.64 {d2}, [r0], r1
533 vld1.64 {d4}, [r0], r1
534 vld1.64 {d26}, [r0], r1
535 vld1.64 {d7}, [r0], r1
536 vld1.64 {d21}, [r0], r1
537 vld1.64 {d19}, [r0], r1
538 vld1.64 {d17}, [r0], r1
539 vld1.64 {d1}, [r0], r1
540 vld1.64 {d3}, [r0], r1
541 vld1.64 {d5}, [r0], r1
542 vld1.64 {d27}, [r0], r1
544 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
548 h264_loop_filter_luma
550 transpose_4x4 q4, q8, q0, q5
552 sub r0, r0, r1, lsl #4
554 vst1.32 {d8[0]}, [r0], r1
555 vst1.32 {d16[0]}, [r0], r1
556 vst1.32 {d0[0]}, [r0], r1
557 vst1.32 {d10[0]}, [r0], r1
558 vst1.32 {d8[1]}, [r0], r1
559 vst1.32 {d16[1]}, [r0], r1
560 vst1.32 {d0[1]}, [r0], r1
561 vst1.32 {d10[1]}, [r0], r1
562 vst1.32 {d9[0]}, [r0], r1
563 vst1.32 {d17[0]}, [r0], r1
564 vst1.32 {d1[0]}, [r0], r1
565 vst1.32 {d11[0]}, [r0], r1
566 vst1.32 {d9[1]}, [r0], r1
567 vst1.32 {d17[1]}, [r0], r1
568 vst1.32 {d1[1]}, [r0], r1
569 vst1.32 {d11[1]}, [r0], r1
575 .macro h264_loop_filter_chroma
576 vdup.8 d22, r2 @ alpha
578 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
580 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
584 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
586 vclt.u8 d26, d26, d22 @ < alpha
588 vdup.8 d22, r3 @ beta
589 vrshrn.i16 d4, q2, #3
590 vclt.u8 d28, d28, d22 @ < beta
591 vclt.u8 d30, d30, d22 @ < beta
600 vaddw.s8 q14, q14, d4
601 vsubw.s8 q11, q11, d4
606 function ff_h264_v_loop_filter_chroma_neon, export=1
607 h264_loop_filter_start
609 sub r0, r0, r1, lsl #1
610 vld1.64 {d18}, [r0,:64], r1
611 vld1.64 {d16}, [r0,:64], r1
612 vld1.64 {d0}, [r0,:64], r1
613 vld1.64 {d2}, [r0,:64]
615 h264_loop_filter_chroma
617 sub r0, r0, r1, lsl #1
618 vst1.64 {d16}, [r0,:64], r1
619 vst1.64 {d0}, [r0,:64], r1
624 function ff_h264_h_loop_filter_chroma_neon, export=1
625 h264_loop_filter_start
628 vld1.32 {d18[0]}, [r0], r1
629 vld1.32 {d16[0]}, [r0], r1
630 vld1.32 {d0[0]}, [r0], r1
631 vld1.32 {d2[0]}, [r0], r1
632 vld1.32 {d18[1]}, [r0], r1
633 vld1.32 {d16[1]}, [r0], r1
634 vld1.32 {d0[1]}, [r0], r1
635 vld1.32 {d2[1]}, [r0], r1
642 h264_loop_filter_chroma
649 sub r0, r0, r1, lsl #3
650 vst1.32 {d18[0]}, [r0], r1
651 vst1.32 {d16[0]}, [r0], r1
652 vst1.32 {d0[0]}, [r0], r1
653 vst1.32 {d2[0]}, [r0], r1
654 vst1.32 {d18[1]}, [r0], r1
655 vst1.32 {d16[1]}, [r0], r1
656 vst1.32 {d0[1]}, [r0], r1
657 vst1.32 {d2[1]}, [r0], r1
664 .macro lowpass_const r
670 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
678 vext.8 d2, \r0, \r1, #2
679 vext.8 d3, \r0, \r1, #3
681 vext.8 d4, \r0, \r1, #1
682 vext.8 d5, \r0, \r1, #4
684 vext.8 d30, \r0, \r1, #5
685 vaddl.u8 t0, \r0, d30
686 vext.8 d18, \r2, \r3, #2
687 vmla.i16 t0, q1, d6[1]
688 vext.8 d19, \r2, \r3, #3
689 vaddl.u8 q9, d18, d19
690 vext.8 d20, \r2, \r3, #1
691 vmls.i16 t0, q2, d6[0]
692 vext.8 d21, \r2, \r3, #4
693 vaddl.u8 q10, d20, d21
694 vext.8 d31, \r2, \r3, #5
695 vaddl.u8 t1, \r2, d31
696 vmla.i16 t1, q9, d6[1]
697 vmls.i16 t1, q10, d6[0]
699 vqrshrun.s16 \d0, t0, #5
700 vqrshrun.s16 \d1, t1, #5
706 .macro lowpass_8_1 r0, r1, d0, narrow=1
712 vext.8 d2, \r0, \r1, #2
713 vext.8 d3, \r0, \r1, #3
715 vext.8 d4, \r0, \r1, #1
716 vext.8 d5, \r0, \r1, #4
718 vext.8 d30, \r0, \r1, #5
719 vaddl.u8 t0, \r0, d30
720 vmla.i16 t0, q1, d6[1]
721 vmls.i16 t0, q2, d6[0]
723 vqrshrun.s16 \d0, t0, #5
728 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
729 vext.16 q1, \r0, \r1, #2
730 vext.16 q0, \r0, \r1, #3
732 vext.16 q2, \r0, \r1, #1
734 vext.16 q3, \r0, \r1, #4
735 vaddl.s16 q10, d4, d6
736 vext.16 \r1, \r0, \r1, #5
738 vaddl.s16 q0, \h0, \h1
739 vaddl.s16 q8, \l0, \l1
743 vshl.i32 q15, q10, #2
745 vadd.i32 q10, q10, q15
759 vrshrn.s32 d18, q9, #10
760 vrshrn.s32 d19, q1, #10
765 function put_h264_qpel16_h_lowpass_neon_packed
769 bl put_h264_qpel8_h_lowpass_neon
770 sub r1, r1, r2, lsl #4
774 b put_h264_qpel8_h_lowpass_neon
777 .macro h264_qpel_h_lowpass type
778 function \type\()_h264_qpel16_h_lowpass_neon
781 bl \type\()_h264_qpel8_h_lowpass_neon
782 sub r0, r0, r3, lsl #4
783 sub r1, r1, r2, lsl #4
790 function \type\()_h264_qpel8_h_lowpass_neon
791 1: vld1.64 {d0, d1}, [r1], r2
792 vld1.64 {d16,d17}, [r1], r2
794 lowpass_8 d0, d1, d16, d17, d0, d16
796 vld1.8 {d2}, [r0,:64], r3
798 vld1.8 {d3}, [r0,:64]
799 vrhadd.u8 d16, d16, d3
802 vst1.64 {d0}, [r0,:64], r3
803 vst1.64 {d16}, [r0,:64], r3
809 h264_qpel_h_lowpass put
810 h264_qpel_h_lowpass avg
812 .macro h264_qpel_h_lowpass_l2 type
813 function \type\()_h264_qpel16_h_lowpass_l2_neon
816 bl \type\()_h264_qpel8_h_lowpass_l2_neon
817 sub r0, r0, r2, lsl #4
818 sub r1, r1, r2, lsl #4
819 sub r3, r3, r2, lsl #4
827 function \type\()_h264_qpel8_h_lowpass_l2_neon
828 1: vld1.64 {d0, d1}, [r1], r2
829 vld1.64 {d16,d17}, [r1], r2
830 vld1.64 {d28}, [r3], r2
831 vld1.64 {d29}, [r3], r2
833 lowpass_8 d0, d1, d16, d17, d0, d1
834 vrhadd.u8 q0, q0, q14
836 vld1.8 {d2}, [r0,:64], r2
838 vld1.8 {d3}, [r0,:64]
842 vst1.64 {d0}, [r0,:64], r2
843 vst1.64 {d1}, [r0,:64], r2
849 h264_qpel_h_lowpass_l2 put
850 h264_qpel_h_lowpass_l2 avg
852 function put_h264_qpel16_v_lowpass_neon_packed
855 bl put_h264_qpel8_v_lowpass_neon
856 sub r1, r1, r3, lsl #2
857 bl put_h264_qpel8_v_lowpass_neon
858 sub r1, r1, r3, lsl #4
859 sub r1, r1, r3, lsl #2
861 bl put_h264_qpel8_v_lowpass_neon
862 sub r1, r1, r3, lsl #2
864 b put_h264_qpel8_v_lowpass_neon
867 .macro h264_qpel_v_lowpass type
868 function \type\()_h264_qpel16_v_lowpass_neon
870 bl \type\()_h264_qpel8_v_lowpass_neon
871 sub r1, r1, r3, lsl #2
872 bl \type\()_h264_qpel8_v_lowpass_neon
873 sub r0, r0, r2, lsl #4
875 sub r1, r1, r3, lsl #4
876 sub r1, r1, r3, lsl #2
878 bl \type\()_h264_qpel8_v_lowpass_neon
879 sub r1, r1, r3, lsl #2
883 function \type\()_h264_qpel8_v_lowpass_neon
884 vld1.64 {d8}, [r1], r3
885 vld1.64 {d10}, [r1], r3
886 vld1.64 {d12}, [r1], r3
887 vld1.64 {d14}, [r1], r3
888 vld1.64 {d22}, [r1], r3
889 vld1.64 {d24}, [r1], r3
890 vld1.64 {d26}, [r1], r3
891 vld1.64 {d28}, [r1], r3
892 vld1.64 {d9}, [r1], r3
893 vld1.64 {d11}, [r1], r3
894 vld1.64 {d13}, [r1], r3
895 vld1.64 {d15}, [r1], r3
898 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
899 lowpass_8 d8, d9, d10, d11, d8, d10
900 lowpass_8 d12, d13, d14, d15, d12, d14
901 lowpass_8 d22, d23, d24, d25, d22, d24
902 lowpass_8 d26, d27, d28, d29, d26, d28
903 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
906 vld1.8 {d9}, [r0,:64], r2
908 vld1.8 {d11}, [r0,:64], r2
909 vrhadd.u8 d10, d10, d11
910 vld1.8 {d13}, [r0,:64], r2
911 vrhadd.u8 d12, d12, d13
912 vld1.8 {d15}, [r0,:64], r2
913 vrhadd.u8 d14, d14, d15
914 vld1.8 {d23}, [r0,:64], r2
915 vrhadd.u8 d22, d22, d23
916 vld1.8 {d25}, [r0,:64], r2
917 vrhadd.u8 d24, d24, d25
918 vld1.8 {d27}, [r0,:64], r2
919 vrhadd.u8 d26, d26, d27
920 vld1.8 {d29}, [r0,:64], r2
921 vrhadd.u8 d28, d28, d29
922 sub r0, r0, r2, lsl #3
925 vst1.64 {d8}, [r0,:64], r2
926 vst1.64 {d10}, [r0,:64], r2
927 vst1.64 {d12}, [r0,:64], r2
928 vst1.64 {d14}, [r0,:64], r2
929 vst1.64 {d22}, [r0,:64], r2
930 vst1.64 {d24}, [r0,:64], r2
931 vst1.64 {d26}, [r0,:64], r2
932 vst1.64 {d28}, [r0,:64], r2
938 h264_qpel_v_lowpass put
939 h264_qpel_v_lowpass avg
941 .macro h264_qpel_v_lowpass_l2 type
942 function \type\()_h264_qpel16_v_lowpass_l2_neon
944 bl \type\()_h264_qpel8_v_lowpass_l2_neon
945 sub r1, r1, r3, lsl #2
946 bl \type\()_h264_qpel8_v_lowpass_l2_neon
947 sub r0, r0, r3, lsl #4
948 sub ip, ip, r2, lsl #4
951 sub r1, r1, r3, lsl #4
952 sub r1, r1, r3, lsl #2
954 bl \type\()_h264_qpel8_v_lowpass_l2_neon
955 sub r1, r1, r3, lsl #2
959 function \type\()_h264_qpel8_v_lowpass_l2_neon
960 vld1.64 {d8}, [r1], r3
961 vld1.64 {d10}, [r1], r3
962 vld1.64 {d12}, [r1], r3
963 vld1.64 {d14}, [r1], r3
964 vld1.64 {d22}, [r1], r3
965 vld1.64 {d24}, [r1], r3
966 vld1.64 {d26}, [r1], r3
967 vld1.64 {d28}, [r1], r3
968 vld1.64 {d9}, [r1], r3
969 vld1.64 {d11}, [r1], r3
970 vld1.64 {d13}, [r1], r3
971 vld1.64 {d15}, [r1], r3
974 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
975 lowpass_8 d8, d9, d10, d11, d8, d9
976 lowpass_8 d12, d13, d14, d15, d12, d13
977 lowpass_8 d22, d23, d24, d25, d22, d23
978 lowpass_8 d26, d27, d28, d29, d26, d27
979 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
981 vld1.64 {d0}, [ip], r2
982 vld1.64 {d1}, [ip], r2
983 vld1.64 {d2}, [ip], r2
984 vld1.64 {d3}, [ip], r2
985 vld1.64 {d4}, [ip], r2
987 vld1.64 {d5}, [ip], r2
989 vld1.64 {d10}, [ip], r2
990 vrhadd.u8 q2, q2, q11
991 vld1.64 {d11}, [ip], r2
992 vrhadd.u8 q5, q5, q13
995 vld1.8 {d16}, [r0,:64], r3
996 vrhadd.u8 d0, d0, d16
997 vld1.8 {d17}, [r0,:64], r3
998 vrhadd.u8 d1, d1, d17
999 vld1.8 {d16}, [r0,:64], r3
1000 vrhadd.u8 d2, d2, d16
1001 vld1.8 {d17}, [r0,:64], r3
1002 vrhadd.u8 d3, d3, d17
1003 vld1.8 {d16}, [r0,:64], r3
1004 vrhadd.u8 d4, d4, d16
1005 vld1.8 {d17}, [r0,:64], r3
1006 vrhadd.u8 d5, d5, d17
1007 vld1.8 {d16}, [r0,:64], r3
1008 vrhadd.u8 d10, d10, d16
1009 vld1.8 {d17}, [r0,:64], r3
1010 vrhadd.u8 d11, d11, d17
1011 sub r0, r0, r3, lsl #3
1014 vst1.64 {d0}, [r0,:64], r3
1015 vst1.64 {d1}, [r0,:64], r3
1016 vst1.64 {d2}, [r0,:64], r3
1017 vst1.64 {d3}, [r0,:64], r3
1018 vst1.64 {d4}, [r0,:64], r3
1019 vst1.64 {d5}, [r0,:64], r3
1020 vst1.64 {d10}, [r0,:64], r3
1021 vst1.64 {d11}, [r0,:64], r3
1027 h264_qpel_v_lowpass_l2 put
1028 h264_qpel_v_lowpass_l2 avg
1030 function put_h264_qpel8_hv_lowpass_neon_top
1033 1: vld1.64 {d0, d1}, [r1], r3
1034 vld1.64 {d16,d17}, [r1], r3
1036 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
1037 vst1.64 {d22-d25}, [r4,:128]!
1040 vld1.64 {d0, d1}, [r1]
1041 lowpass_8_1 d0, d1, q12, narrow=0
1045 vld1.64 {d30,d31}, [r4,:128], ip
1046 vld1.64 {d20,d21}, [r4,:128], ip
1047 vld1.64 {d18,d19}, [r4,:128], ip
1048 vld1.64 {d16,d17}, [r4,:128], ip
1049 vld1.64 {d14,d15}, [r4,:128], ip
1050 vld1.64 {d12,d13}, [r4,:128], ip
1051 vld1.64 {d10,d11}, [r4,:128], ip
1052 vld1.64 {d8, d9}, [r4,:128], ip
1053 vld1.64 {d6, d7}, [r4,:128], ip
1054 vld1.64 {d4, d5}, [r4,:128], ip
1055 vld1.64 {d2, d3}, [r4,:128], ip
1056 vld1.64 {d0, d1}, [r4,:128]
1058 swap4 d1, d3, d5, d7, d8, d10, d12, d14
1059 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
1061 swap4 d17, d19, d21, d31, d24, d26, d28, d22
1062 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
1064 vst1.64 {d30,d31}, [r4,:128]!
1065 vst1.64 {d6, d7}, [r4,:128]!
1066 vst1.64 {d20,d21}, [r4,:128]!
1067 vst1.64 {d4, d5}, [r4,:128]!
1068 vst1.64 {d18,d19}, [r4,:128]!
1069 vst1.64 {d2, d3}, [r4,:128]!
1070 vst1.64 {d16,d17}, [r4,:128]!
1071 vst1.64 {d0, d1}, [r4,:128]
1073 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
1074 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
1075 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
1076 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
1078 vld1.64 {d16,d17}, [r4,:128], ip
1079 vld1.64 {d30,d31}, [r4,:128], ip
1080 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
1081 vld1.64 {d16,d17}, [r4,:128], ip
1082 vld1.64 {d30,d31}, [r4,:128], ip
1083 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
1084 vld1.64 {d16,d17}, [r4,:128], ip
1085 vld1.64 {d30,d31}, [r4,:128], ip
1086 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
1087 vld1.64 {d16,d17}, [r4,:128], ip
1088 vld1.64 {d30,d31}, [r4,:128]
1089 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
1091 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
1096 .macro h264_qpel8_hv_lowpass type
1097 function \type\()_h264_qpel8_hv_lowpass_neon
1099 bl put_h264_qpel8_hv_lowpass_neon_top
1101 vld1.8 {d0}, [r0,:64], r2
1102 vrhadd.u8 d12, d12, d0
1103 vld1.8 {d1}, [r0,:64], r2
1104 vrhadd.u8 d13, d13, d1
1105 vld1.8 {d2}, [r0,:64], r2
1106 vrhadd.u8 d14, d14, d2
1107 vld1.8 {d3}, [r0,:64], r2
1108 vrhadd.u8 d15, d15, d3
1109 vld1.8 {d4}, [r0,:64], r2
1110 vrhadd.u8 d8, d8, d4
1111 vld1.8 {d5}, [r0,:64], r2
1112 vrhadd.u8 d9, d9, d5
1113 vld1.8 {d6}, [r0,:64], r2
1114 vrhadd.u8 d10, d10, d6
1115 vld1.8 {d7}, [r0,:64], r2
1116 vrhadd.u8 d11, d11, d7
1117 sub r0, r0, r2, lsl #3
1119 vst1.64 {d12}, [r0,:64], r2
1120 vst1.64 {d13}, [r0,:64], r2
1121 vst1.64 {d14}, [r0,:64], r2
1122 vst1.64 {d15}, [r0,:64], r2
1123 vst1.64 {d8}, [r0,:64], r2
1124 vst1.64 {d9}, [r0,:64], r2
1125 vst1.64 {d10}, [r0,:64], r2
1126 vst1.64 {d11}, [r0,:64], r2
1133 h264_qpel8_hv_lowpass put
1134 h264_qpel8_hv_lowpass avg
1136 .macro h264_qpel8_hv_lowpass_l2 type
1137 function \type\()_h264_qpel8_hv_lowpass_l2_neon
1139 bl put_h264_qpel8_hv_lowpass_neon_top
1141 vld1.64 {d0, d1}, [r2,:128]!
1142 vld1.64 {d2, d3}, [r2,:128]!
1143 vrhadd.u8 q0, q0, q6
1144 vld1.64 {d4, d5}, [r2,:128]!
1145 vrhadd.u8 q1, q1, q7
1146 vld1.64 {d6, d7}, [r2,:128]!
1147 vrhadd.u8 q2, q2, q4
1148 vrhadd.u8 q3, q3, q5
1150 vld1.8 {d16}, [r0,:64], r3
1151 vrhadd.u8 d0, d0, d16
1152 vld1.8 {d17}, [r0,:64], r3
1153 vrhadd.u8 d1, d1, d17
1154 vld1.8 {d18}, [r0,:64], r3
1155 vrhadd.u8 d2, d2, d18
1156 vld1.8 {d19}, [r0,:64], r3
1157 vrhadd.u8 d3, d3, d19
1158 vld1.8 {d20}, [r0,:64], r3
1159 vrhadd.u8 d4, d4, d20
1160 vld1.8 {d21}, [r0,:64], r3
1161 vrhadd.u8 d5, d5, d21
1162 vld1.8 {d22}, [r0,:64], r3
1163 vrhadd.u8 d6, d6, d22
1164 vld1.8 {d23}, [r0,:64], r3
1165 vrhadd.u8 d7, d7, d23
1166 sub r0, r0, r3, lsl #3
1168 vst1.64 {d0}, [r0,:64], r3
1169 vst1.64 {d1}, [r0,:64], r3
1170 vst1.64 {d2}, [r0,:64], r3
1171 vst1.64 {d3}, [r0,:64], r3
1172 vst1.64 {d4}, [r0,:64], r3
1173 vst1.64 {d5}, [r0,:64], r3
1174 vst1.64 {d6}, [r0,:64], r3
1175 vst1.64 {d7}, [r0,:64], r3
1182 h264_qpel8_hv_lowpass_l2 put
1183 h264_qpel8_hv_lowpass_l2 avg
1185 .macro h264_qpel16_hv type
1186 function \type\()_h264_qpel16_hv_lowpass_neon
1188 bl \type\()_h264_qpel8_hv_lowpass_neon
1189 sub r1, r1, r3, lsl #2
1190 bl \type\()_h264_qpel8_hv_lowpass_neon
1191 sub r1, r1, r3, lsl #4
1192 sub r1, r1, r3, lsl #2
1194 sub r0, r0, r2, lsl #4
1196 bl \type\()_h264_qpel8_hv_lowpass_neon
1197 sub r1, r1, r3, lsl #2
1199 b \type\()_h264_qpel8_hv_lowpass_neon
1202 function \type\()_h264_qpel16_hv_lowpass_l2_neon
1205 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1206 sub r1, r1, r3, lsl #2
1207 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1208 sub r1, r1, r3, lsl #4
1209 sub r1, r1, r3, lsl #2
1211 sub r0, r0, r3, lsl #4
1213 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1214 sub r1, r1, r3, lsl #2
1216 b \type\()_h264_qpel8_hv_lowpass_l2_neon
1223 .macro h264_qpel8 type
1224 function ff_\type\()_h264_qpel8_mc10_neon, export=1
1229 b \type\()_h264_qpel8_h_lowpass_l2_neon
1232 function ff_\type\()_h264_qpel8_mc20_neon, export=1
1237 b \type\()_h264_qpel8_h_lowpass_neon
1240 function ff_\type\()_h264_qpel8_mc30_neon, export=1
1245 b \type\()_h264_qpel8_h_lowpass_l2_neon
1248 function ff_\type\()_h264_qpel8_mc01_neon, export=1
1251 \type\()_h264_qpel8_mc01:
1254 sub r1, r1, r2, lsl #1
1256 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1261 function ff_\type\()_h264_qpel8_mc11_neon, export=1
1262 push {r0, r1, r11, lr}
1263 \type\()_h264_qpel8_mc11:
1273 bl put_h264_qpel8_h_lowpass_neon
1277 sub r1, r1, r2, lsl #1
1279 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1285 function ff_\type\()_h264_qpel8_mc21_neon, export=1
1286 push {r0, r1, r4, r10, r11, lr}
1287 \type\()_h264_qpel8_mc21:
1291 sub sp, sp, #(8*8+16*12)
1297 bl put_h264_qpel8_h_lowpass_neon
1300 sub r1, r1, r2, lsl #1
1304 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1307 pop {r4, r10, r11, pc}
1310 function ff_\type\()_h264_qpel8_mc31_neon, export=1
1312 push {r0, r1, r11, lr}
1314 b \type\()_h264_qpel8_mc11
1317 function ff_\type\()_h264_qpel8_mc02_neon, export=1
1320 sub r1, r1, r2, lsl #1
1323 bl \type\()_h264_qpel8_v_lowpass_neon
1328 function ff_\type\()_h264_qpel8_mc12_neon, export=1
1329 push {r0, r1, r4, r10, r11, lr}
1330 \type\()_h264_qpel8_mc12:
1334 sub sp, sp, #(8*8+16*12)
1335 sub r1, r1, r2, lsl #1
1340 bl put_h264_qpel8_v_lowpass_neon
1343 sub r1, r1, r3, lsl #1
1346 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1349 pop {r4, r10, r11, pc}
1352 function ff_\type\()_h264_qpel8_mc22_neon, export=1
1353 push {r4, r10, r11, lr}
1356 sub r1, r1, r2, lsl #1
1359 sub sp, sp, #(16*12)
1362 bl \type\()_h264_qpel8_hv_lowpass_neon
1365 pop {r4, r10, r11, pc}
1368 function ff_\type\()_h264_qpel8_mc32_neon, export=1
1369 push {r0, r1, r4, r10, r11, lr}
1371 b \type\()_h264_qpel8_mc12
1374 function ff_\type\()_h264_qpel8_mc03_neon, export=1
1377 b \type\()_h264_qpel8_mc01
1380 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1381 push {r0, r1, r11, lr}
1383 b \type\()_h264_qpel8_mc11
1386 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1387 push {r0, r1, r4, r10, r11, lr}
1389 b \type\()_h264_qpel8_mc21
1392 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1394 push {r0, r1, r11, lr}
1397 b \type\()_h264_qpel8_mc11
1404 .macro h264_qpel16 type
1405 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1409 b \type\()_h264_qpel16_h_lowpass_l2_neon
1412 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1416 b \type\()_h264_qpel16_h_lowpass_neon
1419 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1423 b \type\()_h264_qpel16_h_lowpass_l2_neon
1426 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1429 \type\()_h264_qpel16_mc01:
1432 sub r1, r1, r2, lsl #1
1434 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1439 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1440 push {r0, r1, r4, r11, lr}
1441 \type\()_h264_qpel16_mc11:
1450 bl put_h264_qpel16_h_lowpass_neon
1454 sub r1, r1, r2, lsl #1
1456 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1462 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1463 push {r0, r1, r4-r5, r9-r11, lr}
1464 \type\()_h264_qpel16_mc21:
1468 sub sp, sp, #(16*16+16*12)
1472 bl put_h264_qpel16_h_lowpass_neon_packed
1475 sub r1, r1, r2, lsl #1
1478 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1481 pop {r4-r5, r9-r11, pc}
1484 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1486 push {r0, r1, r4, r11, lr}
1488 b \type\()_h264_qpel16_mc11
1491 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1494 sub r1, r1, r2, lsl #1
1497 bl \type\()_h264_qpel16_v_lowpass_neon
1502 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1503 push {r0, r1, r4-r5, r9-r11, lr}
1504 \type\()_h264_qpel16_mc12:
1508 sub sp, sp, #(16*16+16*12)
1509 sub r1, r1, r2, lsl #1
1513 bl put_h264_qpel16_v_lowpass_neon_packed
1516 sub r1, r1, r3, lsl #1
1519 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1522 pop {r4-r5, r9-r11, pc}
1525 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1526 push {r4, r9-r11, lr}
1530 sub r1, r1, r2, lsl #1
1533 sub sp, sp, #(16*12)
1536 bl \type\()_h264_qpel16_hv_lowpass_neon
1539 pop {r4, r9-r11, pc}
1542 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1543 push {r0, r1, r4-r5, r9-r11, lr}
1545 b \type\()_h264_qpel16_mc12
1548 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1551 b \type\()_h264_qpel16_mc01
1554 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1555 push {r0, r1, r4, r11, lr}
1557 b \type\()_h264_qpel16_mc11
1560 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1561 push {r0, r1, r4-r5, r9-r11, lr}
1563 b \type\()_h264_qpel16_mc21
1566 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1568 push {r0, r1, r4, r11, lr}
1571 b \type\()_h264_qpel16_mc11
1578 @ Biweighted prediction
1580 .macro biweight_16 macs, macd
1586 vld1.8 {d20-d21},[r0,:128], r2
1590 vld1.8 {d22-d23},[r1,:128], r2
1595 vld1.8 {d28-d29},[r0,:128], r2
1600 vld1.8 {d30-d31},[r1,:128], r2
1608 vshl.s16 q12, q12, q9
1609 vshl.s16 q13, q13, q9
1610 vqmovun.s16 d24, q12
1611 vqmovun.s16 d25, q13
1613 vst1.8 {d4- d5}, [r6,:128], r2
1615 vst1.8 {d24-d25},[r6,:128], r2
1620 .macro biweight_8 macs, macd
1626 vld1.8 {d4},[r0,:64], r2
1629 vld1.8 {d5},[r1,:64], r2
1632 vld1.8 {d6},[r0,:64], r2
1635 vld1.8 {d7},[r1,:64], r2
1640 vshl.s16 q10, q10, q9
1643 vst1.8 {d2},[r6,:64], r2
1645 vst1.8 {d4},[r6,:64], r2
1650 .macro biweight_4 macs, macd
1656 vld1.32 {d4[0]},[r0,:32], r2
1657 vld1.32 {d4[1]},[r0,:32], r2
1660 vld1.32 {d5[0]},[r1,:32], r2
1661 vld1.32 {d5[1]},[r1,:32], r2
1665 vld1.32 {d6[0]},[r0,:32], r2
1666 vld1.32 {d6[1]},[r0,:32], r2
1669 vld1.32 {d7[0]},[r1,:32], r2
1670 vld1.32 {d7[1]},[r1,:32], r2
1675 vshl.s16 q10, q10, q9
1678 vst1.32 {d2[0]},[r6,:32], r2
1679 vst1.32 {d2[1]},[r6,:32], r2
1681 vst1.32 {d4[0]},[r6,:32], r2
1682 vst1.32 {d4[1]},[r6,:32], r2
1685 2: vshl.s16 q1, q1, q9
1687 vst1.32 {d2[0]},[r6,:32], r2
1688 vst1.32 {d2[1]},[r6,:32], r2
1692 .macro biweight_func w
1693 function biweight_h264_pixels_\w\()_neon
1699 eors lr, lr, r5, lsr #30
1712 10: biweight_\w vmlal.u8, vmlal.u8
1714 biweight_\w vmlal.u8, vmlsl.u8
1717 biweight_\w vmlsl.u8, vmlsl.u8
1719 biweight_\w vmlsl.u8, vmlal.u8
1723 .macro biweight_entry w, h, b=1
1724 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1727 b biweight_h264_pixels_\w\()_neon
1732 biweight_entry 16, 8
1733 biweight_entry 16, 16, b=0
1736 biweight_entry 8, 16
1738 biweight_entry 8, 8, b=0
1743 biweight_entry 4, 4, b=0
1746 @ Weighted prediction
1748 .macro weight_16 add
1751 vld1.8 {d20-d21},[r0,:128], r1
1752 vmull.u8 q2, d0, d20
1754 vmull.u8 q3, d0, d21
1755 vld1.8 {d28-d29},[r0,:128], r1
1756 vmull.u8 q12, d0, d28
1758 vmull.u8 q13, d0, d29
1760 vrshl.s16 q2, q2, q9
1762 vrshl.s16 q3, q3, q9
1766 vrshl.s16 q12, q12, q9
1768 vrshl.s16 q13, q13, q9
1769 vqmovun.s16 d24, q12
1770 vqmovun.s16 d25, q13
1771 vst1.8 {d4- d5}, [r4,:128], r1
1772 vst1.8 {d24-d25},[r4,:128], r1
1780 vld1.8 {d4},[r0,:64], r1
1783 vld1.8 {d6},[r0,:64], r1
1784 vmull.u8 q10, d0, d6
1787 vrshl.s16 q1, q1, q9
1790 vrshl.s16 q10, q10, q9
1792 vst1.8 {d2},[r4,:64], r1
1793 vst1.8 {d4},[r4,:64], r1
1803 vld1.32 {d4[0]},[r0,:32], r1
1804 vld1.32 {d4[1]},[r0,:32], r1
1808 vld1.32 {d6[0]},[r0,:32], r1
1809 vld1.32 {d6[1]},[r0,:32], r1
1810 vmull.u8 q10, d0, d6
1813 vrshl.s16 q1, q1, q9
1816 vrshl.s16 q10, q10, q9
1819 vst1.32 {d2[0]},[r4,:32], r1
1820 vst1.32 {d2[1]},[r4,:32], r1
1822 vst1.32 {d4[0]},[r4,:32], r1
1823 vst1.32 {d4[1]},[r4,:32], r1
1827 vrshl.s16 q1, q1, q9
1829 vst1.32 {d2[0]},[r4,:32], r1
1830 vst1.32 {d2[1]},[r4,:32], r1
1834 .macro weight_func w
1835 function weight_h264_pixels_\w\()_neon
1860 .macro weight_entry w, h, b=1
1861 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1864 b weight_h264_pixels_\w\()_neon
1870 weight_entry 16, 16, b=0
1875 weight_entry 8, 8, b=0
1880 weight_entry 4, 4, b=0