2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
38 .macro transpose_4x4 r0 r1 r2 r3
45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64 .macro h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
75 rsb r6, r7, r5, lsl #3
76 rsb ip, r7, r4, lsl #3
77 sub r4, r7, r4, lsl #3
78 sub r4, r4, r5, lsl #3
88 vld1.64 {d4, d5}, [r1], r4
90 vld1.64 {d6, d7}, [r5], r4
99 vld1.64 {d4, d5}, [r1], r4
101 vext.8 d5, d4, d5, #1
108 vrshrn.u16 d16, q8, #6
109 vld1.64 {d6, d7}, [r5], r4
111 vrshrn.u16 d17, q9, #6
113 vld1.64 {d20}, [lr,:64], r2
114 vld1.64 {d21}, [lr,:64], r2
115 vrhadd.u8 q8, q8, q10
117 vext.8 d7, d6, d7, #1
118 vst1.64 {d16}, [r0,:64], r2
119 vst1.64 {d17}, [r0,:64], r2
133 vld1.64 {d4}, [r1], r4
134 vld1.64 {d6}, [r5], r4
139 vld1.64 {d4}, [r1], r4
142 vld1.64 {d6}, [r5], r4
143 vrshrn.u16 d16, q8, #6
144 vrshrn.u16 d17, q9, #6
146 vld1.64 {d20}, [lr,:64], r2
147 vld1.64 {d21}, [lr,:64], r2
148 vrhadd.u8 q8, q8, q10
152 vst1.64 {d16}, [r0,:64], r2
153 vst1.64 {d17}, [r0,:64], r2
158 4: vld1.64 {d4, d5}, [r1], r2
159 vld1.64 {d6, d7}, [r1], r2
160 vext.8 d5, d4, d5, #1
161 vext.8 d7, d6, d7, #1
167 vld1.64 {d4, d5}, [r1], r2
171 vext.8 d5, d4, d5, #1
172 vrshrn.u16 d16, q8, #6
173 vrshrn.u16 d17, q9, #6
175 vld1.64 {d20}, [lr,:64], r2
176 vld1.64 {d21}, [lr,:64], r2
177 vrhadd.u8 q8, q8, q10
179 vld1.64 {d6, d7}, [r1], r2
180 vext.8 d7, d6, d7, #1
181 vst1.64 {d16}, [r0,:64], r2
182 vst1.64 {d17}, [r0,:64], r2
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190 .macro h264_chroma_mc4 type
191 function ff_\type\()_h264_chroma_mc4_neon, export=1
201 rsb r6, r7, r5, lsl #3
202 rsb ip, r7, r4, lsl #3
203 sub r4, r7, r4, lsl #3
204 sub r4, r4, r5, lsl #3
214 vld1.64 {d4}, [r1], r4
216 vld1.64 {d6}, [r5], r4
219 vext.8 d5, d4, d5, #1
220 vext.8 d7, d6, d7, #1
230 vld1.64 {d4}, [r1], r4
231 vext.8 d5, d4, d5, #1
235 vld1.64 {d6}, [r5], r4
236 vadd.i16 d16, d16, d17
237 vadd.i16 d17, d18, d19
238 vrshrn.u16 d16, q8, #6
242 vld1.32 {d20[0]}, [lr,:32], r2
243 vld1.32 {d20[1]}, [lr,:32], r2
244 vrhadd.u8 d16, d16, d20
246 vext.8 d7, d6, d7, #1
248 vst1.32 {d16[0]}, [r0,:32], r2
249 vst1.32 {d16[1]}, [r0,:32], r2
262 vext.32 d1, d0, d1, #1
265 vld1.32 {d4[0]}, [r1], r4
266 vld1.32 {d4[1]}, [r5], r4
270 vld1.32 {d4[0]}, [r1], r4
272 vld1.32 {d4[1]}, [r5], r4
273 vadd.i16 d16, d16, d17
274 vadd.i16 d17, d18, d19
275 vrshrn.u16 d16, q8, #6
277 vld1.32 {d20[0]}, [lr,:32], r2
278 vld1.32 {d20[1]}, [lr,:32], r2
279 vrhadd.u8 d16, d16, d20
283 vst1.32 {d16[0]}, [r0,:32], r2
284 vst1.32 {d16[1]}, [r0,:32], r2
289 4: vld1.64 {d4}, [r1], r2
290 vld1.64 {d6}, [r1], r2
291 vext.8 d5, d4, d5, #1
292 vext.8 d7, d6, d7, #1
296 5: vmull.u8 q8, d4, d0
299 vld1.64 {d4}, [r1], r2
300 vext.8 d5, d4, d5, #1
302 vadd.i16 d16, d16, d17
303 vadd.i16 d17, d18, d19
305 vrshrn.u16 d16, q8, #6
307 vld1.32 {d20[0]}, [lr,:32], r2
308 vld1.32 {d20[1]}, [lr,:32], r2
309 vrhadd.u8 d16, d16, d20
311 vld1.64 {d6}, [r1], r2
312 vext.8 d7, d6, d7, #1
315 vst1.32 {d16[0]}, [r0,:32], r2
316 vst1.32 {d16[1]}, [r0,:32], r2
323 .macro h264_chroma_mc2 type
324 function ff_\type\()_h264_chroma_mc2_neon, export=1
334 rsb r6, r5, lr, lsl #3
335 rsb r12, r5, r4, lsl #3
336 sub r4, r5, r4, lsl #3
337 sub r4, r4, lr, lsl #3
345 vld1.32 {d4[0]}, [r1], r2
346 vld1.32 {d4[1]}, [r1], r2
348 vld1.32 {d5[1]}, [r1]
349 vext.8 q3, q2, q2, #1
354 vld1.16 {d18[0]}, [r0,:16], r2
355 vld1.16 {d18[1]}, [r0,:16]
359 vadd.i16 d16, d16, d17
360 vrshrn.u16 d16, q8, #6
362 vrhadd.u8 d16, d16, d18
364 vst1.16 {d16[0]}, [r0,:16], r2
365 vst1.16 {d16[1]}, [r0,:16], r2
376 vld1.16 {d16[0]}, [r1], r2
377 vld1.16 {d16[1]}, [r1], r2
378 vld1.16 {d18[0]}, [r0,:16], r2
379 vld1.16 {d18[1]}, [r0,:16]
381 vrhadd.u8 d16, d16, d18
382 vst1.16 {d16[0]}, [r0,:16], r2
383 vst1.16 {d16[1]}, [r0,:16], r2
401 /* H.264 loop filter */
403 .macro h264_loop_filter_start
409 and ip, ip, ip, lsl #16
411 ands ip, ip, ip, lsl #8
415 .macro align_push_regs
419 vst1.64 {d12-d15}, [sp,:128]
421 vst1.64 {d8-d11}, [sp,:128]
424 .macro align_pop_regs
425 vld1.64 {d8-d11}, [sp,:128]!
426 vld1.64 {d12-d15}, [sp,:128], ip
429 .macro h264_loop_filter_luma
430 vdup.8 q11, r2 @ alpha
432 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
434 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
436 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
437 vsli.32 q12, q12, #16
438 vclt.u8 q6, q6, q11 @ < alpha
439 vdup.8 q11, r3 @ beta
441 vclt.u8 q14, q14, q11 @ < beta
442 vclt.u8 q15, q15, q11 @ < beta
444 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
446 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
447 vclt.u8 q4, q4, q11 @ < beta
449 vclt.u8 q5, q5, q11 @ < beta
453 vrhadd.u8 q14, q8, q0
456 vhadd.u8 q10, q10, q14
458 vhadd.u8 q14, q2, q14
460 vqsub.u8 q11, q9, q12
463 vqsub.u8 q11, q1, q12
466 vmax.u8 q14, q14, q11
469 vsubw.u8 q10, q10, d17
471 vshl.i16 q10, q10, #2
473 vaddw.u8 q10, q10, d19
475 vsubw.u8 q10, q10, d3
476 vrshrn.i16 d4, q2, #3
477 vrshrn.i16 d5, q10, #3
487 vaddw.s8 q14, q14, d4
489 vsubw.s8 q11, q11, d4
490 vsubw.s8 q12, q12, d5
497 function ff_h264_v_loop_filter_luma_neon, export=1
498 h264_loop_filter_start
500 vld1.64 {d0, d1}, [r0,:128], r1
501 vld1.64 {d2, d3}, [r0,:128], r1
502 vld1.64 {d4, d5}, [r0,:128], r1
503 sub r0, r0, r1, lsl #2
504 sub r0, r0, r1, lsl #1
505 vld1.64 {d20,d21}, [r0,:128], r1
506 vld1.64 {d18,d19}, [r0,:128], r1
507 vld1.64 {d16,d17}, [r0,:128], r1
511 h264_loop_filter_luma
513 sub r0, r0, r1, lsl #1
514 vst1.64 {d8, d9}, [r0,:128], r1
515 vst1.64 {d16,d17}, [r0,:128], r1
516 vst1.64 {d0, d1}, [r0,:128], r1
517 vst1.64 {d10,d11}, [r0,:128]
523 function ff_h264_h_loop_filter_luma_neon, export=1
524 h264_loop_filter_start
527 vld1.64 {d6}, [r0], r1
528 vld1.64 {d20}, [r0], r1
529 vld1.64 {d18}, [r0], r1
530 vld1.64 {d16}, [r0], r1
531 vld1.64 {d0}, [r0], r1
532 vld1.64 {d2}, [r0], r1
533 vld1.64 {d4}, [r0], r1
534 vld1.64 {d26}, [r0], r1
535 vld1.64 {d7}, [r0], r1
536 vld1.64 {d21}, [r0], r1
537 vld1.64 {d19}, [r0], r1
538 vld1.64 {d17}, [r0], r1
539 vld1.64 {d1}, [r0], r1
540 vld1.64 {d3}, [r0], r1
541 vld1.64 {d5}, [r0], r1
542 vld1.64 {d27}, [r0], r1
544 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
548 h264_loop_filter_luma
550 transpose_4x4 q4, q8, q0, q5
552 sub r0, r0, r1, lsl #4
554 vst1.32 {d8[0]}, [r0], r1
555 vst1.32 {d16[0]}, [r0], r1
556 vst1.32 {d0[0]}, [r0], r1
557 vst1.32 {d10[0]}, [r0], r1
558 vst1.32 {d8[1]}, [r0], r1
559 vst1.32 {d16[1]}, [r0], r1
560 vst1.32 {d0[1]}, [r0], r1
561 vst1.32 {d10[1]}, [r0], r1
562 vst1.32 {d9[0]}, [r0], r1
563 vst1.32 {d17[0]}, [r0], r1
564 vst1.32 {d1[0]}, [r0], r1
565 vst1.32 {d11[0]}, [r0], r1
566 vst1.32 {d9[1]}, [r0], r1
567 vst1.32 {d17[1]}, [r0], r1
568 vst1.32 {d1[1]}, [r0], r1
569 vst1.32 {d11[1]}, [r0], r1
575 .macro h264_loop_filter_chroma
576 vdup.8 d22, r2 @ alpha
578 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
580 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
584 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
586 vclt.u8 d26, d26, d22 @ < alpha
588 vdup.8 d22, r3 @ beta
590 vrshrn.i16 d4, q2, #3
591 vclt.u8 d28, d28, d22 @ < beta
593 vclt.u8 d30, d30, d22 @ < beta
602 vaddw.s8 q14, q14, d4
603 vsubw.s8 q11, q11, d4
608 function ff_h264_v_loop_filter_chroma_neon, export=1
609 h264_loop_filter_start
611 sub r0, r0, r1, lsl #1
612 vld1.64 {d18}, [r0,:64], r1
613 vld1.64 {d16}, [r0,:64], r1
614 vld1.64 {d0}, [r0,:64], r1
615 vld1.64 {d2}, [r0,:64]
617 h264_loop_filter_chroma
619 sub r0, r0, r1, lsl #1
620 vst1.64 {d16}, [r0,:64], r1
621 vst1.64 {d0}, [r0,:64], r1
626 function ff_h264_h_loop_filter_chroma_neon, export=1
627 h264_loop_filter_start
630 vld1.32 {d18[0]}, [r0], r1
631 vld1.32 {d16[0]}, [r0], r1
632 vld1.32 {d0[0]}, [r0], r1
633 vld1.32 {d2[0]}, [r0], r1
634 vld1.32 {d18[1]}, [r0], r1
635 vld1.32 {d16[1]}, [r0], r1
636 vld1.32 {d0[1]}, [r0], r1
637 vld1.32 {d2[1]}, [r0], r1
644 h264_loop_filter_chroma
651 sub r0, r0, r1, lsl #3
652 vst1.32 {d18[0]}, [r0], r1
653 vst1.32 {d16[0]}, [r0], r1
654 vst1.32 {d0[0]}, [r0], r1
655 vst1.32 {d2[0]}, [r0], r1
656 vst1.32 {d18[1]}, [r0], r1
657 vst1.32 {d16[1]}, [r0], r1
658 vst1.32 {d0[1]}, [r0], r1
659 vst1.32 {d2[1]}, [r0], r1
666 .macro lowpass_const r
672 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
680 vext.8 d2, \r0, \r1, #2
681 vext.8 d3, \r0, \r1, #3
683 vext.8 d4, \r0, \r1, #1
684 vext.8 d5, \r0, \r1, #4
686 vext.8 d30, \r0, \r1, #5
687 vaddl.u8 t0, \r0, d30
688 vext.8 d18, \r2, \r3, #2
689 vmla.i16 t0, q1, d6[1]
690 vext.8 d19, \r2, \r3, #3
691 vaddl.u8 q9, d18, d19
692 vext.8 d20, \r2, \r3, #1
693 vmls.i16 t0, q2, d6[0]
694 vext.8 d21, \r2, \r3, #4
695 vaddl.u8 q10, d20, d21
696 vext.8 d31, \r2, \r3, #5
697 vaddl.u8 t1, \r2, d31
698 vmla.i16 t1, q9, d6[1]
699 vmls.i16 t1, q10, d6[0]
701 vqrshrun.s16 \d0, t0, #5
702 vqrshrun.s16 \d1, t1, #5
708 .macro lowpass_8_1 r0, r1, d0, narrow=1
714 vext.8 d2, \r0, \r1, #2
715 vext.8 d3, \r0, \r1, #3
717 vext.8 d4, \r0, \r1, #1
718 vext.8 d5, \r0, \r1, #4
720 vext.8 d30, \r0, \r1, #5
721 vaddl.u8 t0, \r0, d30
722 vmla.i16 t0, q1, d6[1]
723 vmls.i16 t0, q2, d6[0]
725 vqrshrun.s16 \d0, t0, #5
730 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
731 vext.16 q1, \r0, \r1, #2
732 vext.16 q0, \r0, \r1, #3
734 vext.16 q2, \r0, \r1, #1
736 vext.16 q3, \r0, \r1, #4
737 vaddl.s16 q10, d4, d6
738 vext.16 \r1, \r0, \r1, #5
740 vaddl.s16 q0, \h0, \h1
741 vaddl.s16 q8, \l0, \l1
745 vshl.i32 q15, q10, #2
747 vadd.i32 q10, q10, q15
761 vrshrn.s32 d18, q9, #10
762 vrshrn.s32 d19, q1, #10
767 function put_h264_qpel16_h_lowpass_neon_packed
771 bl put_h264_qpel8_h_lowpass_neon
772 sub r1, r1, r2, lsl #4
776 b put_h264_qpel8_h_lowpass_neon
779 .macro h264_qpel_h_lowpass type
780 function \type\()_h264_qpel16_h_lowpass_neon
783 bl \type\()_h264_qpel8_h_lowpass_neon
784 sub r0, r0, r3, lsl #4
785 sub r1, r1, r2, lsl #4
792 function \type\()_h264_qpel8_h_lowpass_neon
793 1: vld1.64 {d0, d1}, [r1], r2
794 vld1.64 {d16,d17}, [r1], r2
796 lowpass_8 d0, d1, d16, d17, d0, d16
798 vld1.8 {d2}, [r0,:64], r3
800 vld1.8 {d3}, [r0,:64]
801 vrhadd.u8 d16, d16, d3
804 vst1.64 {d0}, [r0,:64], r3
805 vst1.64 {d16}, [r0,:64], r3
811 h264_qpel_h_lowpass put
812 h264_qpel_h_lowpass avg
814 .macro h264_qpel_h_lowpass_l2 type
815 function \type\()_h264_qpel16_h_lowpass_l2_neon
818 bl \type\()_h264_qpel8_h_lowpass_l2_neon
819 sub r0, r0, r2, lsl #4
820 sub r1, r1, r2, lsl #4
821 sub r3, r3, r2, lsl #4
829 function \type\()_h264_qpel8_h_lowpass_l2_neon
830 1: vld1.64 {d0, d1}, [r1], r2
831 vld1.64 {d16,d17}, [r1], r2
832 vld1.64 {d28}, [r3], r2
833 vld1.64 {d29}, [r3], r2
835 lowpass_8 d0, d1, d16, d17, d0, d1
836 vrhadd.u8 q0, q0, q14
838 vld1.8 {d2}, [r0,:64], r2
840 vld1.8 {d3}, [r0,:64]
844 vst1.64 {d0}, [r0,:64], r2
845 vst1.64 {d1}, [r0,:64], r2
851 h264_qpel_h_lowpass_l2 put
852 h264_qpel_h_lowpass_l2 avg
854 function put_h264_qpel16_v_lowpass_neon_packed
857 bl put_h264_qpel8_v_lowpass_neon
858 sub r1, r1, r3, lsl #2
859 bl put_h264_qpel8_v_lowpass_neon
860 sub r1, r1, r3, lsl #4
861 sub r1, r1, r3, lsl #2
863 bl put_h264_qpel8_v_lowpass_neon
864 sub r1, r1, r3, lsl #2
866 b put_h264_qpel8_v_lowpass_neon
869 .macro h264_qpel_v_lowpass type
870 function \type\()_h264_qpel16_v_lowpass_neon
872 bl \type\()_h264_qpel8_v_lowpass_neon
873 sub r1, r1, r3, lsl #2
874 bl \type\()_h264_qpel8_v_lowpass_neon
875 sub r0, r0, r2, lsl #4
877 sub r1, r1, r3, lsl #4
878 sub r1, r1, r3, lsl #2
880 bl \type\()_h264_qpel8_v_lowpass_neon
881 sub r1, r1, r3, lsl #2
885 function \type\()_h264_qpel8_v_lowpass_neon
886 vld1.64 {d8}, [r1], r3
887 vld1.64 {d10}, [r1], r3
888 vld1.64 {d12}, [r1], r3
889 vld1.64 {d14}, [r1], r3
890 vld1.64 {d22}, [r1], r3
891 vld1.64 {d24}, [r1], r3
892 vld1.64 {d26}, [r1], r3
893 vld1.64 {d28}, [r1], r3
894 vld1.64 {d9}, [r1], r3
895 vld1.64 {d11}, [r1], r3
896 vld1.64 {d13}, [r1], r3
897 vld1.64 {d15}, [r1], r3
900 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
901 lowpass_8 d8, d9, d10, d11, d8, d10
902 lowpass_8 d12, d13, d14, d15, d12, d14
903 lowpass_8 d22, d23, d24, d25, d22, d24
904 lowpass_8 d26, d27, d28, d29, d26, d28
905 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
908 vld1.8 {d9}, [r0,:64], r2
910 vld1.8 {d11}, [r0,:64], r2
911 vrhadd.u8 d10, d10, d11
912 vld1.8 {d13}, [r0,:64], r2
913 vrhadd.u8 d12, d12, d13
914 vld1.8 {d15}, [r0,:64], r2
915 vrhadd.u8 d14, d14, d15
916 vld1.8 {d23}, [r0,:64], r2
917 vrhadd.u8 d22, d22, d23
918 vld1.8 {d25}, [r0,:64], r2
919 vrhadd.u8 d24, d24, d25
920 vld1.8 {d27}, [r0,:64], r2
921 vrhadd.u8 d26, d26, d27
922 vld1.8 {d29}, [r0,:64], r2
923 vrhadd.u8 d28, d28, d29
924 sub r0, r0, r2, lsl #3
927 vst1.64 {d8}, [r0,:64], r2
928 vst1.64 {d10}, [r0,:64], r2
929 vst1.64 {d12}, [r0,:64], r2
930 vst1.64 {d14}, [r0,:64], r2
931 vst1.64 {d22}, [r0,:64], r2
932 vst1.64 {d24}, [r0,:64], r2
933 vst1.64 {d26}, [r0,:64], r2
934 vst1.64 {d28}, [r0,:64], r2
940 h264_qpel_v_lowpass put
941 h264_qpel_v_lowpass avg
943 .macro h264_qpel_v_lowpass_l2 type
944 function \type\()_h264_qpel16_v_lowpass_l2_neon
946 bl \type\()_h264_qpel8_v_lowpass_l2_neon
947 sub r1, r1, r3, lsl #2
948 bl \type\()_h264_qpel8_v_lowpass_l2_neon
949 sub r0, r0, r3, lsl #4
950 sub ip, ip, r2, lsl #4
953 sub r1, r1, r3, lsl #4
954 sub r1, r1, r3, lsl #2
956 bl \type\()_h264_qpel8_v_lowpass_l2_neon
957 sub r1, r1, r3, lsl #2
961 function \type\()_h264_qpel8_v_lowpass_l2_neon
962 vld1.64 {d8}, [r1], r3
963 vld1.64 {d10}, [r1], r3
964 vld1.64 {d12}, [r1], r3
965 vld1.64 {d14}, [r1], r3
966 vld1.64 {d22}, [r1], r3
967 vld1.64 {d24}, [r1], r3
968 vld1.64 {d26}, [r1], r3
969 vld1.64 {d28}, [r1], r3
970 vld1.64 {d9}, [r1], r3
971 vld1.64 {d11}, [r1], r3
972 vld1.64 {d13}, [r1], r3
973 vld1.64 {d15}, [r1], r3
976 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
977 lowpass_8 d8, d9, d10, d11, d8, d9
978 lowpass_8 d12, d13, d14, d15, d12, d13
979 lowpass_8 d22, d23, d24, d25, d22, d23
980 lowpass_8 d26, d27, d28, d29, d26, d27
981 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
983 vld1.64 {d0}, [ip], r2
984 vld1.64 {d1}, [ip], r2
985 vld1.64 {d2}, [ip], r2
986 vld1.64 {d3}, [ip], r2
987 vld1.64 {d4}, [ip], r2
989 vld1.64 {d5}, [ip], r2
991 vld1.64 {d10}, [ip], r2
992 vrhadd.u8 q2, q2, q11
993 vld1.64 {d11}, [ip], r2
994 vrhadd.u8 q5, q5, q13
997 vld1.8 {d16}, [r0,:64], r3
998 vrhadd.u8 d0, d0, d16
999 vld1.8 {d17}, [r0,:64], r3
1000 vrhadd.u8 d1, d1, d17
1001 vld1.8 {d16}, [r0,:64], r3
1002 vrhadd.u8 d2, d2, d16
1003 vld1.8 {d17}, [r0,:64], r3
1004 vrhadd.u8 d3, d3, d17
1005 vld1.8 {d16}, [r0,:64], r3
1006 vrhadd.u8 d4, d4, d16
1007 vld1.8 {d17}, [r0,:64], r3
1008 vrhadd.u8 d5, d5, d17
1009 vld1.8 {d16}, [r0,:64], r3
1010 vrhadd.u8 d10, d10, d16
1011 vld1.8 {d17}, [r0,:64], r3
1012 vrhadd.u8 d11, d11, d17
1013 sub r0, r0, r3, lsl #3
1016 vst1.64 {d0}, [r0,:64], r3
1017 vst1.64 {d1}, [r0,:64], r3
1018 vst1.64 {d2}, [r0,:64], r3
1019 vst1.64 {d3}, [r0,:64], r3
1020 vst1.64 {d4}, [r0,:64], r3
1021 vst1.64 {d5}, [r0,:64], r3
1022 vst1.64 {d10}, [r0,:64], r3
1023 vst1.64 {d11}, [r0,:64], r3
1029 h264_qpel_v_lowpass_l2 put
1030 h264_qpel_v_lowpass_l2 avg
1032 function put_h264_qpel8_hv_lowpass_neon_top
1035 1: vld1.64 {d0, d1}, [r1], r3
1036 vld1.64 {d16,d17}, [r1], r3
1038 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
1039 vst1.64 {d22-d25}, [r4,:128]!
1042 vld1.64 {d0, d1}, [r1]
1043 lowpass_8_1 d0, d1, q12, narrow=0
1047 vld1.64 {d30,d31}, [r4,:128], ip
1048 vld1.64 {d20,d21}, [r4,:128], ip
1049 vld1.64 {d18,d19}, [r4,:128], ip
1050 vld1.64 {d16,d17}, [r4,:128], ip
1051 vld1.64 {d14,d15}, [r4,:128], ip
1052 vld1.64 {d12,d13}, [r4,:128], ip
1053 vld1.64 {d10,d11}, [r4,:128], ip
1054 vld1.64 {d8, d9}, [r4,:128], ip
1055 vld1.64 {d6, d7}, [r4,:128], ip
1056 vld1.64 {d4, d5}, [r4,:128], ip
1057 vld1.64 {d2, d3}, [r4,:128], ip
1058 vld1.64 {d0, d1}, [r4,:128]
1060 swap4 d1, d3, d5, d7, d8, d10, d12, d14
1061 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
1063 swap4 d17, d19, d21, d31, d24, d26, d28, d22
1064 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
1066 vst1.64 {d30,d31}, [r4,:128]!
1067 vst1.64 {d6, d7}, [r4,:128]!
1068 vst1.64 {d20,d21}, [r4,:128]!
1069 vst1.64 {d4, d5}, [r4,:128]!
1070 vst1.64 {d18,d19}, [r4,:128]!
1071 vst1.64 {d2, d3}, [r4,:128]!
1072 vst1.64 {d16,d17}, [r4,:128]!
1073 vst1.64 {d0, d1}, [r4,:128]
1075 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
1076 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
1077 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
1078 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
1080 vld1.64 {d16,d17}, [r4,:128], ip
1081 vld1.64 {d30,d31}, [r4,:128], ip
1082 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
1083 vld1.64 {d16,d17}, [r4,:128], ip
1084 vld1.64 {d30,d31}, [r4,:128], ip
1085 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
1086 vld1.64 {d16,d17}, [r4,:128], ip
1087 vld1.64 {d30,d31}, [r4,:128], ip
1088 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
1089 vld1.64 {d16,d17}, [r4,:128], ip
1090 vld1.64 {d30,d31}, [r4,:128]
1091 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
1093 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
1098 .macro h264_qpel8_hv_lowpass type
1099 function \type\()_h264_qpel8_hv_lowpass_neon
1101 bl put_h264_qpel8_hv_lowpass_neon_top
1103 vld1.8 {d0}, [r0,:64], r2
1104 vrhadd.u8 d12, d12, d0
1105 vld1.8 {d1}, [r0,:64], r2
1106 vrhadd.u8 d13, d13, d1
1107 vld1.8 {d2}, [r0,:64], r2
1108 vrhadd.u8 d14, d14, d2
1109 vld1.8 {d3}, [r0,:64], r2
1110 vrhadd.u8 d15, d15, d3
1111 vld1.8 {d4}, [r0,:64], r2
1112 vrhadd.u8 d8, d8, d4
1113 vld1.8 {d5}, [r0,:64], r2
1114 vrhadd.u8 d9, d9, d5
1115 vld1.8 {d6}, [r0,:64], r2
1116 vrhadd.u8 d10, d10, d6
1117 vld1.8 {d7}, [r0,:64], r2
1118 vrhadd.u8 d11, d11, d7
1119 sub r0, r0, r2, lsl #3
1121 vst1.64 {d12}, [r0,:64], r2
1122 vst1.64 {d13}, [r0,:64], r2
1123 vst1.64 {d14}, [r0,:64], r2
1124 vst1.64 {d15}, [r0,:64], r2
1125 vst1.64 {d8}, [r0,:64], r2
1126 vst1.64 {d9}, [r0,:64], r2
1127 vst1.64 {d10}, [r0,:64], r2
1128 vst1.64 {d11}, [r0,:64], r2
1135 h264_qpel8_hv_lowpass put
1136 h264_qpel8_hv_lowpass avg
1138 .macro h264_qpel8_hv_lowpass_l2 type
1139 function \type\()_h264_qpel8_hv_lowpass_l2_neon
1141 bl put_h264_qpel8_hv_lowpass_neon_top
1143 vld1.64 {d0, d1}, [r2,:128]!
1144 vld1.64 {d2, d3}, [r2,:128]!
1145 vrhadd.u8 q0, q0, q6
1146 vld1.64 {d4, d5}, [r2,:128]!
1147 vrhadd.u8 q1, q1, q7
1148 vld1.64 {d6, d7}, [r2,:128]!
1149 vrhadd.u8 q2, q2, q4
1150 vrhadd.u8 q3, q3, q5
1152 vld1.8 {d16}, [r0,:64], r3
1153 vrhadd.u8 d0, d0, d16
1154 vld1.8 {d17}, [r0,:64], r3
1155 vrhadd.u8 d1, d1, d17
1156 vld1.8 {d18}, [r0,:64], r3
1157 vrhadd.u8 d2, d2, d18
1158 vld1.8 {d19}, [r0,:64], r3
1159 vrhadd.u8 d3, d3, d19
1160 vld1.8 {d20}, [r0,:64], r3
1161 vrhadd.u8 d4, d4, d20
1162 vld1.8 {d21}, [r0,:64], r3
1163 vrhadd.u8 d5, d5, d21
1164 vld1.8 {d22}, [r0,:64], r3
1165 vrhadd.u8 d6, d6, d22
1166 vld1.8 {d23}, [r0,:64], r3
1167 vrhadd.u8 d7, d7, d23
1168 sub r0, r0, r3, lsl #3
1170 vst1.64 {d0}, [r0,:64], r3
1171 vst1.64 {d1}, [r0,:64], r3
1172 vst1.64 {d2}, [r0,:64], r3
1173 vst1.64 {d3}, [r0,:64], r3
1174 vst1.64 {d4}, [r0,:64], r3
1175 vst1.64 {d5}, [r0,:64], r3
1176 vst1.64 {d6}, [r0,:64], r3
1177 vst1.64 {d7}, [r0,:64], r3
1184 h264_qpel8_hv_lowpass_l2 put
1185 h264_qpel8_hv_lowpass_l2 avg
1187 .macro h264_qpel16_hv type
1188 function \type\()_h264_qpel16_hv_lowpass_neon
1190 bl \type\()_h264_qpel8_hv_lowpass_neon
1191 sub r1, r1, r3, lsl #2
1192 bl \type\()_h264_qpel8_hv_lowpass_neon
1193 sub r1, r1, r3, lsl #4
1194 sub r1, r1, r3, lsl #2
1196 sub r0, r0, r2, lsl #4
1198 bl \type\()_h264_qpel8_hv_lowpass_neon
1199 sub r1, r1, r3, lsl #2
1201 b \type\()_h264_qpel8_hv_lowpass_neon
1204 function \type\()_h264_qpel16_hv_lowpass_l2_neon
1207 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1208 sub r1, r1, r3, lsl #2
1209 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1210 sub r1, r1, r3, lsl #4
1211 sub r1, r1, r3, lsl #2
1213 sub r0, r0, r3, lsl #4
1215 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1216 sub r1, r1, r3, lsl #2
1218 b \type\()_h264_qpel8_hv_lowpass_l2_neon
1225 .macro h264_qpel8 type
1226 function ff_\type\()_h264_qpel8_mc10_neon, export=1
1231 b \type\()_h264_qpel8_h_lowpass_l2_neon
1234 function ff_\type\()_h264_qpel8_mc20_neon, export=1
1239 b \type\()_h264_qpel8_h_lowpass_neon
1242 function ff_\type\()_h264_qpel8_mc30_neon, export=1
1247 b \type\()_h264_qpel8_h_lowpass_l2_neon
1250 function ff_\type\()_h264_qpel8_mc01_neon, export=1
1253 \type\()_h264_qpel8_mc01:
1256 sub r1, r1, r2, lsl #1
1258 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1263 function ff_\type\()_h264_qpel8_mc11_neon, export=1
1264 push {r0, r1, r11, lr}
1265 \type\()_h264_qpel8_mc11:
1275 bl put_h264_qpel8_h_lowpass_neon
1279 sub r1, r1, r2, lsl #1
1281 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1287 function ff_\type\()_h264_qpel8_mc21_neon, export=1
1288 push {r0, r1, r4, r10, r11, lr}
1289 \type\()_h264_qpel8_mc21:
1293 sub sp, sp, #(8*8+16*12)
1299 bl put_h264_qpel8_h_lowpass_neon
1302 sub r1, r1, r2, lsl #1
1306 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1309 pop {r4, r10, r11, pc}
1312 function ff_\type\()_h264_qpel8_mc31_neon, export=1
1314 push {r0, r1, r11, lr}
1316 b \type\()_h264_qpel8_mc11
1319 function ff_\type\()_h264_qpel8_mc02_neon, export=1
1322 sub r1, r1, r2, lsl #1
1325 bl \type\()_h264_qpel8_v_lowpass_neon
1330 function ff_\type\()_h264_qpel8_mc12_neon, export=1
1331 push {r0, r1, r4, r10, r11, lr}
1332 \type\()_h264_qpel8_mc12:
1336 sub sp, sp, #(8*8+16*12)
1337 sub r1, r1, r2, lsl #1
1342 bl put_h264_qpel8_v_lowpass_neon
1345 sub r1, r1, r3, lsl #1
1348 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1351 pop {r4, r10, r11, pc}
1354 function ff_\type\()_h264_qpel8_mc22_neon, export=1
1355 push {r4, r10, r11, lr}
1358 sub r1, r1, r2, lsl #1
1361 sub sp, sp, #(16*12)
1364 bl \type\()_h264_qpel8_hv_lowpass_neon
1367 pop {r4, r10, r11, pc}
1370 function ff_\type\()_h264_qpel8_mc32_neon, export=1
1371 push {r0, r1, r4, r10, r11, lr}
1373 b \type\()_h264_qpel8_mc12
1376 function ff_\type\()_h264_qpel8_mc03_neon, export=1
1379 b \type\()_h264_qpel8_mc01
1382 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1383 push {r0, r1, r11, lr}
1385 b \type\()_h264_qpel8_mc11
1388 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1389 push {r0, r1, r4, r10, r11, lr}
1391 b \type\()_h264_qpel8_mc21
1394 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1396 push {r0, r1, r11, lr}
1399 b \type\()_h264_qpel8_mc11
1406 .macro h264_qpel16 type
1407 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1411 b \type\()_h264_qpel16_h_lowpass_l2_neon
1414 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1418 b \type\()_h264_qpel16_h_lowpass_neon
1421 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1425 b \type\()_h264_qpel16_h_lowpass_l2_neon
1428 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1431 \type\()_h264_qpel16_mc01:
1434 sub r1, r1, r2, lsl #1
1436 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1441 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1442 push {r0, r1, r4, r11, lr}
1443 \type\()_h264_qpel16_mc11:
1452 bl put_h264_qpel16_h_lowpass_neon
1456 sub r1, r1, r2, lsl #1
1458 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1464 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1465 push {r0, r1, r4-r5, r9-r11, lr}
1466 \type\()_h264_qpel16_mc21:
1470 sub sp, sp, #(16*16+16*12)
1474 bl put_h264_qpel16_h_lowpass_neon_packed
1477 sub r1, r1, r2, lsl #1
1480 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1483 pop {r4-r5, r9-r11, pc}
1486 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1488 push {r0, r1, r4, r11, lr}
1490 b \type\()_h264_qpel16_mc11
1493 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1496 sub r1, r1, r2, lsl #1
1499 bl \type\()_h264_qpel16_v_lowpass_neon
1504 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1505 push {r0, r1, r4-r5, r9-r11, lr}
1506 \type\()_h264_qpel16_mc12:
1510 sub sp, sp, #(16*16+16*12)
1511 sub r1, r1, r2, lsl #1
1515 bl put_h264_qpel16_v_lowpass_neon_packed
1518 sub r1, r1, r3, lsl #1
1521 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1524 pop {r4-r5, r9-r11, pc}
1527 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1528 push {r4, r9-r11, lr}
1532 sub r1, r1, r2, lsl #1
1535 sub sp, sp, #(16*12)
1538 bl \type\()_h264_qpel16_hv_lowpass_neon
1541 pop {r4, r9-r11, pc}
1544 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1545 push {r0, r1, r4-r5, r9-r11, lr}
1547 b \type\()_h264_qpel16_mc12
1550 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1553 b \type\()_h264_qpel16_mc01
1556 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1557 push {r0, r1, r4, r11, lr}
1559 b \type\()_h264_qpel16_mc11
1562 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1563 push {r0, r1, r4-r5, r9-r11, lr}
1565 b \type\()_h264_qpel16_mc21
1568 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1570 push {r0, r1, r4, r11, lr}
1573 b \type\()_h264_qpel16_mc11
1580 @ Biweighted prediction
1582 .macro biweight_16 macs, macd
1588 vld1.8 {d20-d21},[r0,:128], r2
1592 vld1.8 {d22-d23},[r1,:128], r2
1597 vld1.8 {d28-d29},[r0,:128], r2
1602 vld1.8 {d30-d31},[r1,:128], r2
1610 vshl.s16 q12, q12, q9
1611 vshl.s16 q13, q13, q9
1612 vqmovun.s16 d24, q12
1613 vqmovun.s16 d25, q13
1615 vst1.8 {d4- d5}, [r6,:128], r2
1617 vst1.8 {d24-d25},[r6,:128], r2
1622 .macro biweight_8 macs, macd
1628 vld1.8 {d4},[r0,:64], r2
1631 vld1.8 {d5},[r1,:64], r2
1634 vld1.8 {d6},[r0,:64], r2
1637 vld1.8 {d7},[r1,:64], r2
1642 vshl.s16 q10, q10, q9
1645 vst1.8 {d2},[r6,:64], r2
1647 vst1.8 {d4},[r6,:64], r2
1652 .macro biweight_4 macs, macd
1658 vld1.32 {d4[0]},[r0,:32], r2
1659 vld1.32 {d4[1]},[r0,:32], r2
1662 vld1.32 {d5[0]},[r1,:32], r2
1663 vld1.32 {d5[1]},[r1,:32], r2
1667 vld1.32 {d6[0]},[r0,:32], r2
1668 vld1.32 {d6[1]},[r0,:32], r2
1671 vld1.32 {d7[0]},[r1,:32], r2
1672 vld1.32 {d7[1]},[r1,:32], r2
1677 vshl.s16 q10, q10, q9
1680 vst1.32 {d2[0]},[r6,:32], r2
1681 vst1.32 {d2[1]},[r6,:32], r2
1683 vst1.32 {d4[0]},[r6,:32], r2
1684 vst1.32 {d4[1]},[r6,:32], r2
1687 2: vshl.s16 q1, q1, q9
1689 vst1.32 {d2[0]},[r6,:32], r2
1690 vst1.32 {d2[1]},[r6,:32], r2
1694 .macro biweight_func w
1695 function biweight_h264_pixels_\w\()_neon
1701 eors lr, lr, r5, lsr #30
1714 10: biweight_\w vmlal.u8, vmlal.u8
1716 biweight_\w vmlal.u8, vmlsl.u8
1719 biweight_\w vmlsl.u8, vmlsl.u8
1721 biweight_\w vmlsl.u8, vmlal.u8
1725 .macro biweight_entry w, h, b=1
1726 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1729 b biweight_h264_pixels_\w\()_neon
1734 biweight_entry 16, 8
1735 biweight_entry 16, 16, b=0
1738 biweight_entry 8, 16
1740 biweight_entry 8, 8, b=0
1745 biweight_entry 4, 4, b=0
1748 @ Weighted prediction
1750 .macro weight_16 add
1753 vld1.8 {d20-d21},[r0,:128], r1
1754 vmull.u8 q2, d0, d20
1756 vmull.u8 q3, d0, d21
1757 vld1.8 {d28-d29},[r0,:128], r1
1758 vmull.u8 q12, d0, d28
1760 vmull.u8 q13, d0, d29
1762 vrshl.s16 q2, q2, q9
1764 vrshl.s16 q3, q3, q9
1768 vrshl.s16 q12, q12, q9
1770 vrshl.s16 q13, q13, q9
1771 vqmovun.s16 d24, q12
1772 vqmovun.s16 d25, q13
1773 vst1.8 {d4- d5}, [r4,:128], r1
1774 vst1.8 {d24-d25},[r4,:128], r1
1782 vld1.8 {d4},[r0,:64], r1
1785 vld1.8 {d6},[r0,:64], r1
1786 vmull.u8 q10, d0, d6
1789 vrshl.s16 q1, q1, q9
1792 vrshl.s16 q10, q10, q9
1794 vst1.8 {d2},[r4,:64], r1
1795 vst1.8 {d4},[r4,:64], r1
1805 vld1.32 {d4[0]},[r0,:32], r1
1806 vld1.32 {d4[1]},[r0,:32], r1
1810 vld1.32 {d6[0]},[r0,:32], r1
1811 vld1.32 {d6[1]},[r0,:32], r1
1812 vmull.u8 q10, d0, d6
1815 vrshl.s16 q1, q1, q9
1818 vrshl.s16 q10, q10, q9
1821 vst1.32 {d2[0]},[r4,:32], r1
1822 vst1.32 {d2[1]},[r4,:32], r1
1824 vst1.32 {d4[0]},[r4,:32], r1
1825 vst1.32 {d4[1]},[r4,:32], r1
1829 vrshl.s16 q1, q1, q9
1831 vst1.32 {d2[0]},[r4,:32], r1
1832 vst1.32 {d2[1]},[r4,:32], r1
1836 .macro weight_func w
1837 function weight_h264_pixels_\w\()_neon
1862 .macro weight_entry w, h, b=1
1863 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1866 b weight_h264_pixels_\w\()_neon
1872 weight_entry 16, 16, b=0
1877 weight_entry 8, 8, b=0
1882 weight_entry 4, 4, b=0