2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
38 .macro transpose_4x4 r0 r1 r2 r3
45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64 .macro h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
77 rsb r6, r7, r5, lsl #3
78 rsb ip, r7, r4, lsl #3
79 sub r4, r7, r4, lsl #3
80 sub r4, r4, r5, lsl #3
90 vld1.64 {d4, d5}, [r1], r4
92 vld1.64 {d6, d7}, [r5], r4
101 vld1.64 {d4, d5}, [r1], r4
103 vext.8 d5, d4, d5, #1
110 vrshrn.u16 d16, q8, #6
111 vld1.64 {d6, d7}, [r5], r4
113 vrshrn.u16 d17, q9, #6
115 vld1.64 {d20}, [lr,:64], r2
116 vld1.64 {d21}, [lr,:64], r2
117 vrhadd.u8 q8, q8, q10
119 vext.8 d7, d6, d7, #1
120 vst1.64 {d16}, [r0,:64], r2
121 vst1.64 {d17}, [r0,:64], r2
135 vld1.64 {d4}, [r1], r4
136 vld1.64 {d6}, [r5], r4
141 vld1.64 {d4}, [r1], r4
144 vld1.64 {d6}, [r5], r4
145 vrshrn.u16 d16, q8, #6
146 vrshrn.u16 d17, q9, #6
148 vld1.64 {d20}, [lr,:64], r2
149 vld1.64 {d21}, [lr,:64], r2
150 vrhadd.u8 q8, q8, q10
154 vst1.64 {d16}, [r0,:64], r2
155 vst1.64 {d17}, [r0,:64], r2
160 4: vld1.64 {d4, d5}, [r1], r2
161 vld1.64 {d6, d7}, [r1], r2
162 vext.8 d5, d4, d5, #1
163 vext.8 d7, d6, d7, #1
169 vld1.64 {d4, d5}, [r1], r2
173 vext.8 d5, d4, d5, #1
174 vrshrn.u16 d16, q8, #6
175 vrshrn.u16 d17, q9, #6
177 vld1.64 {d20}, [lr,:64], r2
178 vld1.64 {d21}, [lr,:64], r2
179 vrhadd.u8 q8, q8, q10
181 vld1.64 {d6, d7}, [r1], r2
182 vext.8 d7, d6, d7, #1
183 vst1.64 {d16}, [r0,:64], r2
184 vst1.64 {d17}, [r0,:64], r2
191 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
192 .macro h264_chroma_mc4 type
193 function ff_\type\()_h264_chroma_mc4_neon, export=1
205 rsb r6, r7, r5, lsl #3
206 rsb ip, r7, r4, lsl #3
207 sub r4, r7, r4, lsl #3
208 sub r4, r4, r5, lsl #3
218 vld1.64 {d4}, [r1], r4
220 vld1.64 {d6}, [r5], r4
223 vext.8 d5, d4, d5, #1
224 vext.8 d7, d6, d7, #1
234 vld1.64 {d4}, [r1], r4
235 vext.8 d5, d4, d5, #1
239 vld1.64 {d6}, [r5], r4
240 vadd.i16 d16, d16, d17
241 vadd.i16 d17, d18, d19
242 vrshrn.u16 d16, q8, #6
246 vld1.32 {d20[0]}, [lr,:32], r2
247 vld1.32 {d20[1]}, [lr,:32], r2
248 vrhadd.u8 d16, d16, d20
250 vext.8 d7, d6, d7, #1
252 vst1.32 {d16[0]}, [r0,:32], r2
253 vst1.32 {d16[1]}, [r0,:32], r2
266 vext.32 d1, d0, d1, #1
269 vld1.32 {d4[0]}, [r1], r4
270 vld1.32 {d4[1]}, [r5], r4
274 vld1.32 {d4[0]}, [r1], r4
276 vld1.32 {d4[1]}, [r5], r4
277 vadd.i16 d16, d16, d17
278 vadd.i16 d17, d18, d19
279 vrshrn.u16 d16, q8, #6
281 vld1.32 {d20[0]}, [lr,:32], r2
282 vld1.32 {d20[1]}, [lr,:32], r2
283 vrhadd.u8 d16, d16, d20
287 vst1.32 {d16[0]}, [r0,:32], r2
288 vst1.32 {d16[1]}, [r0,:32], r2
293 4: vld1.64 {d4}, [r1], r2
294 vld1.64 {d6}, [r1], r2
295 vext.8 d5, d4, d5, #1
296 vext.8 d7, d6, d7, #1
300 5: vmull.u8 q8, d4, d0
303 vld1.64 {d4}, [r1], r2
304 vext.8 d5, d4, d5, #1
306 vadd.i16 d16, d16, d17
307 vadd.i16 d17, d18, d19
309 vrshrn.u16 d16, q8, #6
311 vld1.32 {d20[0]}, [lr,:32], r2
312 vld1.32 {d20[1]}, [lr,:32], r2
313 vrhadd.u8 d16, d16, d20
315 vld1.64 {d6}, [r1], r2
316 vext.8 d7, d6, d7, #1
319 vst1.32 {d16[0]}, [r0,:32], r2
320 vst1.32 {d16[1]}, [r0,:32], r2
327 .macro h264_chroma_mc2 type
328 function ff_\type\()_h264_chroma_mc2_neon, export=1
338 rsb r6, r5, lr, lsl #3
339 rsb r12, r5, r4, lsl #3
340 sub r4, r5, r4, lsl #3
341 sub r4, r4, lr, lsl #3
349 vld1.32 {d4[0]}, [r1], r2
350 vld1.32 {d4[1]}, [r1], r2
352 vld1.32 {d5[1]}, [r1]
353 vext.8 q3, q2, q2, #1
358 vld1.16 {d18[0]}, [r0,:16], r2
359 vld1.16 {d18[1]}, [r0,:16]
363 vadd.i16 d16, d16, d17
364 vrshrn.u16 d16, q8, #6
366 vrhadd.u8 d16, d16, d18
368 vst1.16 {d16[0]}, [r0,:16], r2
369 vst1.16 {d16[1]}, [r0,:16], r2
380 vld1.16 {d16[0]}, [r1], r2
381 vld1.16 {d16[1]}, [r1], r2
382 vld1.16 {d18[0]}, [r0,:16], r2
383 vld1.16 {d18[1]}, [r0,:16]
385 vrhadd.u8 d16, d16, d18
386 vst1.16 {d16[0]}, [r0,:16], r2
387 vst1.16 {d16[1]}, [r0,:16], r2
402 /* H.264 loop filter */
404 .macro h264_loop_filter_start
411 and ip, ip, ip, lsl #16
414 ands ip, ip, ip, lsl #8
419 .macro h264_loop_filter_luma
420 vdup.8 q11, r2 @ alpha
422 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
424 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
426 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
427 vsli.32 q12, q12, #16
428 vclt.u8 q6, q6, q11 @ < alpha
429 vdup.8 q11, r3 @ beta
431 vclt.u8 q14, q14, q11 @ < beta
432 vclt.u8 q15, q15, q11 @ < beta
434 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
436 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
437 vclt.u8 q4, q4, q11 @ < beta
439 vclt.u8 q5, q5, q11 @ < beta
443 vrhadd.u8 q14, q8, q0
446 vhadd.u8 q10, q10, q14
448 vhadd.u8 q14, q2, q14
450 vqsub.u8 q11, q9, q12
453 vqsub.u8 q11, q1, q12
456 vmax.u8 q14, q14, q11
459 vsubw.u8 q10, q10, d17
461 vshl.i16 q10, q10, #2
463 vaddw.u8 q10, q10, d19
465 vsubw.u8 q10, q10, d3
466 vrshrn.i16 d4, q2, #3
467 vrshrn.i16 d5, q10, #3
477 vaddw.s8 q14, q14, d4
479 vsubw.s8 q11, q11, d4
480 vsubw.s8 q12, q12, d5
487 function ff_h264_v_loop_filter_luma_neon, export=1
488 h264_loop_filter_start
490 vld1.64 {d0, d1}, [r0,:128], r1
491 vld1.64 {d2, d3}, [r0,:128], r1
492 vld1.64 {d4, d5}, [r0,:128], r1
493 sub r0, r0, r1, lsl #2
494 sub r0, r0, r1, lsl #1
495 vld1.64 {d20,d21}, [r0,:128], r1
496 vld1.64 {d18,d19}, [r0,:128], r1
497 vld1.64 {d16,d17}, [r0,:128], r1
501 h264_loop_filter_luma
503 sub r0, r0, r1, lsl #1
504 vst1.64 {d8, d9}, [r0,:128], r1
505 vst1.64 {d16,d17}, [r0,:128], r1
506 vst1.64 {d0, d1}, [r0,:128], r1
507 vst1.64 {d10,d11}, [r0,:128]
513 function ff_h264_h_loop_filter_luma_neon, export=1
514 h264_loop_filter_start
517 vld1.64 {d6}, [r0], r1
518 vld1.64 {d20}, [r0], r1
519 vld1.64 {d18}, [r0], r1
520 vld1.64 {d16}, [r0], r1
521 vld1.64 {d0}, [r0], r1
522 vld1.64 {d2}, [r0], r1
523 vld1.64 {d4}, [r0], r1
524 vld1.64 {d26}, [r0], r1
525 vld1.64 {d7}, [r0], r1
526 vld1.64 {d21}, [r0], r1
527 vld1.64 {d19}, [r0], r1
528 vld1.64 {d17}, [r0], r1
529 vld1.64 {d1}, [r0], r1
530 vld1.64 {d3}, [r0], r1
531 vld1.64 {d5}, [r0], r1
532 vld1.64 {d27}, [r0], r1
534 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
538 h264_loop_filter_luma
540 transpose_4x4 q4, q8, q0, q5
542 sub r0, r0, r1, lsl #4
544 vst1.32 {d8[0]}, [r0], r1
545 vst1.32 {d16[0]}, [r0], r1
546 vst1.32 {d0[0]}, [r0], r1
547 vst1.32 {d10[0]}, [r0], r1
548 vst1.32 {d8[1]}, [r0], r1
549 vst1.32 {d16[1]}, [r0], r1
550 vst1.32 {d0[1]}, [r0], r1
551 vst1.32 {d10[1]}, [r0], r1
552 vst1.32 {d9[0]}, [r0], r1
553 vst1.32 {d17[0]}, [r0], r1
554 vst1.32 {d1[0]}, [r0], r1
555 vst1.32 {d11[0]}, [r0], r1
556 vst1.32 {d9[1]}, [r0], r1
557 vst1.32 {d17[1]}, [r0], r1
558 vst1.32 {d1[1]}, [r0], r1
559 vst1.32 {d11[1]}, [r0], r1
565 .macro h264_loop_filter_chroma
566 vdup.8 d22, r2 @ alpha
568 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
570 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
574 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
576 vclt.u8 d26, d26, d22 @ < alpha
578 vdup.8 d22, r3 @ beta
579 vrshrn.i16 d4, q2, #3
580 vclt.u8 d28, d28, d22 @ < beta
581 vclt.u8 d30, d30, d22 @ < beta
590 vaddw.s8 q14, q14, d4
591 vsubw.s8 q11, q11, d4
596 function ff_h264_v_loop_filter_chroma_neon, export=1
597 h264_loop_filter_start
599 sub r0, r0, r1, lsl #1
600 vld1.64 {d18}, [r0,:64], r1
601 vld1.64 {d16}, [r0,:64], r1
602 vld1.64 {d0}, [r0,:64], r1
603 vld1.64 {d2}, [r0,:64]
605 h264_loop_filter_chroma
607 sub r0, r0, r1, lsl #1
608 vst1.64 {d16}, [r0,:64], r1
609 vst1.64 {d0}, [r0,:64], r1
614 function ff_h264_h_loop_filter_chroma_neon, export=1
615 h264_loop_filter_start
618 vld1.32 {d18[0]}, [r0], r1
619 vld1.32 {d16[0]}, [r0], r1
620 vld1.32 {d0[0]}, [r0], r1
621 vld1.32 {d2[0]}, [r0], r1
622 vld1.32 {d18[1]}, [r0], r1
623 vld1.32 {d16[1]}, [r0], r1
624 vld1.32 {d0[1]}, [r0], r1
625 vld1.32 {d2[1]}, [r0], r1
632 h264_loop_filter_chroma
639 sub r0, r0, r1, lsl #3
640 vst1.32 {d18[0]}, [r0], r1
641 vst1.32 {d16[0]}, [r0], r1
642 vst1.32 {d0[0]}, [r0], r1
643 vst1.32 {d2[0]}, [r0], r1
644 vst1.32 {d18[1]}, [r0], r1
645 vst1.32 {d16[1]}, [r0], r1
646 vst1.32 {d0[1]}, [r0], r1
647 vst1.32 {d2[1]}, [r0], r1
654 .macro lowpass_const r
660 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
668 vext.8 d2, \r0, \r1, #2
669 vext.8 d3, \r0, \r1, #3
671 vext.8 d4, \r0, \r1, #1
672 vext.8 d5, \r0, \r1, #4
674 vext.8 d30, \r0, \r1, #5
675 vaddl.u8 t0, \r0, d30
676 vext.8 d18, \r2, \r3, #2
677 vmla.i16 t0, q1, d6[1]
678 vext.8 d19, \r2, \r3, #3
679 vaddl.u8 q9, d18, d19
680 vext.8 d20, \r2, \r3, #1
681 vmls.i16 t0, q2, d6[0]
682 vext.8 d21, \r2, \r3, #4
683 vaddl.u8 q10, d20, d21
684 vext.8 d31, \r2, \r3, #5
685 vaddl.u8 t1, \r2, d31
686 vmla.i16 t1, q9, d6[1]
687 vmls.i16 t1, q10, d6[0]
689 vqrshrun.s16 \d0, t0, #5
690 vqrshrun.s16 \d1, t1, #5
696 .macro lowpass_8_1 r0, r1, d0, narrow=1
702 vext.8 d2, \r0, \r1, #2
703 vext.8 d3, \r0, \r1, #3
705 vext.8 d4, \r0, \r1, #1
706 vext.8 d5, \r0, \r1, #4
708 vext.8 d30, \r0, \r1, #5
709 vaddl.u8 t0, \r0, d30
710 vmla.i16 t0, q1, d6[1]
711 vmls.i16 t0, q2, d6[0]
713 vqrshrun.s16 \d0, t0, #5
718 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
719 vext.16 q1, \r0, \r1, #2
720 vext.16 q0, \r0, \r1, #3
722 vext.16 q2, \r0, \r1, #1
724 vext.16 q3, \r0, \r1, #4
725 vaddl.s16 q10, d4, d6
726 vext.16 \r1, \r0, \r1, #5
728 vaddl.s16 q0, \h0, \h1
729 vaddl.s16 q8, \l0, \l1
733 vshl.i32 q15, q10, #2
735 vadd.i32 q10, q10, q15
749 vrshrn.s32 d18, q9, #10
750 vrshrn.s32 d19, q1, #10
755 function put_h264_qpel16_h_lowpass_neon_packed
759 bl put_h264_qpel8_h_lowpass_neon
760 sub r1, r1, r2, lsl #4
764 b put_h264_qpel8_h_lowpass_neon
767 .macro h264_qpel_h_lowpass type
768 function \type\()_h264_qpel16_h_lowpass_neon
771 bl \type\()_h264_qpel8_h_lowpass_neon
772 sub r0, r0, r3, lsl #4
773 sub r1, r1, r2, lsl #4
780 function \type\()_h264_qpel8_h_lowpass_neon
781 1: vld1.64 {d0, d1}, [r1], r2
782 vld1.64 {d16,d17}, [r1], r2
784 lowpass_8 d0, d1, d16, d17, d0, d16
786 vld1.8 {d2}, [r0,:64], r3
788 vld1.8 {d3}, [r0,:64]
789 vrhadd.u8 d16, d16, d3
792 vst1.64 {d0}, [r0,:64], r3
793 vst1.64 {d16}, [r0,:64], r3
799 h264_qpel_h_lowpass put
800 h264_qpel_h_lowpass avg
802 .macro h264_qpel_h_lowpass_l2 type
803 function \type\()_h264_qpel16_h_lowpass_l2_neon
806 bl \type\()_h264_qpel8_h_lowpass_l2_neon
807 sub r0, r0, r2, lsl #4
808 sub r1, r1, r2, lsl #4
809 sub r3, r3, r2, lsl #4
817 function \type\()_h264_qpel8_h_lowpass_l2_neon
818 1: vld1.64 {d0, d1}, [r1], r2
819 vld1.64 {d16,d17}, [r1], r2
820 vld1.64 {d28}, [r3], r2
821 vld1.64 {d29}, [r3], r2
823 lowpass_8 d0, d1, d16, d17, d0, d1
824 vrhadd.u8 q0, q0, q14
826 vld1.8 {d2}, [r0,:64], r2
828 vld1.8 {d3}, [r0,:64]
832 vst1.64 {d0}, [r0,:64], r2
833 vst1.64 {d1}, [r0,:64], r2
839 h264_qpel_h_lowpass_l2 put
840 h264_qpel_h_lowpass_l2 avg
842 function put_h264_qpel16_v_lowpass_neon_packed
845 bl put_h264_qpel8_v_lowpass_neon
846 sub r1, r1, r3, lsl #2
847 bl put_h264_qpel8_v_lowpass_neon
848 sub r1, r1, r3, lsl #4
849 sub r1, r1, r3, lsl #2
851 bl put_h264_qpel8_v_lowpass_neon
852 sub r1, r1, r3, lsl #2
854 b put_h264_qpel8_v_lowpass_neon
857 .macro h264_qpel_v_lowpass type
858 function \type\()_h264_qpel16_v_lowpass_neon
860 bl \type\()_h264_qpel8_v_lowpass_neon
861 sub r1, r1, r3, lsl #2
862 bl \type\()_h264_qpel8_v_lowpass_neon
863 sub r0, r0, r2, lsl #4
865 sub r1, r1, r3, lsl #4
866 sub r1, r1, r3, lsl #2
868 bl \type\()_h264_qpel8_v_lowpass_neon
869 sub r1, r1, r3, lsl #2
873 function \type\()_h264_qpel8_v_lowpass_neon
874 vld1.64 {d8}, [r1], r3
875 vld1.64 {d10}, [r1], r3
876 vld1.64 {d12}, [r1], r3
877 vld1.64 {d14}, [r1], r3
878 vld1.64 {d22}, [r1], r3
879 vld1.64 {d24}, [r1], r3
880 vld1.64 {d26}, [r1], r3
881 vld1.64 {d28}, [r1], r3
882 vld1.64 {d9}, [r1], r3
883 vld1.64 {d11}, [r1], r3
884 vld1.64 {d13}, [r1], r3
885 vld1.64 {d15}, [r1], r3
888 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
889 lowpass_8 d8, d9, d10, d11, d8, d10
890 lowpass_8 d12, d13, d14, d15, d12, d14
891 lowpass_8 d22, d23, d24, d25, d22, d24
892 lowpass_8 d26, d27, d28, d29, d26, d28
893 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
896 vld1.8 {d9}, [r0,:64], r2
898 vld1.8 {d11}, [r0,:64], r2
899 vrhadd.u8 d10, d10, d11
900 vld1.8 {d13}, [r0,:64], r2
901 vrhadd.u8 d12, d12, d13
902 vld1.8 {d15}, [r0,:64], r2
903 vrhadd.u8 d14, d14, d15
904 vld1.8 {d23}, [r0,:64], r2
905 vrhadd.u8 d22, d22, d23
906 vld1.8 {d25}, [r0,:64], r2
907 vrhadd.u8 d24, d24, d25
908 vld1.8 {d27}, [r0,:64], r2
909 vrhadd.u8 d26, d26, d27
910 vld1.8 {d29}, [r0,:64], r2
911 vrhadd.u8 d28, d28, d29
912 sub r0, r0, r2, lsl #3
915 vst1.64 {d8}, [r0,:64], r2
916 vst1.64 {d10}, [r0,:64], r2
917 vst1.64 {d12}, [r0,:64], r2
918 vst1.64 {d14}, [r0,:64], r2
919 vst1.64 {d22}, [r0,:64], r2
920 vst1.64 {d24}, [r0,:64], r2
921 vst1.64 {d26}, [r0,:64], r2
922 vst1.64 {d28}, [r0,:64], r2
928 h264_qpel_v_lowpass put
929 h264_qpel_v_lowpass avg
931 .macro h264_qpel_v_lowpass_l2 type
932 function \type\()_h264_qpel16_v_lowpass_l2_neon
934 bl \type\()_h264_qpel8_v_lowpass_l2_neon
935 sub r1, r1, r3, lsl #2
936 bl \type\()_h264_qpel8_v_lowpass_l2_neon
937 sub r0, r0, r3, lsl #4
938 sub ip, ip, r2, lsl #4
941 sub r1, r1, r3, lsl #4
942 sub r1, r1, r3, lsl #2
944 bl \type\()_h264_qpel8_v_lowpass_l2_neon
945 sub r1, r1, r3, lsl #2
949 function \type\()_h264_qpel8_v_lowpass_l2_neon
950 vld1.64 {d8}, [r1], r3
951 vld1.64 {d10}, [r1], r3
952 vld1.64 {d12}, [r1], r3
953 vld1.64 {d14}, [r1], r3
954 vld1.64 {d22}, [r1], r3
955 vld1.64 {d24}, [r1], r3
956 vld1.64 {d26}, [r1], r3
957 vld1.64 {d28}, [r1], r3
958 vld1.64 {d9}, [r1], r3
959 vld1.64 {d11}, [r1], r3
960 vld1.64 {d13}, [r1], r3
961 vld1.64 {d15}, [r1], r3
964 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
965 lowpass_8 d8, d9, d10, d11, d8, d9
966 lowpass_8 d12, d13, d14, d15, d12, d13
967 lowpass_8 d22, d23, d24, d25, d22, d23
968 lowpass_8 d26, d27, d28, d29, d26, d27
969 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
971 vld1.64 {d0}, [ip], r2
972 vld1.64 {d1}, [ip], r2
973 vld1.64 {d2}, [ip], r2
974 vld1.64 {d3}, [ip], r2
975 vld1.64 {d4}, [ip], r2
977 vld1.64 {d5}, [ip], r2
979 vld1.64 {d10}, [ip], r2
980 vrhadd.u8 q2, q2, q11
981 vld1.64 {d11}, [ip], r2
982 vrhadd.u8 q5, q5, q13
985 vld1.8 {d16}, [r0,:64], r3
986 vrhadd.u8 d0, d0, d16
987 vld1.8 {d17}, [r0,:64], r3
988 vrhadd.u8 d1, d1, d17
989 vld1.8 {d16}, [r0,:64], r3
990 vrhadd.u8 d2, d2, d16
991 vld1.8 {d17}, [r0,:64], r3
992 vrhadd.u8 d3, d3, d17
993 vld1.8 {d16}, [r0,:64], r3
994 vrhadd.u8 d4, d4, d16
995 vld1.8 {d17}, [r0,:64], r3
996 vrhadd.u8 d5, d5, d17
997 vld1.8 {d16}, [r0,:64], r3
998 vrhadd.u8 d10, d10, d16
999 vld1.8 {d17}, [r0,:64], r3
1000 vrhadd.u8 d11, d11, d17
1001 sub r0, r0, r3, lsl #3
1004 vst1.64 {d0}, [r0,:64], r3
1005 vst1.64 {d1}, [r0,:64], r3
1006 vst1.64 {d2}, [r0,:64], r3
1007 vst1.64 {d3}, [r0,:64], r3
1008 vst1.64 {d4}, [r0,:64], r3
1009 vst1.64 {d5}, [r0,:64], r3
1010 vst1.64 {d10}, [r0,:64], r3
1011 vst1.64 {d11}, [r0,:64], r3
1017 h264_qpel_v_lowpass_l2 put
1018 h264_qpel_v_lowpass_l2 avg
1020 function put_h264_qpel8_hv_lowpass_neon_top
1023 1: vld1.64 {d0, d1}, [r1], r3
1024 vld1.64 {d16,d17}, [r1], r3
1026 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
1027 vst1.64 {d22-d25}, [r4,:128]!
1030 vld1.64 {d0, d1}, [r1]
1031 lowpass_8_1 d0, d1, q12, narrow=0
1035 vld1.64 {d30,d31}, [r4,:128], ip
1036 vld1.64 {d20,d21}, [r4,:128], ip
1037 vld1.64 {d18,d19}, [r4,:128], ip
1038 vld1.64 {d16,d17}, [r4,:128], ip
1039 vld1.64 {d14,d15}, [r4,:128], ip
1040 vld1.64 {d12,d13}, [r4,:128], ip
1041 vld1.64 {d10,d11}, [r4,:128], ip
1042 vld1.64 {d8, d9}, [r4,:128], ip
1043 vld1.64 {d6, d7}, [r4,:128], ip
1044 vld1.64 {d4, d5}, [r4,:128], ip
1045 vld1.64 {d2, d3}, [r4,:128], ip
1046 vld1.64 {d0, d1}, [r4,:128]
1048 swap4 d1, d3, d5, d7, d8, d10, d12, d14
1049 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
1051 swap4 d17, d19, d21, d31, d24, d26, d28, d22
1052 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
1054 vst1.64 {d30,d31}, [r4,:128]!
1055 vst1.64 {d6, d7}, [r4,:128]!
1056 vst1.64 {d20,d21}, [r4,:128]!
1057 vst1.64 {d4, d5}, [r4,:128]!
1058 vst1.64 {d18,d19}, [r4,:128]!
1059 vst1.64 {d2, d3}, [r4,:128]!
1060 vst1.64 {d16,d17}, [r4,:128]!
1061 vst1.64 {d0, d1}, [r4,:128]
1063 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
1064 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
1065 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
1066 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
1068 vld1.64 {d16,d17}, [r4,:128], ip
1069 vld1.64 {d30,d31}, [r4,:128], ip
1070 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
1071 vld1.64 {d16,d17}, [r4,:128], ip
1072 vld1.64 {d30,d31}, [r4,:128], ip
1073 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
1074 vld1.64 {d16,d17}, [r4,:128], ip
1075 vld1.64 {d30,d31}, [r4,:128], ip
1076 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
1077 vld1.64 {d16,d17}, [r4,:128], ip
1078 vld1.64 {d30,d31}, [r4,:128]
1079 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
1081 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
1086 .macro h264_qpel8_hv_lowpass type
1087 function \type\()_h264_qpel8_hv_lowpass_neon
1089 bl put_h264_qpel8_hv_lowpass_neon_top
1091 vld1.8 {d0}, [r0,:64], r2
1092 vrhadd.u8 d12, d12, d0
1093 vld1.8 {d1}, [r0,:64], r2
1094 vrhadd.u8 d13, d13, d1
1095 vld1.8 {d2}, [r0,:64], r2
1096 vrhadd.u8 d14, d14, d2
1097 vld1.8 {d3}, [r0,:64], r2
1098 vrhadd.u8 d15, d15, d3
1099 vld1.8 {d4}, [r0,:64], r2
1100 vrhadd.u8 d8, d8, d4
1101 vld1.8 {d5}, [r0,:64], r2
1102 vrhadd.u8 d9, d9, d5
1103 vld1.8 {d6}, [r0,:64], r2
1104 vrhadd.u8 d10, d10, d6
1105 vld1.8 {d7}, [r0,:64], r2
1106 vrhadd.u8 d11, d11, d7
1107 sub r0, r0, r2, lsl #3
1110 vst1.64 {d12}, [r0,:64], r2
1111 vst1.64 {d13}, [r0,:64], r2
1112 vst1.64 {d14}, [r0,:64], r2
1113 vst1.64 {d15}, [r0,:64], r2
1114 vst1.64 {d8}, [r0,:64], r2
1115 vst1.64 {d9}, [r0,:64], r2
1116 vst1.64 {d10}, [r0,:64], r2
1117 vst1.64 {d11}, [r0,:64], r2
1124 h264_qpel8_hv_lowpass put
1125 h264_qpel8_hv_lowpass avg
1127 .macro h264_qpel8_hv_lowpass_l2 type
1128 function \type\()_h264_qpel8_hv_lowpass_l2_neon
1130 bl put_h264_qpel8_hv_lowpass_neon_top
1132 vld1.64 {d0, d1}, [r2,:128]!
1133 vld1.64 {d2, d3}, [r2,:128]!
1134 vrhadd.u8 q0, q0, q6
1135 vld1.64 {d4, d5}, [r2,:128]!
1136 vrhadd.u8 q1, q1, q7
1137 vld1.64 {d6, d7}, [r2,:128]!
1138 vrhadd.u8 q2, q2, q4
1139 vrhadd.u8 q3, q3, q5
1141 vld1.8 {d16}, [r0,:64], r3
1142 vrhadd.u8 d0, d0, d16
1143 vld1.8 {d17}, [r0,:64], r3
1144 vrhadd.u8 d1, d1, d17
1145 vld1.8 {d18}, [r0,:64], r3
1146 vrhadd.u8 d2, d2, d18
1147 vld1.8 {d19}, [r0,:64], r3
1148 vrhadd.u8 d3, d3, d19
1149 vld1.8 {d20}, [r0,:64], r3
1150 vrhadd.u8 d4, d4, d20
1151 vld1.8 {d21}, [r0,:64], r3
1152 vrhadd.u8 d5, d5, d21
1153 vld1.8 {d22}, [r0,:64], r3
1154 vrhadd.u8 d6, d6, d22
1155 vld1.8 {d23}, [r0,:64], r3
1156 vrhadd.u8 d7, d7, d23
1157 sub r0, r0, r3, lsl #3
1159 vst1.64 {d0}, [r0,:64], r3
1160 vst1.64 {d1}, [r0,:64], r3
1161 vst1.64 {d2}, [r0,:64], r3
1162 vst1.64 {d3}, [r0,:64], r3
1163 vst1.64 {d4}, [r0,:64], r3
1164 vst1.64 {d5}, [r0,:64], r3
1165 vst1.64 {d6}, [r0,:64], r3
1166 vst1.64 {d7}, [r0,:64], r3
1173 h264_qpel8_hv_lowpass_l2 put
1174 h264_qpel8_hv_lowpass_l2 avg
1176 .macro h264_qpel16_hv type
1177 function \type\()_h264_qpel16_hv_lowpass_neon
1179 bl \type\()_h264_qpel8_hv_lowpass_neon
1180 sub r1, r1, r3, lsl #2
1181 bl \type\()_h264_qpel8_hv_lowpass_neon
1182 sub r1, r1, r3, lsl #4
1183 sub r1, r1, r3, lsl #2
1185 sub r0, r0, r2, lsl #4
1187 bl \type\()_h264_qpel8_hv_lowpass_neon
1188 sub r1, r1, r3, lsl #2
1190 b \type\()_h264_qpel8_hv_lowpass_neon
1193 function \type\()_h264_qpel16_hv_lowpass_l2_neon
1196 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1197 sub r1, r1, r3, lsl #2
1198 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1199 sub r1, r1, r3, lsl #4
1200 sub r1, r1, r3, lsl #2
1202 sub r0, r0, r3, lsl #4
1204 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1205 sub r1, r1, r3, lsl #2
1207 b \type\()_h264_qpel8_hv_lowpass_l2_neon
1214 .macro h264_qpel8 type
1215 function ff_\type\()_h264_qpel8_mc10_neon, export=1
1220 b \type\()_h264_qpel8_h_lowpass_l2_neon
1223 function ff_\type\()_h264_qpel8_mc20_neon, export=1
1228 b \type\()_h264_qpel8_h_lowpass_neon
1231 function ff_\type\()_h264_qpel8_mc30_neon, export=1
1236 b \type\()_h264_qpel8_h_lowpass_l2_neon
1239 function ff_\type\()_h264_qpel8_mc01_neon, export=1
1242 \type\()_h264_qpel8_mc01:
1245 sub r1, r1, r2, lsl #1
1247 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1252 function ff_\type\()_h264_qpel8_mc11_neon, export=1
1253 push {r0, r1, r11, lr}
1254 \type\()_h264_qpel8_mc11:
1266 bl put_h264_qpel8_h_lowpass_neon
1270 sub r1, r1, r2, lsl #1
1272 bl \type\()_h264_qpel8_v_lowpass_l2_neon
1278 function ff_\type\()_h264_qpel8_mc21_neon, export=1
1279 push {r0, r1, r4, r10, r11, lr}
1280 \type\()_h264_qpel8_mc21:
1286 sub sp, sp, #(8*8+16*12)
1292 bl put_h264_qpel8_h_lowpass_neon
1295 sub r1, r1, r2, lsl #1
1299 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1302 pop {r4, r10, r11, pc}
1305 function ff_\type\()_h264_qpel8_mc31_neon, export=1
1307 push {r0, r1, r11, lr}
1309 b \type\()_h264_qpel8_mc11
1312 function ff_\type\()_h264_qpel8_mc02_neon, export=1
1315 sub r1, r1, r2, lsl #1
1318 bl \type\()_h264_qpel8_v_lowpass_neon
1323 function ff_\type\()_h264_qpel8_mc12_neon, export=1
1324 push {r0, r1, r4, r10, r11, lr}
1325 \type\()_h264_qpel8_mc12:
1331 sub sp, sp, #(8*8+16*12)
1332 sub r1, r1, r2, lsl #1
1337 bl put_h264_qpel8_v_lowpass_neon
1340 sub r1, r1, r3, lsl #1
1343 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
1346 pop {r4, r10, r11, pc}
1349 function ff_\type\()_h264_qpel8_mc22_neon, export=1
1350 push {r4, r10, r11, lr}
1355 sub r1, r1, r2, lsl #1
1358 sub sp, sp, #(16*12)
1361 bl \type\()_h264_qpel8_hv_lowpass_neon
1364 pop {r4, r10, r11, pc}
1367 function ff_\type\()_h264_qpel8_mc32_neon, export=1
1368 push {r0, r1, r4, r10, r11, lr}
1370 b \type\()_h264_qpel8_mc12
1373 function ff_\type\()_h264_qpel8_mc03_neon, export=1
1376 b \type\()_h264_qpel8_mc01
1379 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1380 push {r0, r1, r11, lr}
1382 b \type\()_h264_qpel8_mc11
1385 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1386 push {r0, r1, r4, r10, r11, lr}
1388 b \type\()_h264_qpel8_mc21
1391 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1393 push {r0, r1, r11, lr}
1396 b \type\()_h264_qpel8_mc11
1403 .macro h264_qpel16 type
1404 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1408 b \type\()_h264_qpel16_h_lowpass_l2_neon
1411 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1415 b \type\()_h264_qpel16_h_lowpass_neon
1418 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1422 b \type\()_h264_qpel16_h_lowpass_l2_neon
1425 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1428 \type\()_h264_qpel16_mc01:
1431 sub r1, r1, r2, lsl #1
1433 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1438 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1439 push {r0, r1, r4, r11, lr}
1440 \type\()_h264_qpel16_mc11:
1451 bl put_h264_qpel16_h_lowpass_neon
1455 sub r1, r1, r2, lsl #1
1457 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1463 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1464 push {r0, r1, r4-r5, r9-r11, lr}
1465 \type\()_h264_qpel16_mc21:
1471 sub sp, sp, #(16*16+16*12)
1475 bl put_h264_qpel16_h_lowpass_neon_packed
1478 sub r1, r1, r2, lsl #1
1481 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1484 pop {r4-r5, r9-r11, pc}
1487 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1489 push {r0, r1, r4, r11, lr}
1491 b \type\()_h264_qpel16_mc11
1494 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1497 sub r1, r1, r2, lsl #1
1500 bl \type\()_h264_qpel16_v_lowpass_neon
1505 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1506 push {r0, r1, r4-r5, r9-r11, lr}
1507 \type\()_h264_qpel16_mc12:
1513 sub sp, sp, #(16*16+16*12)
1514 sub r1, r1, r2, lsl #1
1518 bl put_h264_qpel16_v_lowpass_neon_packed
1521 sub r1, r1, r3, lsl #1
1524 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1527 pop {r4-r5, r9-r11, pc}
1530 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1531 push {r4, r9-r11, lr}
1537 sub r1, r1, r2, lsl #1
1540 sub sp, sp, #(16*12)
1543 bl \type\()_h264_qpel16_hv_lowpass_neon
1546 pop {r4, r9-r11, pc}
1549 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1550 push {r0, r1, r4-r5, r9-r11, lr}
1552 b \type\()_h264_qpel16_mc12
1555 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1558 b \type\()_h264_qpel16_mc01
1561 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1562 push {r0, r1, r4, r11, lr}
1564 b \type\()_h264_qpel16_mc11
1567 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1568 push {r0, r1, r4-r5, r9-r11, lr}
1570 b \type\()_h264_qpel16_mc21
1573 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1575 push {r0, r1, r4, r11, lr}
1578 b \type\()_h264_qpel16_mc11
1585 @ Biweighted prediction
1587 .macro biweight_16 macs, macd
1593 vld1.8 {d20-d21},[r0,:128], r2
1597 vld1.8 {d22-d23},[r1,:128], r2
1602 vld1.8 {d28-d29},[r0,:128], r2
1607 vld1.8 {d30-d31},[r1,:128], r2
1615 vshl.s16 q12, q12, q9
1616 vshl.s16 q13, q13, q9
1617 vqmovun.s16 d24, q12
1618 vqmovun.s16 d25, q13
1620 vst1.8 {d4- d5}, [r6,:128], r2
1622 vst1.8 {d24-d25},[r6,:128], r2
1627 .macro biweight_8 macs, macd
1633 vld1.8 {d4},[r0,:64], r2
1636 vld1.8 {d5},[r1,:64], r2
1639 vld1.8 {d6},[r0,:64], r2
1642 vld1.8 {d7},[r1,:64], r2
1647 vshl.s16 q10, q10, q9
1650 vst1.8 {d2},[r6,:64], r2
1652 vst1.8 {d4},[r6,:64], r2
1657 .macro biweight_4 macs, macd
1663 vld1.32 {d4[0]},[r0,:32], r2
1664 vld1.32 {d4[1]},[r0,:32], r2
1667 vld1.32 {d5[0]},[r1,:32], r2
1668 vld1.32 {d5[1]},[r1,:32], r2
1672 vld1.32 {d6[0]},[r0,:32], r2
1673 vld1.32 {d6[1]},[r0,:32], r2
1676 vld1.32 {d7[0]},[r1,:32], r2
1677 vld1.32 {d7[1]},[r1,:32], r2
1682 vshl.s16 q10, q10, q9
1685 vst1.32 {d2[0]},[r6,:32], r2
1686 vst1.32 {d2[1]},[r6,:32], r2
1688 vst1.32 {d4[0]},[r6,:32], r2
1689 vst1.32 {d4[1]},[r6,:32], r2
1692 2: vshl.s16 q1, q1, q9
1694 vst1.32 {d2[0]},[r6,:32], r2
1695 vst1.32 {d2[1]},[r6,:32], r2
1699 .macro biweight_func w
1700 function ff_biweight_h264_pixels_\w\()_neon, export=1
1707 eors lr, lr, r5, lsr #30
1720 10: biweight_\w vmlal.u8, vmlal.u8
1722 biweight_\w vmlal.u8, vmlsl.u8
1725 biweight_\w vmlsl.u8, vmlsl.u8
1727 biweight_\w vmlsl.u8, vmlal.u8
1735 @ Weighted prediction
1737 .macro weight_16 add
1740 vld1.8 {d20-d21},[r0,:128], r1
1741 vmull.u8 q2, d0, d20
1743 vmull.u8 q3, d0, d21
1744 vld1.8 {d28-d29},[r0,:128], r1
1745 vmull.u8 q12, d0, d28
1747 vmull.u8 q13, d0, d29
1749 vrshl.s16 q2, q2, q9
1751 vrshl.s16 q3, q3, q9
1755 vrshl.s16 q12, q12, q9
1757 vrshl.s16 q13, q13, q9
1758 vqmovun.s16 d24, q12
1759 vqmovun.s16 d25, q13
1760 vst1.8 {d4- d5}, [r4,:128], r1
1761 vst1.8 {d24-d25},[r4,:128], r1
1769 vld1.8 {d4},[r0,:64], r1
1772 vld1.8 {d6},[r0,:64], r1
1773 vmull.u8 q10, d0, d6
1776 vrshl.s16 q1, q1, q9
1779 vrshl.s16 q10, q10, q9
1781 vst1.8 {d2},[r4,:64], r1
1782 vst1.8 {d4},[r4,:64], r1
1792 vld1.32 {d4[0]},[r0,:32], r1
1793 vld1.32 {d4[1]},[r0,:32], r1
1797 vld1.32 {d6[0]},[r0,:32], r1
1798 vld1.32 {d6[1]},[r0,:32], r1
1799 vmull.u8 q10, d0, d6
1802 vrshl.s16 q1, q1, q9
1805 vrshl.s16 q10, q10, q9
1808 vst1.32 {d2[0]},[r4,:32], r1
1809 vst1.32 {d2[1]},[r4,:32], r1
1811 vst1.32 {d4[0]},[r4,:32], r1
1812 vst1.32 {d4[1]},[r4,:32], r1
1816 vrshl.s16 q1, q1, q9
1818 vst1.32 {d2[0]},[r4,:32], r1
1819 vst1.32 {d2[1]},[r4,:32], r1
1823 .macro weight_func w
1824 function ff_weight_h264_pixels_\w\()_neon, export=1
1838 10: rsb r12, r12, #0
1845 10: rsb r12, r12, #0