2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
38 .macro transpose_4x4 r0 r1 r2 r3
45 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
52 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
63 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64 .macro h264_chroma_mc8 type
65 function ff_\type\()_h264_chroma_mc8_neon, export=1
75 rsb r6, r7, r5, lsl #3
76 rsb ip, r7, r4, lsl #3
77 sub r4, r7, r4, lsl #3
78 sub r4, r4, r5, lsl #3
88 vld1.64 {d4, d5}, [r1], r4
90 vld1.64 {d6, d7}, [r5], r4
99 vld1.64 {d4, d5}, [r1], r4
101 vext.8 d5, d4, d5, #1
108 vrshrn.u16 d16, q8, #6
109 vld1.64 {d6, d7}, [r5], r4
111 vrshrn.u16 d17, q9, #6
113 vld1.64 {d20}, [lr,:64], r2
114 vld1.64 {d21}, [lr,:64], r2
115 vrhadd.u8 q8, q8, q10
117 vext.8 d7, d6, d7, #1
118 vst1.64 {d16}, [r0,:64], r2
119 vst1.64 {d17}, [r0,:64], r2
133 vld1.64 {d4}, [r1], r4
134 vld1.64 {d6}, [r5], r4
139 vld1.64 {d4}, [r1], r4
142 vld1.64 {d6}, [r5], r4
143 vrshrn.u16 d16, q8, #6
144 vrshrn.u16 d17, q9, #6
146 vld1.64 {d20}, [lr,:64], r2
147 vld1.64 {d21}, [lr,:64], r2
148 vrhadd.u8 q8, q8, q10
152 vst1.64 {d16}, [r0,:64], r2
153 vst1.64 {d17}, [r0,:64], r2
158 4: vld1.64 {d4, d5}, [r1], r2
159 vld1.64 {d6, d7}, [r1], r2
160 vext.8 d5, d4, d5, #1
161 vext.8 d7, d6, d7, #1
167 vld1.64 {d4, d5}, [r1], r2
171 vext.8 d5, d4, d5, #1
172 vrshrn.u16 d16, q8, #6
173 vrshrn.u16 d17, q9, #6
175 vld1.64 {d20}, [lr,:64], r2
176 vld1.64 {d21}, [lr,:64], r2
177 vrhadd.u8 q8, q8, q10
179 vld1.64 {d6, d7}, [r1], r2
180 vext.8 d7, d6, d7, #1
181 vst1.64 {d16}, [r0,:64], r2
182 vst1.64 {d17}, [r0,:64], r2
189 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190 .macro h264_chroma_mc4 type
191 function ff_\type\()_h264_chroma_mc4_neon, export=1
201 rsb r6, r7, r5, lsl #3
202 rsb ip, r7, r4, lsl #3
203 sub r4, r7, r4, lsl #3
204 sub r4, r4, r5, lsl #3
214 vld1.64 {d4}, [r1], r4
216 vld1.64 {d6}, [r5], r4
219 vext.8 d5, d4, d5, #1
220 vext.8 d7, d6, d7, #1
230 vld1.64 {d4}, [r1], r4
231 vext.8 d5, d4, d5, #1
235 vld1.64 {d6}, [r5], r4
236 vadd.i16 d16, d16, d17
237 vadd.i16 d17, d18, d19
238 vrshrn.u16 d16, q8, #6
242 vld1.32 {d20[0]}, [lr,:32], r2
243 vld1.32 {d20[1]}, [lr,:32], r2
244 vrhadd.u8 d16, d16, d20
246 vext.8 d7, d6, d7, #1
248 vst1.32 {d16[0]}, [r0,:32], r2
249 vst1.32 {d16[1]}, [r0,:32], r2
262 vext.32 d1, d0, d1, #1
265 vld1.32 {d4[0]}, [r1], r4
266 vld1.32 {d4[1]}, [r5], r4
270 vld1.32 {d4[0]}, [r1], r4
272 vld1.32 {d4[1]}, [r5], r4
273 vadd.i16 d16, d16, d17
274 vadd.i16 d17, d18, d19
275 vrshrn.u16 d16, q8, #6
277 vld1.32 {d20[0]}, [lr,:32], r2
278 vld1.32 {d20[1]}, [lr,:32], r2
279 vrhadd.u8 d16, d16, d20
283 vst1.32 {d16[0]}, [r0,:32], r2
284 vst1.32 {d16[1]}, [r0,:32], r2
289 4: vld1.64 {d4}, [r1], r2
290 vld1.64 {d6}, [r1], r2
291 vext.8 d5, d4, d5, #1
292 vext.8 d7, d6, d7, #1
296 5: vmull.u8 q8, d4, d0
299 vld1.64 {d4}, [r1], r2
300 vext.8 d5, d4, d5, #1
302 vadd.i16 d16, d16, d17
303 vadd.i16 d17, d18, d19
305 vrshrn.u16 d16, q8, #6
307 vld1.32 {d20[0]}, [lr,:32], r2
308 vld1.32 {d20[1]}, [lr,:32], r2
309 vrhadd.u8 d16, d16, d20
311 vld1.64 {d6}, [r1], r2
312 vext.8 d7, d6, d7, #1
315 vst1.32 {d16[0]}, [r0,:32], r2
316 vst1.32 {d16[1]}, [r0,:32], r2
331 /* H.264 loop filter */
333 .macro h264_loop_filter_start
339 and ip, ip, ip, lsl #16
341 ands ip, ip, ip, lsl #8
345 .macro align_push_regs
349 vst1.64 {d12-d15}, [sp,:128]
351 vst1.64 {d8-d11}, [sp,:128]
354 .macro align_pop_regs
355 vld1.64 {d8-d11}, [sp,:128]!
356 vld1.64 {d12-d15}, [sp,:128], ip
359 .macro h264_loop_filter_luma
360 vdup.8 q11, r2 @ alpha
362 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
364 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
366 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
367 vsli.32 q12, q12, #16
368 vclt.u8 q6, q6, q11 @ < alpha
369 vdup.8 q11, r3 @ beta
371 vclt.u8 q14, q14, q11 @ < beta
372 vclt.u8 q15, q15, q11 @ < beta
374 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
376 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
377 vclt.u8 q4, q4, q11 @ < beta
379 vclt.u8 q5, q5, q11 @ < beta
383 vrhadd.u8 q14, q8, q0
386 vhadd.u8 q10, q10, q14
388 vhadd.u8 q14, q2, q14
390 vqsub.u8 q11, q9, q12
393 vqsub.u8 q11, q1, q12
396 vmax.u8 q14, q14, q11
399 vsubw.u8 q10, q10, d17
401 vshl.i16 q10, q10, #2
403 vaddw.u8 q10, q10, d19
405 vsubw.u8 q10, q10, d3
406 vrshrn.i16 d4, q2, #3
407 vrshrn.i16 d5, q10, #3
417 vaddw.s8 q14, q14, d4
419 vsubw.s8 q11, q11, d4
420 vsubw.s8 q12, q12, d5
427 function ff_h264_v_loop_filter_luma_neon, export=1
428 h264_loop_filter_start
430 vld1.64 {d0, d1}, [r0,:128], r1
431 vld1.64 {d2, d3}, [r0,:128], r1
432 vld1.64 {d4, d5}, [r0,:128], r1
433 sub r0, r0, r1, lsl #2
434 sub r0, r0, r1, lsl #1
435 vld1.64 {d20,d21}, [r0,:128], r1
436 vld1.64 {d18,d19}, [r0,:128], r1
437 vld1.64 {d16,d17}, [r0,:128], r1
441 h264_loop_filter_luma
443 sub r0, r0, r1, lsl #1
444 vst1.64 {d8, d9}, [r0,:128], r1
445 vst1.64 {d16,d17}, [r0,:128], r1
446 vst1.64 {d0, d1}, [r0,:128], r1
447 vst1.64 {d10,d11}, [r0,:128]
453 function ff_h264_h_loop_filter_luma_neon, export=1
454 h264_loop_filter_start
457 vld1.64 {d6}, [r0], r1
458 vld1.64 {d20}, [r0], r1
459 vld1.64 {d18}, [r0], r1
460 vld1.64 {d16}, [r0], r1
461 vld1.64 {d0}, [r0], r1
462 vld1.64 {d2}, [r0], r1
463 vld1.64 {d4}, [r0], r1
464 vld1.64 {d26}, [r0], r1
465 vld1.64 {d7}, [r0], r1
466 vld1.64 {d21}, [r0], r1
467 vld1.64 {d19}, [r0], r1
468 vld1.64 {d17}, [r0], r1
469 vld1.64 {d1}, [r0], r1
470 vld1.64 {d3}, [r0], r1
471 vld1.64 {d5}, [r0], r1
472 vld1.64 {d27}, [r0], r1
474 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
478 h264_loop_filter_luma
480 transpose_4x4 q4, q8, q0, q5
482 sub r0, r0, r1, lsl #4
484 vst1.32 {d8[0]}, [r0], r1
485 vst1.32 {d16[0]}, [r0], r1
486 vst1.32 {d0[0]}, [r0], r1
487 vst1.32 {d10[0]}, [r0], r1
488 vst1.32 {d8[1]}, [r0], r1
489 vst1.32 {d16[1]}, [r0], r1
490 vst1.32 {d0[1]}, [r0], r1
491 vst1.32 {d10[1]}, [r0], r1
492 vst1.32 {d9[0]}, [r0], r1
493 vst1.32 {d17[0]}, [r0], r1
494 vst1.32 {d1[0]}, [r0], r1
495 vst1.32 {d11[0]}, [r0], r1
496 vst1.32 {d9[1]}, [r0], r1
497 vst1.32 {d17[1]}, [r0], r1
498 vst1.32 {d1[1]}, [r0], r1
499 vst1.32 {d11[1]}, [r0], r1
505 .macro h264_loop_filter_chroma
506 vdup.8 d22, r2 @ alpha
508 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
510 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
514 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
516 vclt.u8 d26, d26, d22 @ < alpha
518 vdup.8 d22, r3 @ beta
520 vrshrn.i16 d4, q2, #3
521 vclt.u8 d28, d28, d22 @ < beta
523 vclt.u8 d30, d30, d22 @ < beta
532 vaddw.s8 q14, q14, d4
533 vsubw.s8 q11, q11, d4
538 function ff_h264_v_loop_filter_chroma_neon, export=1
539 h264_loop_filter_start
541 sub r0, r0, r1, lsl #1
542 vld1.64 {d18}, [r0,:64], r1
543 vld1.64 {d16}, [r0,:64], r1
544 vld1.64 {d0}, [r0,:64], r1
545 vld1.64 {d2}, [r0,:64]
547 h264_loop_filter_chroma
549 sub r0, r0, r1, lsl #1
550 vst1.64 {d16}, [r0,:64], r1
551 vst1.64 {d0}, [r0,:64], r1
556 function ff_h264_h_loop_filter_chroma_neon, export=1
557 h264_loop_filter_start
560 vld1.32 {d18[0]}, [r0], r1
561 vld1.32 {d16[0]}, [r0], r1
562 vld1.32 {d0[0]}, [r0], r1
563 vld1.32 {d2[0]}, [r0], r1
564 vld1.32 {d18[1]}, [r0], r1
565 vld1.32 {d16[1]}, [r0], r1
566 vld1.32 {d0[1]}, [r0], r1
567 vld1.32 {d2[1]}, [r0], r1
574 h264_loop_filter_chroma
581 sub r0, r0, r1, lsl #3
582 vst1.32 {d18[0]}, [r0], r1
583 vst1.32 {d16[0]}, [r0], r1
584 vst1.32 {d0[0]}, [r0], r1
585 vst1.32 {d2[0]}, [r0], r1
586 vst1.32 {d18[1]}, [r0], r1
587 vst1.32 {d16[1]}, [r0], r1
588 vst1.32 {d0[1]}, [r0], r1
589 vst1.32 {d2[1]}, [r0], r1
596 .macro lowpass_const r
602 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
610 vext.8 d2, \r0, \r1, #2
611 vext.8 d3, \r0, \r1, #3
613 vext.8 d4, \r0, \r1, #1
614 vext.8 d5, \r0, \r1, #4
616 vext.8 d30, \r0, \r1, #5
617 vaddl.u8 t0, \r0, d30
618 vext.8 d18, \r2, \r3, #2
619 vmla.i16 t0, q1, d6[1]
620 vext.8 d19, \r2, \r3, #3
621 vaddl.u8 q9, d18, d19
622 vext.8 d20, \r2, \r3, #1
623 vmls.i16 t0, q2, d6[0]
624 vext.8 d21, \r2, \r3, #4
625 vaddl.u8 q10, d20, d21
626 vext.8 d31, \r2, \r3, #5
627 vaddl.u8 t1, \r2, d31
628 vmla.i16 t1, q9, d6[1]
629 vmls.i16 t1, q10, d6[0]
631 vqrshrun.s16 \d0, t0, #5
632 vqrshrun.s16 \d1, t1, #5
638 .macro lowpass_8_1 r0, r1, d0, narrow=1
644 vext.8 d2, \r0, \r1, #2
645 vext.8 d3, \r0, \r1, #3
647 vext.8 d4, \r0, \r1, #1
648 vext.8 d5, \r0, \r1, #4
650 vext.8 d30, \r0, \r1, #5
651 vaddl.u8 t0, \r0, d30
652 vmla.i16 t0, q1, d6[1]
653 vmls.i16 t0, q2, d6[0]
655 vqrshrun.s16 \d0, t0, #5
660 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
661 vext.16 q1, \r0, \r1, #2
662 vext.16 q0, \r0, \r1, #3
664 vext.16 q2, \r0, \r1, #1
666 vext.16 q3, \r0, \r1, #4
667 vaddl.s16 q10, d4, d6
668 vext.16 \r1, \r0, \r1, #5
670 vaddl.s16 q0, \h0, \h1
671 vaddl.s16 q8, \l0, \l1
675 vshl.i32 q15, q10, #2
677 vadd.i32 q10, q10, q15
691 vrshrn.s32 d18, q9, #10
692 vrshrn.s32 d19, q1, #10
697 function put_h264_qpel16_h_lowpass_neon_packed
701 bl put_h264_qpel8_h_lowpass_neon
702 sub r1, r1, r2, lsl #4
706 b put_h264_qpel8_h_lowpass_neon
709 function put_h264_qpel16_h_lowpass_neon
712 bl put_h264_qpel8_h_lowpass_neon
713 sub r0, r0, r3, lsl #4
714 sub r1, r1, r2, lsl #4
721 function put_h264_qpel8_h_lowpass_neon
722 1: vld1.64 {d0, d1}, [r1], r2
723 vld1.64 {d16,d17}, [r1], r2
725 lowpass_8 d0, d1, d16, d17, d0, d16
726 vst1.64 {d0}, [r0,:64], r3
727 vst1.64 {d16}, [r0,:64], r3
732 function put_h264_qpel16_h_lowpass_l2_neon
735 bl put_h264_qpel8_h_lowpass_l2_neon
736 sub r0, r0, r2, lsl #4
737 sub r1, r1, r2, lsl #4
738 sub r3, r3, r2, lsl #4
746 function put_h264_qpel8_h_lowpass_l2_neon
747 1: vld1.64 {d0, d1}, [r1], r2
748 vld1.64 {d16,d17}, [r1], r2
749 vld1.64 {d28}, [r3], r2
750 vld1.64 {d29}, [r3], r2
752 lowpass_8 d0, d1, d16, d17, d0, d1
753 vrhadd.u8 q0, q0, q14
754 vst1.64 {d0}, [r0,:64], r2
755 vst1.64 {d1}, [r0,:64], r2
760 function put_h264_qpel16_v_lowpass_neon_packed
763 bl put_h264_qpel8_v_lowpass_neon
764 sub r1, r1, r3, lsl #2
765 bl put_h264_qpel8_v_lowpass_neon
766 sub r1, r1, r3, lsl #4
767 sub r1, r1, r3, lsl #2
769 bl put_h264_qpel8_v_lowpass_neon
770 sub r1, r1, r3, lsl #2
772 b put_h264_qpel8_v_lowpass_neon
775 function put_h264_qpel16_v_lowpass_neon
777 bl put_h264_qpel8_v_lowpass_neon
778 sub r1, r1, r3, lsl #2
779 bl put_h264_qpel8_v_lowpass_neon
780 sub r0, r0, r2, lsl #4
782 sub r1, r1, r3, lsl #4
783 sub r1, r1, r3, lsl #2
785 bl put_h264_qpel8_v_lowpass_neon
786 sub r1, r1, r3, lsl #2
790 function put_h264_qpel8_v_lowpass_neon
791 vld1.64 {d8}, [r1], r3
792 vld1.64 {d10}, [r1], r3
793 vld1.64 {d12}, [r1], r3
794 vld1.64 {d14}, [r1], r3
795 vld1.64 {d22}, [r1], r3
796 vld1.64 {d24}, [r1], r3
797 vld1.64 {d26}, [r1], r3
798 vld1.64 {d28}, [r1], r3
799 vld1.64 {d9}, [r1], r3
800 vld1.64 {d11}, [r1], r3
801 vld1.64 {d13}, [r1], r3
802 vld1.64 {d15}, [r1], r3
805 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
806 lowpass_8 d8, d9, d10, d11, d8, d10
807 lowpass_8 d12, d13, d14, d15, d12, d14
808 lowpass_8 d22, d23, d24, d25, d22, d24
809 lowpass_8 d26, d27, d28, d29, d26, d28
810 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
812 vst1.64 {d8}, [r0,:64], r2
813 vst1.64 {d10}, [r0,:64], r2
814 vst1.64 {d12}, [r0,:64], r2
815 vst1.64 {d14}, [r0,:64], r2
816 vst1.64 {d22}, [r0,:64], r2
817 vst1.64 {d24}, [r0,:64], r2
818 vst1.64 {d26}, [r0,:64], r2
819 vst1.64 {d28}, [r0,:64], r2
824 function put_h264_qpel16_v_lowpass_l2_neon
826 bl put_h264_qpel8_v_lowpass_l2_neon
827 sub r1, r1, r3, lsl #2
828 bl put_h264_qpel8_v_lowpass_l2_neon
829 sub r0, r0, r3, lsl #4
830 sub ip, ip, r2, lsl #4
833 sub r1, r1, r3, lsl #4
834 sub r1, r1, r3, lsl #2
836 bl put_h264_qpel8_v_lowpass_l2_neon
837 sub r1, r1, r3, lsl #2
841 function put_h264_qpel8_v_lowpass_l2_neon
842 vld1.64 {d8}, [r1], r3
843 vld1.64 {d10}, [r1], r3
844 vld1.64 {d12}, [r1], r3
845 vld1.64 {d14}, [r1], r3
846 vld1.64 {d22}, [r1], r3
847 vld1.64 {d24}, [r1], r3
848 vld1.64 {d26}, [r1], r3
849 vld1.64 {d28}, [r1], r3
850 vld1.64 {d9}, [r1], r3
851 vld1.64 {d11}, [r1], r3
852 vld1.64 {d13}, [r1], r3
853 vld1.64 {d15}, [r1], r3
856 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
857 lowpass_8 d8, d9, d10, d11, d8, d9
858 lowpass_8 d12, d13, d14, d15, d12, d13
859 lowpass_8 d22, d23, d24, d25, d22, d23
860 lowpass_8 d26, d27, d28, d29, d26, d27
861 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
863 vld1.64 {d0}, [ip], r2
864 vld1.64 {d1}, [ip], r2
865 vld1.64 {d2}, [ip], r2
866 vld1.64 {d3}, [ip], r2
867 vld1.64 {d4}, [ip], r2
869 vld1.64 {d5}, [ip], r2
871 vld1.64 {d10}, [ip], r2
872 vrhadd.u8 q2, q2, q11
873 vld1.64 {d11}, [ip], r2
875 vst1.64 {d0}, [r0,:64], r3
876 vst1.64 {d1}, [r0,:64], r3
877 vrhadd.u8 q5, q5, q13
878 vst1.64 {d2}, [r0,:64], r3
879 vst1.64 {d3}, [r0,:64], r3
880 vst1.64 {d4}, [r0,:64], r3
881 vst1.64 {d5}, [r0,:64], r3
882 vst1.64 {d10}, [r0,:64], r3
883 vst1.64 {d11}, [r0,:64], r3
888 function put_h264_qpel8_hv_lowpass_neon_top
891 1: vld1.64 {d0, d1}, [r1], r3
892 vld1.64 {d16,d17}, [r1], r3
894 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
895 vst1.64 {d22-d25}, [r4,:128]!
898 vld1.64 {d0, d1}, [r1]
899 lowpass_8_1 d0, d1, q12, narrow=0
903 vld1.64 {d30,d31}, [r4,:128], ip
904 vld1.64 {d20,d21}, [r4,:128], ip
905 vld1.64 {d18,d19}, [r4,:128], ip
906 vld1.64 {d16,d17}, [r4,:128], ip
907 vld1.64 {d14,d15}, [r4,:128], ip
908 vld1.64 {d12,d13}, [r4,:128], ip
909 vld1.64 {d10,d11}, [r4,:128], ip
910 vld1.64 {d8, d9}, [r4,:128], ip
911 vld1.64 {d6, d7}, [r4,:128], ip
912 vld1.64 {d4, d5}, [r4,:128], ip
913 vld1.64 {d2, d3}, [r4,:128], ip
914 vld1.64 {d0, d1}, [r4,:128]
916 swap4 d1, d3, d5, d7, d8, d10, d12, d14
917 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
919 swap4 d17, d19, d21, d31, d24, d26, d28, d22
920 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
922 vst1.64 {d30,d31}, [r4,:128]!
923 vst1.64 {d6, d7}, [r4,:128]!
924 vst1.64 {d20,d21}, [r4,:128]!
925 vst1.64 {d4, d5}, [r4,:128]!
926 vst1.64 {d18,d19}, [r4,:128]!
927 vst1.64 {d2, d3}, [r4,:128]!
928 vst1.64 {d16,d17}, [r4,:128]!
929 vst1.64 {d0, d1}, [r4,:128]
931 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
932 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
933 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
934 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
936 vld1.64 {d16,d17}, [r4,:128], ip
937 vld1.64 {d30,d31}, [r4,:128], ip
938 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
939 vld1.64 {d16,d17}, [r4,:128], ip
940 vld1.64 {d30,d31}, [r4,:128], ip
941 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
942 vld1.64 {d16,d17}, [r4,:128], ip
943 vld1.64 {d30,d31}, [r4,:128], ip
944 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
945 vld1.64 {d16,d17}, [r4,:128], ip
946 vld1.64 {d30,d31}, [r4,:128]
947 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
949 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
954 function put_h264_qpel8_hv_lowpass_neon
956 bl put_h264_qpel8_hv_lowpass_neon_top
957 vst1.64 {d12}, [r0,:64], r2
958 vst1.64 {d13}, [r0,:64], r2
959 vst1.64 {d14}, [r0,:64], r2
960 vst1.64 {d15}, [r0,:64], r2
961 vst1.64 {d8}, [r0,:64], r2
962 vst1.64 {d9}, [r0,:64], r2
963 vst1.64 {d10}, [r0,:64], r2
964 vst1.64 {d11}, [r0,:64], r2
970 function put_h264_qpel8_hv_lowpass_l2_neon
972 bl put_h264_qpel8_hv_lowpass_neon_top
974 vld1.64 {d0, d1}, [r2,:128]!
975 vld1.64 {d2, d3}, [r2,:128]!
977 vld1.64 {d4, d5}, [r2,:128]!
979 vld1.64 {d6, d7}, [r2,:128]!
982 vst1.64 {d0}, [r0,:64], r3
984 vst1.64 {d1}, [r0,:64], r3
985 vst1.64 {d2}, [r0,:64], r3
986 vst1.64 {d3}, [r0,:64], r3
987 vst1.64 {d4}, [r0,:64], r3
988 vst1.64 {d5}, [r0,:64], r3
989 vst1.64 {d6}, [r0,:64], r3
990 vst1.64 {d7}, [r0,:64], r3
996 function put_h264_qpel16_hv_lowpass_neon
998 bl put_h264_qpel8_hv_lowpass_neon
999 sub r1, r1, r3, lsl #2
1000 bl put_h264_qpel8_hv_lowpass_neon
1001 sub r1, r1, r3, lsl #4
1002 sub r1, r1, r3, lsl #2
1004 sub r0, r0, r2, lsl #4
1006 bl put_h264_qpel8_hv_lowpass_neon
1007 sub r1, r1, r3, lsl #2
1009 b put_h264_qpel8_hv_lowpass_neon
1012 function put_h264_qpel16_hv_lowpass_l2_neon
1015 bl put_h264_qpel8_hv_lowpass_l2_neon
1016 sub r1, r1, r3, lsl #2
1017 bl put_h264_qpel8_hv_lowpass_l2_neon
1018 sub r1, r1, r3, lsl #4
1019 sub r1, r1, r3, lsl #2
1021 sub r0, r0, r3, lsl #4
1023 bl put_h264_qpel8_hv_lowpass_l2_neon
1024 sub r1, r1, r3, lsl #2
1026 b put_h264_qpel8_hv_lowpass_l2_neon
1029 function ff_put_h264_qpel8_mc10_neon, export=1
1034 b put_h264_qpel8_h_lowpass_l2_neon
1037 function ff_put_h264_qpel8_mc20_neon, export=1
1042 b put_h264_qpel8_h_lowpass_neon
1045 function ff_put_h264_qpel8_mc30_neon, export=1
1050 b put_h264_qpel8_h_lowpass_l2_neon
1053 function ff_put_h264_qpel8_mc01_neon, export=1
1056 put_h264_qpel8_mc01:
1059 sub r1, r1, r2, lsl #1
1061 bl put_h264_qpel8_v_lowpass_l2_neon
1066 function ff_put_h264_qpel8_mc11_neon, export=1
1067 push {r0, r1, r2, lr}
1068 put_h264_qpel8_mc11:
1076 bl put_h264_qpel8_h_lowpass_neon
1080 sub r1, r1, r2, lsl #1
1082 bl put_h264_qpel8_v_lowpass_l2_neon
1088 function ff_put_h264_qpel8_mc21_neon, export=1
1089 push {r0, r1, r4, r10, r11, lr}
1090 put_h264_qpel8_mc21:
1094 sub sp, sp, #(8*8+16*12)
1100 bl put_h264_qpel8_h_lowpass_neon
1103 sub r1, r1, r2, lsl #1
1107 bl put_h264_qpel8_hv_lowpass_l2_neon
1110 pop {r4, r10, r11, pc}
1113 function ff_put_h264_qpel8_mc31_neon, export=1
1115 push {r0, r1, r2, lr}
1117 b put_h264_qpel8_mc11
1120 function ff_put_h264_qpel8_mc02_neon, export=1
1123 sub r1, r1, r2, lsl #1
1126 bl put_h264_qpel8_v_lowpass_neon
1131 function ff_put_h264_qpel8_mc12_neon, export=1
1132 push {r0, r1, r4, r10, r11, lr}
1133 put_h264_qpel8_mc12:
1137 sub sp, sp, #(8*8+16*12)
1138 sub r1, r1, r2, lsl #1
1143 bl put_h264_qpel8_v_lowpass_neon
1146 sub r1, r1, r3, lsl #1
1149 bl put_h264_qpel8_hv_lowpass_l2_neon
1152 pop {r4, r10, r11, pc}
1155 function ff_put_h264_qpel8_mc22_neon, export=1
1156 push {r4, r10, r11, lr}
1159 sub r1, r1, r2, lsl #1
1162 sub sp, sp, #(16*12)
1165 bl put_h264_qpel8_hv_lowpass_neon
1168 pop {r4, r10, r11, pc}
1171 function ff_put_h264_qpel8_mc32_neon, export=1
1172 push {r0, r1, r4, r10, r11, lr}
1174 b put_h264_qpel8_mc12
1177 function ff_put_h264_qpel8_mc03_neon, export=1
1180 b put_h264_qpel8_mc01
1183 function ff_put_h264_qpel8_mc13_neon, export=1
1184 push {r0, r1, r2, lr}
1186 b put_h264_qpel8_mc11
1189 function ff_put_h264_qpel8_mc23_neon, export=1
1190 push {r0, r1, r4, r10, r11, lr}
1192 b put_h264_qpel8_mc21
1195 function ff_put_h264_qpel8_mc33_neon, export=1
1197 push {r0, r1, r2, lr}
1200 b put_h264_qpel8_mc11
1203 function ff_put_h264_qpel16_mc10_neon, export=1
1207 b put_h264_qpel16_h_lowpass_l2_neon
1210 function ff_put_h264_qpel16_mc20_neon, export=1
1214 b put_h264_qpel16_h_lowpass_neon
1217 function ff_put_h264_qpel16_mc30_neon, export=1
1221 b put_h264_qpel16_h_lowpass_l2_neon
1224 function ff_put_h264_qpel16_mc01_neon, export=1
1227 put_h264_qpel16_mc01:
1230 sub r1, r1, r2, lsl #1
1232 bl put_h264_qpel16_v_lowpass_l2_neon
1237 function ff_put_h264_qpel16_mc11_neon, export=1
1238 push {r0, r1, r4, lr}
1239 put_h264_qpel16_mc11:
1246 bl put_h264_qpel16_h_lowpass_neon
1251 sub r1, r1, r2, lsl #1
1253 bl put_h264_qpel16_v_lowpass_l2_neon
1255 add sp, sp, #(256+8)
1259 function ff_put_h264_qpel16_mc21_neon, export=1
1260 push {r0, r1, r4-r5, r9-r11, lr}
1261 put_h264_qpel16_mc21:
1265 sub sp, sp, #(16*16+16*12)
1269 bl put_h264_qpel16_h_lowpass_neon_packed
1272 sub r1, r1, r2, lsl #1
1275 bl put_h264_qpel16_hv_lowpass_l2_neon
1278 pop {r4-r5, r9-r11, pc}
1281 function ff_put_h264_qpel16_mc31_neon, export=1
1283 push {r0, r1, r4, lr}
1285 b put_h264_qpel16_mc11
1288 function ff_put_h264_qpel16_mc02_neon, export=1
1291 sub r1, r1, r2, lsl #1
1294 bl put_h264_qpel16_v_lowpass_neon
1299 function ff_put_h264_qpel16_mc12_neon, export=1
1300 push {r0, r1, r4-r5, r9-r11, lr}
1301 put_h264_qpel16_mc12:
1305 sub sp, sp, #(16*16+16*12)
1306 sub r1, r1, r2, lsl #1
1310 bl put_h264_qpel16_v_lowpass_neon_packed
1313 sub r1, r1, r3, lsl #1
1316 bl put_h264_qpel16_hv_lowpass_l2_neon
1319 pop {r4-r5, r9-r11, pc}
1322 function ff_put_h264_qpel16_mc22_neon, export=1
1323 push {r4, r9-r11, lr}
1327 sub r1, r1, r2, lsl #1
1330 sub sp, sp, #(16*12)
1333 bl put_h264_qpel16_hv_lowpass_neon
1336 pop {r4, r9-r11, pc}
1339 function ff_put_h264_qpel16_mc32_neon, export=1
1340 push {r0, r1, r4-r5, r9-r11, lr}
1342 b put_h264_qpel16_mc12
1345 function ff_put_h264_qpel16_mc03_neon, export=1
1348 b put_h264_qpel16_mc01
1351 function ff_put_h264_qpel16_mc13_neon, export=1
1352 push {r0, r1, r4, lr}
1354 b put_h264_qpel16_mc11
1357 function ff_put_h264_qpel16_mc23_neon, export=1
1358 push {r0, r1, r4-r5, r9-r11, lr}
1360 b put_h264_qpel16_mc21
1363 function ff_put_h264_qpel16_mc33_neon, export=1
1365 push {r0, r1, r4, lr}
1368 b put_h264_qpel16_mc11
1371 @ Biweighted prediction
1373 .macro biweight_16 macs, macd
1379 vld1.8 {d20-d21},[r0,:128], r2
1383 vld1.8 {d22-d23},[r1,:128], r2
1388 vld1.8 {d28-d29},[r0,:128], r2
1393 vld1.8 {d30-d31},[r1,:128], r2
1401 vshl.s16 q12, q12, q9
1402 vshl.s16 q13, q13, q9
1403 vqmovun.s16 d24, q12
1404 vqmovun.s16 d25, q13
1406 vst1.8 {d4- d5}, [r6,:128], r2
1408 vst1.8 {d24-d25},[r6,:128], r2
1413 .macro biweight_8 macs, macd
1419 vld1.8 {d4},[r0,:64], r2
1422 vld1.8 {d5},[r1,:64], r2
1425 vld1.8 {d6},[r0,:64], r2
1428 vld1.8 {d7},[r1,:64], r2
1433 vshl.s16 q10, q10, q9
1436 vst1.8 {d2},[r6,:64], r2
1438 vst1.8 {d4},[r6,:64], r2
1443 .macro biweight_4 macs, macd
1449 vld1.32 {d4[0]},[r0,:32], r2
1450 vld1.32 {d4[1]},[r0,:32], r2
1453 vld1.32 {d5[0]},[r1,:32], r2
1454 vld1.32 {d5[1]},[r1,:32], r2
1458 vld1.32 {d6[0]},[r0,:32], r2
1459 vld1.32 {d6[1]},[r0,:32], r2
1462 vld1.32 {d7[0]},[r1,:32], r2
1463 vld1.32 {d7[1]},[r1,:32], r2
1468 vshl.s16 q10, q10, q9
1471 vst1.32 {d2[0]},[r6,:32], r2
1472 vst1.32 {d2[1]},[r6,:32], r2
1474 vst1.32 {d4[0]},[r6,:32], r2
1475 vst1.32 {d4[1]},[r6,:32], r2
1478 2: vshl.s16 q1, q1, q9
1480 vst1.32 {d2[0]},[r6,:32], r2
1481 vst1.32 {d2[1]},[r6,:32], r2
1485 .macro biweight_func w
1486 function biweight_h264_pixels_\w\()_neon
1492 eors lr, lr, r5, lsr #30
1505 10: biweight_\w vmlal.u8, vmlal.u8
1507 biweight_\w vmlal.u8, vmlsl.u8
1510 biweight_\w vmlsl.u8, vmlsl.u8
1512 biweight_\w vmlsl.u8, vmlal.u8
1516 .macro biweight_entry w, h, b=1
1517 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1520 b biweight_h264_pixels_\w\()_neon
1525 biweight_entry 16, 8
1526 biweight_entry 16, 16, b=0
1529 biweight_entry 8, 16
1531 biweight_entry 8, 8, b=0
1536 biweight_entry 4, 4, b=0
1539 @ Weighted prediction
1541 .macro weight_16 add
1544 vld1.8 {d20-d21},[r0,:128], r1
1545 vmull.u8 q2, d0, d20
1547 vmull.u8 q3, d0, d21
1548 vld1.8 {d28-d29},[r0,:128], r1
1549 vmull.u8 q12, d0, d28
1551 vmull.u8 q13, d0, d29
1553 vrshl.s16 q2, q2, q9
1555 vrshl.s16 q3, q3, q9
1559 vrshl.s16 q12, q12, q9
1561 vrshl.s16 q13, q13, q9
1562 vqmovun.s16 d24, q12
1563 vqmovun.s16 d25, q13
1564 vst1.8 {d4- d5}, [r4,:128], r1
1565 vst1.8 {d24-d25},[r4,:128], r1
1573 vld1.8 {d4},[r0,:64], r1
1576 vld1.8 {d6},[r0,:64], r1
1577 vmull.u8 q10, d0, d6
1580 vrshl.s16 q1, q1, q9
1583 vrshl.s16 q10, q10, q9
1585 vst1.8 {d2},[r4,:64], r1
1586 vst1.8 {d4},[r4,:64], r1
1596 vld1.32 {d4[0]},[r0,:32], r1
1597 vld1.32 {d4[1]},[r0,:32], r1
1601 vld1.32 {d6[0]},[r0,:32], r1
1602 vld1.32 {d6[1]},[r0,:32], r1
1603 vmull.u8 q10, d0, d6
1606 vrshl.s16 q1, q1, q9
1609 vrshl.s16 q10, q10, q9
1612 vst1.32 {d2[0]},[r4,:32], r1
1613 vst1.32 {d2[1]},[r4,:32], r1
1615 vst1.32 {d4[0]},[r4,:32], r1
1616 vst1.32 {d4[1]},[r4,:32], r1
1620 vrshl.s16 q1, q1, q9
1622 vst1.32 {d2[0]},[r4,:32], r1
1623 vst1.32 {d2[1]},[r4,:32], r1
1627 .macro weight_func w
1628 function weight_h264_pixels_\w\()_neon
1653 .macro weight_entry w, h, b=1
1654 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1657 b weight_h264_pixels_\w\()_neon
1663 weight_entry 16, 16, b=0
1668 weight_entry 8, 8, b=0
1673 weight_entry 4, 4, b=0