2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 /* H.264 loop filter */
26 .macro h264_loop_filter_start
33 and r12, r12, r12, lsl #16
36 ands r12, r12, r12, lsl #8
41 .macro h264_loop_filter_luma
42 vdup.8 q11, r2 @ alpha
44 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
46 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
48 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
50 vclt.u8 q6, q6, q11 @ < alpha
53 vclt.u8 q14, q14, q11 @ < beta
54 vclt.u8 q15, q15, q11 @ < beta
56 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
58 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
59 vclt.u8 q4, q4, q11 @ < beta
61 vclt.u8 q5, q5, q11 @ < beta
68 vhadd.u8 q10, q10, q14
81 vsubw.u8 q10, q10, d17
85 vaddw.u8 q10, q10, d19
89 vrshrn.i16 d5, q10, #3
101 vsubw.s8 q11, q11, d4
102 vsubw.s8 q12, q12, d5
109 function ff_h264_v_loop_filter_luma_neon, export=1
110 h264_loop_filter_start
112 vld1.8 {d0, d1}, [r0,:128], r1
113 vld1.8 {d2, d3}, [r0,:128], r1
114 vld1.8 {d4, d5}, [r0,:128], r1
115 sub r0, r0, r1, lsl #2
116 sub r0, r0, r1, lsl #1
117 vld1.8 {d20,d21}, [r0,:128], r1
118 vld1.8 {d18,d19}, [r0,:128], r1
119 vld1.8 {d16,d17}, [r0,:128], r1
123 h264_loop_filter_luma
125 sub r0, r0, r1, lsl #1
126 vst1.8 {d8, d9}, [r0,:128], r1
127 vst1.8 {d16,d17}, [r0,:128], r1
128 vst1.8 {d0, d1}, [r0,:128], r1
129 vst1.8 {d10,d11}, [r0,:128]
135 function ff_h264_h_loop_filter_luma_neon, export=1
136 h264_loop_filter_start
139 vld1.8 {d6}, [r0], r1
140 vld1.8 {d20}, [r0], r1
141 vld1.8 {d18}, [r0], r1
142 vld1.8 {d16}, [r0], r1
143 vld1.8 {d0}, [r0], r1
144 vld1.8 {d2}, [r0], r1
145 vld1.8 {d4}, [r0], r1
146 vld1.8 {d26}, [r0], r1
147 vld1.8 {d7}, [r0], r1
148 vld1.8 {d21}, [r0], r1
149 vld1.8 {d19}, [r0], r1
150 vld1.8 {d17}, [r0], r1
151 vld1.8 {d1}, [r0], r1
152 vld1.8 {d3}, [r0], r1
153 vld1.8 {d5}, [r0], r1
154 vld1.8 {d27}, [r0], r1
156 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
160 h264_loop_filter_luma
162 transpose_4x4 q4, q8, q0, q5
164 sub r0, r0, r1, lsl #4
166 vst1.32 {d8[0]}, [r0], r1
167 vst1.32 {d16[0]}, [r0], r1
168 vst1.32 {d0[0]}, [r0], r1
169 vst1.32 {d10[0]}, [r0], r1
170 vst1.32 {d8[1]}, [r0], r1
171 vst1.32 {d16[1]}, [r0], r1
172 vst1.32 {d0[1]}, [r0], r1
173 vst1.32 {d10[1]}, [r0], r1
174 vst1.32 {d9[0]}, [r0], r1
175 vst1.32 {d17[0]}, [r0], r1
176 vst1.32 {d1[0]}, [r0], r1
177 vst1.32 {d11[0]}, [r0], r1
178 vst1.32 {d9[1]}, [r0], r1
179 vst1.32 {d17[1]}, [r0], r1
180 vst1.32 {d1[1]}, [r0], r1
181 vst1.32 {d11[1]}, [r0], r1
187 .macro h264_loop_filter_chroma
188 vdup.8 d22, r2 @ alpha
190 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
192 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
196 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
198 vclt.u8 d26, d26, d22 @ < alpha
200 vdup.8 d22, r3 @ beta
201 vrshrn.i16 d4, q2, #3
202 vclt.u8 d28, d28, d22 @ < beta
203 vclt.u8 d30, d30, d22 @ < beta
212 vaddw.s8 q14, q14, d4
213 vsubw.s8 q11, q11, d4
218 function ff_h264_v_loop_filter_chroma_neon, export=1
219 h264_loop_filter_start
221 sub r0, r0, r1, lsl #1
222 vld1.8 {d18}, [r0,:64], r1
223 vld1.8 {d16}, [r0,:64], r1
224 vld1.8 {d0}, [r0,:64], r1
225 vld1.8 {d2}, [r0,:64]
227 h264_loop_filter_chroma
229 sub r0, r0, r1, lsl #1
230 vst1.8 {d16}, [r0,:64], r1
231 vst1.8 {d0}, [r0,:64], r1
236 function ff_h264_h_loop_filter_chroma_neon, export=1
237 h264_loop_filter_start
240 vld1.32 {d18[0]}, [r0], r1
241 vld1.32 {d16[0]}, [r0], r1
242 vld1.32 {d0[0]}, [r0], r1
243 vld1.32 {d2[0]}, [r0], r1
244 vld1.32 {d18[1]}, [r0], r1
245 vld1.32 {d16[1]}, [r0], r1
246 vld1.32 {d0[1]}, [r0], r1
247 vld1.32 {d2[1]}, [r0], r1
254 h264_loop_filter_chroma
261 sub r0, r0, r1, lsl #3
262 vst1.32 {d18[0]}, [r0], r1
263 vst1.32 {d16[0]}, [r0], r1
264 vst1.32 {d0[0]}, [r0], r1
265 vst1.32 {d2[0]}, [r0], r1
266 vst1.32 {d18[1]}, [r0], r1
267 vst1.32 {d16[1]}, [r0], r1
268 vst1.32 {d0[1]}, [r0], r1
269 vst1.32 {d2[1]}, [r0], r1
276 .macro lowpass_const r
282 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
290 vext.8 d2, \r0, \r1, #2
291 vext.8 d3, \r0, \r1, #3
293 vext.8 d4, \r0, \r1, #1
294 vext.8 d5, \r0, \r1, #4
296 vext.8 d30, \r0, \r1, #5
297 vaddl.u8 t0, \r0, d30
298 vext.8 d18, \r2, \r3, #2
299 vmla.i16 t0, q1, d6[1]
300 vext.8 d19, \r2, \r3, #3
301 vaddl.u8 q9, d18, d19
302 vext.8 d20, \r2, \r3, #1
303 vmls.i16 t0, q2, d6[0]
304 vext.8 d21, \r2, \r3, #4
305 vaddl.u8 q10, d20, d21
306 vext.8 d31, \r2, \r3, #5
307 vaddl.u8 t1, \r2, d31
308 vmla.i16 t1, q9, d6[1]
309 vmls.i16 t1, q10, d6[0]
311 vqrshrun.s16 \d0, t0, #5
312 vqrshrun.s16 \d1, t1, #5
318 .macro lowpass_8_1 r0, r1, d0, narrow=1
324 vext.8 d2, \r0, \r1, #2
325 vext.8 d3, \r0, \r1, #3
327 vext.8 d4, \r0, \r1, #1
328 vext.8 d5, \r0, \r1, #4
330 vext.8 d30, \r0, \r1, #5
331 vaddl.u8 t0, \r0, d30
332 vmla.i16 t0, q1, d6[1]
333 vmls.i16 t0, q2, d6[0]
335 vqrshrun.s16 \d0, t0, #5
340 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
341 vext.16 q1, \r0, \r1, #2
342 vext.16 q0, \r0, \r1, #3
344 vext.16 q2, \r0, \r1, #1
346 vext.16 q3, \r0, \r1, #4
347 vaddl.s16 q10, d4, d6
348 vext.16 \r1, \r0, \r1, #5
350 vaddl.s16 q0, \h0, \h1
351 vaddl.s16 q8, \l0, \l1
355 vshl.i32 q15, q10, #2
357 vadd.i32 q10, q10, q15
371 vrshrn.s32 d18, q9, #10
372 vrshrn.s32 d19, q1, #10
377 function put_h264_qpel16_h_lowpass_neon_packed
381 bl put_h264_qpel8_h_lowpass_neon
382 sub r1, r1, r2, lsl #4
386 b put_h264_qpel8_h_lowpass_neon
389 .macro h264_qpel_h_lowpass type
390 function \type\()_h264_qpel16_h_lowpass_neon
393 bl \type\()_h264_qpel8_h_lowpass_neon
394 sub r0, r0, r3, lsl #4
395 sub r1, r1, r2, lsl #4
402 function \type\()_h264_qpel8_h_lowpass_neon
403 1: vld1.8 {d0, d1}, [r1], r2
404 vld1.8 {d16,d17}, [r1], r2
406 lowpass_8 d0, d1, d16, d17, d0, d16
408 vld1.8 {d2}, [r0,:64], r3
410 vld1.8 {d3}, [r0,:64]
411 vrhadd.u8 d16, d16, d3
414 vst1.8 {d0}, [r0,:64], r3
415 vst1.8 {d16}, [r0,:64], r3
421 h264_qpel_h_lowpass put
422 h264_qpel_h_lowpass avg
424 .macro h264_qpel_h_lowpass_l2 type
425 function \type\()_h264_qpel16_h_lowpass_l2_neon
428 bl \type\()_h264_qpel8_h_lowpass_l2_neon
429 sub r0, r0, r2, lsl #4
430 sub r1, r1, r2, lsl #4
431 sub r3, r3, r2, lsl #4
439 function \type\()_h264_qpel8_h_lowpass_l2_neon
440 1: vld1.8 {d0, d1}, [r1], r2
441 vld1.8 {d16,d17}, [r1], r2
442 vld1.8 {d28}, [r3], r2
443 vld1.8 {d29}, [r3], r2
445 lowpass_8 d0, d1, d16, d17, d0, d1
446 vrhadd.u8 q0, q0, q14
448 vld1.8 {d2}, [r0,:64], r2
450 vld1.8 {d3}, [r0,:64]
454 vst1.8 {d0}, [r0,:64], r2
455 vst1.8 {d1}, [r0,:64], r2
461 h264_qpel_h_lowpass_l2 put
462 h264_qpel_h_lowpass_l2 avg
464 function put_h264_qpel16_v_lowpass_neon_packed
467 bl put_h264_qpel8_v_lowpass_neon
468 sub r1, r1, r3, lsl #2
469 bl put_h264_qpel8_v_lowpass_neon
470 sub r1, r1, r3, lsl #4
471 sub r1, r1, r3, lsl #2
473 bl put_h264_qpel8_v_lowpass_neon
474 sub r1, r1, r3, lsl #2
476 b put_h264_qpel8_v_lowpass_neon
479 .macro h264_qpel_v_lowpass type
480 function \type\()_h264_qpel16_v_lowpass_neon
482 bl \type\()_h264_qpel8_v_lowpass_neon
483 sub r1, r1, r3, lsl #2
484 bl \type\()_h264_qpel8_v_lowpass_neon
485 sub r0, r0, r2, lsl #4
487 sub r1, r1, r3, lsl #4
488 sub r1, r1, r3, lsl #2
490 bl \type\()_h264_qpel8_v_lowpass_neon
491 sub r1, r1, r3, lsl #2
495 function \type\()_h264_qpel8_v_lowpass_neon
496 vld1.8 {d8}, [r1], r3
497 vld1.8 {d10}, [r1], r3
498 vld1.8 {d12}, [r1], r3
499 vld1.8 {d14}, [r1], r3
500 vld1.8 {d22}, [r1], r3
501 vld1.8 {d24}, [r1], r3
502 vld1.8 {d26}, [r1], r3
503 vld1.8 {d28}, [r1], r3
504 vld1.8 {d9}, [r1], r3
505 vld1.8 {d11}, [r1], r3
506 vld1.8 {d13}, [r1], r3
507 vld1.8 {d15}, [r1], r3
510 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
511 lowpass_8 d8, d9, d10, d11, d8, d10
512 lowpass_8 d12, d13, d14, d15, d12, d14
513 lowpass_8 d22, d23, d24, d25, d22, d24
514 lowpass_8 d26, d27, d28, d29, d26, d28
515 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
518 vld1.8 {d9}, [r0,:64], r2
520 vld1.8 {d11}, [r0,:64], r2
521 vrhadd.u8 d10, d10, d11
522 vld1.8 {d13}, [r0,:64], r2
523 vrhadd.u8 d12, d12, d13
524 vld1.8 {d15}, [r0,:64], r2
525 vrhadd.u8 d14, d14, d15
526 vld1.8 {d23}, [r0,:64], r2
527 vrhadd.u8 d22, d22, d23
528 vld1.8 {d25}, [r0,:64], r2
529 vrhadd.u8 d24, d24, d25
530 vld1.8 {d27}, [r0,:64], r2
531 vrhadd.u8 d26, d26, d27
532 vld1.8 {d29}, [r0,:64], r2
533 vrhadd.u8 d28, d28, d29
534 sub r0, r0, r2, lsl #3
537 vst1.8 {d8}, [r0,:64], r2
538 vst1.8 {d10}, [r0,:64], r2
539 vst1.8 {d12}, [r0,:64], r2
540 vst1.8 {d14}, [r0,:64], r2
541 vst1.8 {d22}, [r0,:64], r2
542 vst1.8 {d24}, [r0,:64], r2
543 vst1.8 {d26}, [r0,:64], r2
544 vst1.8 {d28}, [r0,:64], r2
550 h264_qpel_v_lowpass put
551 h264_qpel_v_lowpass avg
553 .macro h264_qpel_v_lowpass_l2 type
554 function \type\()_h264_qpel16_v_lowpass_l2_neon
556 bl \type\()_h264_qpel8_v_lowpass_l2_neon
557 sub r1, r1, r3, lsl #2
558 bl \type\()_h264_qpel8_v_lowpass_l2_neon
559 sub r0, r0, r3, lsl #4
560 sub r12, r12, r2, lsl #4
563 sub r1, r1, r3, lsl #4
564 sub r1, r1, r3, lsl #2
566 bl \type\()_h264_qpel8_v_lowpass_l2_neon
567 sub r1, r1, r3, lsl #2
571 function \type\()_h264_qpel8_v_lowpass_l2_neon
572 vld1.8 {d8}, [r1], r3
573 vld1.8 {d10}, [r1], r3
574 vld1.8 {d12}, [r1], r3
575 vld1.8 {d14}, [r1], r3
576 vld1.8 {d22}, [r1], r3
577 vld1.8 {d24}, [r1], r3
578 vld1.8 {d26}, [r1], r3
579 vld1.8 {d28}, [r1], r3
580 vld1.8 {d9}, [r1], r3
581 vld1.8 {d11}, [r1], r3
582 vld1.8 {d13}, [r1], r3
583 vld1.8 {d15}, [r1], r3
586 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
587 lowpass_8 d8, d9, d10, d11, d8, d9
588 lowpass_8 d12, d13, d14, d15, d12, d13
589 lowpass_8 d22, d23, d24, d25, d22, d23
590 lowpass_8 d26, d27, d28, d29, d26, d27
591 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
593 vld1.8 {d0}, [r12], r2
594 vld1.8 {d1}, [r12], r2
595 vld1.8 {d2}, [r12], r2
596 vld1.8 {d3}, [r12], r2
597 vld1.8 {d4}, [r12], r2
599 vld1.8 {d5}, [r12], r2
601 vld1.8 {d10}, [r12], r2
602 vrhadd.u8 q2, q2, q11
603 vld1.8 {d11}, [r12], r2
604 vrhadd.u8 q5, q5, q13
607 vld1.8 {d16}, [r0,:64], r3
608 vrhadd.u8 d0, d0, d16
609 vld1.8 {d17}, [r0,:64], r3
610 vrhadd.u8 d1, d1, d17
611 vld1.8 {d16}, [r0,:64], r3
612 vrhadd.u8 d2, d2, d16
613 vld1.8 {d17}, [r0,:64], r3
614 vrhadd.u8 d3, d3, d17
615 vld1.8 {d16}, [r0,:64], r3
616 vrhadd.u8 d4, d4, d16
617 vld1.8 {d17}, [r0,:64], r3
618 vrhadd.u8 d5, d5, d17
619 vld1.8 {d16}, [r0,:64], r3
620 vrhadd.u8 d10, d10, d16
621 vld1.8 {d17}, [r0,:64], r3
622 vrhadd.u8 d11, d11, d17
623 sub r0, r0, r3, lsl #3
626 vst1.8 {d0}, [r0,:64], r3
627 vst1.8 {d1}, [r0,:64], r3
628 vst1.8 {d2}, [r0,:64], r3
629 vst1.8 {d3}, [r0,:64], r3
630 vst1.8 {d4}, [r0,:64], r3
631 vst1.8 {d5}, [r0,:64], r3
632 vst1.8 {d10}, [r0,:64], r3
633 vst1.8 {d11}, [r0,:64], r3
639 h264_qpel_v_lowpass_l2 put
640 h264_qpel_v_lowpass_l2 avg
642 function put_h264_qpel8_hv_lowpass_neon_top
645 1: vld1.8 {d0, d1}, [r1], r3
646 vld1.8 {d16,d17}, [r1], r3
648 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
649 vst1.8 {d22-d25}, [r4,:128]!
652 vld1.8 {d0, d1}, [r1]
653 lowpass_8_1 d0, d1, q12, narrow=0
657 vld1.8 {d30,d31}, [r4,:128], r12
658 vld1.8 {d20,d21}, [r4,:128], r12
659 vld1.8 {d18,d19}, [r4,:128], r12
660 vld1.8 {d16,d17}, [r4,:128], r12
661 vld1.8 {d14,d15}, [r4,:128], r12
662 vld1.8 {d12,d13}, [r4,:128], r12
663 vld1.8 {d10,d11}, [r4,:128], r12
664 vld1.8 {d8, d9}, [r4,:128], r12
665 vld1.8 {d6, d7}, [r4,:128], r12
666 vld1.8 {d4, d5}, [r4,:128], r12
667 vld1.8 {d2, d3}, [r4,:128], r12
668 vld1.8 {d0, d1}, [r4,:128]
670 swap4 d1, d3, d5, d7, d8, d10, d12, d14
671 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
673 swap4 d17, d19, d21, d31, d24, d26, d28, d22
674 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
676 vst1.8 {d30,d31}, [r4,:128]!
677 vst1.8 {d6, d7}, [r4,:128]!
678 vst1.8 {d20,d21}, [r4,:128]!
679 vst1.8 {d4, d5}, [r4,:128]!
680 vst1.8 {d18,d19}, [r4,:128]!
681 vst1.8 {d2, d3}, [r4,:128]!
682 vst1.8 {d16,d17}, [r4,:128]!
683 vst1.8 {d0, d1}, [r4,:128]
685 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
686 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
687 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
688 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
690 vld1.8 {d16,d17}, [r4,:128], r12
691 vld1.8 {d30,d31}, [r4,:128], r12
692 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
693 vld1.8 {d16,d17}, [r4,:128], r12
694 vld1.8 {d30,d31}, [r4,:128], r12
695 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
696 vld1.8 {d16,d17}, [r4,:128], r12
697 vld1.8 {d30,d31}, [r4,:128], r12
698 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
699 vld1.8 {d16,d17}, [r4,:128], r12
700 vld1.8 {d30,d31}, [r4,:128]
701 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
703 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
708 .macro h264_qpel8_hv_lowpass type
709 function \type\()_h264_qpel8_hv_lowpass_neon
711 bl put_h264_qpel8_hv_lowpass_neon_top
713 vld1.8 {d0}, [r0,:64], r2
714 vrhadd.u8 d12, d12, d0
715 vld1.8 {d1}, [r0,:64], r2
716 vrhadd.u8 d13, d13, d1
717 vld1.8 {d2}, [r0,:64], r2
718 vrhadd.u8 d14, d14, d2
719 vld1.8 {d3}, [r0,:64], r2
720 vrhadd.u8 d15, d15, d3
721 vld1.8 {d4}, [r0,:64], r2
723 vld1.8 {d5}, [r0,:64], r2
725 vld1.8 {d6}, [r0,:64], r2
726 vrhadd.u8 d10, d10, d6
727 vld1.8 {d7}, [r0,:64], r2
728 vrhadd.u8 d11, d11, d7
729 sub r0, r0, r2, lsl #3
732 vst1.8 {d12}, [r0,:64], r2
733 vst1.8 {d13}, [r0,:64], r2
734 vst1.8 {d14}, [r0,:64], r2
735 vst1.8 {d15}, [r0,:64], r2
736 vst1.8 {d8}, [r0,:64], r2
737 vst1.8 {d9}, [r0,:64], r2
738 vst1.8 {d10}, [r0,:64], r2
739 vst1.8 {d11}, [r0,:64], r2
746 h264_qpel8_hv_lowpass put
747 h264_qpel8_hv_lowpass avg
749 .macro h264_qpel8_hv_lowpass_l2 type
750 function \type\()_h264_qpel8_hv_lowpass_l2_neon
752 bl put_h264_qpel8_hv_lowpass_neon_top
754 vld1.8 {d0, d1}, [r2,:128]!
755 vld1.8 {d2, d3}, [r2,:128]!
757 vld1.8 {d4, d5}, [r2,:128]!
759 vld1.8 {d6, d7}, [r2,:128]!
763 vld1.8 {d16}, [r0,:64], r3
764 vrhadd.u8 d0, d0, d16
765 vld1.8 {d17}, [r0,:64], r3
766 vrhadd.u8 d1, d1, d17
767 vld1.8 {d18}, [r0,:64], r3
768 vrhadd.u8 d2, d2, d18
769 vld1.8 {d19}, [r0,:64], r3
770 vrhadd.u8 d3, d3, d19
771 vld1.8 {d20}, [r0,:64], r3
772 vrhadd.u8 d4, d4, d20
773 vld1.8 {d21}, [r0,:64], r3
774 vrhadd.u8 d5, d5, d21
775 vld1.8 {d22}, [r0,:64], r3
776 vrhadd.u8 d6, d6, d22
777 vld1.8 {d23}, [r0,:64], r3
778 vrhadd.u8 d7, d7, d23
779 sub r0, r0, r3, lsl #3
781 vst1.8 {d0}, [r0,:64], r3
782 vst1.8 {d1}, [r0,:64], r3
783 vst1.8 {d2}, [r0,:64], r3
784 vst1.8 {d3}, [r0,:64], r3
785 vst1.8 {d4}, [r0,:64], r3
786 vst1.8 {d5}, [r0,:64], r3
787 vst1.8 {d6}, [r0,:64], r3
788 vst1.8 {d7}, [r0,:64], r3
795 h264_qpel8_hv_lowpass_l2 put
796 h264_qpel8_hv_lowpass_l2 avg
798 .macro h264_qpel16_hv type
799 function \type\()_h264_qpel16_hv_lowpass_neon
801 bl \type\()_h264_qpel8_hv_lowpass_neon
802 sub r1, r1, r3, lsl #2
803 bl \type\()_h264_qpel8_hv_lowpass_neon
804 sub r1, r1, r3, lsl #4
805 sub r1, r1, r3, lsl #2
807 sub r0, r0, r2, lsl #4
809 bl \type\()_h264_qpel8_hv_lowpass_neon
810 sub r1, r1, r3, lsl #2
812 b \type\()_h264_qpel8_hv_lowpass_neon
815 function \type\()_h264_qpel16_hv_lowpass_l2_neon
818 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
819 sub r1, r1, r3, lsl #2
820 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
821 sub r1, r1, r3, lsl #4
822 sub r1, r1, r3, lsl #2
824 sub r0, r0, r3, lsl #4
826 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
827 sub r1, r1, r3, lsl #2
829 b \type\()_h264_qpel8_hv_lowpass_l2_neon
836 .macro h264_qpel8 type
837 function ff_\type\()_h264_qpel8_mc10_neon, export=1
842 b \type\()_h264_qpel8_h_lowpass_l2_neon
845 function ff_\type\()_h264_qpel8_mc20_neon, export=1
850 b \type\()_h264_qpel8_h_lowpass_neon
853 function ff_\type\()_h264_qpel8_mc30_neon, export=1
858 b \type\()_h264_qpel8_h_lowpass_l2_neon
861 function ff_\type\()_h264_qpel8_mc01_neon, export=1
864 \type\()_h264_qpel8_mc01:
867 sub r1, r1, r2, lsl #1
869 bl \type\()_h264_qpel8_v_lowpass_l2_neon
874 function ff_\type\()_h264_qpel8_mc11_neon, export=1
875 push {r0, r1, r11, lr}
876 \type\()_h264_qpel8_mc11:
888 bl put_h264_qpel8_h_lowpass_neon
892 sub r1, r1, r2, lsl #1
894 bl \type\()_h264_qpel8_v_lowpass_l2_neon
900 function ff_\type\()_h264_qpel8_mc21_neon, export=1
901 push {r0, r1, r4, r10, r11, lr}
902 \type\()_h264_qpel8_mc21:
908 sub sp, sp, #(8*8+16*12)
914 bl put_h264_qpel8_h_lowpass_neon
917 sub r1, r1, r2, lsl #1
921 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
924 pop {r4, r10, r11, pc}
927 function ff_\type\()_h264_qpel8_mc31_neon, export=1
929 push {r0, r1, r11, lr}
931 b \type\()_h264_qpel8_mc11
934 function ff_\type\()_h264_qpel8_mc02_neon, export=1
937 sub r1, r1, r2, lsl #1
940 bl \type\()_h264_qpel8_v_lowpass_neon
945 function ff_\type\()_h264_qpel8_mc12_neon, export=1
946 push {r0, r1, r4, r10, r11, lr}
947 \type\()_h264_qpel8_mc12:
953 sub sp, sp, #(8*8+16*12)
954 sub r1, r1, r2, lsl #1
959 bl put_h264_qpel8_v_lowpass_neon
962 sub r1, r1, r3, lsl #1
965 bl \type\()_h264_qpel8_hv_lowpass_l2_neon
968 pop {r4, r10, r11, pc}
971 function ff_\type\()_h264_qpel8_mc22_neon, export=1
972 push {r4, r10, r11, lr}
977 sub r1, r1, r2, lsl #1
983 bl \type\()_h264_qpel8_hv_lowpass_neon
986 pop {r4, r10, r11, pc}
989 function ff_\type\()_h264_qpel8_mc32_neon, export=1
990 push {r0, r1, r4, r10, r11, lr}
992 b \type\()_h264_qpel8_mc12
995 function ff_\type\()_h264_qpel8_mc03_neon, export=1
998 b \type\()_h264_qpel8_mc01
1001 function ff_\type\()_h264_qpel8_mc13_neon, export=1
1002 push {r0, r1, r11, lr}
1004 b \type\()_h264_qpel8_mc11
1007 function ff_\type\()_h264_qpel8_mc23_neon, export=1
1008 push {r0, r1, r4, r10, r11, lr}
1010 b \type\()_h264_qpel8_mc21
1013 function ff_\type\()_h264_qpel8_mc33_neon, export=1
1015 push {r0, r1, r11, lr}
1018 b \type\()_h264_qpel8_mc11
1025 .macro h264_qpel16 type
1026 function ff_\type\()_h264_qpel16_mc10_neon, export=1
1030 b \type\()_h264_qpel16_h_lowpass_l2_neon
1033 function ff_\type\()_h264_qpel16_mc20_neon, export=1
1037 b \type\()_h264_qpel16_h_lowpass_neon
1040 function ff_\type\()_h264_qpel16_mc30_neon, export=1
1044 b \type\()_h264_qpel16_h_lowpass_l2_neon
1047 function ff_\type\()_h264_qpel16_mc01_neon, export=1
1050 \type\()_h264_qpel16_mc01:
1053 sub r1, r1, r2, lsl #1
1055 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1060 function ff_\type\()_h264_qpel16_mc11_neon, export=1
1061 push {r0, r1, r4, r11, lr}
1062 \type\()_h264_qpel16_mc11:
1073 bl put_h264_qpel16_h_lowpass_neon
1077 sub r1, r1, r2, lsl #1
1079 bl \type\()_h264_qpel16_v_lowpass_l2_neon
1085 function ff_\type\()_h264_qpel16_mc21_neon, export=1
1086 push {r0, r1, r4-r5, r9-r11, lr}
1087 \type\()_h264_qpel16_mc21:
1093 sub sp, sp, #(16*16+16*12)
1097 bl put_h264_qpel16_h_lowpass_neon_packed
1100 sub r1, r1, r2, lsl #1
1103 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1106 pop {r4-r5, r9-r11, pc}
1109 function ff_\type\()_h264_qpel16_mc31_neon, export=1
1111 push {r0, r1, r4, r11, lr}
1113 b \type\()_h264_qpel16_mc11
1116 function ff_\type\()_h264_qpel16_mc02_neon, export=1
1119 sub r1, r1, r2, lsl #1
1122 bl \type\()_h264_qpel16_v_lowpass_neon
1127 function ff_\type\()_h264_qpel16_mc12_neon, export=1
1128 push {r0, r1, r4-r5, r9-r11, lr}
1129 \type\()_h264_qpel16_mc12:
1135 sub sp, sp, #(16*16+16*12)
1136 sub r1, r1, r2, lsl #1
1140 bl put_h264_qpel16_v_lowpass_neon_packed
1143 sub r1, r1, r3, lsl #1
1146 bl \type\()_h264_qpel16_hv_lowpass_l2_neon
1149 pop {r4-r5, r9-r11, pc}
1152 function ff_\type\()_h264_qpel16_mc22_neon, export=1
1153 push {r4, r9-r11, lr}
1159 sub r1, r1, r2, lsl #1
1162 sub sp, sp, #(16*12)
1165 bl \type\()_h264_qpel16_hv_lowpass_neon
1168 pop {r4, r9-r11, pc}
1171 function ff_\type\()_h264_qpel16_mc32_neon, export=1
1172 push {r0, r1, r4-r5, r9-r11, lr}
1174 b \type\()_h264_qpel16_mc12
1177 function ff_\type\()_h264_qpel16_mc03_neon, export=1
1180 b \type\()_h264_qpel16_mc01
1183 function ff_\type\()_h264_qpel16_mc13_neon, export=1
1184 push {r0, r1, r4, r11, lr}
1186 b \type\()_h264_qpel16_mc11
1189 function ff_\type\()_h264_qpel16_mc23_neon, export=1
1190 push {r0, r1, r4-r5, r9-r11, lr}
1192 b \type\()_h264_qpel16_mc21
1195 function ff_\type\()_h264_qpel16_mc33_neon, export=1
1197 push {r0, r1, r4, r11, lr}
1200 b \type\()_h264_qpel16_mc11
1207 @ Biweighted prediction
1209 .macro biweight_16 macs, macd
1215 vld1.8 {d20-d21},[r0,:128], r2
1219 vld1.8 {d22-d23},[r1,:128], r2
1224 vld1.8 {d28-d29},[r0,:128], r2
1229 vld1.8 {d30-d31},[r1,:128], r2
1237 vshl.s16 q12, q12, q9
1238 vshl.s16 q13, q13, q9
1239 vqmovun.s16 d24, q12
1240 vqmovun.s16 d25, q13
1242 vst1.8 {d4- d5}, [r6,:128], r2
1244 vst1.8 {d24-d25},[r6,:128], r2
1249 .macro biweight_8 macs, macd
1255 vld1.8 {d4},[r0,:64], r2
1258 vld1.8 {d5},[r1,:64], r2
1261 vld1.8 {d6},[r0,:64], r2
1264 vld1.8 {d7},[r1,:64], r2
1269 vshl.s16 q10, q10, q9
1272 vst1.8 {d2},[r6,:64], r2
1274 vst1.8 {d4},[r6,:64], r2
1279 .macro biweight_4 macs, macd
1285 vld1.32 {d4[0]},[r0,:32], r2
1286 vld1.32 {d4[1]},[r0,:32], r2
1289 vld1.32 {d5[0]},[r1,:32], r2
1290 vld1.32 {d5[1]},[r1,:32], r2
1294 vld1.32 {d6[0]},[r0,:32], r2
1295 vld1.32 {d6[1]},[r0,:32], r2
1298 vld1.32 {d7[0]},[r1,:32], r2
1299 vld1.32 {d7[1]},[r1,:32], r2
1304 vshl.s16 q10, q10, q9
1307 vst1.32 {d2[0]},[r6,:32], r2
1308 vst1.32 {d2[1]},[r6,:32], r2
1310 vst1.32 {d4[0]},[r6,:32], r2
1311 vst1.32 {d4[1]},[r6,:32], r2
1314 2: vshl.s16 q1, q1, q9
1316 vst1.32 {d2[0]},[r6,:32], r2
1317 vst1.32 {d2[1]},[r6,:32], r2
1321 .macro biweight_func w
1322 function ff_biweight_h264_pixels_\w\()_neon, export=1
1329 eors lr, lr, r5, lsr #30
1342 10: biweight_\w vmlal.u8, vmlal.u8
1344 biweight_\w vmlal.u8, vmlsl.u8
1347 biweight_\w vmlsl.u8, vmlsl.u8
1349 biweight_\w vmlsl.u8, vmlal.u8
1357 @ Weighted prediction
1359 .macro weight_16 add
1362 vld1.8 {d20-d21},[r0,:128], r1
1363 vmull.u8 q2, d0, d20
1365 vmull.u8 q3, d0, d21
1366 vld1.8 {d28-d29},[r0,:128], r1
1367 vmull.u8 q12, d0, d28
1369 vmull.u8 q13, d0, d29
1371 vrshl.s16 q2, q2, q9
1373 vrshl.s16 q3, q3, q9
1377 vrshl.s16 q12, q12, q9
1379 vrshl.s16 q13, q13, q9
1380 vqmovun.s16 d24, q12
1381 vqmovun.s16 d25, q13
1382 vst1.8 {d4- d5}, [r4,:128], r1
1383 vst1.8 {d24-d25},[r4,:128], r1
1391 vld1.8 {d4},[r0,:64], r1
1394 vld1.8 {d6},[r0,:64], r1
1395 vmull.u8 q10, d0, d6
1398 vrshl.s16 q1, q1, q9
1401 vrshl.s16 q10, q10, q9
1403 vst1.8 {d2},[r4,:64], r1
1404 vst1.8 {d4},[r4,:64], r1
1414 vld1.32 {d4[0]},[r0,:32], r1
1415 vld1.32 {d4[1]},[r0,:32], r1
1419 vld1.32 {d6[0]},[r0,:32], r1
1420 vld1.32 {d6[1]},[r0,:32], r1
1421 vmull.u8 q10, d0, d6
1424 vrshl.s16 q1, q1, q9
1427 vrshl.s16 q10, q10, q9
1430 vst1.32 {d2[0]},[r4,:32], r1
1431 vst1.32 {d2[1]},[r4,:32], r1
1433 vst1.32 {d4[0]},[r4,:32], r1
1434 vst1.32 {d4[1]},[r4,:32], r1
1438 vrshl.s16 q1, q1, q9
1440 vst1.32 {d2[0]},[r4,:32], r1
1441 vst1.32 {d2[1]},[r4,:32], r1
1445 .macro weight_func w
1446 function ff_weight_h264_pixels_\w\()_neon, export=1
1460 10: rsb r12, r12, #0
1467 10: rsb r12, r12, #0