2 * Copyright (c) 2016 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 @ All public functions in this file have the following signature:
24 @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25 @ const uint8_t *ref, ptrdiff_t ref_stride,
26 @ int h, int mx, int my);
28 function ff_vp9_copy64_neon, export=1
33 vld1.8 {q0, q1}, [r2]!
34 vst1.8 {q0, q1}, [r0, :128]!
35 vld1.8 {q2, q3}, [r2], r3
37 vst1.8 {q2, q3}, [r0, :128], r1
42 function ff_vp9_avg64_neon, export=1
49 vld1.8 {q8, q9}, [r2]!
50 vld1.8 {q0, q1}, [r0, :128]!
51 vld1.8 {q10, q11}, [r2], r3
53 vld1.8 {q2, q3}, [r0, :128], r1
56 vst1.8 {q0, q1}, [lr, :128]!
58 vst1.8 {q2, q3}, [lr, :128], r1
64 function ff_vp9_copy32_neon, export=1
67 vld1.8 {q0, q1}, [r2], r3
69 vst1.8 {q0, q1}, [r0, :128], r1
74 function ff_vp9_avg32_neon, export=1
77 vld1.8 {q2, q3}, [r2], r3
78 vld1.8 {q0, q1}, [r0, :128]
82 vst1.8 {q0, q1}, [r0, :128], r1
87 function ff_vp9_copy16_neon, export=1
98 vst1.8 {q0}, [r0, :128], r1
99 vst1.8 {q1}, [r4, :128], r1
104 function ff_vp9_avg16_neon, export=1
109 vld1.8 {q2}, [r2], r3
110 vld1.8 {q0}, [r0, :128], r1
111 vld1.8 {q3}, [r2], r3
113 vld1.8 {q1}, [r0, :128], r1
116 vst1.8 {q0}, [lr, :128], r1
117 vst1.8 {q1}, [lr, :128], r1
122 function ff_vp9_copy8_neon, export=1
125 vld1.8 {d0}, [r2], r3
126 vld1.8 {d1}, [r2], r3
128 vst1.8 {d0}, [r0, :64], r1
129 vst1.8 {d1}, [r0, :64], r1
134 function ff_vp9_avg8_neon, export=1
137 vld1.8 {d2}, [r2], r3
138 vld1.8 {d0}, [r0, :64], r1
139 vld1.8 {d3}, [r2], r3
141 vld1.8 {d1}, [r0, :64]
145 vst1.8 {d0}, [r0, :64], r1
146 vst1.8 {d1}, [r0, :64], r1
151 function ff_vp9_copy4_neon, export=1
154 vld1.32 {d0[]}, [r2], r3
155 vld1.32 {d1[]}, [r2], r3
156 vst1.32 {d0[0]}, [r0, :32], r1
157 vld1.32 {d2[]}, [r2], r3
158 vst1.32 {d1[0]}, [r0, :32], r1
159 vld1.32 {d3[]}, [r2], r3
161 vst1.32 {d2[0]}, [r0, :32], r1
162 vst1.32 {d3[0]}, [r0, :32], r1
167 function ff_vp9_avg4_neon, export=1
172 vld1.32 {d4[]}, [r2], r3
173 vld1.32 {d0[]}, [r0, :32], r1
174 vld1.32 {d5[]}, [r2], r3
176 vld1.32 {d1[]}, [r0, :32], r1
177 vld1.32 {d6[]}, [r2], r3
179 vld1.32 {d2[]}, [r0, :32], r1
180 vld1.32 {d7[]}, [r2], r3
182 vld1.32 {d3[]}, [r0, :32], r1
184 vst1.32 {d0[0]}, [lr, :32], r1
186 vst1.32 {d1[0]}, [lr, :32], r1
187 vst1.32 {d2[0]}, [lr, :32], r1
188 vst1.32 {d3[0]}, [lr, :32], r1
193 @ Helper macros for vmul/vmla with a constant from either d0 or d1 depending on index
194 .macro vmul_lane dst, src, idx
196 vmul.s16 \dst, \src, d0[\idx]
198 vmul.s16 \dst, \src, d1[\idx - 4]
201 .macro vmla_lane dst, src, idx
203 vmla.s16 \dst, \src, d0[\idx]
205 vmla.s16 \dst, \src, d1[\idx - 4]
209 @ Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
210 @ for size >= 16), and multiply-accumulate into dst1 and dst3 (or
211 @ dst1-dst2 and dst3-dst4 for size >= 16)
212 .macro extmla dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
213 vext.8 q14, \src1, \src2, #(2*\offset)
214 vext.8 q15, \src4, \src5, #(2*\offset)
216 vmla_lane \dst1, q14, \offset
217 vext.8 q5, \src2, \src3, #(2*\offset)
218 vmla_lane \dst3, q15, \offset
219 vext.8 q6, \src5, \src6, #(2*\offset)
220 vmla_lane \dst2, q5, \offset
221 vmla_lane \dst4, q6, \offset
223 vmla_lane \dst1, q14, \offset
224 vmla_lane \dst3, q15, \offset
226 vmla_lane \dst1d, d28, \offset
227 vmla_lane \dst3d, d30, \offset
230 @ The same as above, but don't accumulate straight into the
231 @ destination, but use a temp register and accumulate with saturation.
232 .macro extmulqadd dst1, dst2, dst3, dst4, dst1d, dst3d, src1, src2, src3, src4, src5, src6, offset, size
233 vext.8 q14, \src1, \src2, #(2*\offset)
234 vext.8 q15, \src4, \src5, #(2*\offset)
236 vmul_lane q14, q14, \offset
237 vext.8 q5, \src2, \src3, #(2*\offset)
238 vmul_lane q15, q15, \offset
239 vext.8 q6, \src5, \src6, #(2*\offset)
240 vmul_lane q5, q5, \offset
241 vmul_lane q6, q6, \offset
243 vmul_lane q14, q14, \offset
244 vmul_lane q15, q15, \offset
246 vmul_lane d28, d28, \offset
247 vmul_lane d30, d30, \offset
250 vqadd.s16 \dst1d, \dst1d, d28
251 vqadd.s16 \dst3d, \dst3d, d30
253 vqadd.s16 \dst1, \dst1, q14
254 vqadd.s16 \dst3, \dst3, q15
256 vqadd.s16 \dst2, \dst2, q5
257 vqadd.s16 \dst4, \dst4, q6
263 @ Instantiate a horizontal filter function for the given size.
264 @ This can work on 4, 8 or 16 pixels in parallel; for larger
265 @ widths it will do 16 pixels at a time and loop horizontally.
266 @ The actual width is passed in r5, the height in r4 and
267 @ the filter coefficients in r12. idx2 is the index of the largest
268 @ filter coefficient (3 or 4) and idx1 is the other one of them.
269 .macro do_8tap_h type, size, idx1, idx2
270 function \type\()_8tap_\size\()h_\idx1\idx2
276 @ Only size >= 16 loops horizontally and needs
281 @ size >= 16 loads two qwords and increments r2,
282 @ for size 4/8 it's enough with one qword and no
288 @ Load the filter vector
289 vld1.16 {q0}, [r12,:128]
296 vld1.8 {d18, d19, d20}, [r2]!
297 vld1.8 {d24, d25, d26}, [r7]!
312 @ Accumulate, adding idx2 last with a separate
313 @ saturating add. The positive filter coefficients
314 @ for all indices except idx2 must add up to less
315 @ than 127 for this not to overflow.
316 vmul.s16 q1, q8, d0[0]
317 vmul.s16 q3, q11, d0[0]
319 vmul.s16 q2, q9, d0[0]
320 vmul.s16 q4, q12, d0[0]
322 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 1, \size
323 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 2, \size
324 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx1, \size
325 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 5, \size
326 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 6, \size
327 extmla q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, 7, \size
328 extmulqadd q1, q2, q3, q4, d2, d6, q8, q9, q10, q11, q12, q13, \idx2, \size
330 @ Round, shift and saturate
331 vqrshrun.s16 d2, q1, #7
332 vqrshrun.s16 d6, q3, #7
334 vqrshrun.s16 d3, q2, #7
335 vqrshrun.s16 d7, q4, #7
340 vld1.8 {q14}, [r0,:128]
341 vld1.8 {q15}, [r6,:128]
342 vrhadd.u8 q1, q1, q14
343 vrhadd.u8 q3, q3, q15
345 vld1.8 {d28}, [r0,:64]
346 vld1.8 {d30}, [r6,:64]
347 vrhadd.u8 d2, d2, d28
348 vrhadd.u8 d6, d6, d30
350 @ We only need d28[0], but [] is faster on some cores
351 vld1.32 {d28[]}, [r0,:32]
352 vld1.32 {d30[]}, [r6,:32]
353 vrhadd.u8 d2, d2, d28
354 vrhadd.u8 d6, d6, d30
357 @ Store and loop horizontally (for size >= 16)
360 vst1.8 {q1}, [r0,:128]!
361 vst1.8 {q3}, [r6,:128]!
373 vst1.8 {d2}, [r0,:64]
374 vst1.8 {d6}, [r6,:64]
376 vst1.32 {d2[0]}, [r0,:32]
377 vst1.32 {d6[0]}, [r6,:32]
395 .macro do_8tap_h_size size
396 do_8tap_h put, \size, 3, 4
397 do_8tap_h avg, \size, 3, 4
398 do_8tap_h put, \size, 4, 3
399 do_8tap_h avg, \size, 4, 3
406 .macro do_8tap_h_func type, filter, offset, size
407 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
417 movrelx r12, X(ff_vp9_subpel_filters), r6
418 add r12, r12, 256*\offset
420 add r12, r12, r5, lsl #4
423 bge \type\()_8tap_16h_34
424 b \type\()_8tap_16h_43
426 bge \type\()_8tap_\size\()h_34
427 b \type\()_8tap_\size\()h_43
432 .macro do_8tap_h_filters size
433 do_8tap_h_func put, regular, 1, \size
434 do_8tap_h_func avg, regular, 1, \size
435 do_8tap_h_func put, sharp, 2, \size
436 do_8tap_h_func avg, sharp, 2, \size
437 do_8tap_h_func put, smooth, 0, \size
438 do_8tap_h_func avg, smooth, 0, \size
451 @ Round, shift and saturate and store qreg1-2 over 4 lines
452 .macro do_store4 qreg1, dreg1, qreg2, dreg2, tmp1, tmp2, type
453 vqrshrun.s16 \dreg1, \qreg1, #7
454 vqrshrun.s16 \dreg2, \qreg2, #7
456 vld1.32 {\tmp1[]}, [r0,:32], r1
457 vld1.32 {\tmp2[]}, [r0,:32], r1
458 vld1.32 {\tmp1[1]}, [r0,:32], r1
459 vld1.32 {\tmp2[1]}, [r0,:32], r1
460 vrhadd.u8 \dreg1, \dreg1, \tmp1
461 vrhadd.u8 \dreg2, \dreg2, \tmp2
462 sub r0, r0, r1, lsl #2
464 vst1.32 {\dreg1[0]}, [r0,:32], r1
465 vst1.32 {\dreg2[0]}, [r0,:32], r1
466 vst1.32 {\dreg1[1]}, [r0,:32], r1
467 vst1.32 {\dreg2[1]}, [r0,:32], r1
470 @ Round, shift and saturate and store qreg1-4
471 .macro do_store qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, type
472 vqrshrun.s16 \dreg1, \qreg1, #7
473 vqrshrun.s16 \dreg2, \qreg2, #7
474 vqrshrun.s16 \dreg3, \qreg3, #7
475 vqrshrun.s16 \dreg4, \qreg4, #7
477 vld1.8 {\tmp1}, [r0,:64], r1
478 vld1.8 {\tmp2}, [r0,:64], r1
479 vld1.8 {\tmp3}, [r0,:64], r1
480 vld1.8 {\tmp4}, [r0,:64], r1
481 vrhadd.u8 \dreg1, \dreg1, \tmp1
482 vrhadd.u8 \dreg2, \dreg2, \tmp2
483 vrhadd.u8 \dreg3, \dreg3, \tmp3
484 vrhadd.u8 \dreg4, \dreg4, \tmp4
485 sub r0, r0, r1, lsl #2
487 vst1.8 {\dreg1}, [r0,:64], r1
488 vst1.8 {\dreg2}, [r0,:64], r1
489 vst1.8 {\dreg3}, [r0,:64], r1
490 vst1.8 {\dreg4}, [r0,:64], r1
493 @ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
494 @ (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
495 @ at the end with saturation. Indices 0 and 7 always have negative or zero
496 @ coefficients, so they can be accumulated into tmp1-tmp2 together with the
497 @ largest coefficient.
498 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
499 vmul.s16 \dst1, \src2, d0[1]
500 vmul.s16 \dst2, \src3, d0[1]
501 vmul.s16 \tmp1, \src1, d0[0]
502 vmul.s16 \tmp2, \src2, d0[0]
503 vmla.s16 \dst1, \src3, d0[2]
504 vmla.s16 \dst2, \src4, d0[2]
506 vmla.s16 \dst1, \src4, d0[3]
507 vmla.s16 \dst2, \src5, d0[3]
509 vmla.s16 \dst1, \src5, d1[0]
510 vmla.s16 \dst2, \src6, d1[0]
512 vmla.s16 \dst1, \src6, d1[1]
513 vmla.s16 \dst2, \src7, d1[1]
514 vmla.s16 \tmp1, \src8, d1[3]
515 vmla.s16 \tmp2, \src9, d1[3]
516 vmla.s16 \dst1, \src7, d1[2]
517 vmla.s16 \dst2, \src8, d1[2]
519 vmla.s16 \tmp1, \src4, d0[3]
520 vmla.s16 \tmp2, \src5, d0[3]
522 vmla.s16 \tmp1, \src5, d1[0]
523 vmla.s16 \tmp2, \src6, d1[0]
525 vqadd.s16 \dst1, \dst1, \tmp1
526 vqadd.s16 \dst2, \dst2, \tmp2
529 @ Load pixels and extend them to 16 bit
530 .macro loadl dst1, dst2, dst3, dst4
531 vld1.8 {d2}, [r2], r3
532 vld1.8 {d3}, [r2], r3
533 vld1.8 {d4}, [r2], r3
535 vld1.8 {d5}, [r2], r3
545 @ Instantiate a vertical filter function for filtering 8 pixels at a time.
546 @ The height is passed in r4, the width in r5 and the filter coefficients
547 @ in r12. idx2 is the index of the largest filter coefficient (3 or 4)
548 @ and idx1 is the other one of them.
549 .macro do_8tap_8v type, idx1, idx2
550 function \type\()_8tap_8v_\idx1\idx2
551 sub r2, r2, r3, lsl #1
553 vld1.16 {q0}, [r12, :128]
558 loadl q8, q9, q10, q11
560 loadl q12, q13, q14, q15
561 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q5
562 convolve q3, q4, q7, q8, q9, q10, q11, q12, q13, q14, q15, \idx1, \idx2, q5, q6
563 do_store q1, d2, q2, d4, q3, d6, q4, d8, d3, d5, d7, d9, \type
569 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q4, q5, \idx1, \idx2, q8, q9
570 convolve q3, q8, q11, q12, q13, q14, q15, q4, q5, q6, q7, \idx1, \idx2, q9, q10
571 do_store q1, d2, q2, d4, q3, d6, q8, d16, d3, d5, d7, d17, \type
576 loadl q8, q9, q10, q11
577 convolve q1, q2, q13, q14, q15, q4, q5, q6, q7, q8, q9, \idx1, \idx2, q12, q13
578 convolve q3, q12, q15, q4, q5, q6, q7, q8, q9, q10, q11, \idx1, \idx2, q13, q14
579 do_store q1, d2, q2, d4, q3, d6, q12, d24, d3, d5, d7, d25, \type
587 @ r0 -= h * dst_stride
589 @ r2 -= h * src_stride
591 @ r2 -= 8 * src_stride
592 sub r2, r2, r3, lsl #3
593 @ r2 += 1 * src_stride
610 @ Instantiate a vertical filter function for filtering a 4 pixels wide
611 @ slice. The first half of the registers contain one row, while the second
612 @ half of a register contains the second-next row (also stored in the first
613 @ half of the register two steps ahead). The convolution does two outputs
614 @ at a time; the output of q5-q12 into one, and q4-q13 into another one.
615 @ The first half of first output is the first output row, the first half
616 @ of the other output is the second output row. The second halves of the
617 @ registers are rows 3 and 4.
618 @ This only is designed to work for 4 or 8 output lines.
619 .macro do_8tap_4v type, idx1, idx2
620 function \type\()_8tap_4v_\idx1\idx2
621 sub r2, r2, r3, lsl #1
623 vld1.16 {q0}, [r12, :128]
625 vld1.32 {d2[]}, [r2], r3
626 vld1.32 {d3[]}, [r2], r3
627 vld1.32 {d4[]}, [r2], r3
628 vld1.32 {d5[]}, [r2], r3
629 vld1.32 {d6[]}, [r2], r3
630 vld1.32 {d7[]}, [r2], r3
631 vext.8 d2, d2, d4, #4
632 vld1.32 {d8[]}, [r2], r3
633 vext.8 d3, d3, d5, #4
634 vld1.32 {d9[]}, [r2], r3
636 vext.8 d4, d4, d6, #4
637 vld1.32 {d28[]}, [r2], r3
639 vext.8 d5, d5, d7, #4
640 vld1.32 {d29[]}, [r2], r3
642 vext.8 d6, d6, d8, #4
643 vld1.32 {d30[]}, [r2], r3
645 vext.8 d7, d7, d9, #4
647 vext.8 d8, d8, d28, #4
649 vext.8 d9, d9, d29, #4
651 vext.8 d28, d28, d30, #4
655 convolve q1, q2, q5, q6, q7, q8, q9, q10, q11, q12, q13, \idx1, \idx2, q4, q3
656 do_store4 q1, d2, q2, d4, d3, d5, \type
660 vld1.32 {d2[]}, [r2], r3
661 vld1.32 {d3[]}, [r2], r3
662 vext.8 d29, d29, d2, #4
663 vext.8 d30, d30, d3, #4
664 vld1.32 {d2[1]}, [r2], r3
666 vld1.32 {d3[1]}, [r2], r3
671 convolve q1, q2, q9, q10, q11, q12, q13, q14, q15, q5, q6, \idx1, \idx2, q4, q3
672 do_store4 q1, d2, q2, d4, d3, d5, \type
686 .macro do_8tap_v_func type, filter, offset, size
687 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
691 movrelx r12, X(ff_vp9_subpel_filters), r5
693 add r12, r12, 256*\offset
694 add r12, r12, r5, lsl #4
698 bge \type\()_8tap_8v_34
699 b \type\()_8tap_8v_43
701 bge \type\()_8tap_4v_34
702 b \type\()_8tap_4v_43
707 .macro do_8tap_v_filters size
708 do_8tap_v_func put, regular, 1, \size
709 do_8tap_v_func avg, regular, 1, \size
710 do_8tap_v_func put, sharp, 2, \size
711 do_8tap_v_func avg, sharp, 2, \size
712 do_8tap_v_func put, smooth, 0, \size
713 do_8tap_v_func avg, smooth, 0, \size