2 * Copyright (c) 2017 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/arm/asm.S"
23 @ All public functions in this file have the following signature:
24 @ typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25 @ const uint8_t *ref, ptrdiff_t ref_stride,
26 @ int h, int mx, int my);
28 function ff_vp9_copy128_neon, export=1
34 vld1.16 {q0, q1}, [r2]!
35 vst1.16 {q0, q1}, [r0, :128]!
36 vld1.16 {q2, q3}, [r2]!
37 vst1.16 {q2, q3}, [r0, :128]!
38 vld1.16 {q8, q9}, [r2]!
39 vst1.16 {q8, q9}, [r0, :128]!
40 vld1.16 {q10, q11}, [r2], r3
41 vst1.16 {q10, q11}, [r0, :128], r1
46 function ff_vp9_avg64_16_neon, export=1
54 vld1.16 {q8, q9}, [r2]!
55 vld1.16 {q0, q1}, [r0, :128]!
56 vld1.16 {q10, q11}, [r2]!
58 vld1.16 {q2, q3}, [r0, :128]!
60 vld1.16 {q12, q13}, [r2]!
61 vrhadd.u16 q2, q2, q10
62 vst1.16 {q0, q1}, [lr, :128]!
63 vrhadd.u16 q3, q3, q11
64 vld1.16 {q8, q9}, [r0, :128]!
65 vst1.16 {q2, q3}, [lr, :128]!
66 vrhadd.u16 q8, q8, q12
67 vld1.16 {q14, q15}, [r2], r3
68 vrhadd.u16 q9, q9, q13
69 vld1.16 {q10, q11}, [r0, :128], r1
70 vrhadd.u16 q10, q10, q14
71 vst1.16 {q8, q9}, [lr, :128]!
72 vrhadd.u16 q11, q11, q15
73 vst1.16 {q10, q11}, [lr, :128], r1
78 function ff_vp9_avg32_16_neon, export=1
86 vld1.16 {q8, q9}, [r2]!
87 vld1.16 {q0, q1}, [r0, :128]!
88 vld1.16 {q10, q11}, [r2], r3
90 vld1.16 {q2, q3}, [r0, :128], r1
92 vrhadd.u16 q2, q2, q10
93 vst1.16 {q0, q1}, [lr, :128]!
94 vrhadd.u16 q3, q3, q11
95 vst1.16 {q2, q3}, [lr, :128], r1
100 function ff_vp9_avg16_16_neon, export=1
104 vld1.16 {q2, q3}, [r2], r3
105 vld1.16 {q0, q1}, [r0, :128]
106 vrhadd.u16 q0, q0, q2
107 vrhadd.u16 q1, q1, q3
108 vst1.16 {q0, q1}, [r0, :128], r1
113 function ff_vp9_avg8_16_neon, export=1
119 vld1.16 {q2}, [r2], r3
120 vld1.16 {q0}, [r0, :128], r1
121 vld1.16 {q3}, [r2], r3
122 vrhadd.u16 q0, q0, q2
123 vld1.16 {q1}, [r0, :128], r1
124 vrhadd.u16 q1, q1, q3
125 vst1.16 {q0}, [lr, :128], r1
126 vst1.16 {q1}, [lr, :128], r1
131 function ff_vp9_avg4_16_neon, export=1
135 vld1.16 {d2}, [r2], r3
136 vld1.16 {d0}, [r0, :64], r1
137 vld1.16 {d3}, [r2], r3
138 vrhadd.u16 d0, d0, d2
139 vld1.16 {d1}, [r0, :64]
141 vrhadd.u16 d1, d1, d3
142 vst1.16 {d0}, [r0, :64], r1
143 vst1.16 {d1}, [r0, :64], r1
148 @ Helper macros for vmull/vmlal with a constant from either d0 or d1 depending on index
149 .macro vmull_lane dst, src, idx
151 vmull.s16 \dst, \src, d0[\idx]
153 vmull.s16 \dst, \src, d1[\idx - 4]
156 .macro vmlal_lane dst, src, idx
158 vmlal.s16 \dst, \src, d0[\idx]
160 vmlal.s16 \dst, \src, d1[\idx - 4]
164 @ Extract a vector from src1-src2 and src3-src4, andmultiply-accumulate
165 @ into dst1 and dst3 (or dst1-dst2 and dst3-dst4 for size >= 8)
166 .macro extmlal dst1, dst2, dst3, dst4, src1, src2, src3, src4, offset, size
167 vext.8 q14, \src1, \src2, #(2*\offset)
168 vext.8 q15, \src3, \src4, #(2*\offset)
169 vmlal_lane \dst1, d28, \offset
170 vmlal_lane \dst3, d30, \offset
172 vmlal_lane \dst2, d29, \offset
173 vmlal_lane \dst4, d31, \offset
178 @ Instantiate a horizontal filter function for the given size.
179 @ This can work on 4 or 8 pixels in parallel; for larger
180 @ widths it will do 8 pixels at a time and loop horizontally.
181 @ The actual width (in bytes) is passed in r5, the height in r4 and
182 @ the filter coefficients in r12.
183 .macro do_8tap_h type, size
184 function \type\()_8tap_\size\()h
190 @ Only size >= 8 loops horizontally and needs
195 @ size >= 8 loads two qwords and increments r2,
196 @ for size 4 it's enough with three dwords and no
202 @ Load the filter vector
203 vld1.16 {q0}, [r12,:128]
210 vld1.16 {q8, q9}, [r2]!
211 vld1.16 {q10, q11}, [r7]!
213 vld1.16 {d16, d17, d18}, [r2]
214 vld1.16 {d20, d21, d22}, [r7]
218 vmull.s16 q1, d16, d0[0]
219 vmull.s16 q12, d20, d0[0]
221 vmull.s16 q2, d17, d0[0]
222 vmull.s16 q13, d21, d0[0]
224 extmlal q1, q2, q12, q13, q8, q9, q10, q11, 1, \size
225 extmlal q1, q2, q12, q13, q8, q9, q10, q11, 2, \size
226 extmlal q1, q2, q12, q13, q8, q9, q10, q11, 3, \size
227 extmlal q1, q2, q12, q13, q8, q9, q10, q11, 4, \size
228 extmlal q1, q2, q12, q13, q8, q9, q10, q11, 5, \size
229 extmlal q1, q2, q12, q13, q8, q9, q10, q11, 6, \size
230 extmlal q1, q2, q12, q13, q8, q9, q10, q11, 7, \size
232 @ Round, shift and saturate.
233 @ The vqrshrun takes care of clamping negative values to zero, but
234 @ we manually need to do vmin with the max pixel value.
235 vqrshrun.s32 d2, q1, #7
236 vqrshrun.s32 d24, q12, #7
238 vqrshrun.s32 d3, q2, #7
239 vqrshrun.s32 d25, q13, #7
241 vmin.u16 q12, q12, q3
244 vmin.u16 d24, d24, d6
249 vld1.16 {q14}, [r0,:128]
250 vld1.16 {q15}, [r6,:128]
251 vrhadd.u16 q1, q1, q14
252 vrhadd.u16 q12, q12, q15
254 vld1.16 {d28}, [r0,:64]
255 vld1.16 {d30}, [r6,:64]
256 vrhadd.u16 d2, d2, d28
257 vrhadd.u16 d24, d24, d30
260 @ Store and loop horizontally (for size >= 8)
263 vst1.16 {q1}, [r0,:128]!
264 vst1.16 {q12}, [r6,:128]!
272 vst1.16 {d2}, [r0,:64]
273 vst1.16 {d24}, [r6,:64]
288 .macro do_8tap_h_size size
296 .macro do_8tap_h_func type, filter, offset, size, bpp
297 function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
301 vmvn.u16 q3, #((0xffff << \bpp) & 0xffff)
302 movrelx r12, X(ff_vp9_subpel_filters), r6
303 add r12, r12, 256*\offset
304 add r12, r12, r5, lsl #4
314 .macro do_8tap_h_filters size, bpp
315 do_8tap_h_func put, regular, 1, \size, \bpp
316 do_8tap_h_func avg, regular, 1, \size, \bpp
317 do_8tap_h_func put, sharp, 2, \size, \bpp
318 do_8tap_h_func avg, sharp, 2, \size, \bpp
319 do_8tap_h_func put, smooth, 0, \size, \bpp
320 do_8tap_h_func avg, smooth, 0, \size, \bpp
323 .macro do_8tap_h_filters_bpp bpp
324 do_8tap_h_filters 64, \bpp
325 do_8tap_h_filters 32, \bpp
326 do_8tap_h_filters 16, \bpp
327 do_8tap_h_filters 8, \bpp
328 do_8tap_h_filters 4, \bpp
331 do_8tap_h_filters_bpp 10
332 do_8tap_h_filters_bpp 12
338 @ Round, shift and saturate and store qreg1-4
339 .macro do_store4 qreg1, dreg1, qreg2, dreg2, qreg3, dreg3, qreg4, dreg4, tmp1, tmp2, tmp3, tmp4, minreg, type
340 vqrshrun.s32 \dreg1, \qreg1, #7
341 vqrshrun.s32 \dreg2, \qreg2, #7
342 vqrshrun.s32 \dreg3, \qreg3, #7
343 vqrshrun.s32 \dreg4, \qreg4, #7
345 vld1.16 {\tmp1}, [r6,:64], r1
346 vld1.16 {\tmp2}, [r6,:64], r1
347 vld1.16 {\tmp3}, [r6,:64], r1
348 vld1.16 {\tmp4}, [r6,:64], r1
350 vmin.u16 \dreg1, \dreg1, \minreg
351 vmin.u16 \dreg2, \dreg2, \minreg
352 vmin.u16 \dreg3, \dreg3, \minreg
353 vmin.u16 \dreg4, \dreg4, \minreg
355 vrhadd.u16 \dreg1, \dreg1, \tmp1
356 vrhadd.u16 \dreg2, \dreg2, \tmp2
357 vrhadd.u16 \dreg3, \dreg3, \tmp3
358 vrhadd.u16 \dreg4, \dreg4, \tmp4
360 vst1.16 {\dreg1}, [r0,:64], r1
361 vst1.16 {\dreg2}, [r0,:64], r1
362 vst1.16 {\dreg3}, [r0,:64], r1
363 vst1.16 {\dreg4}, [r0,:64], r1
366 @ Round, shift and saturate and store qreg1-4
367 @ qreg1-2 belong to one line and qreg3-4 to the second line.
368 @ dreg1-2 == qreg1, dreg3-4 == qreg2.
369 .macro do_store8 qreg1, qreg2, qreg3, qreg4, dreg1, dreg2, dreg3, dreg4, minreg, type
370 vqrshrun.s32 \dreg1, \qreg1, #7
371 vqrshrun.s32 \dreg2, \qreg2, #7
372 vqrshrun.s32 \dreg3, \qreg3, #7
373 vqrshrun.s32 \dreg4, \qreg4, #7
375 vld1.16 {\qreg3}, [r6,:128], r1
376 vld1.16 {\qreg4}, [r6,:128], r1
378 vmin.u16 \qreg1, \qreg1, \minreg
379 vmin.u16 \qreg2, \qreg2, \minreg
381 vrhadd.u16 \qreg1, \qreg1, \qreg3
382 vrhadd.u16 \qreg2, \qreg2, \qreg4
384 vst1.16 {\qreg1}, [r0,:128], r1
385 vst1.16 {\qreg2}, [r0,:128], r1
388 @ Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
389 @ (src1-src8 into dst1, src2-src9 into dst2).
390 .macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
391 vmull.s16 \dst1, \src1, d0[0]
392 vmull.s16 \dst2, \src2, d0[0]
393 vmull.s16 \tmp1, \src2, d0[1]
394 vmull.s16 \tmp2, \src3, d0[1]
395 vmlal.s16 \dst1, \src3, d0[2]
396 vmlal.s16 \dst2, \src4, d0[2]
397 vmlal.s16 \tmp1, \src4, d0[3]
398 vmlal.s16 \tmp2, \src5, d0[3]
399 vmlal.s16 \dst1, \src5, d1[0]
400 vmlal.s16 \dst2, \src6, d1[0]
401 vmlal.s16 \tmp1, \src6, d1[1]
402 vmlal.s16 \tmp2, \src7, d1[1]
403 vmlal.s16 \dst1, \src7, d1[2]
404 vmlal.s16 \dst2, \src8, d1[2]
405 vmlal.s16 \tmp1, \src8, d1[3]
406 vmlal.s16 \tmp2, \src9, d1[3]
407 vadd.s32 \dst1, \dst1, \tmp1
408 vadd.s32 \dst2, \dst2, \tmp2
411 @ Evaluate the filter twice in parallel. This does the same as convolve4 above,
412 @ but with double width (two input/output registers per row).
413 .macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10, src11, src12, src13, src14, src15, src16, src17, src18
414 vmull.s16 \dst1, \src1, d0[0]
415 vmull.s16 \dst2, \src2, d0[0]
416 vmull.s16 \dst3, \src3, d0[0]
417 vmull.s16 \dst4, \src4, d0[0]
418 vmlal.s16 \dst1, \src3, d0[1]
419 vmlal.s16 \dst2, \src4, d0[1]
420 vmlal.s16 \dst3, \src5, d0[1]
421 vmlal.s16 \dst4, \src6, d0[1]
422 vmlal.s16 \dst1, \src5, d0[2]
423 vmlal.s16 \dst2, \src6, d0[2]
424 vmlal.s16 \dst3, \src7, d0[2]
425 vmlal.s16 \dst4, \src8, d0[2]
426 vmlal.s16 \dst1, \src7, d0[3]
427 vmlal.s16 \dst2, \src8, d0[3]
428 vmlal.s16 \dst3, \src9, d0[3]
429 vmlal.s16 \dst4, \src10, d0[3]
430 vmlal.s16 \dst1, \src9, d1[0]
431 vmlal.s16 \dst2, \src10, d1[0]
432 vmlal.s16 \dst3, \src11, d1[0]
433 vmlal.s16 \dst4, \src12, d1[0]
434 vmlal.s16 \dst1, \src11, d1[1]
435 vmlal.s16 \dst2, \src12, d1[1]
436 vmlal.s16 \dst3, \src13, d1[1]
437 vmlal.s16 \dst4, \src14, d1[1]
438 vmlal.s16 \dst1, \src13, d1[2]
439 vmlal.s16 \dst2, \src14, d1[2]
440 vmlal.s16 \dst3, \src15, d1[2]
441 vmlal.s16 \dst4, \src16, d1[2]
442 vmlal.s16 \dst1, \src15, d1[3]
443 vmlal.s16 \dst2, \src16, d1[3]
444 vmlal.s16 \dst3, \src17, d1[3]
445 vmlal.s16 \dst4, \src18, d1[3]
448 @ Instantiate a vertical filter function for filtering 8 pixels at a time.
449 @ The height is passed in r4, the width in r5 and the filter coefficients
451 .macro do_8tap_8v type
452 function \type\()_8tap_8v
453 sub r2, r2, r3, lsl #1
455 vld1.16 {q0}, [r12, :128]
462 vld1.16 {q5}, [r2], r3
463 vld1.16 {q6}, [r2], r3
464 vld1.16 {q7}, [r2], r3
465 vld1.16 {q8}, [r2], r3
466 vld1.16 {q9}, [r2], r3
467 vld1.16 {q10}, [r2], r3
468 vld1.16 {q11}, [r2], r3
470 vld1.16 {q12}, [r2], r3
471 vld1.16 {q13}, [r2], r3
472 vld1.16 {q14}, [r2], r3
473 vld1.16 {q15}, [r2], r3
474 convolve8 q2, q3, q4, q5, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27
475 do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type
476 convolve8 q2, q3, q4, q5, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
477 do_store8 q2, q3, q4, q5, d4, d5, d6, d7, q1, \type
482 vld1.16 {q4}, [r2], r3
483 vld1.16 {q5}, [r2], r3
484 vld1.16 {q6}, [r2], r3
485 vld1.16 {q7}, [r2], r3
486 convolve8 q2, q3, q8, q9, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11
487 do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type
488 convolve8 q2, q3, q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15
489 do_store8 q2, q3, q8, q9, d4, d5, d6, d7, q1, \type
494 vld1.16 {q8}, [r2], r3
495 vld1.16 {q9}, [r2], r3
496 vld1.16 {q10}, [r2], r3
497 vld1.16 {q11}, [r2], r3
498 convolve8 q2, q3, q12, q13, d26, d27, d28, d29, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19
499 do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type
500 convolve8 q2, q3, q12, q13, d30, d31, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23
501 do_store8 q2, q3, q12, q13, d4, d5, d6, d7, q1, \type
509 @ r0 -= h * dst_stride
511 @ r2 -= h * src_stride
513 @ r2 -= 8 * src_stride
514 sub r2, r2, r3, lsl #3
515 @ r2 += 1 * src_stride
530 @ Instantiate a vertical filter function for filtering a 4 pixels wide
531 @ slice. This only is designed to work for 4 or 8 output lines.
532 .macro do_8tap_4v type
533 function \type\()_8tap_4v
534 sub r2, r2, r3, lsl #1
536 vld1.16 {q0}, [r12, :128]
541 vld1.16 {d16}, [r2], r3
542 vld1.16 {d17}, [r2], r3
543 vld1.16 {d18}, [r2], r3
544 vld1.16 {d19}, [r2], r3
545 vld1.16 {d20}, [r2], r3
546 vld1.16 {d21}, [r2], r3
547 vld1.16 {d22}, [r2], r3
548 vld1.16 {d23}, [r2], r3
549 vld1.16 {d24}, [r2], r3
550 vld1.16 {d25}, [r2], r3
551 vld1.16 {d26}, [r2], r3
552 convolve4 q2, q3, d16, d17, d18, d19, d20, d21, d22, d23, d24, q14, q15
553 convolve4 q14, q15, d18, d19, d20, d21, d22, d23, d24, d25, d26, q8, q9
554 do_store4 q2, d4, q3, d6, q14, d28, q15, d30, d5, d7, d29, d31, d2, \type
559 vld1.16 {d27}, [r2], r3
560 vld1.16 {d28}, [r2], r3
561 vld1.16 {d29}, [r2], r3
562 vld1.16 {d30}, [r2], r3
563 convolve4 q2, q3, d20, d21, d22, d23, d24, d25, d26, d27, d28, q8, q9
564 convolve4 q8, q9, d22, d23, d24, d25, d26, d27, d28, d29, d30, q10, q11
565 do_store4 q2, d4, q3, d6, q8, d16, q9, d18, d5, d7, d17, d19, d2, \type
576 .macro do_8tap_v_func type, filter, offset, size, bpp
577 function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
584 vmvn.u16 q1, #((0xffff << \bpp) & 0xffff)
585 movrelx r12, X(ff_vp9_subpel_filters), r6
586 add r12, r12, 256*\offset
587 add r12, r12, r5, lsl #4
597 .macro do_8tap_v_filters size, bpp
598 do_8tap_v_func put, regular, 1, \size, \bpp
599 do_8tap_v_func avg, regular, 1, \size, \bpp
600 do_8tap_v_func put, sharp, 2, \size, \bpp
601 do_8tap_v_func avg, sharp, 2, \size, \bpp
602 do_8tap_v_func put, smooth, 0, \size, \bpp
603 do_8tap_v_func avg, smooth, 0, \size, \bpp
606 .macro do_8tap_v_filters_bpp bpp
607 do_8tap_v_filters 64, \bpp
608 do_8tap_v_filters 32, \bpp
609 do_8tap_v_filters 16, \bpp
610 do_8tap_v_filters 8, \bpp
611 do_8tap_v_filters 4, \bpp
614 do_8tap_v_filters_bpp 10
615 do_8tap_v_filters_bpp 12