2 * Copyright (c) 2017 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
23 // All public functions in this file have the following signature:
24 // typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25 // const uint8_t *ref, ptrdiff_t ref_stride,
26 // int h, int mx, int my);
28 function ff_vp9_copy128_aarch64, export=1
33 ldp x9, x10, [x2, #32]
36 ldp x11, x12, [x2, #48]
37 stp x9, x10, [x0, #32]
38 stp x11, x12, [x0, #48]
42 ldp x9, x10, [x2, #96]
44 ldp x11, x12, [x2, #112]
45 stp x9, x10, [x0, #96]
46 stp x11, x12, [x0, #112]
53 function ff_vp9_avg64_16_neon, export=1
58 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
59 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
60 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
61 urhadd v0.8h, v0.8h, v4.8h
62 urhadd v1.8h, v1.8h, v5.8h
63 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
64 urhadd v2.8h, v2.8h, v6.8h
65 urhadd v3.8h, v3.8h, v7.8h
67 urhadd v16.8h, v16.8h, v20.8h
68 urhadd v17.8h, v17.8h, v21.8h
69 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64
70 urhadd v18.8h, v18.8h, v22.8h
71 urhadd v19.8h, v19.8h, v23.8h
72 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
77 function ff_vp9_avg32_16_neon, export=1
80 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3
81 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
82 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
83 urhadd v0.8h, v0.8h, v4.8h
84 urhadd v1.8h, v1.8h, v5.8h
85 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
86 urhadd v2.8h, v2.8h, v6.8h
87 urhadd v3.8h, v3.8h, v7.8h
89 urhadd v16.8h, v16.8h, v20.8h
90 urhadd v17.8h, v17.8h, v21.8h
91 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1
92 urhadd v18.8h, v18.8h, v22.8h
93 urhadd v19.8h, v19.8h, v23.8h
94 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
99 function ff_vp9_avg16_16_neon, export=1
101 ld1 {v2.8h, v3.8h}, [x2], x3
102 ld1 {v0.8h, v1.8h}, [x0]
103 urhadd v0.8h, v0.8h, v2.8h
104 urhadd v1.8h, v1.8h, v3.8h
106 st1 {v0.8h, v1.8h}, [x0], x1
111 function ff_vp9_avg8_16_neon, export=1
114 ld1 {v2.8h}, [x2], x3
115 ld1 {v0.8h}, [x0], x1
116 ld1 {v3.8h}, [x2], x3
117 urhadd v0.8h, v0.8h, v2.8h
118 ld1 {v1.8h}, [x0], x1
119 urhadd v1.8h, v1.8h, v3.8h
121 st1 {v0.8h}, [x5], x1
122 st1 {v1.8h}, [x5], x1
127 function ff_vp9_avg4_16_neon, export=1
130 ld1 {v2.4h}, [x2], x3
131 ld1 {v0.4h}, [x0], x1
132 ld1 {v3.4h}, [x2], x3
133 urhadd v0.4h, v0.4h, v2.4h
134 ld1 {v1.4h}, [x0], x1
135 urhadd v1.4h, v1.4h, v3.4h
137 st1 {v0.4h}, [x5], x1
138 st1 {v1.8b}, [x5], x1
144 // Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
145 // for size >= 16), and multiply-accumulate into dst1 and dst5 (or
146 // dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
148 .macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
149 ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
150 ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
151 smlal \dst1\().4s, v20.4h, v0.h[\offset]
152 smlal \dst5\().4s, v22.4h, v0.h[\offset]
154 ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
155 ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
158 smlal2 \dst2\().4s, v20.8h, v0.h[\offset]
159 smlal2 \dst6\().4s, v22.8h, v0.h[\offset]
162 smlal \dst3\().4s, v21.4h, v0.h[\offset]
163 smlal \dst7\().4s, v23.4h, v0.h[\offset]
164 smlal2 \dst4\().4s, v21.8h, v0.h[\offset]
165 smlal2 \dst8\().4s, v23.8h, v0.h[\offset]
170 // Instantiate a horizontal filter function for the given size.
171 // This can work on 4, 8 or 16 pixels in parallel; for larger
172 // widths it will do 16 pixels at a time and loop horizontally.
173 // The actual width (in bytes) is passed in x5, the height in w4 and
174 // the filter coefficients in x9.
175 .macro do_8tap_h type, size
176 function \type\()_8tap_\size\()h
182 // Only size >= 16 loops horizontally and needs
183 // reduced dst stride
187 // size >= 16 loads two qwords and increments r2,
188 // for size 4/8 it's enough with one qword and no
194 // Load the filter vector
202 ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48
203 ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48
205 ld1 {v5.8h, v6.8h}, [x2]
206 ld1 {v16.8h, v17.8h}, [x7]
210 smull v1.4s, v5.4h, v0.h[0]
211 smull v24.4s, v16.4h, v0.h[0]
213 smull2 v2.4s, v5.8h, v0.h[0]
214 smull2 v25.4s, v16.8h, v0.h[0]
217 smull v3.4s, v6.4h, v0.h[0]
218 smull v26.4s, v17.4h, v0.h[0]
219 smull2 v4.4s, v6.8h, v0.h[0]
220 smull2 v27.4s, v17.8h, v0.h[0]
222 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size
223 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size
224 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size
225 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size
226 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size
227 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size
228 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size
230 // Round, shift and saturate
231 // The sqrshrun takes care of clamping negative values to zero, but
232 // we manually need to do umin with the max pixel value.
233 sqrshrun v1.4h, v1.4s, #7
234 sqrshrun v24.4h, v24.4s, #7
236 sqrshrun2 v1.8h, v2.4s, #7
237 sqrshrun2 v24.8h, v25.4s, #7
238 umin v1.8h, v1.8h, v31.8h
239 umin v24.8h, v24.8h, v31.8h
241 sqrshrun v2.4h, v3.4s, #7
242 sqrshrun v25.4h, v26.4s, #7
243 sqrshrun2 v2.8h, v4.4s, #7
244 sqrshrun2 v25.8h, v27.4s, #7
245 umin v2.8h, v2.8h, v31.8h
246 umin v25.8h, v25.8h, v31.8h
249 umin v1.4h, v1.4h, v31.4h
250 umin v24.4h, v24.4h, v31.4h
255 ld1 {v3.8h, v4.8h}, [x0]
256 ld1 {v29.8h, v30.8h}, [x6]
257 urhadd v1.8h, v1.8h, v3.8h
258 urhadd v2.8h, v2.8h, v4.8h
259 urhadd v24.8h, v24.8h, v29.8h
260 urhadd v25.8h, v25.8h, v30.8h
264 urhadd v1.8h, v1.8h, v3.8h
265 urhadd v24.8h, v24.8h, v4.8h
269 urhadd v1.4h, v1.4h, v3.4h
270 urhadd v24.4h, v24.4h, v4.4h
273 // Store and loop horizontally (for size >= 16)
276 st1 {v1.8h, v2.8h}, [x0], #32
277 st1 {v24.8h, v25.8h}, [x6], #32
281 ld1 {v6.8h, v7.8h}, [x2], #32
282 ld1 {v17.8h, v18.8h}, [x7], #32
303 .macro do_8tap_h_size size
312 .macro do_8tap_h_func type, filter, offset, size, bpp
313 function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
314 mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
315 movrel x6, X(ff_vp9_subpel_filters), 256*\offset
317 add x9, x6, w5, uxtw #4
322 b \type\()_8tap_\size\()h
327 .macro do_8tap_h_filters size, bpp
328 do_8tap_h_func put, regular, 1, \size, \bpp
329 do_8tap_h_func avg, regular, 1, \size, \bpp
330 do_8tap_h_func put, sharp, 2, \size, \bpp
331 do_8tap_h_func avg, sharp, 2, \size, \bpp
332 do_8tap_h_func put, smooth, 0, \size, \bpp
333 do_8tap_h_func avg, smooth, 0, \size, \bpp
336 .macro do_8tap_h_filters_bpp bpp
337 do_8tap_h_filters 64, \bpp
338 do_8tap_h_filters 32, \bpp
339 do_8tap_h_filters 16, \bpp
340 do_8tap_h_filters 8, \bpp
341 do_8tap_h_filters 4, \bpp
344 do_8tap_h_filters_bpp 10
345 do_8tap_h_filters_bpp 12
350 // Round, shift and saturate and store reg1-reg4
351 .macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
352 sqrshrun \reg1\().4h, \reg1\().4s, #7
353 sqrshrun \reg2\().4h, \reg2\().4s, #7
354 sqrshrun \reg3\().4h, \reg3\().4s, #7
355 sqrshrun \reg4\().4h, \reg4\().4s, #7
357 ld1 {\tmp1\().4h}, [x7], x1
358 ld1 {\tmp2\().4h}, [x7], x1
359 ld1 {\tmp3\().4h}, [x7], x1
360 ld1 {\tmp4\().4h}, [x7], x1
362 umin \reg1\().4h, \reg1\().4h, \minreg\().4h
363 umin \reg2\().4h, \reg2\().4h, \minreg\().4h
364 umin \reg3\().4h, \reg3\().4h, \minreg\().4h
365 umin \reg4\().4h, \reg4\().4h, \minreg\().4h
367 urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h
368 urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h
369 urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h
370 urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h
372 st1 {\reg1\().4h}, [x0], x1
373 st1 {\reg2\().4h}, [x0], x1
374 st1 {\reg3\().4h}, [x0], x1
375 st1 {\reg4\().4h}, [x0], x1
378 // Round, shift and saturate and store reg1-8, where
379 // reg1-2, reg3-4 etc pairwise correspond to 4 rows.
380 .macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
381 sqrshrun \reg1\().4h, \reg1\().4s, #7
382 sqrshrun2 \reg1\().8h, \reg2\().4s, #7
383 sqrshrun \reg2\().4h, \reg3\().4s, #7
384 sqrshrun2 \reg2\().8h, \reg4\().4s, #7
385 sqrshrun \reg3\().4h, \reg5\().4s, #7
386 sqrshrun2 \reg3\().8h, \reg6\().4s, #7
387 sqrshrun \reg4\().4h, \reg7\().4s, #7
388 sqrshrun2 \reg4\().8h, \reg8\().4s, #7
390 ld1 {\reg5\().8h}, [x7], x1
391 ld1 {\reg6\().8h}, [x7], x1
392 ld1 {\reg7\().8h}, [x7], x1
393 ld1 {\reg8\().8h}, [x7], x1
395 umin \reg1\().8h, \reg1\().8h, \minreg\().8h
396 umin \reg2\().8h, \reg2\().8h, \minreg\().8h
397 umin \reg3\().8h, \reg3\().8h, \minreg\().8h
398 umin \reg4\().8h, \reg4\().8h, \minreg\().8h
400 urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h
401 urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h
402 urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h
403 urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h
405 st1 {\reg1\().8h}, [x0], x1
406 st1 {\reg2\().8h}, [x0], x1
407 st1 {\reg3\().8h}, [x0], x1
408 st1 {\reg4\().8h}, [x0], x1
411 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
412 // (src1-src8 into dst1, src2-src9 into dst2).
413 .macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
414 smull \dst1\().4s, \src1\().4h, v0.h[0]
415 smull \dst2\().4s, \src2\().4h, v0.h[0]
416 smull \tmp1\().4s, \src2\().4h, v0.h[1]
417 smull \tmp2\().4s, \src3\().4h, v0.h[1]
418 smlal \dst1\().4s, \src3\().4h, v0.h[2]
419 smlal \dst2\().4s, \src4\().4h, v0.h[2]
420 smlal \tmp1\().4s, \src4\().4h, v0.h[3]
421 smlal \tmp2\().4s, \src5\().4h, v0.h[3]
422 smlal \dst1\().4s, \src5\().4h, v0.h[4]
423 smlal \dst2\().4s, \src6\().4h, v0.h[4]
424 smlal \tmp1\().4s, \src6\().4h, v0.h[5]
425 smlal \tmp2\().4s, \src7\().4h, v0.h[5]
426 smlal \dst1\().4s, \src7\().4h, v0.h[6]
427 smlal \dst2\().4s, \src8\().4h, v0.h[6]
428 smlal \tmp1\().4s, \src8\().4h, v0.h[7]
429 smlal \tmp2\().4s, \src9\().4h, v0.h[7]
430 add \dst1\().4s, \dst1\().4s, \tmp1\().4s
431 add \dst2\().4s, \dst2\().4s, \tmp2\().4s
434 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
435 // (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
436 .macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
437 smull \dst1\().4s, \src1\().4h, v0.h[0]
438 smull2 \dst2\().4s, \src1\().8h, v0.h[0]
439 smull \dst3\().4s, \src2\().4h, v0.h[0]
440 smull2 \dst4\().4s, \src2\().8h, v0.h[0]
441 smlal \dst1\().4s, \src2\().4h, v0.h[1]
442 smlal2 \dst2\().4s, \src2\().8h, v0.h[1]
443 smlal \dst3\().4s, \src3\().4h, v0.h[1]
444 smlal2 \dst4\().4s, \src3\().8h, v0.h[1]
445 smlal \dst1\().4s, \src3\().4h, v0.h[2]
446 smlal2 \dst2\().4s, \src3\().8h, v0.h[2]
447 smlal \dst3\().4s, \src4\().4h, v0.h[2]
448 smlal2 \dst4\().4s, \src4\().8h, v0.h[2]
449 smlal \dst1\().4s, \src4\().4h, v0.h[3]
450 smlal2 \dst2\().4s, \src4\().8h, v0.h[3]
451 smlal \dst3\().4s, \src5\().4h, v0.h[3]
452 smlal2 \dst4\().4s, \src5\().8h, v0.h[3]
453 smlal \dst1\().4s, \src5\().4h, v0.h[4]
454 smlal2 \dst2\().4s, \src5\().8h, v0.h[4]
455 smlal \dst3\().4s, \src6\().4h, v0.h[4]
456 smlal2 \dst4\().4s, \src6\().8h, v0.h[4]
457 smlal \dst1\().4s, \src6\().4h, v0.h[5]
458 smlal2 \dst2\().4s, \src6\().8h, v0.h[5]
459 smlal \dst3\().4s, \src7\().4h, v0.h[5]
460 smlal2 \dst4\().4s, \src7\().8h, v0.h[5]
461 smlal \dst1\().4s, \src7\().4h, v0.h[6]
462 smlal2 \dst2\().4s, \src7\().8h, v0.h[6]
463 smlal \dst3\().4s, \src8\().4h, v0.h[6]
464 smlal2 \dst4\().4s, \src8\().8h, v0.h[6]
465 smlal \dst1\().4s, \src8\().4h, v0.h[7]
466 smlal2 \dst2\().4s, \src8\().8h, v0.h[7]
467 smlal \dst3\().4s, \src9\().4h, v0.h[7]
468 smlal2 \dst4\().4s, \src9\().8h, v0.h[7]
471 // Instantiate a vertical filter function for filtering 8 pixels at a time.
472 // The height is passed in x4, the width in x5 and the filter coefficients
474 .macro do_8tap_8v type
475 function \type\()_8tap_8v
476 sub x2, x2, x3, lsl #1
485 ld1 {v17.8h}, [x2], x3
486 ld1 {v18.8h}, [x2], x3
487 ld1 {v19.8h}, [x2], x3
488 ld1 {v20.8h}, [x2], x3
489 ld1 {v21.8h}, [x2], x3
490 ld1 {v22.8h}, [x2], x3
491 ld1 {v23.8h}, [x2], x3
493 ld1 {v24.8h}, [x2], x3
494 ld1 {v25.8h}, [x2], x3
495 ld1 {v26.8h}, [x2], x3
496 ld1 {v27.8h}, [x2], x3
498 convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25
499 convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
500 do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type
505 ld1 {v16.8h}, [x2], x3
506 ld1 {v17.8h}, [x2], x3
507 ld1 {v18.8h}, [x2], x3
508 ld1 {v19.8h}, [x2], x3
509 convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17
510 convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
511 do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type
516 ld1 {v20.8h}, [x2], x3
517 ld1 {v21.8h}, [x2], x3
518 ld1 {v22.8h}, [x2], x3
519 ld1 {v23.8h}, [x2], x3
520 convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21
521 convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
522 do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type
530 // x0 -= h * dst_stride
532 // x2 -= h * src_stride
534 // x2 -= 8 * src_stride
535 sub x2, x2, x3, lsl #3
536 // x2 += 1 * src_stride
550 // Instantiate a vertical filter function for filtering a 4 pixels wide
551 // slice. This only is designed to work for 4 or 8 output lines.
552 .macro do_8tap_4v type
553 function \type\()_8tap_4v
554 sub x2, x2, x3, lsl #1
561 ld1 {v16.4h}, [x2], x3
562 ld1 {v17.4h}, [x2], x3
563 ld1 {v18.4h}, [x2], x3
564 ld1 {v19.4h}, [x2], x3
565 ld1 {v20.4h}, [x2], x3
566 ld1 {v21.4h}, [x2], x3
567 ld1 {v22.4h}, [x2], x3
568 ld1 {v23.4h}, [x2], x3
569 ld1 {v24.4h}, [x2], x3
570 ld1 {v25.4h}, [x2], x3
571 ld1 {v26.4h}, [x2], x3
573 convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
574 convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
575 do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type
580 ld1 {v27.4h}, [x2], x3
581 ld1 {v28.4h}, [x2], x3
582 ld1 {v29.4h}, [x2], x3
583 ld1 {v30.4h}, [x2], x3
585 convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
586 convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
587 do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type
598 .macro do_8tap_v_func type, filter, offset, size, bpp
599 function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
601 mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
602 movrel x5, X(ff_vp9_subpel_filters), 256*\offset
603 add x6, x5, w6, uxtw #4
613 .macro do_8tap_v_filters size, bpp
614 do_8tap_v_func put, regular, 1, \size, \bpp
615 do_8tap_v_func avg, regular, 1, \size, \bpp
616 do_8tap_v_func put, sharp, 2, \size, \bpp
617 do_8tap_v_func avg, sharp, 2, \size, \bpp
618 do_8tap_v_func put, smooth, 0, \size, \bpp
619 do_8tap_v_func avg, smooth, 0, \size, \bpp
622 .macro do_8tap_v_filters_bpp bpp
623 do_8tap_v_filters 64, \bpp
624 do_8tap_v_filters 32, \bpp
625 do_8tap_v_filters 16, \bpp
626 do_8tap_v_filters 8, \bpp
627 do_8tap_v_filters 4, \bpp
630 do_8tap_v_filters_bpp 10
631 do_8tap_v_filters_bpp 12