2 * Copyright (c) 2017 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
23 // All public functions in this file have the following signature:
24 // typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25 // const uint8_t *ref, ptrdiff_t ref_stride,
26 // int h, int mx, int my);
28 function ff_vp9_avg64_16_neon, export=1
33 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
34 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
35 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
36 urhadd v0.8h, v0.8h, v4.8h
37 urhadd v1.8h, v1.8h, v5.8h
38 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
39 urhadd v2.8h, v2.8h, v6.8h
40 urhadd v3.8h, v3.8h, v7.8h
42 urhadd v16.8h, v16.8h, v20.8h
43 urhadd v17.8h, v17.8h, v21.8h
44 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64
45 urhadd v18.8h, v18.8h, v22.8h
46 urhadd v19.8h, v19.8h, v23.8h
47 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
52 function ff_vp9_avg32_16_neon, export=1
55 ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3
56 ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
57 ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
58 urhadd v0.8h, v0.8h, v4.8h
59 urhadd v1.8h, v1.8h, v5.8h
60 ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
61 urhadd v2.8h, v2.8h, v6.8h
62 urhadd v3.8h, v3.8h, v7.8h
64 urhadd v16.8h, v16.8h, v20.8h
65 urhadd v17.8h, v17.8h, v21.8h
66 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1
67 urhadd v18.8h, v18.8h, v22.8h
68 urhadd v19.8h, v19.8h, v23.8h
69 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
74 function ff_vp9_avg16_16_neon, export=1
76 ld1 {v2.8h, v3.8h}, [x2], x3
77 ld1 {v0.8h, v1.8h}, [x0]
78 urhadd v0.8h, v0.8h, v2.8h
79 urhadd v1.8h, v1.8h, v3.8h
81 st1 {v0.8h, v1.8h}, [x0], x1
86 function ff_vp9_avg8_16_neon, export=1
92 urhadd v0.8h, v0.8h, v2.8h
94 urhadd v1.8h, v1.8h, v3.8h
102 function ff_vp9_avg4_16_neon, export=1
105 ld1 {v2.4h}, [x2], x3
106 ld1 {v0.4h}, [x0], x1
107 ld1 {v3.4h}, [x2], x3
108 urhadd v0.4h, v0.4h, v2.4h
109 ld1 {v1.4h}, [x0], x1
110 urhadd v1.4h, v1.4h, v3.4h
112 st1 {v0.4h}, [x5], x1
113 st1 {v1.8b}, [x5], x1
119 // Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
120 // for size >= 16), and multiply-accumulate into dst1 and dst5 (or
121 // dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
123 .macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
124 ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
125 ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
126 smlal \dst1\().4s, v20.4h, v0.h[\offset]
127 smlal \dst5\().4s, v22.4h, v0.h[\offset]
129 ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
130 ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
133 smlal2 \dst2\().4s, v20.8h, v0.h[\offset]
134 smlal2 \dst6\().4s, v22.8h, v0.h[\offset]
137 smlal \dst3\().4s, v21.4h, v0.h[\offset]
138 smlal \dst7\().4s, v23.4h, v0.h[\offset]
139 smlal2 \dst4\().4s, v21.8h, v0.h[\offset]
140 smlal2 \dst8\().4s, v23.8h, v0.h[\offset]
145 // Instantiate a horizontal filter function for the given size.
146 // This can work on 4, 8 or 16 pixels in parallel; for larger
147 // widths it will do 16 pixels at a time and loop horizontally.
148 // The actual width (in bytes) is passed in x5, the height in w4 and
149 // the filter coefficients in x9.
150 .macro do_8tap_h type, size
151 function \type\()_8tap_\size\()h
157 // Only size >= 16 loops horizontally and needs
158 // reduced dst stride
162 // size >= 16 loads two qwords and increments r2,
163 // for size 4/8 it's enough with one qword and no
169 // Load the filter vector
177 ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48
178 ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48
180 ld1 {v5.8h, v6.8h}, [x2]
181 ld1 {v16.8h, v17.8h}, [x7]
185 smull v1.4s, v5.4h, v0.h[0]
186 smull v24.4s, v16.4h, v0.h[0]
188 smull2 v2.4s, v5.8h, v0.h[0]
189 smull2 v25.4s, v16.8h, v0.h[0]
192 smull v3.4s, v6.4h, v0.h[0]
193 smull v26.4s, v17.4h, v0.h[0]
194 smull2 v4.4s, v6.8h, v0.h[0]
195 smull2 v27.4s, v17.8h, v0.h[0]
197 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size
198 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size
199 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size
200 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size
201 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size
202 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size
203 extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size
205 // Round, shift and saturate
206 // The sqrshrun takes care of clamping negative values to zero, but
207 // we manually need to do umin with the max pixel value.
208 sqrshrun v1.4h, v1.4s, #7
209 sqrshrun v24.4h, v24.4s, #7
211 sqrshrun2 v1.8h, v2.4s, #7
212 sqrshrun2 v24.8h, v25.4s, #7
213 umin v1.8h, v1.8h, v31.8h
214 umin v24.8h, v24.8h, v31.8h
216 sqrshrun v2.4h, v3.4s, #7
217 sqrshrun v25.4h, v26.4s, #7
218 sqrshrun2 v2.8h, v4.4s, #7
219 sqrshrun2 v25.8h, v27.4s, #7
220 umin v2.8h, v2.8h, v31.8h
221 umin v25.8h, v25.8h, v31.8h
224 umin v1.4h, v1.4h, v31.4h
225 umin v24.4h, v24.4h, v31.4h
230 ld1 {v3.8h, v4.8h}, [x0]
231 ld1 {v29.8h, v30.8h}, [x6]
232 urhadd v1.8h, v1.8h, v3.8h
233 urhadd v2.8h, v2.8h, v4.8h
234 urhadd v24.8h, v24.8h, v29.8h
235 urhadd v25.8h, v25.8h, v30.8h
239 urhadd v1.8h, v1.8h, v3.8h
240 urhadd v24.8h, v24.8h, v4.8h
244 urhadd v1.4h, v1.4h, v3.4h
245 urhadd v24.4h, v24.4h, v4.4h
248 // Store and loop horizontally (for size >= 16)
251 st1 {v1.8h, v2.8h}, [x0], #32
252 st1 {v24.8h, v25.8h}, [x6], #32
256 ld1 {v6.8h, v7.8h}, [x2], #32
257 ld1 {v17.8h, v18.8h}, [x7], #32
278 .macro do_8tap_h_size size
287 .macro do_8tap_h_func type, filter, offset, size, bpp
288 function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
289 mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
290 movrel x6, X(ff_vp9_subpel_filters), 256*\offset
292 add x9, x6, w5, uxtw #4
297 b \type\()_8tap_\size\()h
302 .macro do_8tap_h_filters size, bpp
303 do_8tap_h_func put, regular, 1, \size, \bpp
304 do_8tap_h_func avg, regular, 1, \size, \bpp
305 do_8tap_h_func put, sharp, 2, \size, \bpp
306 do_8tap_h_func avg, sharp, 2, \size, \bpp
307 do_8tap_h_func put, smooth, 0, \size, \bpp
308 do_8tap_h_func avg, smooth, 0, \size, \bpp
311 .macro do_8tap_h_filters_bpp bpp
312 do_8tap_h_filters 64, \bpp
313 do_8tap_h_filters 32, \bpp
314 do_8tap_h_filters 16, \bpp
315 do_8tap_h_filters 8, \bpp
316 do_8tap_h_filters 4, \bpp
319 do_8tap_h_filters_bpp 10
320 do_8tap_h_filters_bpp 12
325 // Round, shift and saturate and store reg1-reg4
326 .macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
327 sqrshrun \reg1\().4h, \reg1\().4s, #7
328 sqrshrun \reg2\().4h, \reg2\().4s, #7
329 sqrshrun \reg3\().4h, \reg3\().4s, #7
330 sqrshrun \reg4\().4h, \reg4\().4s, #7
332 ld1 {\tmp1\().4h}, [x7], x1
333 ld1 {\tmp2\().4h}, [x7], x1
334 ld1 {\tmp3\().4h}, [x7], x1
335 ld1 {\tmp4\().4h}, [x7], x1
337 umin \reg1\().4h, \reg1\().4h, \minreg\().4h
338 umin \reg2\().4h, \reg2\().4h, \minreg\().4h
339 umin \reg3\().4h, \reg3\().4h, \minreg\().4h
340 umin \reg4\().4h, \reg4\().4h, \minreg\().4h
342 urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h
343 urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h
344 urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h
345 urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h
347 st1 {\reg1\().4h}, [x0], x1
348 st1 {\reg2\().4h}, [x0], x1
349 st1 {\reg3\().4h}, [x0], x1
350 st1 {\reg4\().4h}, [x0], x1
353 // Round, shift and saturate and store reg1-8, where
354 // reg1-2, reg3-4 etc pairwise correspond to 4 rows.
355 .macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
356 sqrshrun \reg1\().4h, \reg1\().4s, #7
357 sqrshrun2 \reg1\().8h, \reg2\().4s, #7
358 sqrshrun \reg2\().4h, \reg3\().4s, #7
359 sqrshrun2 \reg2\().8h, \reg4\().4s, #7
360 sqrshrun \reg3\().4h, \reg5\().4s, #7
361 sqrshrun2 \reg3\().8h, \reg6\().4s, #7
362 sqrshrun \reg4\().4h, \reg7\().4s, #7
363 sqrshrun2 \reg4\().8h, \reg8\().4s, #7
365 ld1 {\reg5\().8h}, [x7], x1
366 ld1 {\reg6\().8h}, [x7], x1
367 ld1 {\reg7\().8h}, [x7], x1
368 ld1 {\reg8\().8h}, [x7], x1
370 umin \reg1\().8h, \reg1\().8h, \minreg\().8h
371 umin \reg2\().8h, \reg2\().8h, \minreg\().8h
372 umin \reg3\().8h, \reg3\().8h, \minreg\().8h
373 umin \reg4\().8h, \reg4\().8h, \minreg\().8h
375 urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h
376 urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h
377 urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h
378 urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h
380 st1 {\reg1\().8h}, [x0], x1
381 st1 {\reg2\().8h}, [x0], x1
382 st1 {\reg3\().8h}, [x0], x1
383 st1 {\reg4\().8h}, [x0], x1
386 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
387 // (src1-src8 into dst1, src2-src9 into dst2).
388 .macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
389 smull \dst1\().4s, \src1\().4h, v0.h[0]
390 smull \dst2\().4s, \src2\().4h, v0.h[0]
391 smull \tmp1\().4s, \src2\().4h, v0.h[1]
392 smull \tmp2\().4s, \src3\().4h, v0.h[1]
393 smlal \dst1\().4s, \src3\().4h, v0.h[2]
394 smlal \dst2\().4s, \src4\().4h, v0.h[2]
395 smlal \tmp1\().4s, \src4\().4h, v0.h[3]
396 smlal \tmp2\().4s, \src5\().4h, v0.h[3]
397 smlal \dst1\().4s, \src5\().4h, v0.h[4]
398 smlal \dst2\().4s, \src6\().4h, v0.h[4]
399 smlal \tmp1\().4s, \src6\().4h, v0.h[5]
400 smlal \tmp2\().4s, \src7\().4h, v0.h[5]
401 smlal \dst1\().4s, \src7\().4h, v0.h[6]
402 smlal \dst2\().4s, \src8\().4h, v0.h[6]
403 smlal \tmp1\().4s, \src8\().4h, v0.h[7]
404 smlal \tmp2\().4s, \src9\().4h, v0.h[7]
405 add \dst1\().4s, \dst1\().4s, \tmp1\().4s
406 add \dst2\().4s, \dst2\().4s, \tmp2\().4s
409 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
410 // (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
411 .macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
412 smull \dst1\().4s, \src1\().4h, v0.h[0]
413 smull2 \dst2\().4s, \src1\().8h, v0.h[0]
414 smull \dst3\().4s, \src2\().4h, v0.h[0]
415 smull2 \dst4\().4s, \src2\().8h, v0.h[0]
416 smlal \dst1\().4s, \src2\().4h, v0.h[1]
417 smlal2 \dst2\().4s, \src2\().8h, v0.h[1]
418 smlal \dst3\().4s, \src3\().4h, v0.h[1]
419 smlal2 \dst4\().4s, \src3\().8h, v0.h[1]
420 smlal \dst1\().4s, \src3\().4h, v0.h[2]
421 smlal2 \dst2\().4s, \src3\().8h, v0.h[2]
422 smlal \dst3\().4s, \src4\().4h, v0.h[2]
423 smlal2 \dst4\().4s, \src4\().8h, v0.h[2]
424 smlal \dst1\().4s, \src4\().4h, v0.h[3]
425 smlal2 \dst2\().4s, \src4\().8h, v0.h[3]
426 smlal \dst3\().4s, \src5\().4h, v0.h[3]
427 smlal2 \dst4\().4s, \src5\().8h, v0.h[3]
428 smlal \dst1\().4s, \src5\().4h, v0.h[4]
429 smlal2 \dst2\().4s, \src5\().8h, v0.h[4]
430 smlal \dst3\().4s, \src6\().4h, v0.h[4]
431 smlal2 \dst4\().4s, \src6\().8h, v0.h[4]
432 smlal \dst1\().4s, \src6\().4h, v0.h[5]
433 smlal2 \dst2\().4s, \src6\().8h, v0.h[5]
434 smlal \dst3\().4s, \src7\().4h, v0.h[5]
435 smlal2 \dst4\().4s, \src7\().8h, v0.h[5]
436 smlal \dst1\().4s, \src7\().4h, v0.h[6]
437 smlal2 \dst2\().4s, \src7\().8h, v0.h[6]
438 smlal \dst3\().4s, \src8\().4h, v0.h[6]
439 smlal2 \dst4\().4s, \src8\().8h, v0.h[6]
440 smlal \dst1\().4s, \src8\().4h, v0.h[7]
441 smlal2 \dst2\().4s, \src8\().8h, v0.h[7]
442 smlal \dst3\().4s, \src9\().4h, v0.h[7]
443 smlal2 \dst4\().4s, \src9\().8h, v0.h[7]
446 // Instantiate a vertical filter function for filtering 8 pixels at a time.
447 // The height is passed in x4, the width in x5 and the filter coefficients
449 .macro do_8tap_8v type
450 function \type\()_8tap_8v
451 sub x2, x2, x3, lsl #1
460 ld1 {v17.8h}, [x2], x3
461 ld1 {v18.8h}, [x2], x3
462 ld1 {v19.8h}, [x2], x3
463 ld1 {v20.8h}, [x2], x3
464 ld1 {v21.8h}, [x2], x3
465 ld1 {v22.8h}, [x2], x3
466 ld1 {v23.8h}, [x2], x3
468 ld1 {v24.8h}, [x2], x3
469 ld1 {v25.8h}, [x2], x3
470 ld1 {v26.8h}, [x2], x3
471 ld1 {v27.8h}, [x2], x3
473 convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25
474 convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
475 do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type
480 ld1 {v16.8h}, [x2], x3
481 ld1 {v17.8h}, [x2], x3
482 ld1 {v18.8h}, [x2], x3
483 ld1 {v19.8h}, [x2], x3
484 convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17
485 convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
486 do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type
491 ld1 {v20.8h}, [x2], x3
492 ld1 {v21.8h}, [x2], x3
493 ld1 {v22.8h}, [x2], x3
494 ld1 {v23.8h}, [x2], x3
495 convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21
496 convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
497 do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type
505 // x0 -= h * dst_stride
507 // x2 -= h * src_stride
509 // x2 -= 8 * src_stride
510 sub x2, x2, x3, lsl #3
511 // x2 += 1 * src_stride
525 // Instantiate a vertical filter function for filtering a 4 pixels wide
526 // slice. This only is designed to work for 4 or 8 output lines.
527 .macro do_8tap_4v type
528 function \type\()_8tap_4v
529 sub x2, x2, x3, lsl #1
536 ld1 {v16.4h}, [x2], x3
537 ld1 {v17.4h}, [x2], x3
538 ld1 {v18.4h}, [x2], x3
539 ld1 {v19.4h}, [x2], x3
540 ld1 {v20.4h}, [x2], x3
541 ld1 {v21.4h}, [x2], x3
542 ld1 {v22.4h}, [x2], x3
543 ld1 {v23.4h}, [x2], x3
544 ld1 {v24.4h}, [x2], x3
545 ld1 {v25.4h}, [x2], x3
546 ld1 {v26.4h}, [x2], x3
548 convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
549 convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
550 do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type
555 ld1 {v27.4h}, [x2], x3
556 ld1 {v28.4h}, [x2], x3
557 ld1 {v29.4h}, [x2], x3
558 ld1 {v30.4h}, [x2], x3
560 convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
561 convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
562 do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type
573 .macro do_8tap_v_func type, filter, offset, size, bpp
574 function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
576 mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
577 movrel x5, X(ff_vp9_subpel_filters), 256*\offset
578 add x6, x5, w6, uxtw #4
588 .macro do_8tap_v_filters size, bpp
589 do_8tap_v_func put, regular, 1, \size, \bpp
590 do_8tap_v_func avg, regular, 1, \size, \bpp
591 do_8tap_v_func put, sharp, 2, \size, \bpp
592 do_8tap_v_func avg, sharp, 2, \size, \bpp
593 do_8tap_v_func put, smooth, 0, \size, \bpp
594 do_8tap_v_func avg, smooth, 0, \size, \bpp
597 .macro do_8tap_v_filters_bpp bpp
598 do_8tap_v_filters 64, \bpp
599 do_8tap_v_filters 32, \bpp
600 do_8tap_v_filters 16, \bpp
601 do_8tap_v_filters 8, \bpp
602 do_8tap_v_filters 4, \bpp
605 do_8tap_v_filters_bpp 10
606 do_8tap_v_filters_bpp 12