2 * Copyright (c) 2016 Google Inc.
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/aarch64/asm.S"
23 // All public functions in this file have the following signature:
24 // typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
25 // const uint8_t *ref, ptrdiff_t ref_stride,
26 // int h, int mx, int my);
28 function ff_vp9_copy64_aarch64, export=1
33 ldp x9, x10, [x2, #32]
36 ldp x11, x12, [x2, #48]
37 stp x9, x10, [x0, #32]
38 stp x11, x12, [x0, #48]
45 function ff_vp9_avg64_neon, export=1
48 ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3
49 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
50 ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
51 urhadd v0.16b, v0.16b, v4.16b
52 urhadd v1.16b, v1.16b, v5.16b
53 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
54 urhadd v2.16b, v2.16b, v6.16b
55 urhadd v3.16b, v3.16b, v7.16b
57 urhadd v16.16b, v16.16b, v20.16b
58 urhadd v17.16b, v17.16b, v21.16b
59 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1
60 urhadd v18.16b, v18.16b, v22.16b
61 urhadd v19.16b, v19.16b, v23.16b
62 st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
67 function ff_vp9_copy32_aarch64, export=1
80 function ff_vp9_avg32_neon, export=1
82 ld1 {v2.16b, v3.16b}, [x2], x3
83 ld1 {v0.16b, v1.16b}, [x0]
84 urhadd v0.16b, v0.16b, v2.16b
85 urhadd v1.16b, v1.16b, v3.16b
87 st1 {v0.16b, v1.16b}, [x0], x1
92 function ff_vp9_copy16_neon, export=1
98 ld1 {v0.16b}, [x2], x3
99 ld1 {v1.16b}, [x6], x3
100 ld1 {v2.16b}, [x2], x3
101 ld1 {v3.16b}, [x6], x3
103 st1 {v0.16b}, [x0], x1
104 st1 {v1.16b}, [x5], x1
105 st1 {v2.16b}, [x0], x1
106 st1 {v3.16b}, [x5], x1
111 function ff_vp9_avg16_neon, export=1
114 ld1 {v2.16b}, [x2], x3
115 ld1 {v0.16b}, [x0], x1
116 ld1 {v3.16b}, [x2], x3
117 urhadd v0.16b, v0.16b, v2.16b
118 ld1 {v1.16b}, [x0], x1
119 urhadd v1.16b, v1.16b, v3.16b
121 st1 {v0.16b}, [x5], x1
122 st1 {v1.16b}, [x5], x1
127 function ff_vp9_copy8_neon, export=1
129 ld1 {v0.8b}, [x2], x3
130 ld1 {v1.8b}, [x2], x3
132 st1 {v0.8b}, [x0], x1
133 st1 {v1.8b}, [x0], x1
138 function ff_vp9_avg8_neon, export=1
141 ld1 {v2.8b}, [x2], x3
142 ld1 {v0.8b}, [x0], x1
143 ld1 {v3.8b}, [x2], x3
144 urhadd v0.8b, v0.8b, v2.8b
145 ld1 {v1.8b}, [x0], x1
146 urhadd v1.8b, v1.8b, v3.8b
148 st1 {v0.8b}, [x5], x1
149 st1 {v1.8b}, [x5], x1
154 function ff_vp9_copy4_neon, export=1
156 ld1 {v0.s}[0], [x2], x3
157 ld1 {v1.s}[0], [x2], x3
158 st1 {v0.s}[0], [x0], x1
159 ld1 {v2.s}[0], [x2], x3
160 st1 {v1.s}[0], [x0], x1
161 ld1 {v3.s}[0], [x2], x3
163 st1 {v2.s}[0], [x0], x1
164 st1 {v3.s}[0], [x0], x1
169 function ff_vp9_avg4_neon, export=1
172 ld1 {v2.s}[0], [x2], x3
173 ld1 {v0.s}[0], [x0], x1
174 ld1 {v2.s}[1], [x2], x3
175 ld1 {v0.s}[1], [x0], x1
176 ld1 {v3.s}[0], [x2], x3
177 ld1 {v1.s}[0], [x0], x1
178 ld1 {v3.s}[1], [x2], x3
179 ld1 {v1.s}[1], [x0], x1
181 urhadd v0.8b, v0.8b, v2.8b
182 urhadd v1.8b, v1.8b, v3.8b
183 st1 {v0.s}[0], [x5], x1
184 st1 {v0.s}[1], [x5], x1
185 st1 {v1.s}[0], [x5], x1
186 st1 {v1.s}[1], [x5], x1
192 // Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
193 // for size >= 16), and multiply-accumulate into dst1 and dst3 (or
194 // dst1-dst2 and dst3-dst4 for size >= 16)
195 .macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
196 ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
197 ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
199 mla \dst1\().8h, v20.8h, v0.h[\offset]
200 ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
201 mla \dst3\().8h, v22.8h, v0.h[\offset]
202 ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
203 mla \dst2\().8h, v21.8h, v0.h[\offset]
204 mla \dst4\().8h, v23.8h, v0.h[\offset]
206 mla \dst1\().8h, v20.8h, v0.h[\offset]
207 mla \dst3\().8h, v22.8h, v0.h[\offset]
209 mla \dst1\().4h, v20.4h, v0.h[\offset]
210 mla \dst3\().4h, v22.4h, v0.h[\offset]
213 // The same as above, but don't accumulate straight into the
214 // destination, but use a temp register and accumulate with saturation.
215 .macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
216 ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
217 ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
219 mul v20.8h, v20.8h, v0.h[\offset]
220 ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
221 mul v22.8h, v22.8h, v0.h[\offset]
222 ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
223 mul v21.8h, v21.8h, v0.h[\offset]
224 mul v23.8h, v23.8h, v0.h[\offset]
226 mul v20.8h, v20.8h, v0.h[\offset]
227 mul v22.8h, v22.8h, v0.h[\offset]
229 mul v20.4h, v20.4h, v0.h[\offset]
230 mul v22.4h, v22.4h, v0.h[\offset]
233 sqadd \dst1\().4h, \dst1\().4h, v20.4h
234 sqadd \dst3\().4h, \dst3\().4h, v22.4h
236 sqadd \dst1\().8h, \dst1\().8h, v20.8h
237 sqadd \dst3\().8h, \dst3\().8h, v22.8h
239 sqadd \dst2\().8h, \dst2\().8h, v21.8h
240 sqadd \dst4\().8h, \dst4\().8h, v23.8h
246 // Instantiate a horizontal filter function for the given size.
247 // This can work on 4, 8 or 16 pixels in parallel; for larger
248 // widths it will do 16 pixels at a time and loop horizontally.
249 // The actual width is passed in x5, the height in w4 and the
250 // filter coefficients in x9. idx2 is the index of the largest
251 // filter coefficient (3 or 4) and idx1 is the other one of them.
252 .macro do_8tap_h type, size, idx1, idx2
253 function \type\()_8tap_\size\()h_\idx1\idx2
259 // Only size >= 16 loops horizontally and needs
260 // reduced dst stride
264 // size >= 16 loads two qwords and increments x2,
265 // for size 4/8 it's enough with one qword and no
271 // Load the filter vector
279 ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24
280 ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24
282 ld1 {v4.8b, v5.8b}, [x2]
283 ld1 {v16.8b, v17.8b}, [x7]
295 // Accumulate, adding idx2 last with a separate
296 // saturating add. The positive filter coefficients
297 // for all indices except idx2 must add up to less
298 // than 127 for this not to overflow.
299 mul v1.8h, v4.8h, v0.h[0]
300 mul v24.8h, v16.8h, v0.h[0]
302 mul v2.8h, v5.8h, v0.h[0]
303 mul v25.8h, v17.8h, v0.h[0]
305 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size
306 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size
307 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size
308 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size
309 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size
310 extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size
311 extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size
313 // Round, shift and saturate
314 sqrshrun v1.8b, v1.8h, #7
315 sqrshrun v24.8b, v24.8h, #7
317 sqrshrun2 v1.16b, v2.8h, #7
318 sqrshrun2 v24.16b, v25.8h, #7
325 urhadd v1.16b, v1.16b, v2.16b
326 urhadd v24.16b, v24.16b, v3.16b
330 urhadd v1.8b, v1.8b, v2.8b
331 urhadd v24.8b, v24.8b, v3.8b
335 urhadd v1.8b, v1.8b, v2.8b
336 urhadd v24.8b, v24.8b, v3.8b
339 // Store and loop horizontally (for size >= 16)
342 st1 {v1.16b}, [x0], #16
343 st1 {v24.16b}, [x6], #16
347 ld1 {v6.16b}, [x2], #16
348 ld1 {v18.16b}, [x7], #16
352 uxtl2 v18.8h, v18.16b
373 .macro do_8tap_h_size size
374 do_8tap_h put, \size, 3, 4
375 do_8tap_h avg, \size, 3, 4
376 do_8tap_h put, \size, 4, 3
377 do_8tap_h avg, \size, 4, 3
384 .macro do_8tap_h_func type, filter, offset, size
385 function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
386 movrel x6, X(ff_vp9_subpel_filters), 256*\offset
388 add x9, x6, w5, uxtw #4
391 b.ge \type\()_8tap_16h_34
392 b \type\()_8tap_16h_43
394 b.ge \type\()_8tap_\size\()h_34
395 b \type\()_8tap_\size\()h_43
400 .macro do_8tap_h_filters size
401 do_8tap_h_func put, regular, 1, \size
402 do_8tap_h_func avg, regular, 1, \size
403 do_8tap_h_func put, sharp, 2, \size
404 do_8tap_h_func avg, sharp, 2, \size
405 do_8tap_h_func put, smooth, 0, \size
406 do_8tap_h_func avg, smooth, 0, \size
418 // Round, shift and saturate and store reg1-reg2 over 4 lines
419 .macro do_store4 reg1, reg2, tmp1, tmp2, type
420 sqrshrun \reg1\().8b, \reg1\().8h, #7
421 sqrshrun \reg2\().8b, \reg2\().8h, #7
423 ld1 {\tmp1\().s}[0], [x7], x1
424 ld1 {\tmp2\().s}[0], [x7], x1
425 ld1 {\tmp1\().s}[1], [x7], x1
426 ld1 {\tmp2\().s}[1], [x7], x1
427 urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
428 urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
430 st1 {\reg1\().s}[0], [x0], x1
431 st1 {\reg2\().s}[0], [x0], x1
432 st1 {\reg1\().s}[1], [x0], x1
433 st1 {\reg2\().s}[1], [x0], x1
436 // Round, shift and saturate and store reg1-4
437 .macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
438 sqrshrun \reg1\().8b, \reg1\().8h, #7
439 sqrshrun \reg2\().8b, \reg2\().8h, #7
440 sqrshrun \reg3\().8b, \reg3\().8h, #7
441 sqrshrun \reg4\().8b, \reg4\().8h, #7
443 ld1 {\tmp1\().8b}, [x7], x1
444 ld1 {\tmp2\().8b}, [x7], x1
445 ld1 {\tmp3\().8b}, [x7], x1
446 ld1 {\tmp4\().8b}, [x7], x1
447 urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
448 urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
449 urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b
450 urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b
452 st1 {\reg1\().8b}, [x0], x1
453 st1 {\reg2\().8b}, [x0], x1
454 st1 {\reg3\().8b}, [x0], x1
455 st1 {\reg4\().8b}, [x0], x1
458 // Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
459 // (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
460 // at the end with saturation. Indices 0 and 7 always have negative or zero
461 // coefficients, so they can be accumulated into tmp1-tmp2 together with the
462 // largest coefficient.
463 .macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
464 mul \dst1\().8h, \src2\().8h, v0.h[1]
465 mul \dst2\().8h, \src3\().8h, v0.h[1]
466 mul \tmp1\().8h, \src1\().8h, v0.h[0]
467 mul \tmp2\().8h, \src2\().8h, v0.h[0]
468 mla \dst1\().8h, \src3\().8h, v0.h[2]
469 mla \dst2\().8h, \src4\().8h, v0.h[2]
471 mla \dst1\().8h, \src4\().8h, v0.h[3]
472 mla \dst2\().8h, \src5\().8h, v0.h[3]
474 mla \dst1\().8h, \src5\().8h, v0.h[4]
475 mla \dst2\().8h, \src6\().8h, v0.h[4]
477 mla \dst1\().8h, \src6\().8h, v0.h[5]
478 mla \dst2\().8h, \src7\().8h, v0.h[5]
479 mla \tmp1\().8h, \src8\().8h, v0.h[7]
480 mla \tmp2\().8h, \src9\().8h, v0.h[7]
481 mla \dst1\().8h, \src7\().8h, v0.h[6]
482 mla \dst2\().8h, \src8\().8h, v0.h[6]
484 mla \tmp1\().8h, \src4\().8h, v0.h[3]
485 mla \tmp2\().8h, \src5\().8h, v0.h[3]
487 mla \tmp1\().8h, \src5\().8h, v0.h[4]
488 mla \tmp2\().8h, \src6\().8h, v0.h[4]
490 sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h
491 sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h
494 // Load pixels and extend them to 16 bit
495 .macro loadl dst1, dst2, dst3, dst4
496 ld1 {v1.8b}, [x2], x3
497 ld1 {v2.8b}, [x2], x3
498 ld1 {v3.8b}, [x2], x3
500 ld1 {v4.8b}, [x2], x3
502 uxtl \dst1\().8h, v1.8b
503 uxtl \dst2\().8h, v2.8b
504 uxtl \dst3\().8h, v3.8b
506 uxtl \dst4\().8h, v4.8b
510 // Instantiate a vertical filter function for filtering 8 pixels at a time.
511 // The height is passed in x4, the width in x5 and the filter coefficients
512 // in x6. idx2 is the index of the largest filter coefficient (3 or 4)
513 // and idx1 is the other one of them.
514 .macro do_8tap_8v type, idx1, idx2
515 function \type\()_8tap_8v_\idx1\idx2
516 sub x2, x2, x3, lsl #1
527 loadl v20, v21, v22, v23
529 loadl v24, v25, v26, v27
530 convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6
531 convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6
532 do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
537 loadl v16, v17, v18, v19
538 convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6
539 convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6
540 do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
545 loadl v20, v21, v22, v23
546 convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6
547 convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6
548 do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
556 // x0 -= h * dst_stride
558 // x2 -= h * src_stride
560 // x2 -= 8 * src_stride
561 sub x2, x2, x3, lsl #3
562 // x2 += 1 * src_stride
578 // Instantiate a vertical filter function for filtering a 4 pixels wide
579 // slice. The first half of the registers contain one row, while the second
580 // half of a register contains the second-next row (also stored in the first
581 // half of the register two steps ahead). The convolution does two outputs
582 // at a time; the output of v17-v24 into one, and v18-v25 into another one.
583 // The first half of first output is the first output row, the first half
584 // of the other output is the second output row. The second halves of the
585 // registers are rows 3 and 4.
586 // This only is designed to work for 4 or 8 output lines.
587 .macro do_8tap_4v type, idx1, idx2
588 function \type\()_8tap_4v_\idx1\idx2
589 sub x2, x2, x3, lsl #1
596 ld1 {v1.s}[0], [x2], x3
597 ld1 {v2.s}[0], [x2], x3
598 ld1 {v3.s}[0], [x2], x3
599 ld1 {v4.s}[0], [x2], x3
600 ld1 {v5.s}[0], [x2], x3
601 ld1 {v6.s}[0], [x2], x3
602 trn1 v1.2s, v1.2s, v3.2s
603 ld1 {v7.s}[0], [x2], x3
604 trn1 v2.2s, v2.2s, v4.2s
605 ld1 {v26.s}[0], [x2], x3
607 trn1 v3.2s, v3.2s, v5.2s
608 ld1 {v27.s}[0], [x2], x3
610 trn1 v4.2s, v4.2s, v6.2s
611 ld1 {v28.s}[0], [x2], x3
613 trn1 v5.2s, v5.2s, v7.2s
614 ld1 {v29.s}[0], [x2], x3
616 trn1 v6.2s, v6.2s, v26.2s
618 trn1 v7.2s, v7.2s, v27.2s
620 trn1 v26.2s, v26.2s, v28.2s
622 trn1 v27.2s, v27.2s, v29.2s
626 convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4
627 do_store4 v1, v2, v5, v6, \type
632 ld1 {v1.s}[0], [x2], x3
633 ld1 {v2.s}[0], [x2], x3
634 trn1 v28.2s, v28.2s, v1.2s
635 trn1 v29.2s, v29.2s, v2.2s
636 ld1 {v1.s}[1], [x2], x3
638 ld1 {v2.s}[1], [x2], x3
643 convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4
644 do_store4 v1, v2, v5, v6, \type
657 .macro do_8tap_v_func type, filter, offset, size
658 function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
660 movrel x5, X(ff_vp9_subpel_filters), 256*\offset
662 add x6, x5, w6, uxtw #4
665 b.ge \type\()_8tap_8v_34
666 b \type\()_8tap_8v_43
668 b.ge \type\()_8tap_4v_34
669 b \type\()_8tap_4v_43
674 .macro do_8tap_v_filters size
675 do_8tap_v_func put, regular, 1, \size
676 do_8tap_v_func avg, regular, 1, \size
677 do_8tap_v_func put, sharp, 2, \size
678 do_8tap_v_func avg, sharp, 2, \size
679 do_8tap_v_func put, smooth, 0, \size
680 do_8tap_v_func avg, smooth, 0, \size