X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Faarch64%2Fvp8dsp_neon.S;h=4bbf16d1a4c7f1db75bec8fa8785e03510b8d1a2;hb=419d2524a8239a8f00b4c1702c91065b259615a2;hp=c19ab0de0fd4ad2225c37bd1bc81a364a609befd;hpb=85bfaa4949f4afcde19061def3e8a18988964858;p=ffmpeg diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index c19ab0de0fd..4bbf16d1a4c 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -4,27 +4,84 @@ * Copyright (c) 2010 Rob Clark * Copyright (c) 2011 Mans Rullgard * Copyright (c) 2018 Magnus Röös + * Copyright (c) 2019 Martin Storsjo * - * This file is part of Libav. + * This file is part of FFmpeg. * - * Libav is free software; you can redistribute it and/or + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * - * Libav is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "libavutil/aarch64/asm.S" #include "neon.S" +function ff_vp8_luma_dc_wht_neon, export=1 + ld1 {v0.4h - v3.4h}, [x1] + movi v30.8h, #0 + + add v4.4h, v0.4h, v3.4h + add v6.4h, v1.4h, v2.4h + st1 {v30.8h}, [x1], #16 + sub v7.4h, v1.4h, v2.4h + sub v5.4h, v0.4h, v3.4h + st1 {v30.8h}, [x1] + add v0.4h, v4.4h, v6.4h + add v1.4h, v5.4h, v7.4h + sub v2.4h, v4.4h, v6.4h + sub v3.4h, v5.4h, v7.4h + + movi v16.4h, #3 + + transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 + + add v0.4h, v0.4h, v16.4h + + add v4.4h, v0.4h, v3.4h + add v6.4h, v1.4h, v2.4h + sub v7.4h, v1.4h, v2.4h + sub v5.4h, v0.4h, v3.4h + add v0.4h, v4.4h, v6.4h + add v1.4h, v5.4h, v7.4h + sub v2.4h, v4.4h, v6.4h + sub v3.4h, v5.4h, v7.4h + + sshr v0.4h, v0.4h, #3 + sshr v1.4h, v1.4h, #3 + sshr v2.4h, v2.4h, #3 + sshr v3.4h, v3.4h, #3 + + mov x3, #32 + st1 {v0.h}[0], [x0], x3 + st1 {v1.h}[0], [x0], x3 + st1 {v2.h}[0], [x0], x3 + st1 {v3.h}[0], [x0], x3 + st1 {v0.h}[1], [x0], x3 + st1 {v1.h}[1], [x0], x3 + st1 {v2.h}[1], [x0], x3 + st1 {v3.h}[1], [x0], x3 + st1 {v0.h}[2], [x0], x3 + st1 {v1.h}[2], [x0], x3 + st1 {v2.h}[2], [x0], x3 + st1 {v3.h}[2], [x0], x3 + st1 {v0.h}[3], [x0], x3 + st1 {v1.h}[3], [x0], x3 + st1 {v2.h}[3], [x0], x3 + st1 {v3.h}[3], [x0], x3 + + ret +endfunc + function ff_vp8_idct_add_neon, export=1 ld1 {v0.8b - v3.8b}, [x1] mov w4, #20091 @@ -35,8 +92,8 @@ function ff_vp8_idct_add_neon, export=1 smull v27.4s, v3.4h, v4.h[0] sqdmulh v20.4h, v1.4h, v4.h[1] sqdmulh v23.4h, v3.4h, v4.h[1] - sqshrn v21.4h, v26.4s, #16 - sqshrn v22.4h, v27.4s, #16 + shrn v21.4h, v26.4s, #16 + shrn v22.4h, v27.4s, #16 add v21.4h, v21.4h, v1.4h add v22.4h, v22.4h, v3.4h @@ -60,44 +117,97 @@ function ff_vp8_idct_add_neon, export=1 st1 {v29.16b}, [x1] sqdmulh v21.4h, v1.4h, v4.h[1] sqdmulh v23.4h, v3.4h, v4.h[1] - sqshrn v20.4h, v26.4s, #16 - sqshrn v22.4h, v27.4s, #16 + shrn v20.4h, v26.4s, #16 + shrn v22.4h, v27.4s, #16 add v20.4h, v20.4h, v1.4h add v22.4h, v22.4h, v3.4h add v16.4h, v0.4h, v2.4h sub v17.4h, v0.4h, v2.4h add v18.4h, v20.4h, v23.4h - ld1 {v24.d}[0], [x0], x2 - zip1 v16.2d, v16.2d, v17.2d - sub v19.4h, v21.4h, v22.4h - ld1 {v25.d}[0], [x0], x2 - zip1 v18.2d, v18.2d, v19.2d - add v0.8h, v16.8h, v18.8h - ld1 {v25.d}[1], [x0], x2 - sub v1.8h, v16.8h, v18.8h - ld1 {v24.d}[1], [x0], x2 - srshr v0.8h, v0.8h, #3 - trn1 v24.4s, v24.4s, v25.4s - srshr v1.8h, v1.8h, #3 + ld1 {v24.s}[0], [x0], x2 + sub v19.4h, v21.4h, v22.4h + ld1 {v25.s}[0], [x0], x2 + add v0.4h, v16.4h, v18.4h + add v1.4h, v17.4h, v19.4h + ld1 {v26.s}[0], [x0], x2 + sub v3.4h, v16.4h, v18.4h + sub v2.4h, v17.4h, v19.4h + ld1 {v27.s}[0], [x0], x2 + srshr v0.4h, v0.4h, #3 + srshr v1.4h, v1.4h, #3 + srshr v2.4h, v2.4h, #3 + srshr v3.4h, v3.4h, #3 + sub x0, x0, x2, lsl #2 - ext v1.16b, v1.16b, v1.16b, #8 - trn1 v3.2d, v0.2d, v1.2d - trn2 v0.2d, v0.2d, v1.2d - trn1 v1.8h, v3.8h, v0.8h - trn2 v3.8h, v3.8h, v0.8h - uzp1 v0.4s, v1.4s, v3.4s - uzp2 v1.4s, v3.4s, v1.4s + transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16 uaddw v0.8h, v0.8h, v24.8b - uaddw2 v1.8h, v1.8h, v24.16b + uaddw v1.8h, v1.8h, v25.8b + uaddw v2.8h, v2.8h, v26.8b + uaddw v3.8h, v3.8h, v27.8b sqxtun v0.8b, v0.8h - sqxtun2 v0.16b, v1.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v0.s}[0], [x0], x2 - st1 {v0.s}[1], [x0], x2 - st1 {v0.s}[3], [x0], x2 - st1 {v0.s}[2], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v2.s}[0], [x0], x2 + st1 {v3.s}[0], [x0], x2 + + ret +endfunc + +function ff_vp8_idct_dc_add4uv_neon, export=1 + movi v0.4h, #0 + mov x3, #32 + ld1r {v16.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v17.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v18.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v19.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + mov x3, x0 + srshr v16.8h, v16.8h, #3 // dc >>= 3 + ld1 {v0.8b}, [x0], x2 + srshr v18.8h, v18.8h, #3 + ld1 {v1.8b}, [x0], x2 + uaddw v20.8h, v16.8h, v0.8b + ld1 {v2.8b}, [x0], x2 + uaddw v0.8h, v16.8h, v1.8b + ld1 {v3.8b}, [x0], x2 + uaddw v22.8h, v16.8h, v2.8b + ld1 {v4.8b}, [x0], x2 + uaddw v2.8h, v16.8h, v3.8b + ld1 {v5.8b}, [x0], x2 + uaddw v24.8h, v18.8h, v4.8b + ld1 {v6.8b}, [x0], x2 + uaddw v4.8h, v18.8h, v5.8b + ld1 {v7.8b}, [x0], x2 + uaddw v26.8h, v18.8h, v6.8b + sqxtun v20.8b, v20.8h + uaddw v6.8h, v18.8h, v7.8b + sqxtun v21.8b, v0.8h + sqxtun v22.8b, v22.8h + st1 {v20.8b}, [x3], x2 + sqxtun v23.8b, v2.8h + st1 {v21.8b}, [x3], x2 + sqxtun v24.8b, v24.8h + st1 {v22.8b}, [x3], x2 + sqxtun v25.8b, v4.8h + st1 {v23.8b}, [x3], x2 + sqxtun v26.8b, v26.8h + st1 {v24.8b}, [x3], x2 + sqxtun v27.8b, v6.8h + st1 {v25.8b}, [x3], x2 + st1 {v26.8b}, [x3], x2 + st1 {v27.8b}, [x3], x2 ret endfunc @@ -660,23 +770,6 @@ endfunc sqrshrun2 \d0\().16b, v22.8h, #7 .endm -.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 - uxtl \s2\().8h, \s2\().8b - uxtl \s3\().8h, \s3\().8b - uxtl \s1\().8h, \s1\().8b - uxtl \s4\().8h, \s4\().8b - uxtl \s0\().8h, \s0\().8b - uxtl \s5\().8h, \s5\().8b - mul \s2\().8h, \s2\().8h, v0.h[2] - mul \s3\().8h, \s3\().8h, v0.h[3] - mls \s2\().8h, \s1\().8h, v0.h[1] - mls \s3\().8h, \s4\().8h, v0.h[4] - mla \s2\().8h, \s0\().8h, v0.h[0] - mla \s3\().8h, \s5\().8h, v0.h[5] - sqadd \s3\().8h, \s2\().8h, \s3\().8h - sqrshrun \d0\().8b, \s3\().8h, #7 -.endm - .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 uxtl \s0\().8h, \s0\().8b uxtl \s3\().8h, \s3\().8b @@ -743,7 +836,7 @@ endfunc // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit -// arithmatic can be used to apply filters +// arithmetic can be used to apply filters const subpel_filters, align=4 .short 0, 6, 123, 12, 1, 0, 0, 0 .short 2, 11, 108, 36, 8, 1, 0, 0 @@ -833,21 +926,69 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 2: ld1 {v1.8b - v4.8b}, [x7], #32 ld1 {v16.8b - v19.8b}, [x7], #32 - ld1 {v20.8b - v23.8b}, [x7] - sub x7, x7, #48 + ld1 {v20.8b - v23.8b}, [x7], #32 + ld1 {v24.8b - v25.8b}, [x7] + sub x7, x7, #64 - vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22 - vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23 - trn1 v2.2d, v5.2d, v2.2d + vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 + vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 + trn1 v1.2d, v1.2d, v2.2d + trn1 v3.2d, v3.2d, v4.2d - st1 {v2.16b}, [x0], x1 - subs x4, x4, #1 + st1 {v1.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + subs x4, x4, #2 b.ne 2b add sp, sp, #336+16 ret endfunc +function ff_put_vp8_epel8_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x2], x3 + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x2], x3 + ld1 {v28.8b}, [x2] + + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 + + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + subs w4, w4, #2 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h6_neon, export=1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b, v3.8b}, [x2], x3 + + vp8_epel8_h6 v2, v2, v3 + + st1 {v2.8b}, [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + function ff_put_vp8_epel8_h6v6_neon, export=1 sub x2, x2, x3, lsl #1 sub x2, x2, #2 @@ -894,6 +1035,48 @@ function ff_put_vp8_epel8_h6v6_neon, export=1 ret endfunc +function ff_put_vp8_epel8_v4_neon, export=1 + sub x2, x2, x3 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x2], x3 + ld1 {v6.8b}, [x2] + sub x2, x2, x3, lsl #1 + + vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 + + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x0], x1 + subs w4, w4, #2 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h4_neon, export=1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + + vp8_epel8_h4 v2, v2, v3 + + st1 {v2.8b}, [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + function ff_put_vp8_epel8_h4v6_neon, export=1 sub x2, x2, x3, lsl #1 sub x2, x2, #1 @@ -1029,3 +1212,579 @@ function ff_put_vp8_epel8_h6v4_neon, export=1 add sp, sp, #168+16 ret endfunc + +function ff_put_vp8_epel4_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1r {v2.2s}, [x2], x3 + ld1r {v3.2s}, [x2], x3 + ld1r {v4.2s}, [x2], x3 + ld1r {v5.2s}, [x2], x3 + ld1r {v6.2s}, [x2], x3 + ld1r {v7.2s}, [x2], x3 + ld1r {v28.2s}, [x2] + sub x2, x2, x3, lsl #2 + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + ld1 {v4.s}[1], [x2], x3 + ld1 {v5.s}[1], [x2], x3 + ld1 {v6.s}[1], [x2], x3 + ld1 {v7.s}[1], [x2], x3 + ld1 {v28.s}[1], [x2] + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 + + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h6_neon, export=1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #52 + add w8, w4, #5 + mov x9, sp +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1 {v6.8b}, [x9], #8 + ld1r {v28.2s}, [x9] + sub x9, x9, #16 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v7.8b}, [x9], #8 + ld1 {v28.s}[1], [x9] + sub x9, x9, #16 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + trn1 v3.2s, v6.2s, v7.2s + trn2 v7.2s, v6.2s, v7.2s + vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #52 + ret +endfunc + +function ff_put_vp8_epel4_h4v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #52 + add w8, w4, #5 + mov x9, sp +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v2 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1 {v6.8b}, [x9], #8 + ld1r {v28.2s}, [x9] + sub x9, x9, #16 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v7.8b}, [x9], #8 + ld1 {v28.s}[1], [x9] + sub x9, x9, #16 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + trn1 v3.2s, v6.2s, v7.2s + trn2 v7.2s, v6.2s, v7.2s + vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #52 + ret +endfunc + +function ff_put_vp8_epel4_h6v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #44 + add w8, w4, #3 + mov x9, sp +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1r {v6.2s}, [x9] + sub x9, x9, #8 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v6.s}[1], [x9] + sub x9, x9, #8 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[1], [x0], x1 + st1 {v1.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #44 + ret +endfunc + +function ff_put_vp8_epel4_h4_neon, export=1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v2 + st1 {v2.s}[0], [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_v4_neon, export=1 + sub x2, x2, x3 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1r {v2.2s}, [x2], x3 + ld1r {v3.2s}, [x2], x3 + ld1r {v4.2s}, [x2], x3 + ld1r {v5.2s}, [x2], x3 + ld1r {v6.2s}, [x2] + sub x2, x2, x3, lsl #1 + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + ld1 {v4.s}[1], [x2], x3 + ld1 {v5.s}[1], [x2], x3 + ld1 {v6.s}[1], [x2] + sub x2, x2, x3, lsl #1 + + vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 + + st1 {v2.s}[0], [x0], x1 + st1 {v2.s}[2], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v2.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h4v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #44 + add w8, w4, #3 + mov x9, sp +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1r {v6.2s}, [x9] + sub x9, x9, #8 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v6.s}[1], [x9] + sub x9, x9, #8 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[1], [x0], x1 + st1 {v1.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #44 + ret +endfunc + +/* Bilinear MC */ + +function ff_put_vp8_bilin16_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3 + ext v5.8b, v3.8b, v4.8b, #1 + ext v4.8b, v2.8b, v3.8b, #1 + umull v16.8h, v2.8b, v1.8b + umlal v16.8h, v4.8b, v0.8b + ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 + umull v6.8h, v3.8b, v1.8b + umlal v6.8h, v5.8b, v0.8b + ext v21.8b, v19.8b, v20.8b, #1 + ext v20.8b, v18.8b, v19.8b, #1 + umull v22.8h, v18.8b, v1.8b + umlal v22.8h, v20.8b, v0.8b + umull v24.8h, v19.8b, v1.8b + umlal v24.8h, v21.8b, v0.8b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v6.8h, #3 + rshrn v6.8b, v22.8h, #3 + rshrn2 v6.16b, v24.8h, #3 + st1 {v4.16b}, [x0], x1 + st1 {v6.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin16_v_neon, export=1 + mov w7, #8 + dup v0.16b, w6 + sub w6, w7, w6 + dup v1.16b, w6 + + ld1 {v2.16b}, [x2], x3 +1: + subs w4, w4, #2 + ld1 {v4.16b}, [x2], x3 + umull v6.8h, v2.8b, v1.8b + umlal v6.8h, v4.8b, v0.8b + umull2 v16.8h, v2.16b, v1.16b + umlal2 v16.8h, v4.16b, v0.16b + ld1 {v2.16b}, [x2], x3 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v2.8b, v0.8b + umull2 v20.8h, v4.16b, v1.16b + umlal2 v20.8h, v2.16b, v0.16b + rshrn v4.8b, v6.8h, #3 + rshrn2 v4.16b, v16.8h, #3 + rshrn v6.8b, v18.8h, #3 + rshrn2 v6.16b, v20.8h, #3 + st1 {v4.16b}, [x0], x1 + st1 {v6.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin16_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.16b, w6 // my + sub w6, w7, w6 + dup v3.16b, w6 + + ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3 + + ext v7.8b, v5.8b, v6.8b, #1 + ext v6.8b, v4.8b, v5.8b, #1 + umull v16.8h, v4.8b, v1.8b + umlal v16.8h, v6.8b, v0.8b + umull v18.8h, v5.8b, v1.8b + umlal v18.8h, v7.8b, v0.8b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 + ext v21.8b, v19.8b, v20.8b, #1 + ext v20.8b, v18.8b, v19.8b, #1 + umull v22.8h, v18.8b, v1.8b + umlal v22.8h, v20.8b, v0.8b + ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3 + umull v24.8h, v19.8b, v1.8b + umlal v24.8h, v21.8b, v0.8b + ext v29.8b, v27.8b, v28.8b, #1 + ext v28.8b, v26.8b, v27.8b, #1 + umull v16.8h, v26.8b, v1.8b + umlal v16.8h, v28.8b, v0.8b + umull v18.8h, v27.8b, v1.8b + umlal v18.8h, v29.8b, v0.8b + rshrn v6.8b, v22.8h, #3 + rshrn2 v6.16b, v24.8h, #3 + umull v24.8h, v4.8b, v3.8b + umlal v24.8h, v6.8b, v2.8b + umull2 v30.8h, v4.16b, v3.16b + umlal2 v30.8h, v6.16b, v2.16b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v18.8h, #3 + umull v20.8h, v6.8b, v3.8b + umlal v20.8h, v4.8b, v2.8b + umull2 v22.8h, v6.16b, v3.16b + umlal2 v22.8h, v4.16b, v2.16b + rshrn v24.8b, v24.8h, #3 + rshrn2 v24.16b, v30.8h, #3 + st1 {v24.16b}, [x0], x1 + rshrn v20.8b, v20.8h, #3 + rshrn2 v20.16b, v22.8h, #3 + st1 {v20.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b,v3.8b}, [x2], x3 + ext v3.8b, v2.8b, v3.8b, #1 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + ld1 {v6.8b,v7.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + rshrn v16.8b, v16.8h, #3 + st1 {v4.8b}, [x0], x1 + st1 {v16.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_v_neon, export=1 + mov w7, #8 + dup v0.8b, w6 + sub w6, w7, w6 + dup v1.8b, w6 + + ld1 {v2.8b}, [x2], x3 +1: + subs w4, w4, #2 + ld1 {v3.8b}, [x2], x3 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + ld1 {v2.8b}, [x2], x3 + umull v6.8h, v3.8b, v1.8b + umlal v6.8h, v2.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + rshrn v6.8b, v6.8h, #3 + st1 {v4.8b}, [x0], x1 + st1 {v6.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.8b, w6 // my + sub w6, w7, w6 + dup v3.8b, w6 + + ld1 {v4.8b,v5.8b}, [x2], x3 + ext v5.8b, v4.8b, v5.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v22.8b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v6.8b,v7.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + ld1 {v4.8b,v5.8b}, [x2], x3 + ext v5.8b, v4.8b, v5.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v16.8b, v16.8h, #3 + umull v20.8h, v22.8b, v3.8b + umlal v20.8h, v16.8b, v2.8b + rshrn v22.8b, v18.8h, #3 + umull v24.8h, v16.8b, v3.8b + umlal v24.8h, v22.8b, v2.8b + rshrn v20.8b, v20.8h, #3 + st1 {v20.8b}, [x0], x1 + rshrn v23.8b, v24.8h, #3 + st1 {v23.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b}, [x2], x3 + ext v3.8b, v2.8b, v3.8b, #1 + ld1 {v6.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + trn1 v2.2s, v2.2s, v6.2s + trn1 v3.2s, v3.2s, v7.2s + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_v_neon, export=1 + mov w7, #8 + dup v0.8b, w6 + sub w6, w7, w6 + dup v1.8b, w6 + + ld1r {v2.2s}, [x2], x3 +1: + ld1r {v3.2s}, [x2] + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + trn2 v2.2s, v3.2s, v2.2s + rshrn v4.8b, v4.8h, #3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + subs w4, w4, #2 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.8b, w6 // my + sub w6, w7, w6 + dup v3.8b, w6 + + ld1 {v4.8b}, [x2], x3 + ext v5.8b, v4.8b, v4.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v22.8b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v6.8b}, [x2], x3 + ext v7.8b, v6.8b, v6.8b, #1 + ld1 {v4.8b}, [x2], x3 + ext v5.8b, v4.8b, v4.8b, #1 + trn1 v6.2s, v6.2s, v4.2s + trn1 v7.2s, v7.2s, v5.2s + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + rshrn v16.8b, v16.8h, #3 + umull v20.8h, v16.8b, v2.8b + trn1 v22.2s, v22.2s, v16.2s + umlal v20.8h, v22.8b, v3.8b + rev64 v22.2s, v16.2s + rshrn v20.8b, v20.8h, #3 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x0], x1 + b.gt 1b + + ret +endfunc