* Copyright (c) 2010 Rob Clark <rob@ti.com>
* Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
+ * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
*
- * This file is part of Libav.
+ * This file is part of FFmpeg.
*
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#include "neon.S"
+function ff_vp8_luma_dc_wht_neon, export=1
+ ld1 {v0.4h - v3.4h}, [x1]
+ movi v30.8h, #0
+
+ add v4.4h, v0.4h, v3.4h
+ add v6.4h, v1.4h, v2.4h
+ st1 {v30.8h}, [x1], #16
+ sub v7.4h, v1.4h, v2.4h
+ sub v5.4h, v0.4h, v3.4h
+ st1 {v30.8h}, [x1]
+ add v0.4h, v4.4h, v6.4h
+ add v1.4h, v5.4h, v7.4h
+ sub v2.4h, v4.4h, v6.4h
+ sub v3.4h, v5.4h, v7.4h
+
+ movi v16.4h, #3
+
+ transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
+
+ add v0.4h, v0.4h, v16.4h
+
+ add v4.4h, v0.4h, v3.4h
+ add v6.4h, v1.4h, v2.4h
+ sub v7.4h, v1.4h, v2.4h
+ sub v5.4h, v0.4h, v3.4h
+ add v0.4h, v4.4h, v6.4h
+ add v1.4h, v5.4h, v7.4h
+ sub v2.4h, v4.4h, v6.4h
+ sub v3.4h, v5.4h, v7.4h
+
+ sshr v0.4h, v0.4h, #3
+ sshr v1.4h, v1.4h, #3
+ sshr v2.4h, v2.4h, #3
+ sshr v3.4h, v3.4h, #3
+
+ mov x3, #32
+ st1 {v0.h}[0], [x0], x3
+ st1 {v1.h}[0], [x0], x3
+ st1 {v2.h}[0], [x0], x3
+ st1 {v3.h}[0], [x0], x3
+ st1 {v0.h}[1], [x0], x3
+ st1 {v1.h}[1], [x0], x3
+ st1 {v2.h}[1], [x0], x3
+ st1 {v3.h}[1], [x0], x3
+ st1 {v0.h}[2], [x0], x3
+ st1 {v1.h}[2], [x0], x3
+ st1 {v2.h}[2], [x0], x3
+ st1 {v3.h}[2], [x0], x3
+ st1 {v0.h}[3], [x0], x3
+ st1 {v1.h}[3], [x0], x3
+ st1 {v2.h}[3], [x0], x3
+ st1 {v3.h}[3], [x0], x3
+
+ ret
+endfunc
+
function ff_vp8_idct_add_neon, export=1
ld1 {v0.8b - v3.8b}, [x1]
mov w4, #20091
smull v27.4s, v3.4h, v4.h[0]
sqdmulh v20.4h, v1.4h, v4.h[1]
sqdmulh v23.4h, v3.4h, v4.h[1]
- sqshrn v21.4h, v26.4s, #16
- sqshrn v22.4h, v27.4s, #16
+ shrn v21.4h, v26.4s, #16
+ shrn v22.4h, v27.4s, #16
add v21.4h, v21.4h, v1.4h
add v22.4h, v22.4h, v3.4h
st1 {v29.16b}, [x1]
sqdmulh v21.4h, v1.4h, v4.h[1]
sqdmulh v23.4h, v3.4h, v4.h[1]
- sqshrn v20.4h, v26.4s, #16
- sqshrn v22.4h, v27.4s, #16
+ shrn v20.4h, v26.4s, #16
+ shrn v22.4h, v27.4s, #16
add v20.4h, v20.4h, v1.4h
add v22.4h, v22.4h, v3.4h
add v16.4h, v0.4h, v2.4h
sub v17.4h, v0.4h, v2.4h
add v18.4h, v20.4h, v23.4h
- ld1 {v24.d}[0], [x0], x2
- zip1 v16.2d, v16.2d, v17.2d
- sub v19.4h, v21.4h, v22.4h
- ld1 {v25.d}[0], [x0], x2
- zip1 v18.2d, v18.2d, v19.2d
- add v0.8h, v16.8h, v18.8h
- ld1 {v25.d}[1], [x0], x2
- sub v1.8h, v16.8h, v18.8h
- ld1 {v24.d}[1], [x0], x2
- srshr v0.8h, v0.8h, #3
- trn1 v24.4s, v24.4s, v25.4s
- srshr v1.8h, v1.8h, #3
+ ld1 {v24.s}[0], [x0], x2
+ sub v19.4h, v21.4h, v22.4h
+ ld1 {v25.s}[0], [x0], x2
+ add v0.4h, v16.4h, v18.4h
+ add v1.4h, v17.4h, v19.4h
+ ld1 {v26.s}[0], [x0], x2
+ sub v3.4h, v16.4h, v18.4h
+ sub v2.4h, v17.4h, v19.4h
+ ld1 {v27.s}[0], [x0], x2
+ srshr v0.4h, v0.4h, #3
+ srshr v1.4h, v1.4h, #3
+ srshr v2.4h, v2.4h, #3
+ srshr v3.4h, v3.4h, #3
+
sub x0, x0, x2, lsl #2
- ext v1.16b, v1.16b, v1.16b, #8
- trn1 v3.2d, v0.2d, v1.2d
- trn2 v0.2d, v0.2d, v1.2d
- trn1 v1.8h, v3.8h, v0.8h
- trn2 v3.8h, v3.8h, v0.8h
- uzp1 v0.4s, v1.4s, v3.4s
- uzp2 v1.4s, v3.4s, v1.4s
+ transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16
uaddw v0.8h, v0.8h, v24.8b
- uaddw2 v1.8h, v1.8h, v24.16b
+ uaddw v1.8h, v1.8h, v25.8b
+ uaddw v2.8h, v2.8h, v26.8b
+ uaddw v3.8h, v3.8h, v27.8b
sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+
st1 {v0.s}[0], [x0], x2
- st1 {v0.s}[1], [x0], x2
- st1 {v0.s}[3], [x0], x2
- st1 {v0.s}[2], [x0], x2
+ st1 {v1.s}[0], [x0], x2
+ st1 {v2.s}[0], [x0], x2
+ st1 {v3.s}[0], [x0], x2
+
+ ret
+endfunc
+
+function ff_vp8_idct_dc_add4uv_neon, export=1
+ movi v0.4h, #0
+ mov x3, #32
+ ld1r {v16.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v17.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v18.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ld1r {v19.4h}, [x1]
+ st1 {v0.h}[0], [x1], x3
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ mov x3, x0
+ srshr v16.8h, v16.8h, #3 // dc >>= 3
+ ld1 {v0.8b}, [x0], x2
+ srshr v18.8h, v18.8h, #3
+ ld1 {v1.8b}, [x0], x2
+ uaddw v20.8h, v16.8h, v0.8b
+ ld1 {v2.8b}, [x0], x2
+ uaddw v0.8h, v16.8h, v1.8b
+ ld1 {v3.8b}, [x0], x2
+ uaddw v22.8h, v16.8h, v2.8b
+ ld1 {v4.8b}, [x0], x2
+ uaddw v2.8h, v16.8h, v3.8b
+ ld1 {v5.8b}, [x0], x2
+ uaddw v24.8h, v18.8h, v4.8b
+ ld1 {v6.8b}, [x0], x2
+ uaddw v4.8h, v18.8h, v5.8b
+ ld1 {v7.8b}, [x0], x2
+ uaddw v26.8h, v18.8h, v6.8b
+ sqxtun v20.8b, v20.8h
+ uaddw v6.8h, v18.8h, v7.8b
+ sqxtun v21.8b, v0.8h
+ sqxtun v22.8b, v22.8h
+ st1 {v20.8b}, [x3], x2
+ sqxtun v23.8b, v2.8h
+ st1 {v21.8b}, [x3], x2
+ sqxtun v24.8b, v24.8h
+ st1 {v22.8b}, [x3], x2
+ sqxtun v25.8b, v4.8h
+ st1 {v23.8b}, [x3], x2
+ sqxtun v26.8b, v26.8h
+ st1 {v24.8b}, [x3], x2
+ sqxtun v27.8b, v6.8h
+ st1 {v25.8b}, [x3], x2
+ st1 {v26.8b}, [x3], x2
+ st1 {v27.8b}, [x3], x2
ret
endfunc
sqrshrun2 \d0\().16b, v22.8h, #7
.endm
-.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
- uxtl \s2\().8h, \s2\().8b
- uxtl \s3\().8h, \s3\().8b
- uxtl \s1\().8h, \s1\().8b
- uxtl \s4\().8h, \s4\().8b
- uxtl \s0\().8h, \s0\().8b
- uxtl \s5\().8h, \s5\().8b
- mul \s2\().8h, \s2\().8h, v0.h[2]
- mul \s3\().8h, \s3\().8h, v0.h[3]
- mls \s2\().8h, \s1\().8h, v0.h[1]
- mls \s3\().8h, \s4\().8h, v0.h[4]
- mla \s2\().8h, \s0\().8h, v0.h[0]
- mla \s3\().8h, \s5\().8h, v0.h[5]
- sqadd \s3\().8h, \s2\().8h, \s3\().8h
- sqrshrun \d0\().8b, \s3\().8h, #7
-.endm
-
.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
uxtl \s0\().8h, \s0\().8b
uxtl \s3\().8h, \s3\().8b
// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
-// arithmatic can be used to apply filters
+// arithmetic can be used to apply filters
const subpel_filters, align=4
.short 0, 6, 123, 12, 1, 0, 0, 0
.short 2, 11, 108, 36, 8, 1, 0, 0
2:
ld1 {v1.8b - v4.8b}, [x7], #32
ld1 {v16.8b - v19.8b}, [x7], #32
- ld1 {v20.8b - v23.8b}, [x7]
- sub x7, x7, #48
+ ld1 {v20.8b - v23.8b}, [x7], #32
+ ld1 {v24.8b - v25.8b}, [x7]
+ sub x7, x7, #64
- vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22
- vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23
- trn1 v2.2d, v5.2d, v2.2d
+ vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
+ vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
+ trn1 v1.2d, v1.2d, v2.2d
+ trn1 v3.2d, v3.2d, v4.2d
- st1 {v2.16b}, [x0], x1
- subs x4, x4, #1
+ st1 {v1.16b}, [x0], x1
+ st1 {v3.16b}, [x0], x1
+ subs x4, x4, #2
b.ne 2b
add sp, sp, #336+16
ret
endfunc
+function ff_put_vp8_epel8_v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+
+ movrel x7, subpel_filters, -16
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+1:
+ ld1 {v2.8b}, [x2], x3
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v4.8b}, [x2], x3
+ ld1 {v5.8b}, [x2], x3
+ ld1 {v6.8b}, [x2], x3
+ ld1 {v7.8b}, [x2], x3
+ ld1 {v28.8b}, [x2]
+
+ sub x2, x2, x3, lsl #2
+
+ vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
+
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ subs w4, w4, #2
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel8_h6_neon, export=1
+ sub x2, x2, #2
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+1:
+ ld1 {v2.8b, v3.8b}, [x2], x3
+
+ vp8_epel8_h6 v2, v2, v3
+
+ st1 {v2.8b}, [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+
+ ret
+endfunc
+
function ff_put_vp8_epel8_h6v6_neon, export=1
sub x2, x2, x3, lsl #1
sub x2, x2, #2
ret
endfunc
+function ff_put_vp8_epel8_v4_neon, export=1
+ sub x2, x2, x3
+
+ movrel x7, subpel_filters, -16
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+1:
+ ld1 {v2.8b}, [x2], x3
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v4.8b}, [x2], x3
+ ld1 {v5.8b}, [x2], x3
+ ld1 {v6.8b}, [x2]
+ sub x2, x2, x3, lsl #1
+
+ vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
+
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x0], x1
+ subs w4, w4, #2
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel8_h4_neon, export=1
+ sub x2, x2, #1
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+1:
+ ld1 {v2.8b,v3.8b}, [x2], x3
+
+ vp8_epel8_h4 v2, v2, v3
+
+ st1 {v2.8b}, [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+
+ ret
+endfunc
+
function ff_put_vp8_epel8_h4v6_neon, export=1
sub x2, x2, x3, lsl #1
sub x2, x2, #1
add sp, sp, #168+16
ret
endfunc
+
+function ff_put_vp8_epel4_v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+
+ movrel x7, subpel_filters, -16
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+1:
+ ld1r {v2.2s}, [x2], x3
+ ld1r {v3.2s}, [x2], x3
+ ld1r {v4.2s}, [x2], x3
+ ld1r {v5.2s}, [x2], x3
+ ld1r {v6.2s}, [x2], x3
+ ld1r {v7.2s}, [x2], x3
+ ld1r {v28.2s}, [x2]
+ sub x2, x2, x3, lsl #2
+ ld1 {v2.s}[1], [x2], x3
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v4.s}[1], [x2], x3
+ ld1 {v5.s}[1], [x2], x3
+ ld1 {v6.s}[1], [x2], x3
+ ld1 {v7.s}[1], [x2], x3
+ ld1 {v28.s}[1], [x2]
+ sub x2, x2, x3, lsl #2
+
+ vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
+
+ st1 {v2.s}[0], [x0], x1
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[1], [x0], x1
+ st1 {v3.s}[1], [x0], x1
+ subs w4, w4, #4
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h6_neon, export=1
+ sub x2, x2, #2
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+1:
+ ld1 {v2.8b,v3.8b}, [x2], x3
+ vp8_epel8_h6 v2, v2, v3
+ st1 {v2.s}[0], [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h6v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, #2
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+
+ sub sp, sp, #52
+ add w8, w4, #5
+ mov x9, sp
+1:
+ ld1 {v2.8b,v3.8b}, [x2], x3
+ vp8_epel8_h6 v2, v2, v3
+ st1 {v2.s}[0], [x9], #4
+ subs w8, w8, #1
+ b.ne 1b
+
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+ mov x9, sp
+2:
+ ld1 {v2.8b,v3.8b}, [x9], #16
+ ld1 {v6.8b}, [x9], #8
+ ld1r {v28.2s}, [x9]
+ sub x9, x9, #16
+ ld1 {v4.8b,v5.8b}, [x9], #16
+ ld1 {v7.8b}, [x9], #8
+ ld1 {v28.s}[1], [x9]
+ sub x9, x9, #16
+ trn1 v1.2s, v2.2s, v4.2s
+ trn2 v4.2s, v2.2s, v4.2s
+ trn1 v2.2s, v3.2s, v5.2s
+ trn2 v5.2s, v3.2s, v5.2s
+ trn1 v3.2s, v6.2s, v7.2s
+ trn2 v7.2s, v6.2s, v7.2s
+ vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
+ st1 {v2.s}[0], [x0], x1
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[1], [x0], x1
+ st1 {v3.s}[1], [x0], x1
+ subs w4, w4, #4
+ b.ne 2b
+
+ add sp, sp, #52
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h4v6_neon, export=1
+ sub x2, x2, x3, lsl #1
+ sub x2, x2, #1
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+
+ sub sp, sp, #52
+ add w8, w4, #5
+ mov x9, sp
+1:
+ ld1 {v2.8b}, [x2], x3
+ vp8_epel8_h4 v2, v2, v2
+ st1 {v2.s}[0], [x9], #4
+ subs w8, w8, #1
+ b.ne 1b
+
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+ mov x9, sp
+2:
+ ld1 {v2.8b,v3.8b}, [x9], #16
+ ld1 {v6.8b}, [x9], #8
+ ld1r {v28.2s}, [x9]
+ sub x9, x9, #16
+ ld1 {v4.8b,v5.8b}, [x9], #16
+ ld1 {v7.8b}, [x9], #8
+ ld1 {v28.s}[1], [x9]
+ sub x9, x9, #16
+ trn1 v1.2s, v2.2s, v4.2s
+ trn2 v4.2s, v2.2s, v4.2s
+ trn1 v2.2s, v3.2s, v5.2s
+ trn2 v5.2s, v3.2s, v5.2s
+ trn1 v3.2s, v6.2s, v7.2s
+ trn2 v7.2s, v6.2s, v7.2s
+ vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
+ st1 {v2.s}[0], [x0], x1
+ st1 {v3.s}[0], [x0], x1
+ st1 {v2.s}[1], [x0], x1
+ st1 {v3.s}[1], [x0], x1
+ subs w4, w4, #4
+ b.ne 2b
+
+ add sp, sp, #52
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h6v4_neon, export=1
+ sub x2, x2, x3
+ sub x2, x2, #2
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+
+ sub sp, sp, #44
+ add w8, w4, #3
+ mov x9, sp
+1:
+ ld1 {v2.8b,v3.8b}, [x2], x3
+ vp8_epel8_h6 v2, v2, v3
+ st1 {v2.s}[0], [x9], #4
+ subs w8, w8, #1
+ b.ne 1b
+
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+ mov x9, sp
+2:
+ ld1 {v2.8b,v3.8b}, [x9], #16
+ ld1r {v6.2s}, [x9]
+ sub x9, x9, #8
+ ld1 {v4.8b,v5.8b}, [x9], #16
+ ld1 {v6.s}[1], [x9]
+ sub x9, x9, #8
+ trn1 v1.2s, v2.2s, v4.2s
+ trn2 v4.2s, v2.2s, v4.2s
+ trn1 v2.2s, v3.2s, v5.2s
+ trn2 v5.2s, v3.2s, v5.2s
+ vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[2], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+ st1 {v1.s}[3], [x0], x1
+ subs w4, w4, #4
+ b.ne 2b
+
+ add sp, sp, #44
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h4_neon, export=1
+ sub x2, x2, #1
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+1:
+ ld1 {v2.8b}, [x2], x3
+ vp8_epel8_h4 v2, v2, v2
+ st1 {v2.s}[0], [x0], x1
+ subs w4, w4, #1
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel4_v4_neon, export=1
+ sub x2, x2, x3
+
+ movrel x7, subpel_filters, -16
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+1:
+ ld1r {v2.2s}, [x2], x3
+ ld1r {v3.2s}, [x2], x3
+ ld1r {v4.2s}, [x2], x3
+ ld1r {v5.2s}, [x2], x3
+ ld1r {v6.2s}, [x2]
+ sub x2, x2, x3, lsl #1
+ ld1 {v2.s}[1], [x2], x3
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v4.s}[1], [x2], x3
+ ld1 {v5.s}[1], [x2], x3
+ ld1 {v6.s}[1], [x2]
+ sub x2, x2, x3, lsl #1
+
+ vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
+
+ st1 {v2.s}[0], [x0], x1
+ st1 {v2.s}[2], [x0], x1
+ st1 {v2.s}[1], [x0], x1
+ st1 {v2.s}[3], [x0], x1
+ subs w4, w4, #4
+ b.ne 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_epel4_h4v4_neon, export=1
+ sub x2, x2, x3
+ sub x2, x2, #1
+
+ movrel x7, subpel_filters, -16
+ add x5, x7, w5, uxtw #4
+ ld1 {v0.8h}, [x5]
+
+ sub sp, sp, #44
+ add w8, w4, #3
+ mov x9, sp
+1:
+ ld1 {v2.8b}, [x2], x3
+ vp8_epel8_h4 v2, v2, v3
+ st1 {v2.s}[0], [x9], #4
+ subs w8, w8, #1
+ b.ne 1b
+
+ add x6, x7, w6, uxtw #4
+ ld1 {v0.8h}, [x6]
+ mov x9, sp
+2:
+ ld1 {v2.8b,v3.8b}, [x9], #16
+ ld1r {v6.2s}, [x9]
+ sub x9, x9, #8
+ ld1 {v4.8b,v5.8b}, [x9], #16
+ ld1 {v6.s}[1], [x9]
+ sub x9, x9, #8
+ trn1 v1.2s, v2.2s, v4.2s
+ trn2 v4.2s, v2.2s, v4.2s
+ trn1 v2.2s, v3.2s, v5.2s
+ trn2 v5.2s, v3.2s, v5.2s
+ vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[2], [x0], x1
+ st1 {v1.s}[1], [x0], x1
+ st1 {v1.s}[3], [x0], x1
+ subs w4, w4, #4
+ b.ne 2b
+
+ add sp, sp, #44
+ ret
+endfunc
+
+/* Bilinear MC */
+
+function ff_put_vp8_bilin16_h_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5
+ sub w5, w7, w5
+ dup v1.8b, w5
+1:
+ subs w4, w4, #2
+ ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3
+ ext v5.8b, v3.8b, v4.8b, #1
+ ext v4.8b, v2.8b, v3.8b, #1
+ umull v16.8h, v2.8b, v1.8b
+ umlal v16.8h, v4.8b, v0.8b
+ ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
+ umull v6.8h, v3.8b, v1.8b
+ umlal v6.8h, v5.8b, v0.8b
+ ext v21.8b, v19.8b, v20.8b, #1
+ ext v20.8b, v18.8b, v19.8b, #1
+ umull v22.8h, v18.8b, v1.8b
+ umlal v22.8h, v20.8b, v0.8b
+ umull v24.8h, v19.8b, v1.8b
+ umlal v24.8h, v21.8b, v0.8b
+ rshrn v4.8b, v16.8h, #3
+ rshrn2 v4.16b, v6.8h, #3
+ rshrn v6.8b, v22.8h, #3
+ rshrn2 v6.16b, v24.8h, #3
+ st1 {v4.16b}, [x0], x1
+ st1 {v6.16b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin16_v_neon, export=1
+ mov w7, #8
+ dup v0.16b, w6
+ sub w6, w7, w6
+ dup v1.16b, w6
+
+ ld1 {v2.16b}, [x2], x3
+1:
+ subs w4, w4, #2
+ ld1 {v4.16b}, [x2], x3
+ umull v6.8h, v2.8b, v1.8b
+ umlal v6.8h, v4.8b, v0.8b
+ umull2 v16.8h, v2.16b, v1.16b
+ umlal2 v16.8h, v4.16b, v0.16b
+ ld1 {v2.16b}, [x2], x3
+ umull v18.8h, v4.8b, v1.8b
+ umlal v18.8h, v2.8b, v0.8b
+ umull2 v20.8h, v4.16b, v1.16b
+ umlal2 v20.8h, v2.16b, v0.16b
+ rshrn v4.8b, v6.8h, #3
+ rshrn2 v4.16b, v16.8h, #3
+ rshrn v6.8b, v18.8h, #3
+ rshrn2 v6.16b, v20.8h, #3
+ st1 {v4.16b}, [x0], x1
+ st1 {v6.16b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin16_hv_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5 // mx
+ sub w5, w7, w5
+ dup v1.8b, w5
+ dup v2.16b, w6 // my
+ sub w6, w7, w6
+ dup v3.16b, w6
+
+ ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3
+
+ ext v7.8b, v5.8b, v6.8b, #1
+ ext v6.8b, v4.8b, v5.8b, #1
+ umull v16.8h, v4.8b, v1.8b
+ umlal v16.8h, v6.8b, v0.8b
+ umull v18.8h, v5.8b, v1.8b
+ umlal v18.8h, v7.8b, v0.8b
+ rshrn v4.8b, v16.8h, #3
+ rshrn2 v4.16b, v18.8h, #3
+1:
+ subs w4, w4, #2
+ ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3
+ ext v21.8b, v19.8b, v20.8b, #1
+ ext v20.8b, v18.8b, v19.8b, #1
+ umull v22.8h, v18.8b, v1.8b
+ umlal v22.8h, v20.8b, v0.8b
+ ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3
+ umull v24.8h, v19.8b, v1.8b
+ umlal v24.8h, v21.8b, v0.8b
+ ext v29.8b, v27.8b, v28.8b, #1
+ ext v28.8b, v26.8b, v27.8b, #1
+ umull v16.8h, v26.8b, v1.8b
+ umlal v16.8h, v28.8b, v0.8b
+ umull v18.8h, v27.8b, v1.8b
+ umlal v18.8h, v29.8b, v0.8b
+ rshrn v6.8b, v22.8h, #3
+ rshrn2 v6.16b, v24.8h, #3
+ umull v24.8h, v4.8b, v3.8b
+ umlal v24.8h, v6.8b, v2.8b
+ umull2 v30.8h, v4.16b, v3.16b
+ umlal2 v30.8h, v6.16b, v2.16b
+ rshrn v4.8b, v16.8h, #3
+ rshrn2 v4.16b, v18.8h, #3
+ umull v20.8h, v6.8b, v3.8b
+ umlal v20.8h, v4.8b, v2.8b
+ umull2 v22.8h, v6.16b, v3.16b
+ umlal2 v22.8h, v4.16b, v2.16b
+ rshrn v24.8b, v24.8h, #3
+ rshrn2 v24.16b, v30.8h, #3
+ st1 {v24.16b}, [x0], x1
+ rshrn v20.8b, v20.8h, #3
+ rshrn2 v20.16b, v22.8h, #3
+ st1 {v20.16b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin8_h_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5
+ sub w5, w7, w5
+ dup v1.8b, w5
+1:
+ subs w4, w4, #2
+ ld1 {v2.8b,v3.8b}, [x2], x3
+ ext v3.8b, v2.8b, v3.8b, #1
+ umull v4.8h, v2.8b, v1.8b
+ umlal v4.8h, v3.8b, v0.8b
+ ld1 {v6.8b,v7.8b}, [x2], x3
+ ext v7.8b, v6.8b, v7.8b, #1
+ umull v16.8h, v6.8b, v1.8b
+ umlal v16.8h, v7.8b, v0.8b
+ rshrn v4.8b, v4.8h, #3
+ rshrn v16.8b, v16.8h, #3
+ st1 {v4.8b}, [x0], x1
+ st1 {v16.8b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin8_v_neon, export=1
+ mov w7, #8
+ dup v0.8b, w6
+ sub w6, w7, w6
+ dup v1.8b, w6
+
+ ld1 {v2.8b}, [x2], x3
+1:
+ subs w4, w4, #2
+ ld1 {v3.8b}, [x2], x3
+ umull v4.8h, v2.8b, v1.8b
+ umlal v4.8h, v3.8b, v0.8b
+ ld1 {v2.8b}, [x2], x3
+ umull v6.8h, v3.8b, v1.8b
+ umlal v6.8h, v2.8b, v0.8b
+ rshrn v4.8b, v4.8h, #3
+ rshrn v6.8b, v6.8h, #3
+ st1 {v4.8b}, [x0], x1
+ st1 {v6.8b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin8_hv_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5 // mx
+ sub w5, w7, w5
+ dup v1.8b, w5
+ dup v2.8b, w6 // my
+ sub w6, w7, w6
+ dup v3.8b, w6
+
+ ld1 {v4.8b,v5.8b}, [x2], x3
+ ext v5.8b, v4.8b, v5.8b, #1
+ umull v18.8h, v4.8b, v1.8b
+ umlal v18.8h, v5.8b, v0.8b
+ rshrn v22.8b, v18.8h, #3
+1:
+ subs w4, w4, #2
+ ld1 {v6.8b,v7.8b}, [x2], x3
+ ext v7.8b, v6.8b, v7.8b, #1
+ umull v16.8h, v6.8b, v1.8b
+ umlal v16.8h, v7.8b, v0.8b
+ ld1 {v4.8b,v5.8b}, [x2], x3
+ ext v5.8b, v4.8b, v5.8b, #1
+ umull v18.8h, v4.8b, v1.8b
+ umlal v18.8h, v5.8b, v0.8b
+ rshrn v16.8b, v16.8h, #3
+ umull v20.8h, v22.8b, v3.8b
+ umlal v20.8h, v16.8b, v2.8b
+ rshrn v22.8b, v18.8h, #3
+ umull v24.8h, v16.8b, v3.8b
+ umlal v24.8h, v22.8b, v2.8b
+ rshrn v20.8b, v20.8h, #3
+ st1 {v20.8b}, [x0], x1
+ rshrn v23.8b, v24.8h, #3
+ st1 {v23.8b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin4_h_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5
+ sub w5, w7, w5
+ dup v1.8b, w5
+1:
+ subs w4, w4, #2
+ ld1 {v2.8b}, [x2], x3
+ ext v3.8b, v2.8b, v3.8b, #1
+ ld1 {v6.8b}, [x2], x3
+ ext v7.8b, v6.8b, v7.8b, #1
+ trn1 v2.2s, v2.2s, v6.2s
+ trn1 v3.2s, v3.2s, v7.2s
+ umull v4.8h, v2.8b, v1.8b
+ umlal v4.8h, v3.8b, v0.8b
+ rshrn v4.8b, v4.8h, #3
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin4_v_neon, export=1
+ mov w7, #8
+ dup v0.8b, w6
+ sub w6, w7, w6
+ dup v1.8b, w6
+
+ ld1r {v2.2s}, [x2], x3
+1:
+ ld1r {v3.2s}, [x2]
+ ld1 {v2.s}[1], [x2], x3
+ ld1 {v3.s}[1], [x2], x3
+ umull v4.8h, v2.8b, v1.8b
+ umlal v4.8h, v3.8b, v0.8b
+ trn2 v2.2s, v3.2s, v2.2s
+ rshrn v4.8b, v4.8h, #3
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x0], x1
+ subs w4, w4, #2
+ b.gt 1b
+
+ ret
+endfunc
+
+function ff_put_vp8_bilin4_hv_neon, export=1
+ mov w7, #8
+ dup v0.8b, w5 // mx
+ sub w5, w7, w5
+ dup v1.8b, w5
+ dup v2.8b, w6 // my
+ sub w6, w7, w6
+ dup v3.8b, w6
+
+ ld1 {v4.8b}, [x2], x3
+ ext v5.8b, v4.8b, v4.8b, #1
+ umull v18.8h, v4.8b, v1.8b
+ umlal v18.8h, v5.8b, v0.8b
+ rshrn v22.8b, v18.8h, #3
+1:
+ subs w4, w4, #2
+ ld1 {v6.8b}, [x2], x3
+ ext v7.8b, v6.8b, v6.8b, #1
+ ld1 {v4.8b}, [x2], x3
+ ext v5.8b, v4.8b, v4.8b, #1
+ trn1 v6.2s, v6.2s, v4.2s
+ trn1 v7.2s, v7.2s, v5.2s
+ umull v16.8h, v6.8b, v1.8b
+ umlal v16.8h, v7.8b, v0.8b
+ rshrn v16.8b, v16.8h, #3
+ umull v20.8h, v16.8b, v2.8b
+ trn1 v22.2s, v22.2s, v16.2s
+ umlal v20.8h, v22.8b, v3.8b
+ rev64 v22.2s, v16.2s
+ rshrn v20.8b, v20.8h, #3
+ st1 {v20.s}[0], [x0], x1
+ st1 {v20.s}[1], [x0], x1
+ b.gt 1b
+
+ ret
+endfunc