smull v27.4s, v3.4h, v4.h[0]
sqdmulh v20.4h, v1.4h, v4.h[1]
sqdmulh v23.4h, v3.4h, v4.h[1]
- sqshrn v21.4h, v26.4s, #16
- sqshrn v22.4h, v27.4s, #16
+ shrn v21.4h, v26.4s, #16
+ shrn v22.4h, v27.4s, #16
add v21.4h, v21.4h, v1.4h
add v22.4h, v22.4h, v3.4h
st1 {v29.16b}, [x1]
sqdmulh v21.4h, v1.4h, v4.h[1]
sqdmulh v23.4h, v3.4h, v4.h[1]
- sqshrn v20.4h, v26.4s, #16
- sqshrn v22.4h, v27.4s, #16
+ shrn v20.4h, v26.4s, #16
+ shrn v22.4h, v27.4s, #16
add v20.4h, v20.4h, v1.4h
add v22.4h, v22.4h, v3.4h
add v16.4h, v0.4h, v2.4h
sub v17.4h, v0.4h, v2.4h
add v18.4h, v20.4h, v23.4h
- ld1 {v24.d}[0], [x0], x2
- zip1 v16.2d, v16.2d, v17.2d
- sub v19.4h, v21.4h, v22.4h
- ld1 {v25.d}[0], [x0], x2
- zip1 v18.2d, v18.2d, v19.2d
- add v0.8h, v16.8h, v18.8h
- ld1 {v25.d}[1], [x0], x2
- sub v1.8h, v16.8h, v18.8h
- ld1 {v24.d}[1], [x0], x2
- srshr v0.8h, v0.8h, #3
- trn1 v24.4s, v24.4s, v25.4s
- srshr v1.8h, v1.8h, #3
+ ld1 {v24.s}[0], [x0], x2
+ sub v19.4h, v21.4h, v22.4h
+ ld1 {v25.s}[0], [x0], x2
+ add v0.4h, v16.4h, v18.4h
+ add v1.4h, v17.4h, v19.4h
+ ld1 {v26.s}[0], [x0], x2
+ sub v3.4h, v16.4h, v18.4h
+ sub v2.4h, v17.4h, v19.4h
+ ld1 {v27.s}[0], [x0], x2
+ srshr v0.4h, v0.4h, #3
+ srshr v1.4h, v1.4h, #3
+ srshr v2.4h, v2.4h, #3
+ srshr v3.4h, v3.4h, #3
+
sub x0, x0, x2, lsl #2
- ext v1.16b, v1.16b, v1.16b, #8
- trn1 v3.2d, v0.2d, v1.2d
- trn2 v0.2d, v0.2d, v1.2d
- trn1 v1.8h, v3.8h, v0.8h
- trn2 v3.8h, v3.8h, v0.8h
- uzp1 v0.4s, v1.4s, v3.4s
- uzp2 v1.4s, v3.4s, v1.4s
+ transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16
uaddw v0.8h, v0.8h, v24.8b
- uaddw2 v1.8h, v1.8h, v24.16b
+ uaddw v1.8h, v1.8h, v25.8b
+ uaddw v2.8h, v2.8h, v26.8b
+ uaddw v3.8h, v3.8h, v27.8b
sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
+ sqxtun v1.8b, v1.8h
+ sqxtun v2.8b, v2.8h
+ sqxtun v3.8b, v3.8h
+
st1 {v0.s}[0], [x0], x2
- st1 {v0.s}[1], [x0], x2
- st1 {v0.s}[3], [x0], x2
- st1 {v0.s}[2], [x0], x2
+ st1 {v1.s}[0], [x0], x2
+ st1 {v2.s}[0], [x0], x2
+ st1 {v3.s}[0], [x0], x2
ret
endfunc
sqrshrun2 \d0\().16b, v22.8h, #7
.endm
-.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
- uxtl \s2\().8h, \s2\().8b
- uxtl \s3\().8h, \s3\().8b
- uxtl \s1\().8h, \s1\().8b
- uxtl \s4\().8h, \s4\().8b
- uxtl \s0\().8h, \s0\().8b
- uxtl \s5\().8h, \s5\().8b
- mul \s2\().8h, \s2\().8h, v0.h[2]
- mul \s3\().8h, \s3\().8h, v0.h[3]
- mls \s2\().8h, \s1\().8h, v0.h[1]
- mls \s3\().8h, \s4\().8h, v0.h[4]
- mla \s2\().8h, \s0\().8h, v0.h[0]
- mla \s3\().8h, \s5\().8h, v0.h[5]
- sqadd \s3\().8h, \s2\().8h, \s3\().8h
- sqrshrun \d0\().8b, \s3\().8h, #7
-.endm
-
.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
uxtl \s0\().8h, \s0\().8b
uxtl \s3\().8h, \s3\().8b
2:
ld1 {v1.8b - v4.8b}, [x7], #32
ld1 {v16.8b - v19.8b}, [x7], #32
- ld1 {v20.8b - v23.8b}, [x7]
- sub x7, x7, #48
+ ld1 {v20.8b - v23.8b}, [x7], #32
+ ld1 {v24.8b - v25.8b}, [x7]
+ sub x7, x7, #64
- vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22
- vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23
- trn1 v2.2d, v5.2d, v2.2d
+ vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
+ vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
+ trn1 v1.2d, v1.2d, v2.2d
+ trn1 v3.2d, v3.2d, v4.2d
- st1 {v2.16b}, [x0], x1
- subs x4, x4, #1
+ st1 {v1.16b}, [x0], x1
+ st1 {v3.16b}, [x0], x1
+ subs x4, x4, #2
b.ne 2b
add sp, sp, #336+16