.ifc \txfm1\()_\txfm2,idct_idct
movrel x4, idct_coeffs
.else
- movrel x4, iadst8_coeffs
+ movrel x4, iadst8_coeffs
ld1 {v1.8h}, [x4], #16
.endif
ld1 {v0.8h}, [x4]
function idct16x16_dc_add_neon
- movrel x4, idct_coeffs
+ movrel x4, idct_coeffs
ld1 {v0.4h}, [x4]
- movi v1.4h, #0
+ movi v1.4h, #0
ld1 {v2.h}[0], [x2]
- smull v2.4s, v2.4h, v0.h[0]
- rshrn v2.4h, v2.4s, #14
- smull v2.4s, v2.4h, v0.h[0]
- rshrn v2.4h, v2.4s, #14
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
+ smull v2.4s, v2.4h, v0.h[0]
+ rshrn v2.4h, v2.4s, #14
dup v2.8h, v2.h[0]
st1 {v1.h}[0], [x2]
- srshr v2.8h, v2.8h, #6
+ srshr v2.8h, v2.8h, #6
- mov x3, x0
- mov x4, #16
+ mov x3, x0
+ mov x4, #16
1:
// Loop to add the constant from v2 into all 16x16 outputs
subs x4, x4, #2
.ifc \txfm1,idct
ld1 {v0.8h,v1.8h}, [x10]
.endif
- mov x9, #32
+ mov x9, #32
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #10
idct16_partial half
function idct32x32_dc_add_neon
- movrel x4, idct_coeffs
+ movrel x4, idct_coeffs
ld1 {v0.4h}, [x4]
- movi v1.4h, #0
+ movi v1.4h, #0
ld1 {v2.h}[0], [x2]
smull v2.4s, v2.4h, v0.h[0]
dup v2.8h, v2.h[0]
st1 {v1.h}[0], [x2]
- srshr v0.8h, v2.8h, #6
+ srshr v0.8h, v2.8h, #6
- mov x3, x0
- mov x4, #32
+ mov x3, x0
+ mov x4, #32
1:
// Loop to add the constant v0 into all 32x32 outputs
subs x4, x4, #2
// x9 = double input stride
function idct32_1d_8x32_pass1\suffix\()_neon
mov x14, x30
- movi v2.8h, #0
+ movi v2.8h, #0
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
.ifb \suffix
.endif
add x2, x2, #64
- movi v2.8h, #0
+ movi v2.8h, #0
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
movrel r12, idct_coeffs
vld1.16 {d0}, [r12,:64]
- vmov.i16 q2, #0
+ vmov.i16 q2, #0
vld1.16 {d16[]}, [r2,:16]
vmull.s16 q8, d16, d0[0]
push {lr}
mov r12, #32
- vmov.s16 q2, #0
+ vmov.s16 q2, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
vld1.16 {d\i}, [r2,:64]
vst1.16 {d4}, [r2,:64], r12
movrel r12, idct_coeffs
vld1.16 {d0}, [r12,:64]
- vmov.i16 q2, #0
+ vmov.i16 q2, #0
vld1.16 {d16[]}, [r2,:16]
vmull.s16 q8, d16, d0[0]
@ Double stride of the input, since we only read every other line
mov r12, #128
- vmov.s16 d4, #0
+ vmov.s16 d4, #0
@ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
.ifb \suffix
.endif
add r2, r2, #64
- vmov.s16 d8, #0
+ vmov.s16 d8, #0
@ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
.endif
vld1.32 {d12[]}, [r0,:32], r1
vld1.32 {d12[1]}, [r0,:32], r1
- vrshr.s16 q4, q4, #6
+ vrshr.s16 q4, q4, #6
vld1.32 {d13[]}, [r0,:32], r1
- vrshr.s16 q5, q5, #6
+ vrshr.s16 q5, q5, #6
vld1.32 {d13[1]}, [r0,:32], r1
sub r0, r0, r1, lsl #2
vaddw.u8 q4, q4, d12
endfunc
function vp9_loop_filter_h_16_neon
- sub r12, r0, #8
+ sub r12, r0, #8
vld1.8 {d16}, [r12,:64], r1
vld1.8 {d24}, [r0, :64], r1
vld1.8 {d17}, [r12,:64], r1