- ld1 {v2.8b}, [x2]
- ldcol.8 v3, x3, x1
- transpose v0.2s, v1.2s, v2.2s, v3.2s
- uaddlp v0.4h, v0.8b // s0, s2
- uaddlp v1.4h, v1.8b // s1, s3
- addp v0.4h, v0.4h, v1.4h // s0, s2, s1, s3
- addp v1.4h, v0.4h, v0.4h
- rshrn v2.8b, v0.8h, #2
+ sub x2, x0, #FDEC_STRIDE
+ ldrb w10, [x0, #0 * FDEC_STRIDE - 1]
+ ldrb w11, [x0, #1 * FDEC_STRIDE - 1]
+ ldrb w12, [x0, #2 * FDEC_STRIDE - 1]
+ ldrb w13, [x0, #3 * FDEC_STRIDE - 1]
+ add w10, w10, w11
+ ldrb w4, [x0, #4 * FDEC_STRIDE - 1]
+ ldrb w5, [x0, #5 * FDEC_STRIDE - 1]
+ add w12, w12, w13
+ ldrb w6, [x0, #6 * FDEC_STRIDE - 1]
+ ldrb w7, [x0, #7 * FDEC_STRIDE - 1]
+ add w4, w4, w5
+ add w6, w6, w7
+ add w10, w10, w12, lsl #16
+ add w4, w4, w6, lsl #16
+ ld1 {v0.8b}, [x2]
+ add x10, x10, x4, lsl #32
+ uaddlp v0.4h, v0.8b // s0, s1
+ mov v1.d[0], x10 // s2, s3
+ add v3.4h, v0.4h, v1.4h
+ addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
+ addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
+ uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
+ uzp1 v1.2d, v1.2d, v1.2d
+ uzp1 v0.2d, v0.2d, v0.2d