and waq, ~(mmsize*2-1)
jmp .end_v
.loop_v:
- movu m0, [src2q+iq]
- movu m1, [src2q+iq+mmsize]
- paddb m0, [src1q+iq]
- paddb m1, [src1q+iq+mmsize]
- movu [dstq+iq ], m0
- movu [dstq+iq+mmsize], m1
+ mova m0, [src1q+iq]
+ mova m1, [src1q+iq+mmsize]
+ paddb m0, [src2q+iq]
+ paddb m1, [src2q+iq+mmsize]
+ mova [dstq+iq ], m0
+ mova [dstq+iq+mmsize], m1
add iq, mmsize*2
.end_v:
cmp iq, waq
and waq, ~7
jmp .end_l
.loop_l:
- movq mm0, [src2q+iq]
- paddb mm0, [src1q+iq]
+ movq mm0, [src1q+iq]
+ paddb mm0, [src2q+iq]
movq [dstq+iq ], mm0
add iq, 8
.end_l: