lea r2, [r2 + r3*4]
lea r0, [r0 + r3*8]
neg r3
+ jz .loop1
.loop4:
- movq m0, [r2 + 4*r3 + 0]
- movq m1, [r2 + 4*r3 + 8]
- movq m2, [r1 + 0*STEP]
- movq m3, [r1 + 2*STEP]
+ movlps m0, [r2 + 4*r3 + 0]
+ movlps m1, [r2 + 4*r3 + 8]
+ movlps m2, [r1 + 0*STEP]
+ movlps m3, [r1 + 2*STEP]
movhps m2, [r1 + 1*STEP]
movhps m3, [r1 + 3*STEP]
- punpckldq m0, m0
- punpckldq m1, m1
+ unpcklps m0, m0
+ unpcklps m1, m1
mulps m0, m2
mulps m1, m3
movu [r0 + 8*r3 + 0], m0
jz .end
.loop1: ; element 0 and 1 can be computed at the same time
movss m0, [r2]
- movq m2, [r1]
- punpckldq m0, m0
+ movlps m2, [r1]
+ unpcklps m0, m0
mulps m2, m0
- movq [r0], m2
+ movlps [r0], m2
add r0, 8
add r2, 4
add r1, STEP