movhps [r0+r3 ], m3
%endif
%assign %%i %%i+64
+%endrep
+
+ pxor m0, m0
+%assign %%offset 0
+%rep 128/mmsize
+ mova [r2+%%offset], m0
+%assign %%offset %%offset+mmsize
%endrep
RET
movhps [r0+r1], m0
%endif
lea r0, [r0+r1*2]
+%assign %%offset 0
+%rep 32/mmsize
+ mova [r2+%%offset], m4
+%assign %%offset %%offset+mmsize
+%endrep
add r2, 32
dec r3
jg .loop
paddusb m2, m0
movq m4, [r0+r1*2]
paddusb m3, m0
- movq m5, [r0+r3 ]
+ movq m5, [r0+r2 ]
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
movq [r0+r1 ], m3
psubusb m5, m1
movq [r0+r1*2], m4
- movq [r0+r3 ], m5
+ movq [r0+r2 ], m5
%endmacro
INIT_MMX mmxext
%if ARCH_X86_64
movsxd r1, r1d
%endif
- lea r3, [r1*3]
- movsx r2, word [r2]
- add r2, 15
- sar r2, 5
- movd m0, r2d
+ movsx r3, word [r2]
+ mov word [r2], 0
+ lea r2, [r1*3]
+ add r3, 15
+ sar r3, 5
+ movd m0, r3d
pshufw m0, m0, 0x0
pxor m1, m1
psubw m1, m0