%else
%define rfilterq %2
%endif
- movdqu m0, [rfilterq ] ;load 128bit of x
+%if (%1 == 8 && %4 <= 4)
+%define %%load movd
+%elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4)
+%define %%load movq
+%else
+%define %%load movdqu
+%endif
+
+ %%load m0, [rfilterq ]
%ifnum %3
- movdqu m1, [rfilterq+ %3] ;load 128bit of x+stride
- movdqu m2, [rfilterq+2*%3] ;load 128bit of x+2*stride
- movdqu m3, [rfilterq+3*%3] ;load 128bit of x+3*stride
+ %%load m1, [rfilterq+ %3]
+ %%load m2, [rfilterq+2*%3]
+ %%load m3, [rfilterq+3*%3]
%else
- movdqu m1, [rfilterq+ %3q] ;load 128bit of x+stride
- movdqu m2, [rfilterq+2*%3q] ;load 128bit of x+2*stride
- movdqu m3, [rfilterq+r3srcq] ;load 128bit of x+2*stride
+ %%load m1, [rfilterq+ %3q]
+ %%load m2, [rfilterq+2*%3q]
+ %%load m3, [rfilterq+r3srcq]
%endif
%if %1 == 8