+ __asm__ volatile(
+ "movaps 0(%0), %%xmm0 \n\t" \
+ "movaps 16(%0), %%xmm1 \n\t" \
+ "movaps 32(%0), %%xmm2 \n\t" \
+ "movaps 48(%0), %%xmm3 \n\t" \
+ "movaps %%xmm0, 0(%1) \n\t" \
+ "movaps %%xmm1, 16(%1) \n\t" \
+ "movaps %%xmm2, 32(%1) \n\t" \
+ "movaps %%xmm3, 48(%1) \n\t" \
+ "movaps 64(%0), %%xmm0 \n\t" \
+ "movaps 80(%0), %%xmm1 \n\t" \
+ "movaps 96(%0), %%xmm2 \n\t" \
+ "movaps 112(%0), %%xmm3 \n\t" \
+ "movaps %%xmm0, 64(%1) \n\t" \
+ "movaps %%xmm1, 80(%1) \n\t" \
+ "movaps %%xmm2, 96(%1) \n\t" \
+ "movaps %%xmm3, 112(%1) \n\t"
+ ::"r"(in), "r"(in+512)
+ :"memory"
+ );