::"r"(-4*k), "r"(4*k),
"r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)
);
-#ifdef ARCH_X86_64
+#if ARCH_X86_64
// if we have enough regs, don't let gcc make the luts latency-bound
// but if not, latency is faster than spilling
__asm__("movlps %%xmm0, %0 \n"