+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+ // GCC should ideally be able to figure out that the conditional move is better, but
+ // it doesn't for various reasons, and this is pretty important for speed, so we hardcode.
+ asm("cmp %1, %0 ; cmovl %2, %0"
+ : "+r" (val)
+ : "g" (bit_thresholds[bits]),
+ "r" (val + (-1 << bits) + 1)
+ : "cc");
+ return val;
+#else