-#define X264_CPU_MMX 0x000001 /* mmx */
-#define X264_CPU_MMXEXT 0x000002 /* mmx-ext*/
-#define X264_CPU_SSE 0x000004 /* sse */
-#define X264_CPU_SSE2 0x000008 /* sse 2 */
-#define X264_CPU_3DNOW 0x000010 /* 3dnow! */
-#define X264_CPU_3DNOWEXT 0x000020 /* 3dnow! ext */
-#define X264_CPU_ALTIVEC 0x000040 /* altivec */
-#define X264_CPU_SSE3 0x000080 /* sse 3 */
-#define X264_CPU_SSSE3 0x000100 /* ssse 3 */
+#define X264_CPU_CACHELINE_32 0x000001 /* avoid memory loads that span the border between two cachelines */
+#define X264_CPU_CACHELINE_64 0x000002 /* 32/64 is the size of a cacheline in bytes */
+#define X264_CPU_ALTIVEC 0x000004
+#define X264_CPU_MMX 0x000008
+#define X264_CPU_MMXEXT 0x000010 /* MMX2 aka MMXEXT aka ISSE */
+#define X264_CPU_SSE 0x000020
+#define X264_CPU_SSE2 0x000040
+#define X264_CPU_SSE2_IS_SLOW 0x000080 /* avoid most SSE2 functions on Athlon64 */
+#define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */
+#define X264_CPU_SSE3 0x000200
+#define X264_CPU_SSSE3 0x000400
+#define X264_CPU_SHUFFLE_IS_FAST 0x000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
+#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
+#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
+#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
+#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
+#define X264_CPU_ARMV6 0x020000
+#define X264_CPU_NEON 0x040000 /* ARM NEON */
+#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */