store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
+#ifndef __SSE4A__
+# undef vlc_CPU_SSE4A
+# define vlc_CPU_SSE4A() ((cpu & VLC_CPU_SSE4A) != 0)
+#endif
+
+#ifndef __SSSE3__
+# undef vlc_CPU_SSSE3
+# define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
+#endif
+
/* Execute the instruction op only if SSE2 is supported. */
#ifdef CAN_COMPILE_SSE2
-# define ASM_SSE2(cpu, op) do { \
- if (cpu & CPU_CAPABILITY_SSE2) \
- asm volatile (op); \
+# ifdef __SSE2__
+# define ASM_SSE2(cpu, op) asm volatile (op)
+# else
+# define ASM_SSE2(cpu, op) do { \
+ if (cpu & VLC_CPU_SSE2) \
+ asm volatile (op); \
} while (0)
+# undef vlc_CPU_SSE2
+# define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
+# endif
#else
-# define ASM_SSE2(cpu, op)
+# define ASM_SSE2(cpu, op)
#endif
/* Optimized copy from "Uncacheable Speculative Write Combining" memory
dst[x] = src[x];
#ifdef CAN_COMPILE_SSE4_1
- if (cpu & CPU_CAPABILITY_SSE4_1) {
+ if (vlc_CPU_SSE4_1()) {
if (!unaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
} else
#endif
#ifdef CAN_COMPILE_SSE2
- if (cpu & CPU_CAPABILITY_SSE2) {
+ if (vlc_CPU_SSE2()) {
if (!unaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqa");
bool unaligned = ((intptr_t)dst & 0x0f) != 0;
#ifdef CAN_COMPILE_SSE2
- if (cpu & CPU_CAPABILITY_SSE2) {
+ if (vlc_CPU_SSE2()) {
if (!unaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movntdq");
"movhpd %%xmm3, 24(%[dst2])\n"
#ifdef CAN_COMPILE_SSSE3
- if (cpu & CPU_CAPABILITY_SSSE3) {
+ if (vlc_CPU_SSE3())
+ {
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[shuffle]), %%xmm7\n"
} else
#endif
#ifdef CAN_COMPILE_SSE2
- if (cpu & CPU_CAPABILITY_SSE2) {
+ if (vlc_CPU_SSE2()) {
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[mask]), %%xmm7\n"