this workaround is possibly rather a windows & gcc specific problem. See e.g.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412#c25
on Linux with gcc 8 this patch brings roughly a 8% speedup.
However, probably needs some testing in the wild.
includes a workaround for an old msys make (3.81) installation (fixes #2984)
No functional change
build: config-sanity
$(MAKE) ARCH=$(ARCH) COMP=$(COMP) all
-profile-build: config-sanity objclean profileclean net
+profile-build: net config-sanity objclean profileclean
@echo ""
@echo "Step 1/4. Building instrumented executable ..."
$(MAKE) ARCH=$(ARCH) COMP=$(COMP) $(profile_make)
// compiled with older g++ crashes because the output memory is not aligned
// even though alignas is specified.
#if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32)
#define _mm256_loadA_si256 _mm256_loadu_si256
#define _mm256_storeA_si256 _mm256_storeu_si256
#else
#endif
#if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9)
+#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32)
#define _mm512_loadA_si512 _mm512_loadu_si512
#define _mm512_storeA_si512 _mm512_storeu_si512
#else