It seems that icc used our fallback version of popcount.
Now use intrinsics.
icc version 16.0.2 (gcc version 5.3.0 compatibility)
bmi2 compile
uname -r 4.5.1-1-ARCH
20xbench gives a nice speedup
./stockfish-icc-master
2161515 +- 34462
./stockfish-icc-sse42
2260857 +- 50349
union { Bitboard bb; uint16_t u[4]; } v = { b };
return PopCnt16[v.u[0]] + PopCnt16[v.u[1]] + PopCnt16[v.u[2]] + PopCnt16[v.u[3]];
union { Bitboard bb; uint16_t u[4]; } v = { b };
return PopCnt16[v.u[0]] + PopCnt16[v.u[1]] + PopCnt16[v.u[2]] + PopCnt16[v.u[3]];
-#elif defined(_MSC_VER) && defined(__INTEL_COMPILER)
+#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)
return _mm_popcnt_u64(b);
return _mm_popcnt_u64(b);
-#if defined(USE_POPCNT) && defined(__INTEL_COMPILER) && defined(_MSC_VER)
+#if defined(USE_POPCNT) && (defined(__INTEL_COMPILER) || defined(_MSC_VER))
# include <nmmintrin.h> // Intel header for _mm_popcnt_u64() intrinsic
#endif
# include <nmmintrin.h> // Intel header for _mm_popcnt_u64() intrinsic
#endif