#endif
// This is the number of TTEntry slots for each position
-static const int ClusterSize = 5;
+static const int ClusterSize = 4;
// The main transposition table
TranspositionTable TT;
/// blocking function and do not stalls the CPU waiting for data
/// to be loaded from RAM, that can be very slow. When we will
/// subsequently call retrieve() the TT data will be already
-/// quickly accessible in L1/l2 CPU cache.
+/// quickly accessible in L1/L2 CPU cache.
void TranspositionTable::prefetch(const Key posKey) const {
#if defined(_MSC_VER)
- _mm_prefetch((char*)first_entry(posKey), _MM_HINT_T0);
+ char* addr = (char*)first_entry(posKey);
+ _mm_prefetch(addr, _MM_HINT_T0);
+ _mm_prefetch(addr+64, _MM_HINT_T0);
#else
- __builtin_prefetch((const void*)first_entry(posKey), 0, 3);
+ // We need to force an asm volatile here because gcc builtin
+ // is optimized away by Intel compiler.
+ char* addr = (char*)first_entry(posKey);
+ asm volatile("prefetcht0 %0" :: "m" (addr));
#endif
}