+ uint32_t posKey32 = posKey >> 32;
+ TTEntry* tte = first_entry(posKey);
+
+ for (int i = 0; i < ClusterSize; i++, tte++)
+ if (tte->key() == posKey32)
+ return tte;
+
+ return NULL;
+}
+
+
+/// TranspositionTable::prefetch looks up the current position in the
+/// transposition table and load it in L1/L2 cache. This is a non
+/// blocking function and do not stalls the CPU waiting for data
+/// to be loaded from RAM, that can be very slow. When we will
+/// subsequently call retrieve() the TT data will be already
+/// quickly accessible in L1/L2 CPU cache.
+
+void TranspositionTable::prefetch(const Key posKey) const {
+
+#if defined(__INTEL_COMPILER) || defined(__ICL)
+ // This hack prevents prefetches to be optimized away by the
+ // Intel compiler. Both MSVC and gcc seems not affected.
+ __asm__ ("");
+#endif
+
+ char const* addr = (char*)first_entry(posKey);
+ _mm_prefetch(addr, _MM_HINT_T2);
+ _mm_prefetch(addr+64, _MM_HINT_T2); // 64 bytes ahead