#endif
+#if !defined(NO_PREFETCH)
+# include <xmmintrin.h>
+#endif
+
#include <cassert>
#include <cstdio>
#include <iomanip>
return 0;
}
}
+
+/// prefetch() preloads the given address in L1/L2 cache. This is a non
+/// blocking function and do not stalls the CPU waiting for data to be
+/// loaded from RAM, that can be very slow.
+#if defined(NO_PREFETCH)
+void prefetch(char*) {}
+#else
+
+void prefetch(char* addr) {
+
+#if defined(__INTEL_COMPILER) || defined(__ICL)
+ // This hack prevents prefetches to be optimized away by
+ // Intel compiler. Both MSVC and gcc seems not affected.
+ __asm__ ("");
+#endif
+
+ _mm_prefetch(addr, _MM_HINT_T2);
+ _mm_prefetch(addr+64, _MM_HINT_T2); // 64 bytes ahead
+}
+
+#endif
+
#endif
extern int get_system_time();
extern int cpu_count();
extern int Bioskey();
+extern void prefetch(char* addr);
////
}
// Prefetch TT access as soon as we know key is updated
- TT.prefetch(key);
+ prefetch((char*)TT.first_entry(key));
// Move the piece
Bitboard move_bb = make_move_bb(from, to);
st->key ^= zobEp[st->epSquare];
st->key ^= zobSideToMove;
- TT.prefetch(st->key);
+ prefetch((char*)TT.first_entry(st->key));
sideToMove = opposite_color(sideToMove);
st->epSquare = SQ_NONE;
#include <cassert>
#include <cmath>
#include <cstring>
-#if !defined(NO_PREFETCH)
-# include <xmmintrin.h>
-#endif
#include "movegen.h"
#include "tt.h"
}
-/// TranspositionTable::first_entry returns a pointer to the first
-/// entry of a cluster given a position. The low 32 bits of the key
-/// are used to get the index in the table.
-
-inline TTEntry* TranspositionTable::first_entry(const Key posKey) const {
-
- return entries[uint32_t(posKey) & (size - 1)].data;
-}
-
-
/// TranspositionTable::store writes a new entry containing a position,
/// a value, a value type, a search depth, and a best move to the
/// transposition table. Transposition table is organized in clusters of
}
-/// TranspositionTable::prefetch looks up the current position in the
-/// transposition table and load it in L1/L2 cache. This is a non
-/// blocking function and do not stalls the CPU waiting for data
-/// to be loaded from RAM, that can be very slow. When we will
-/// subsequently call retrieve() the TT data will be already
-/// quickly accessible in L1/L2 CPU cache.
-#if defined(NO_PREFETCH)
-void TranspositionTable::prefetch(const Key) const {}
-#else
-
-void TranspositionTable::prefetch(const Key posKey) const {
-
-#if defined(__INTEL_COMPILER) || defined(__ICL)
- // This hack prevents prefetches to be optimized away by
- // Intel compiler. Both MSVC and gcc seems not affected.
- __asm__ ("");
-#endif
-
- char const* addr = (char*)first_entry(posKey);
- _mm_prefetch(addr, _MM_HINT_T2);
- _mm_prefetch(addr+64, _MM_HINT_T2); // 64 bytes ahead
-}
-
-#endif
-
/// TranspositionTable::new_search() is called at the beginning of every new
/// search. It increments the "generation" variable, which is used to
/// distinguish transposition table entries from previous searches from
void clear();
void store(const Key posKey, Value v, ValueType type, Depth d, Move m, Value statV, Value kingD);
TTEntry* retrieve(const Key posKey) const;
- void prefetch(const Key posKey) const;
void new_search();
void insert_pv(const Position& pos, Move pv[]);
void extract_pv(const Position& pos, Move pv[], const int PLY_MAX);
int full() const;
+ TTEntry* first_entry(const Key posKey) const;
private:
- inline TTEntry* first_entry(const Key posKey) const;
-
// Be sure 'writes' is at least one cache line away
// from read only variables.
unsigned char pad_before[64 - sizeof(unsigned)];
extern TranspositionTable TT;
+
+/// TranspositionTable::first_entry returns a pointer to the first
+/// entry of a cluster given a position. The low 32 bits of the key
+/// are used to get the index in the table.
+
+inline TTEntry* TranspositionTable::first_entry(const Key posKey) const {
+
+ return entries[uint32_t(posKey) & (size - 1)].data;
+}
+
#endif // !defined(TT_H_INCLUDED)