From cd4604b05c2d61928b26ab50c5864c36ef1d3785 Mon Sep 17 00:00:00 2001 From: Marco Costalba Date: Sun, 9 Aug 2009 13:44:55 +0100 Subject: [PATCH] Add TT prefetching support TT.retrieve() is the most time consuming function because almost always involves a very slow RAM access. TT table is so big that is never cached. This patch prefetches TT data just after a move is done, so that subsequent TT.retrieve will be very fast. Profiling with VTune shows that TT:retrieve() times are almost cutted in half ! No functional change. Signed-off-by: Marco Costalba --- src/search.cpp | 5 +++++ src/tt.cpp | 12 ++++++++++++ src/tt.h | 1 + 3 files changed, 18 insertions(+) diff --git a/src/search.cpp b/src/search.cpp index dbbfa022..61475a2f 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -1126,6 +1126,7 @@ namespace { // Make and search the move StateInfo st; pos.do_move(move, st, dcCandidates); + TT.prefetch(pos.get_key()); if (moveCount == 1) // The first move in list is the PV value = -search_pv(pos, ss, -beta, -alpha, newDepth, ply+1, threadID); @@ -1296,6 +1297,8 @@ namespace { StateInfo st; pos.do_null_move(st); + TT.prefetch(pos.get_key()); + int R = (depth >= 5 * OnePly ? 4 : 3); // Null move dynamic reduction Value nullValue = -search(pos, ss, -(beta-1), depth-R*OnePly, ply+1, false, threadID); @@ -1410,6 +1413,7 @@ namespace { // Make and search the move StateInfo st; pos.do_move(move, st, dcCandidates); + TT.prefetch(pos.get_key()); // Try to reduce non-pv search depth by one ply if move seems not problematic, // if the move fails high will be re-searched at full depth. @@ -1619,6 +1623,7 @@ namespace { // Make and search the move. StateInfo st; pos.do_move(move, st, dcCandidates); + TT.prefetch(pos.get_key()); Value value = -qsearch(pos, ss, -beta, -alpha, depth-OnePly, ply+1, threadID); pos.undo_move(move); diff --git a/src/tt.cpp b/src/tt.cpp index e9b6c175..f2313eab 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include "movegen.h" #include "tt.h" @@ -153,6 +154,17 @@ TTEntry* TranspositionTable::retrieve(const Key posKey) const { return NULL; } +/// TranspositionTable::prefetch looks up the current position in the +/// transposition table and load it in L1/L2 cache. This is a non +/// blocking function and do not stalls the CPU waiting for data +/// to be loaded from RAM, that can be very slow. When we will +/// subsequently call retrieve() the TT data will be already +/// quickly accessible in L1/l2 CPU cache. + +void TranspositionTable::prefetch(const Key posKey) const { + + _mm_prefetch((char*)first_entry(posKey), _MM_HINT_T0); +} /// TranspositionTable::first_entry returns a pointer to the first /// entry of a cluster given a position. The low 32 bits of the key diff --git a/src/tt.h b/src/tt.h index 4a8699fb..e778a372 100644 --- a/src/tt.h +++ b/src/tt.h @@ -85,6 +85,7 @@ public: void clear(); void store(const Key posKey, Value v, ValueType type, Depth d, Move m); TTEntry* retrieve(const Key posKey) const; + void prefetch(const Key posKey) const; void new_search(); void insert_pv(const Position& pos, Move pv[]); void extract_pv(const Position& pos, Move pv[]); -- 2.39.2