fixes minor conflicts.
needed for optimal play and in addition being able to take into account
the 50-move rule.
+## Stockfish on distributed memory systems
+
+The cluster branch allows for running Stockfish on a cluster of servers (nodes)
+that are connected with a high-speed and low-latency network, using the message
+passing interface (MPI). In this case, one MPI process should be run per node,
+and UCI options can be used to set the number of threads/hash per node as usual.
+Typically, the engine will be invoked as
+```
+mpirun -np N /path/to/stockfish
+```
+where ```N``` stands for the number of MPI processes used (alternatives to ```mpirun```,
+include ```mpiexec```, ```srun```). Use 1 mpi rank per node, and employ threading
+according to the cores per node. To build the cluster
+branch, it is sufficient to specify ```COMPILER=mpicxx``` on the make command line,
+and do a clean build:
+```
+make -j ARCH=x86-64-modern clean build COMPILER=mpicxx
+```
+If the name of the compiler wrapper (typically mpicxx, but sometimes e.g. CC) does
+not match ```mpi``` an edit to the Makefile is required. Make sure that the MPI
+installation is configured to support ```MPI_THREAD_MULTIPLE```, this might require
+adding system specific compiler options to the Makefile. Stockfish employs
+non-blocking (asynchronous) communication, and benefits from an MPI
+implementation that efficiently supports this. Some MPI implentations might benefit
+from leaving 1 core/thread free for these asynchronous communications, and might require
+setting additional environment variables. ```mpirun``` should forward stdin/stdout
+to ```rank 0``` only (e.g. ```srun --input=0 --output=0```).
+ Refer to your MPI documentation for more info.
+
## Large Pages
Stockfish supports large pages on Linux and Windows. Large pages make
PGOBENCH = ./$(EXE) bench
### Source and object files
-SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
+SRCS = benchmark.cpp bitbase.cpp bitboard.cpp cluster.cpp endgame.cpp evaluate.cpp main.cpp \
material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
nnue/evaluate_nnue.cpp nnue/features/half_kp.cpp
# vnni256 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 256
# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512
# neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture
+# mpi = yes/no --- -DUSE_MPI --- Use Message Passing Interface
#
# Note that Makefile is space sensitive, so when adding new architectures
# or modifying existing flags, you have to make sure there are no extra spaces
vnni256 = no
vnni512 = no
neon = no
+mpi = no
STRIP = strip
### 2.2 Architecture specific
LDFLAGS += -fPIE -pie
endif
+### 3.10 MPI
+ifneq (,$(findstring mpi, $(CXX)))
+ mpi = yes
+ CXXFLAGS += -DUSE_MPI -Wno-cast-qual -fexceptions
+ DEPENDFLAGS += -DUSE_MPI
+endif
+
### ==========================================================================
### Section 4. Public Targets
### ==========================================================================
@echo "vnni256: '$(vnni256)'"
@echo "vnni512: '$(vnni512)'"
@echo "neon: '$(neon)'"
+ @echo "mpi: '$(mpi)'"
@echo ""
@echo "Flags:"
@echo "CXX: $(CXX)"
--- /dev/null
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef USE_MPI
+
+#include <array>
+#include <map>
+#include <cstddef>
+#include <cstdlib>
+#include <iostream>
+#include <istream>
+#include <mpi.h>
+#include <string>
+#include <vector>
+
+#include "cluster.h"
+#include "thread.h"
+#include "tt.h"
+#include "timeman.h"
+
+namespace Cluster {
+
+// Total number of ranks and rank within the communicator
+static int world_rank = MPI_PROC_NULL;
+static int world_size = 0;
+
+// Signals between ranks exchange basic info using a dedicated communicator
+static MPI_Comm signalsComm = MPI_COMM_NULL;
+static MPI_Request reqSignals = MPI_REQUEST_NULL;
+static uint64_t signalsCallCounter = 0;
+
+// Signals are the number of nodes searched, stop, table base hits, transposition table saves
+enum Signals : int { SIG_NODES = 0, SIG_STOP = 1, SIG_TB = 2, SIG_TTS = 3, SIG_NB = 4};
+static uint64_t signalsSend[SIG_NB] = {};
+static uint64_t signalsRecv[SIG_NB] = {};
+static uint64_t nodesSearchedOthers = 0;
+static uint64_t tbHitsOthers = 0;
+static uint64_t TTsavesOthers = 0;
+static uint64_t stopSignalsPosted = 0;
+
+// The UCI threads of each rank exchange use a dedicated communicator
+static MPI_Comm InputComm = MPI_COMM_NULL;
+
+// bestMove requires MoveInfo communicators and data types
+static MPI_Comm MoveComm = MPI_COMM_NULL;
+static MPI_Datatype MIDatatype = MPI_DATATYPE_NULL;
+
+// TT entries are communicated with a dedicated communicator.
+// The receive buffer is used to gather information from all ranks.
+// THe TTCacheCounter tracks the number of local elements that are ready to be sent.
+static MPI_Comm TTComm = MPI_COMM_NULL;
+static std::array<std::vector<KeyedTTEntry>, 2> TTSendRecvBuffs;
+static std::array<MPI_Request, 2> reqsTTSendRecv = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
+static uint64_t sendRecvPosted = 0;
+static std::atomic<uint64_t> TTCacheCounter = {};
+
+/// Initialize MPI and associated data types. Note that the MPI library must be configured
+/// to support MPI_THREAD_MULTIPLE, since multiple threads access MPI simultaneously.
+void init() {
+
+ int thread_support;
+ MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &thread_support);
+ if (thread_support < MPI_THREAD_MULTIPLE)
+ {
+ std::cerr << "Stockfish requires support for MPI_THREAD_MULTIPLE."
+ << std::endl;
+ std::exit(EXIT_FAILURE);
+ }
+
+ MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+ const std::array<MPI_Aint, 5> MIdisps = {offsetof(MoveInfo, move),
+ offsetof(MoveInfo, ponder),
+ offsetof(MoveInfo, depth),
+ offsetof(MoveInfo, score),
+ offsetof(MoveInfo, rank)};
+ MPI_Type_create_hindexed_block(5, 1, MIdisps.data(), MPI_INT, &MIDatatype);
+ MPI_Type_commit(&MIDatatype);
+
+ MPI_Comm_dup(MPI_COMM_WORLD, &InputComm);
+ MPI_Comm_dup(MPI_COMM_WORLD, &TTComm);
+ MPI_Comm_dup(MPI_COMM_WORLD, &MoveComm);
+ MPI_Comm_dup(MPI_COMM_WORLD, &signalsComm);
+}
+
+/// Finalize MPI and free the associated data types.
+void finalize() {
+
+ MPI_Type_free(&MIDatatype);
+
+ MPI_Comm_free(&InputComm);
+ MPI_Comm_free(&TTComm);
+ MPI_Comm_free(&MoveComm);
+ MPI_Comm_free(&signalsComm);
+
+ MPI_Finalize();
+}
+
+/// Return the total number of ranks
+int size() {
+
+ return world_size;
+}
+
+/// Return the rank (index) of the process
+int rank() {
+
+ return world_rank;
+}
+
+/// The receive buffer depends on the number of MPI ranks and threads, resize as needed
+void ttSendRecvBuff_resize(size_t nThreads) {
+
+ for (int i : {0, 1})
+ {
+ TTSendRecvBuffs[i].resize(TTCacheSize * world_size * nThreads);
+ std::fill(TTSendRecvBuffs[i].begin(), TTSendRecvBuffs[i].end(), KeyedTTEntry());
+ }
+}
+
+/// As input is only received by the root (rank 0) of the cluster, this input must be relayed
+/// to the UCI threads of all ranks, in order to setup the position, etc. We do this with a
+/// dedicated getline implementation, where the root broadcasts to all other ranks the received
+/// information.
+bool getline(std::istream& input, std::string& str) {
+
+ int size;
+ std::vector<char> vec;
+ bool state;
+
+ if (is_root())
+ {
+ state = static_cast<bool>(std::getline(input, str));
+ vec.assign(str.begin(), str.end());
+ size = vec.size();
+ }
+
+ // Some MPI implementations use busy-wait polling, while we need yielding as otherwise
+ // the UCI thread on the non-root ranks would be consuming resources.
+ static MPI_Request reqInput = MPI_REQUEST_NULL;
+ MPI_Ibcast(&size, 1, MPI_INT, 0, InputComm, &reqInput);
+ if (is_root())
+ MPI_Wait(&reqInput, MPI_STATUS_IGNORE);
+ else
+ {
+ while (true)
+ {
+ int flag;
+ MPI_Test(&reqInput, &flag, MPI_STATUS_IGNORE);
+ if (flag)
+ break;
+ else
+ std::this_thread::sleep_for(std::chrono::milliseconds(10));
+ }
+ }
+
+ // Broadcast received string
+ if (!is_root())
+ vec.resize(size);
+ MPI_Bcast(vec.data(), size, MPI_CHAR, 0, InputComm);
+ if (!is_root())
+ str.assign(vec.begin(), vec.end());
+ MPI_Bcast(&state, 1, MPI_CXX_BOOL, 0, InputComm);
+
+ return state;
+}
+
+/// Sending part of the signal communication loop
+void signals_send() {
+
+ signalsSend[SIG_NODES] = Threads.nodes_searched();
+ signalsSend[SIG_TB] = Threads.tb_hits();
+ signalsSend[SIG_TTS] = Threads.TT_saves();
+ signalsSend[SIG_STOP] = Threads.stop;
+ MPI_Iallreduce(signalsSend, signalsRecv, SIG_NB, MPI_UINT64_T,
+ MPI_SUM, signalsComm, &reqSignals);
+ ++signalsCallCounter;
+}
+
+/// Processing part of the signal communication loop.
+/// For some counters (e.g. nodes) we only keep their sum on the other nodes
+/// allowing to add local counters at any time for more fine grained process,
+/// which is useful to indicate progress during early iterations, and to have
+/// node counts that exactly match the non-MPI code in the single rank case.
+/// This call also propagates the stop signal between ranks.
+void signals_process() {
+
+ nodesSearchedOthers = signalsRecv[SIG_NODES] - signalsSend[SIG_NODES];
+ tbHitsOthers = signalsRecv[SIG_TB] - signalsSend[SIG_TB];
+ TTsavesOthers = signalsRecv[SIG_TTS] - signalsSend[SIG_TTS];
+ stopSignalsPosted = signalsRecv[SIG_STOP];
+ if (signalsRecv[SIG_STOP] > 0)
+ Threads.stop = true;
+}
+
+void sendrecv_post() {
+
+ ++sendRecvPosted;
+ MPI_Irecv(TTSendRecvBuffs[sendRecvPosted % 2].data(),
+ TTSendRecvBuffs[sendRecvPosted % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
+ (rank() + size() - 1) % size(), 42, TTComm, &reqsTTSendRecv[0]);
+ MPI_Isend(TTSendRecvBuffs[(sendRecvPosted + 1) % 2].data(),
+ TTSendRecvBuffs[(sendRecvPosted + 1) % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
+ (rank() + 1 ) % size(), 42, TTComm, &reqsTTSendRecv[1]);
+}
+
+/// During search, most message passing is asynchronous, but at the end of
+/// search it makes sense to bring them to a common, finalized state.
+void signals_sync() {
+
+ while(stopSignalsPosted < uint64_t(size()))
+ signals_poll();
+
+ // Finalize outstanding messages of the signal loops.
+ // We might have issued one call less than needed on some ranks.
+ uint64_t globalCounter;
+ MPI_Allreduce(&signalsCallCounter, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
+ if (signalsCallCounter < globalCounter)
+ {
+ MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
+ signals_send();
+ }
+ assert(signalsCallCounter == globalCounter);
+ MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
+ signals_process();
+
+ // Finalize outstanding messages in the sendRecv loop
+ MPI_Allreduce(&sendRecvPosted, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
+ while (sendRecvPosted < globalCounter)
+ {
+ MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
+ sendrecv_post();
+ }
+ assert(sendRecvPosted == globalCounter);
+ MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
+
+}
+
+/// Initialize signal counters to zero.
+void signals_init() {
+
+ stopSignalsPosted = tbHitsOthers = TTsavesOthers = nodesSearchedOthers = 0;
+
+ signalsSend[SIG_NODES] = signalsRecv[SIG_NODES] = 0;
+ signalsSend[SIG_TB] = signalsRecv[SIG_TB] = 0;
+ signalsSend[SIG_TTS] = signalsRecv[SIG_TTS] = 0;
+ signalsSend[SIG_STOP] = signalsRecv[SIG_STOP] = 0;
+
+}
+
+/// Poll the signal loop, and start next round as needed.
+void signals_poll() {
+
+ int flag;
+ MPI_Test(&reqSignals, &flag, MPI_STATUS_IGNORE);
+ if (flag)
+ {
+ signals_process();
+ signals_send();
+ }
+}
+
+/// Provide basic info related the cluster performance, in particular, the number of signals send,
+/// signals per sounds (sps), the number of gathers, the number of positions gathered (per node and per second, gpps)
+/// The number of TT saves and TT saves per second. If gpps equals approximately TTSavesps the gather loop has enough bandwidth.
+void cluster_info(Depth depth) {
+
+ TimePoint elapsed = Time.elapsed() + 1;
+ uint64_t TTSaves = TT_saves();
+
+ sync_cout << "info depth " << depth << " cluster "
+ << " signals " << signalsCallCounter << " sps " << signalsCallCounter * 1000 / elapsed
+ << " sendRecvs " << sendRecvPosted << " srpps " << TTSendRecvBuffs[0].size() * sendRecvPosted * 1000 / elapsed
+ << " TTSaves " << TTSaves << " TTSavesps " << TTSaves * 1000 / elapsed
+ << sync_endl;
+}
+
+/// When a TT entry is saved, additional steps are taken if the entry is of sufficient depth.
+/// If sufficient entries has been collected, a communication is initiated.
+/// If a communication has been completed, the received results are saved to the TT.
+void save(Thread* thread, TTEntry* tte,
+ Key k, Value v, bool PvHit, Bound b, Depth d, Move m, Value ev) {
+
+ // Standard save to the TT
+ tte->save(k, v, PvHit, b, d, m, ev);
+
+ // If the entry is of sufficient depth to be worth communicating, take action.
+ if (d > 3)
+ {
+ // count the TTsaves to information: this should be relatively similar
+ // to the number of entries we can send/recv.
+ thread->TTsaves.fetch_add(1, std::memory_order_relaxed);
+
+ // Add to thread's send buffer, the locking here avoids races when the master thread
+ // prepares the send buffer.
+ {
+ std::lock_guard<std::mutex> lk(thread->ttCache.mutex);
+ thread->ttCache.buffer.replace(KeyedTTEntry(k,*tte));
+ ++TTCacheCounter;
+ }
+
+ size_t recvBuffPerRankSize = Threads.size() * TTCacheSize;
+
+ // Communicate on main search thread, as soon the threads combined have collected
+ // sufficient data to fill the send buffers.
+ if (thread == Threads.main() && TTCacheCounter > recvBuffPerRankSize)
+ {
+ // Test communication status
+ int flag;
+ MPI_Testall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), &flag, MPI_STATUSES_IGNORE);
+
+ // Current communication is complete
+ if (flag)
+ {
+ // Save all received entries to TT, and store our TTCaches, ready for the next round of communication
+ for (size_t irank = 0; irank < size_t(size()) ; ++irank)
+ {
+ if (irank == size_t(rank())) // this is our part, fill the part of the buffer for sending
+ {
+ // Copy from the thread caches to the right spot in the buffer
+ size_t i = irank * recvBuffPerRankSize;
+ for (auto&& th : Threads)
+ {
+ std::lock_guard<std::mutex> lk(th->ttCache.mutex);
+
+ for (auto&& e : th->ttCache.buffer)
+ TTSendRecvBuffs[sendRecvPosted % 2][i++] = e;
+
+ // Reset thread's send buffer
+ th->ttCache.buffer = {};
+ }
+
+ TTCacheCounter = 0;
+ }
+ else // process data received from the corresponding rank.
+ for (size_t i = irank * recvBuffPerRankSize; i < (irank + 1) * recvBuffPerRankSize; ++i)
+ {
+ auto&& e = TTSendRecvBuffs[sendRecvPosted % 2][i];
+ bool found;
+ TTEntry* replace_tte;
+ replace_tte = TT.probe(e.first, found);
+ replace_tte->save(e.first, e.second.value(), e.second.is_pv(), e.second.bound(), e.second.depth(),
+ e.second.move(), e.second.eval());
+ }
+ }
+
+ // Start next communication
+ sendrecv_post();
+
+ // Force check of time on the next occasion, the above actions might have taken some time.
+ static_cast<MainThread*>(thread)->callsCnt = 0;
+
+ }
+ }
+ }
+}
+
+/// Picks the bestMove across ranks, and send the associated info and PV to the root of the cluster.
+/// Note that this bestMove and PV must be output by the root, the guarantee proper ordering of output.
+/// TODO update to the scheme in master.. can this use aggregation of votes?
+void pick_moves(MoveInfo& mi, std::string& PVLine) {
+
+ MoveInfo* pMoveInfo = NULL;
+ if (is_root())
+ {
+ pMoveInfo = (MoveInfo*)malloc(sizeof(MoveInfo) * size());
+ }
+ MPI_Gather(&mi, 1, MIDatatype, pMoveInfo, 1, MIDatatype, 0, MoveComm);
+
+ if (is_root())
+ {
+ std::map<int, int> votes;
+ int minScore = pMoveInfo[0].score;
+ for (int i = 0; i < size(); ++i)
+ {
+ minScore = std::min(minScore, pMoveInfo[i].score);
+ votes[pMoveInfo[i].move] = 0;
+ }
+ for (int i = 0; i < size(); ++i)
+ {
+ votes[pMoveInfo[i].move] += pMoveInfo[i].score - minScore + pMoveInfo[i].depth;
+ }
+ int bestVote = votes[pMoveInfo[0].move];
+ for (int i = 0; i < size(); ++i)
+ {
+ if (votes[pMoveInfo[i].move] > bestVote)
+ {
+ bestVote = votes[pMoveInfo[i].move];
+ mi = pMoveInfo[i];
+ }
+ }
+ free(pMoveInfo);
+ }
+
+ // Send around the final result
+ MPI_Bcast(&mi, 1, MIDatatype, 0, MoveComm);
+
+ // Send PV line to root as needed
+ if (mi.rank != 0 && mi.rank == rank()) {
+ int size;
+ std::vector<char> vec;
+ vec.assign(PVLine.begin(), PVLine.end());
+ size = vec.size();
+ MPI_Send(&size, 1, MPI_INT, 0, 42, MoveComm);
+ MPI_Send(vec.data(), size, MPI_CHAR, 0, 42, MoveComm);
+ }
+ if (mi.rank != 0 && is_root()) {
+ int size;
+ std::vector<char> vec;
+ MPI_Recv(&size, 1, MPI_INT, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
+ vec.resize(size);
+ MPI_Recv(vec.data(), size, MPI_CHAR, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
+ PVLine.assign(vec.begin(), vec.end());
+ }
+
+}
+
+/// Return nodes searched (lazily updated cluster wide in the signal loop)
+uint64_t nodes_searched() {
+
+ return nodesSearchedOthers + Threads.nodes_searched();
+}
+
+/// Return table base hits (lazily updated cluster wide in the signal loop)
+uint64_t tb_hits() {
+
+ return tbHitsOthers + Threads.tb_hits();
+}
+
+/// Return the number of saves to the TT buffers, (lazily updated cluster wide in the signal loop)
+uint64_t TT_saves() {
+
+ return TTsavesOthers + Threads.TT_saves();
+}
+
+
+}
+
+#else
+
+#include "cluster.h"
+#include "thread.h"
+
+namespace Cluster {
+
+uint64_t nodes_searched() {
+
+ return Threads.nodes_searched();
+}
+
+uint64_t tb_hits() {
+
+ return Threads.tb_hits();
+}
+
+uint64_t TT_saves() {
+
+ return Threads.TT_saves();
+}
+
+}
+
+#endif // USE_MPI
--- /dev/null
+/*
+ Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+ Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+ Stockfish is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Stockfish is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef CLUSTER_H_INCLUDED
+#define CLUSTER_H_INCLUDED
+
+#include <algorithm>
+#include <array>
+#include <istream>
+#include <string>
+
+#include "tt.h"
+
+class Thread;
+
+/// The Cluster namespace contains functionality required to run on distributed
+/// memory architectures using MPI as the message passing interface. On a high level,
+/// a 'lazy SMP'-like scheme is implemented where TT saves of sufficient depth are
+/// collected on each rank and distributed to, and used by, all other ranks,
+/// which search essentially independently. The root (MPI rank 0) of the cluster
+/// is responsible for all I/O and time management, communicating this info to
+/// the other ranks as needed. UCI options such as Threads and Hash specify these
+/// quantities per MPI rank. It is recommended to have one rank (MPI process) per node.
+/// For the non-MPI case, wrappers that will be compiler-optimized away are provided.
+
+namespace Cluster {
+
+/// Basic info to find the cluster-wide bestMove
+struct MoveInfo {
+ int move;
+ int ponder;
+ int depth;
+ int score;
+ int rank;
+};
+
+#ifdef USE_MPI
+
+// store the TTEntry with its full key, so it can be saved on the receiver side
+using KeyedTTEntry = std::pair<Key, TTEntry>;
+constexpr std::size_t TTCacheSize = 16;
+
+// Threads locally cache their high-depth TT entries till a batch can be send by MPI
+template <std::size_t N> class TTCache : public std::array<KeyedTTEntry, N> {
+
+ struct Compare {
+ inline bool operator()(const KeyedTTEntry& lhs, const KeyedTTEntry& rhs) {
+ return lhs.second.depth() > rhs.second.depth();
+ }
+ };
+ Compare compare;
+
+public:
+
+ // Keep a heap of entries replacing low depth with high depth entries
+ bool replace(const KeyedTTEntry& value) {
+
+ if (compare(value, this->front()))
+ {
+ std::pop_heap(this->begin(), this->end(), compare);
+ this->back() = value;
+ std::push_heap(this->begin(), this->end(), compare);
+ return true;
+ }
+ return false;
+ }
+};
+
+void init();
+void finalize();
+bool getline(std::istream& input, std::string& str);
+int size();
+int rank();
+inline bool is_root() { return rank() == 0; }
+void save(Thread* thread, TTEntry* tte, Key k, Value v, bool PvHit, Bound b, Depth d, Move m, Value ev);
+void pick_moves(MoveInfo& mi, std::string& PVLine);
+void ttSendRecvBuff_resize(size_t nThreads);
+uint64_t nodes_searched();
+uint64_t tb_hits();
+uint64_t TT_saves();
+void cluster_info(Depth depth);
+void signals_init();
+void signals_poll();
+void signals_sync();
+
+#else
+
+inline void init() { }
+inline void finalize() { }
+inline bool getline(std::istream& input, std::string& str) { return static_cast<bool>(std::getline(input, str)); }
+constexpr int size() { return 1; }
+constexpr int rank() { return 0; }
+constexpr bool is_root() { return true; }
+inline void save(Thread*, TTEntry* tte, Key k, Value v, bool PvHit, Bound b, Depth d, Move m, Value ev) { tte->save(k, v, PvHit, b, d, m, ev); }
+inline void pick_moves(MoveInfo&, std::string&) { }
+inline void ttSendRecvBuff_resize(size_t) { }
+uint64_t nodes_searched();
+uint64_t tb_hits();
+uint64_t TT_saves();
+inline void cluster_info(Depth) { }
+inline void signals_init() { }
+inline void signals_poll() { }
+inline void signals_sync() { }
+
+#endif /* USE_MPI */
+
+}
+
+#endif // #ifndef CLUSTER_H_INCLUDED
#include <vector>
#include "bitboard.h"
+#include "cluster.h"
#include "evaluate.h"
#include "material.h"
#include "misc.h"
exit(EXIT_FAILURE);
}
- if (useNNUE)
- sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
- else
- sync_cout << "info string classical evaluation enabled" << sync_endl;
+ if (Cluster::is_root())
+ {
+ if (useNNUE)
+ sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+ else
+ sync_cout << "info string classical evaluation enabled" << sync_endl;
+ }
}
}
int main(int argc, char* argv[]) {
- std::cout << engine_info() << std::endl;
+ Cluster::init();
+ if (Cluster::is_root())
+ std::cout << engine_info() << std::endl;
CommandLine::init(argc, argv);
UCI::init(Options);
UCI::loop(argc, argv);
Threads.set(0);
+ Cluster::finalize();
return 0;
}
#include <iostream>
#include <sstream>
+#include "cluster.h"
#include "evaluate.h"
#include "misc.h"
#include "movegen.h"
nodes += cnt;
pos.undo_move(m);
}
- if (Root)
+ if (Root && Cluster::is_root())
sync_cout << UCI::move(m, pos.is_chess960()) << ": " << cnt << sync_endl;
}
return nodes;
if (Limits.perft)
{
nodes = perft<true>(rootPos, Limits.perft);
- sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
+ if (Cluster::is_root())
+ sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
+
return;
}
if (rootMoves.empty())
{
rootMoves.emplace_back(MOVE_NONE);
- sync_cout << "info depth 0 score "
- << UCI::value(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW)
- << sync_endl;
+ if (Cluster::is_root())
+ sync_cout << "info depth 0 score "
+ << UCI::value(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW)
+ << sync_endl;
}
else
{
// until the GUI sends one of those commands.
while (!Threads.stop && (ponder || Limits.infinite))
- {} // Busy wait for a stop or a ponder reset
+ { Cluster::signals_poll(); } // Busy wait for a stop or a ponder reset
// Stop the threads if not already stopped (also raise the stop if
// "ponderhit" just reset Threads.ponder).
Threads.stop = true;
+ // Signal and synchronize all other ranks
+ Cluster::signals_sync();
+
// Wait until all threads have finished
Threads.wait_for_search_finished();
// When playing in 'nodes as time' mode, subtract the searched nodes from
// the available ones before exiting.
if (Limits.npmsec)
- Time.availableNodes += Limits.inc[us] - Threads.nodes_searched();
+ Time.availableNodes += Limits.inc[us] - Cluster::nodes_searched();
Thread* bestThread = this;
&& rootMoves[0].pv[0] != MOVE_NONE)
bestThread = Threads.get_best_thread();
- bestPreviousScore = bestThread->rootMoves[0].score;
+ // Prepare PVLine and ponder move
+ std::string PVLine = UCI::pv(bestThread->rootPos, bestThread->completedDepth, -VALUE_INFINITE, VALUE_INFINITE);
- // Send again PV info if we have a new best thread
- if (bestThread != this)
- sync_cout << UCI::pv(bestThread->rootPos, bestThread->completedDepth, -VALUE_INFINITE, VALUE_INFINITE) << sync_endl;
+ Move bestMove = bestThread->rootMoves[0].pv[0];
+ Move ponderMove = MOVE_NONE;
+ if (bestThread->rootMoves[0].pv.size() > 1 || bestThread->rootMoves[0].extract_ponder_from_tt(rootPos))
+ ponderMove = bestThread->rootMoves[0].pv[1];
- sync_cout << "bestmove " << UCI::move(bestThread->rootMoves[0].pv[0], rootPos.is_chess960());
+ // Exchange info as needed
+ Cluster::MoveInfo mi{bestMove,
+ ponderMove,
+ bestThread->completedDepth,
+ bestThread->rootMoves[0].score,
+ Cluster::rank()};
+ Cluster::pick_moves(mi, PVLine);
- if (bestThread->rootMoves[0].pv.size() > 1 || bestThread->rootMoves[0].extract_ponder_from_tt(rootPos))
- std::cout << " ponder " << UCI::move(bestThread->rootMoves[0].pv[1], rootPos.is_chess960());
+ bestPreviousScore = static_cast<Value>(mi.score);
+
+ if (Cluster::is_root())
+ {
+ // Send again PV info if we have a new best thread/rank
+ if (bestThread != this || mi.rank != 0)
+ sync_cout << PVLine << sync_endl;
+
+ bestMove = static_cast<Move>(mi.move);
+ ponderMove = static_cast<Move>(mi.ponder);
+
+ if (ponderMove != MOVE_NONE)
+ sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960())
+ << " ponder " << UCI::move(ponderMove, rootPos.is_chess960()) << sync_endl;
+ else
+ sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960()) << sync_endl;
+ }
- std::cout << sync_endl;
}
// Iterative deepening loop until requested to stop or the target depth is reached
while ( ++rootDepth < MAX_PLY
&& !Threads.stop
- && !(Limits.depth && mainThread && rootDepth > Limits.depth))
+ && !(Limits.depth && mainThread && Cluster::is_root() && rootDepth > Limits.depth))
{
// Age out PV variability metric
if (mainThread)
totBestMoveChanges /= 2;
+
// Save the last iteration's scores before first PV line is searched and
// all the move scores except the (new) PV are set to -VALUE_INFINITE.
for (RootMove& rm : rootMoves)
// When failing high/low give some update (without cluttering
// the UI) before a re-search.
- if ( mainThread
+ if ( Cluster::is_root()
+ && mainThread
&& multiPV == 1
&& (bestValue <= alpha || bestValue >= beta)
&& Time.elapsed() > 3000)
+ {
sync_cout << UCI::pv(rootPos, rootDepth, alpha, beta) << sync_endl;
+ Cluster::cluster_info(rootDepth);
+ }
// In case of failing low/high increase aspiration window and
// re-search, otherwise exit the loop.
// Sort the PV lines searched so far and update the GUI
std::stable_sort(rootMoves.begin() + pvFirst, rootMoves.begin() + pvIdx + 1);
- if ( mainThread
+ if ( Cluster::is_root() && mainThread
&& (Threads.stop || pvIdx + 1 == multiPV || Time.elapsed() > 3000))
+ {
sync_cout << UCI::pv(rootPos, rootDepth, alpha, beta) << sync_endl;
+ Cluster::cluster_info(rootDepth);
+ }
}
if (!Threads.stop)
if ( b == BOUND_EXACT
|| (b == BOUND_LOWER ? value >= beta : value <= alpha))
{
- tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, b,
- std::min(MAX_PLY - 1, depth + 6),
- MOVE_NONE, VALUE_NONE);
+ Cluster::save(thisThread, tte,
+ posKey, value_to_tt(value, ss->ply), ss->ttPv, b,
+ std::min(MAX_PLY - 1, depth + 6),
+ MOVE_NONE, VALUE_NONE);
return value;
}
else
ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo;
- tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE, eval);
+ Cluster::save(thisThread, tte,
+ posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE,
+ eval);
}
// Step 7. Razoring (~1 Elo)
ss->moveCount = ++moveCount;
- if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000)
+ if (rootNode && Cluster::is_root() && thisThread == Threads.main() && Time.elapsed() > 3000)
sync_cout << "info depth " << depth
<< " currmove " << UCI::move(move, pos.is_chess960())
<< " currmovenumber " << moveCount + thisThread->pvIdx << sync_endl;
ss->ttPv = ss->ttPv && (ss+1)->ttPv;
if (!excludedMove && !(rootNode && thisThread->pvIdx))
- tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv,
- bestValue >= beta ? BOUND_LOWER :
- PvNode && bestMove ? BOUND_EXACT : BOUND_UPPER,
- depth, bestMove, ss->staticEval);
+ Cluster::save(thisThread, tte,
+ posKey, value_to_tt(bestValue, ss->ply), ss->ttPv,
+ bestValue >= beta ? BOUND_LOWER :
+ PvNode && bestMove ? BOUND_EXACT : BOUND_UPPER,
+ depth, bestMove, ss->staticEval);
assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE);
if (bestValue >= beta)
{
if (!ss->ttHit)
- tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER,
- DEPTH_NONE, MOVE_NONE, ss->staticEval);
+ Cluster::save(thisThread, tte,
+ posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER,
+ DEPTH_NONE, MOVE_NONE, ss->staticEval);
return bestValue;
}
if (ss->inCheck && bestValue == -VALUE_INFINITE)
return mated_in(ss->ply); // Plies to mate from the root
- tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit,
- bestValue >= beta ? BOUND_LOWER :
- PvNode && bestValue > oldAlpha ? BOUND_EXACT : BOUND_UPPER,
- ttDepth, bestMove, ss->staticEval);
+ Cluster::save(thisThread, tte,
+ posKey, value_to_tt(bestValue, ss->ply), pvHit,
+ bestValue >= beta ? BOUND_LOWER :
+ PvNode && bestValue > oldAlpha ? BOUND_EXACT : BOUND_UPPER,
+ ttDepth, bestMove, ss->staticEval);
assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE);
dbg_print();
}
+ // poll on MPI signals
+ Cluster::signals_poll();
+
// We should not stop pondering until told so by the GUI
if (ponder)
return;
if ( (Limits.use_time_management() && (elapsed > Time.maximum() - 10 || stopOnPonderhit))
|| (Limits.movetime && elapsed >= Limits.movetime)
- || (Limits.nodes && Threads.nodes_searched() >= (uint64_t)Limits.nodes))
+ || (Limits.nodes && Cluster::nodes_searched() >= (uint64_t)Limits.nodes))
Threads.stop = true;
}
const RootMoves& rootMoves = pos.this_thread()->rootMoves;
size_t pvIdx = pos.this_thread()->pvIdx;
size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size());
- uint64_t nodesSearched = Threads.nodes_searched();
- uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
+ uint64_t nodesSearched = Cluster::nodes_searched();
+ uint64_t tbHits = Cluster::tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
for (size_t i = 0; i < multiPV; ++i)
{
#include "misc.h"
#include "movepick.h"
#include "types.h"
+#include "cluster.h"
class Position;
}
bool use_time_management() const {
- return time[WHITE] || time[BLACK];
+ return Cluster::is_root() && (time[WHITE] || time[BLACK]);
}
std::vector<Move> searchmoves;
#include <mutex>
#include "../bitboard.h"
+#include "../cluster.h"
#include "../movegen.h"
#include "../position.h"
#include "../search.h"
}
}
- sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
+ if (Cluster::is_root())
+ sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
}
// Probe the WDL table for a particular position.
// Reallocate the hash with the new threadpool size
TT.resize(size_t(Options["Hash"]));
+ // Adjust cluster buffers
+ Cluster::ttSendRecvBuff_resize(requested);
+
// Init thread number dependent search params.
Search::init();
}
// since they are read-only.
for (Thread* th : *this)
{
- th->nodes = th->tbHits = th->nmpMinPly = th->bestMoveChanges = 0;
+ th->nodes = th->tbHits = th->TTsaves = th->nmpMinPly = th->bestMoveChanges = 0;
th->rootDepth = th->completedDepth = 0;
th->rootMoves = rootMoves;
th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th);
th->rootState = setupStates->back();
}
+ Cluster::signals_init();
+
main()->start_searching();
}
#include <thread>
#include <vector>
+#include "cluster.h"
#include "material.h"
#include "movepick.h"
#include "pawns.h"
uint64_t ttHitAverage;
int selDepth, nmpMinPly;
Color nmpColor;
- std::atomic<uint64_t> nodes, tbHits, bestMoveChanges;
+ std::atomic<uint64_t> nodes, tbHits, TTsaves, bestMoveChanges;
Position rootPos;
StateInfo rootState;
CapturePieceToHistory captureHistory;
ContinuationHistory continuationHistory[2][2];
Score contempt;
+
+#ifdef USE_MPI
+ struct {
+ std::mutex mutex;
+ Cluster::TTCache<Cluster::TTCacheSize> buffer = {};
+ } ttCache;
+#endif
};
MainThread* main() const { return static_cast<MainThread*>(front()); }
uint64_t nodes_searched() const { return accumulate(&Thread::nodes); }
uint64_t tb_hits() const { return accumulate(&Thread::tbHits); }
+ uint64_t TT_saves() const { return accumulate(&Thread::TTsaves); }
Thread* get_best_thread() const;
void start_searching();
void wait_for_search_finished() const;
#include "misc.h"
#include "search.h"
-#include "thread.h"
+#include "cluster.h"
/// The TimeManagement class computes the optimal time to think depending on
/// the maximum available time, the game move number and other parameters.
TimePoint optimum() const { return optimumTime; }
TimePoint maximum() const { return maximumTime; }
TimePoint elapsed() const { return Search::Limits.npmsec ?
- TimePoint(Threads.nodes_searched()) : now() - startTime; }
+ TimePoint(Cluster::nodes_searched()) : now() - startTime; }
int64_t availableNodes; // When in 'nodes as time' mode
#include "misc.h"
#include "types.h"
+namespace Cluster {
+ void init();
+}
+//void Cluster::init();
+
/// TTEntry struct is the 10 bytes transposition table entry, defined as below:
///
/// key 16 bit
private:
friend class TranspositionTable;
+ friend void Cluster::init();
uint16_t key16;
uint8_t depth8;
class TranspositionTable {
+ friend void Cluster::init();
+
static constexpr int ClusterSize = 3;
struct Cluster {
#include <string>
#include "evaluate.h"
+#include "cluster.h"
#include "movegen.h"
#include "position.h"
#include "search.h"
if (Options.count(name))
Options[name] = value;
- else
+ else if (Cluster::is_root())
sync_cout << "No such option: " << name << sync_endl;
}
if (token == "go" || token == "eval")
{
- cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl;
+ if (Cluster::is_root())
+ cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl;
+
if (token == "go")
{
go(pos, is, states);
Threads.main()->wait_for_search_finished();
nodes += Threads.nodes_searched();
}
- else
+ else if (Cluster::is_root())
trace_eval(pos);
}
else if (token == "setoption") setoption(is);
dbg_print(); // Just before exiting
- cerr << "\n==========================="
- << "\nTotal time (ms) : " << elapsed
- << "\nNodes searched : " << nodes
- << "\nNodes/second : " << 1000 * nodes / elapsed << endl;
+ if (Cluster::is_root())
+ cerr << "\n==========================="
+ << "\nTotal time (ms) : " << elapsed
+ << "\nNodes searched : " << nodes
+ << "\nNodes/second : " << 1000 * nodes / elapsed << endl;
}
// The win rate model returns the probability (per mille) of winning given an eval
cmd += std::string(argv[i]) + " ";
do {
- if (argc == 1 && !getline(cin, cmd)) // Block here waiting for input or EOF
+ if (argc == 1 && !Cluster::getline(cin, cmd)) // Block here waiting for input or EOF
cmd = "quit";
istringstream is(cmd);
else if (token == "ponderhit")
Threads.main()->ponder = false; // Switch to normal search
- else if (token == "uci")
+ else if (token == "uci" && Cluster::is_root())
sync_cout << "id name " << engine_info(true)
<< "\n" << Options
<< "\nuciok" << sync_endl;
else if (token == "go") go(pos, is, states);
else if (token == "position") position(pos, is, states);
else if (token == "ucinewgame") Search::clear();
- else if (token == "isready") sync_cout << "readyok" << sync_endl;
+ else if (token == "isready" && Cluster::is_root())
+ sync_cout << "readyok" << sync_endl;
// Additional custom non-UCI commands, mainly for debugging.
// Do not use these commands during a search!
else if (token == "flip") pos.flip();
else if (token == "bench") bench(pos, is, states);
- else if (token == "d") sync_cout << pos << sync_endl;
- else if (token == "eval") trace_eval(pos);
- else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
- else
+ else if (token == "d" && Cluster::is_root())
+ sync_cout << pos << sync_endl;
+ else if (token == "eval" && Cluster::is_root())
+ trace_eval(pos);
+ else if (token == "compiler" && Cluster::is_root())
+ sync_cout << compiler_info() << sync_endl;
+ else if (Cluster::is_root())
sync_cout << "Unknown command: " << cmd << sync_endl;
} while (token != "quit" && argc == 1); // Command line args are one-shot