2 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3 Copyright (C) 2004-2008 Tord Romstad (Glaurung author)
4 Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad
5 Copyright (C) 2015-2019 Marco Costalba, Joona Kiiski, Gary Linscott, Tord Romstad
7 Stockfish is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 Stockfish is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
40 // Total number of ranks and rank within the communicator
41 static int world_rank = MPI_PROC_NULL;
42 static int world_size = 0;
44 // Signals between ranks exchange basic info using a dedicated communicator
45 static MPI_Comm signalsComm = MPI_COMM_NULL;
46 static MPI_Request reqSignals = MPI_REQUEST_NULL;
47 static uint64_t signalsCallCounter = 0;
49 // Signals are the number of nodes searched, stop, table base hits, transposition table saves
50 enum Signals : int { SIG_NODES = 0, SIG_STOP = 1, SIG_TB = 2, SIG_TTS = 3, SIG_NB = 4};
51 static uint64_t signalsSend[SIG_NB] = {};
52 static uint64_t signalsRecv[SIG_NB] = {};
53 static uint64_t nodesSearchedOthers = 0;
54 static uint64_t tbHitsOthers = 0;
55 static uint64_t TTsavesOthers = 0;
56 static uint64_t stopSignalsPosted = 0;
58 // The UCI threads of each rank exchange use a dedicated communicator
59 static MPI_Comm InputComm = MPI_COMM_NULL;
61 // bestMove requires MoveInfo communicators and data types
62 static MPI_Comm MoveComm = MPI_COMM_NULL;
63 static MPI_Datatype MIDatatype = MPI_DATATYPE_NULL;
65 // TT entries are communicated with a dedicated communicator.
66 // The receive buffer is used to gather information from all ranks.
67 // THe TTCacheCounter tracks the number of local elements that are ready to be sent.
68 static MPI_Comm TTComm = MPI_COMM_NULL;
69 static std::array<std::vector<KeyedTTEntry>, 2> TTSendRecvBuffs;
70 static std::array<MPI_Request, 2> reqsTTSendRecv = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
71 static uint64_t sendRecvPosted = 0;
72 static std::atomic<uint64_t> TTCacheCounter = {};
74 /// Initialize MPI and associated data types. Note that the MPI library must be configured
75 /// to support MPI_THREAD_MULTIPLE, since multiple threads access MPI simultaneously.
79 MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &thread_support);
80 if (thread_support < MPI_THREAD_MULTIPLE)
82 std::cerr << "Stockfish requires support for MPI_THREAD_MULTIPLE."
84 std::exit(EXIT_FAILURE);
87 MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
88 MPI_Comm_size(MPI_COMM_WORLD, &world_size);
90 const std::array<MPI_Aint, 5> MIdisps = {offsetof(MoveInfo, move),
91 offsetof(MoveInfo, ponder),
92 offsetof(MoveInfo, depth),
93 offsetof(MoveInfo, score),
94 offsetof(MoveInfo, rank)};
95 MPI_Type_create_hindexed_block(5, 1, MIdisps.data(), MPI_INT, &MIDatatype);
96 MPI_Type_commit(&MIDatatype);
98 MPI_Comm_dup(MPI_COMM_WORLD, &InputComm);
99 MPI_Comm_dup(MPI_COMM_WORLD, &TTComm);
100 MPI_Comm_dup(MPI_COMM_WORLD, &MoveComm);
101 MPI_Comm_dup(MPI_COMM_WORLD, &signalsComm);
104 /// Finalize MPI and free the associated data types.
107 MPI_Type_free(&MIDatatype);
109 MPI_Comm_free(&InputComm);
110 MPI_Comm_free(&TTComm);
111 MPI_Comm_free(&MoveComm);
112 MPI_Comm_free(&signalsComm);
117 /// Return the total number of ranks
123 /// Return the rank (index) of the process
129 /// The receive buffer depends on the number of MPI ranks and threads, resize as needed
130 void ttSendRecvBuff_resize(size_t nThreads) {
134 TTSendRecvBuffs[i].resize(TTCacheSize * world_size * nThreads);
135 std::fill(TTSendRecvBuffs[i].begin(), TTSendRecvBuffs[i].end(), KeyedTTEntry());
139 /// As input is only received by the root (rank 0) of the cluster, this input must be relayed
140 /// to the UCI threads of all ranks, in order to setup the position, etc. We do this with a
141 /// dedicated getline implementation, where the root broadcasts to all other ranks the received
143 bool getline(std::istream& input, std::string& str) {
146 std::vector<char> vec;
151 state = static_cast<bool>(std::getline(input, str));
152 vec.assign(str.begin(), str.end());
156 // Some MPI implementations use busy-wait polling, while we need yielding as otherwise
157 // the UCI thread on the non-root ranks would be consuming resources.
158 static MPI_Request reqInput = MPI_REQUEST_NULL;
159 MPI_Ibcast(&size, 1, MPI_INT, 0, InputComm, &reqInput);
161 MPI_Wait(&reqInput, MPI_STATUS_IGNORE);
167 MPI_Test(&reqInput, &flag, MPI_STATUS_IGNORE);
171 std::this_thread::sleep_for(std::chrono::milliseconds(10));
175 // Broadcast received string
178 MPI_Bcast(vec.data(), size, MPI_CHAR, 0, InputComm);
180 str.assign(vec.begin(), vec.end());
181 MPI_Bcast(&state, 1, MPI_CXX_BOOL, 0, InputComm);
186 /// Sending part of the signal communication loop
187 void signals_send() {
189 signalsSend[SIG_NODES] = Threads.nodes_searched();
190 signalsSend[SIG_TB] = Threads.tb_hits();
191 signalsSend[SIG_TTS] = Threads.TT_saves();
192 signalsSend[SIG_STOP] = Threads.stop;
193 MPI_Iallreduce(signalsSend, signalsRecv, SIG_NB, MPI_UINT64_T,
194 MPI_SUM, signalsComm, &reqSignals);
195 ++signalsCallCounter;
198 /// Processing part of the signal communication loop.
199 /// For some counters (e.g. nodes) we only keep their sum on the other nodes
200 /// allowing to add local counters at any time for more fine grained process,
201 /// which is useful to indicate progress during early iterations, and to have
202 /// node counts that exactly match the non-MPI code in the single rank case.
203 /// This call also propagates the stop signal between ranks.
204 void signals_process() {
206 nodesSearchedOthers = signalsRecv[SIG_NODES] - signalsSend[SIG_NODES];
207 tbHitsOthers = signalsRecv[SIG_TB] - signalsSend[SIG_TB];
208 TTsavesOthers = signalsRecv[SIG_TTS] - signalsSend[SIG_TTS];
209 stopSignalsPosted = signalsRecv[SIG_STOP];
210 if (signalsRecv[SIG_STOP] > 0)
214 void sendrecv_post() {
217 MPI_Irecv(TTSendRecvBuffs[sendRecvPosted % 2].data(),
218 TTSendRecvBuffs[sendRecvPosted % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
219 (rank() + size() - 1) % size(), 42, TTComm, &reqsTTSendRecv[0]);
220 MPI_Isend(TTSendRecvBuffs[(sendRecvPosted + 1) % 2].data(),
221 TTSendRecvBuffs[(sendRecvPosted + 1) % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
222 (rank() + 1 ) % size(), 42, TTComm, &reqsTTSendRecv[1]);
225 /// During search, most message passing is asynchronous, but at the end of
226 /// search it makes sense to bring them to a common, finalized state.
227 void signals_sync() {
229 while(stopSignalsPosted < uint64_t(size()))
232 // Finalize outstanding messages of the signal loops.
233 // We might have issued one call less than needed on some ranks.
234 uint64_t globalCounter;
235 MPI_Allreduce(&signalsCallCounter, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
236 if (signalsCallCounter < globalCounter)
238 MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
241 assert(signalsCallCounter == globalCounter);
242 MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
245 // Finalize outstanding messages in the sendRecv loop
246 MPI_Allreduce(&sendRecvPosted, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
247 while (sendRecvPosted < globalCounter)
249 MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
252 assert(sendRecvPosted == globalCounter);
253 MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
257 /// Initialize signal counters to zero.
258 void signals_init() {
260 stopSignalsPosted = tbHitsOthers = TTsavesOthers = nodesSearchedOthers = 0;
262 signalsSend[SIG_NODES] = signalsRecv[SIG_NODES] = 0;
263 signalsSend[SIG_TB] = signalsRecv[SIG_TB] = 0;
264 signalsSend[SIG_TTS] = signalsRecv[SIG_TTS] = 0;
265 signalsSend[SIG_STOP] = signalsRecv[SIG_STOP] = 0;
269 /// Poll the signal loop, and start next round as needed.
270 void signals_poll() {
273 MPI_Test(&reqSignals, &flag, MPI_STATUS_IGNORE);
281 /// Provide basic info related the cluster performance, in particular, the number of signals send,
282 /// signals per sounds (sps), the number of gathers, the number of positions gathered (per node and per second, gpps)
283 /// The number of TT saves and TT saves per second. If gpps equals approximately TTSavesps the gather loop has enough bandwidth.
284 void cluster_info(Depth depth) {
286 TimePoint elapsed = Time.elapsed() + 1;
287 uint64_t TTSaves = TT_saves();
289 sync_cout << "info depth " << depth << " cluster "
290 << " signals " << signalsCallCounter << " sps " << signalsCallCounter * 1000 / elapsed
291 << " sendRecvs " << sendRecvPosted << " srpps " << TTSendRecvBuffs[0].size() * sendRecvPosted * 1000 / elapsed
292 << " TTSaves " << TTSaves << " TTSavesps " << TTSaves * 1000 / elapsed
296 /// When a TT entry is saved, additional steps are taken if the entry is of sufficient depth.
297 /// If sufficient entries has been collected, a communication is initiated.
298 /// If a communication has been completed, the received results are saved to the TT.
299 void save(Thread* thread, TTEntry* tte,
300 Key k, Value v, bool PvHit, Bound b, Depth d, Move m, Value ev) {
302 // Standard save to the TT
303 tte->save(k, v, PvHit, b, d, m, ev);
305 // If the entry is of sufficient depth to be worth communicating, take action.
308 // count the TTsaves to information: this should be relatively similar
309 // to the number of entries we can send/recv.
310 thread->TTsaves.fetch_add(1, std::memory_order_relaxed);
312 // Add to thread's send buffer, the locking here avoids races when the master thread
313 // prepares the send buffer.
315 std::lock_guard<std::mutex> lk(thread->ttCache.mutex);
316 thread->ttCache.buffer.replace(KeyedTTEntry(k,*tte));
320 size_t recvBuffPerRankSize = Threads.size() * TTCacheSize;
322 // Communicate on main search thread, as soon the threads combined have collected
323 // sufficient data to fill the send buffers.
324 if (thread == Threads.main() && TTCacheCounter > recvBuffPerRankSize)
326 // Test communication status
328 MPI_Testall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), &flag, MPI_STATUSES_IGNORE);
330 // Current communication is complete
333 // Save all received entries to TT, and store our TTCaches, ready for the next round of communication
334 for (size_t irank = 0; irank < size_t(size()) ; ++irank)
336 if (irank == size_t(rank())) // this is our part, fill the part of the buffer for sending
338 // Copy from the thread caches to the right spot in the buffer
339 size_t i = irank * recvBuffPerRankSize;
340 for (auto&& th : Threads)
342 std::lock_guard<std::mutex> lk(th->ttCache.mutex);
344 for (auto&& e : th->ttCache.buffer)
345 TTSendRecvBuffs[sendRecvPosted % 2][i++] = e;
347 // Reset thread's send buffer
348 th->ttCache.buffer = {};
353 else // process data received from the corresponding rank.
354 for (size_t i = irank * recvBuffPerRankSize; i < (irank + 1) * recvBuffPerRankSize; ++i)
356 auto&& e = TTSendRecvBuffs[sendRecvPosted % 2][i];
358 TTEntry* replace_tte;
359 replace_tte = TT.probe(e.first, found);
360 replace_tte->save(e.first, e.second.value(), e.second.is_pv(), e.second.bound(), e.second.depth(),
361 e.second.move(), e.second.eval());
365 // Start next communication
368 // Force check of time on the next occasion, the above actions might have taken some time.
369 static_cast<MainThread*>(thread)->callsCnt = 0;
376 /// Picks the bestMove across ranks, and send the associated info and PV to the root of the cluster.
377 /// Note that this bestMove and PV must be output by the root, the guarantee proper ordering of output.
378 /// TODO update to the scheme in master.. can this use aggregation of votes?
379 void pick_moves(MoveInfo& mi, std::string& PVLine) {
381 MoveInfo* pMoveInfo = NULL;
384 pMoveInfo = (MoveInfo*)malloc(sizeof(MoveInfo) * size());
386 MPI_Gather(&mi, 1, MIDatatype, pMoveInfo, 1, MIDatatype, 0, MoveComm);
390 std::map<int, int> votes;
391 int minScore = pMoveInfo[0].score;
392 for (int i = 0; i < size(); ++i)
394 minScore = std::min(minScore, pMoveInfo[i].score);
395 votes[pMoveInfo[i].move] = 0;
397 for (int i = 0; i < size(); ++i)
399 votes[pMoveInfo[i].move] += pMoveInfo[i].score - minScore + pMoveInfo[i].depth;
401 int bestVote = votes[pMoveInfo[0].move];
402 for (int i = 0; i < size(); ++i)
404 if (votes[pMoveInfo[i].move] > bestVote)
406 bestVote = votes[pMoveInfo[i].move];
413 // Send around the final result
414 MPI_Bcast(&mi, 1, MIDatatype, 0, MoveComm);
416 // Send PV line to root as needed
417 if (mi.rank != 0 && mi.rank == rank()) {
419 std::vector<char> vec;
420 vec.assign(PVLine.begin(), PVLine.end());
422 MPI_Send(&size, 1, MPI_INT, 0, 42, MoveComm);
423 MPI_Send(vec.data(), size, MPI_CHAR, 0, 42, MoveComm);
425 if (mi.rank != 0 && is_root()) {
427 std::vector<char> vec;
428 MPI_Recv(&size, 1, MPI_INT, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
430 MPI_Recv(vec.data(), size, MPI_CHAR, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
431 PVLine.assign(vec.begin(), vec.end());
436 /// Return nodes searched (lazily updated cluster wide in the signal loop)
437 uint64_t nodes_searched() {
439 return nodesSearchedOthers + Threads.nodes_searched();
442 /// Return table base hits (lazily updated cluster wide in the signal loop)
445 return tbHitsOthers + Threads.tb_hits();
448 /// Return the number of saves to the TT buffers, (lazily updated cluster wide in the signal loop)
449 uint64_t TT_saves() {
451 return TTsavesOthers + Threads.TT_saves();
464 uint64_t nodes_searched() {
466 return Threads.nodes_searched();
471 return Threads.tb_hits();
474 uint64_t TT_saves() {
476 return Threads.TT_saves();