Merge branch 'master' into clusterMergeMaster11

author Joost VandeVondele <Joost.VandeVondele@gmail.com>

Thu, 17 Sep 2020 17:17:37 +0000 (19:17 +0200)

committer Joost VandeVondele <Joost.VandeVondele@gmail.com>

Thu, 17 Sep 2020 17:17:37 +0000 (19:17 +0200)
author Joost VandeVondele <Joost.VandeVondele@gmail.com>
Thu, 17 Sep 2020 17:17:37 +0000 (19:17 +0200)
committer Joost VandeVondele <Joost.VandeVondele@gmail.com>
Thu, 17 Sep 2020 17:17:37 +0000 (19:17 +0200)
diff --git a/README.md b/README.md

index 96a495ae54b3dc3b1b76fe6a465f8eb9256d816c..348d77f89420cc585d0dedf33a1d04f40978df9d 100644 (file)
--- a/README.md
+++ b/README.md
@@ -194,6 +194,35 @@ more compact than Nalimov tablebases, while still storing all information
  needed for optimal play and in addition being able to take into account
  the 50-move rule.
  
+## Stockfish on distributed memory systems
+
+The cluster branch allows for running Stockfish on a cluster of servers (nodes)
+that are connected with a high-speed and low-latency network, using the message
+passing interface (MPI). In this case, one MPI process should be run per node,
+and UCI options can be used to set the number of threads/hash per node as usual.
+Typically, the engine will be invoked as
+```
+mpirun -np N /path/to/stockfish
+```
+where ```N``` stands for the number of MPI processes used (alternatives to ```mpirun```,
+include ```mpiexec```, ```srun```). Use 1 mpi rank per node, and employ threading
+according to the cores per node. To build the cluster
+branch, it is sufficient to specify ```COMPILER=mpicxx``` on the make command line,
+and do a clean build:
+```
+make -j ARCH=x86-64-modern clean build COMPILER=mpicxx
+```
+If the name of the compiler wrapper (typically mpicxx, but sometimes e.g. CC) does
+not match ```mpi``` an edit to the Makefile is required. Make sure that the MPI
+installation is configured to support ```MPI_THREAD_MULTIPLE```, this might require
+adding system specific compiler options to the Makefile. Stockfish employs
+non-blocking (asynchronous) communication, and benefits from an MPI
+implementation that efficiently supports this. Some MPI implentations might benefit
+from leaving 1 core/thread free for these asynchronous communications, and might require
+setting additional environment variables. ```mpirun``` should forward stdin/stdout
+to ```rank 0``` only (e.g. ```srun --input=0 --output=0```).
+ Refer to your MPI documentation for more info.
+
  ## Large Pages
  
  Stockfish supports large pages on Linux and Windows. Large pages make
diff --git a/src/Makefile b/src/Makefile

index 54868b39b4682e8c0788cb33ebae04bb9da46756..552cbcb4e1651999a7f1e2b189fb792ad77870c2 100644 (file)
--- a/src/Makefile
+++ b/src/Makefile
@@ -36,7 +36,7 @@ BINDIR = $(PREFIX)/bin
  PGOBENCH = ./$(EXE) bench
  
  ### Source and object files
-SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
+SRCS = benchmark.cpp bitbase.cpp bitboard.cpp cluster.cpp endgame.cpp evaluate.cpp main.cpp \
         material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
         search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
         nnue/evaluate_nnue.cpp nnue/features/half_kp.cpp
@@ -78,6 +78,7 @@ endif
  # vnni256 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 256
  # vnni512 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
  # neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
+# mpi = yes/no        --- -DUSE_MPI        --- Use Message Passing Interface
  #
  # Note that Makefile is space sensitive, so when adding new architectures
  # or modifying existing flags, you have to make sure there are no extra spaces
@@ -118,6 +119,7 @@ avx512 = no
  vnni256 = no
  vnni512 = no
  neon = no
+mpi = no
  STRIP = strip
  
  ### 2.2 Architecture specific
@@ -633,6 +635,13 @@ ifeq ($(OS), Android)
         LDFLAGS += -fPIE -pie
  endif
  
+### 3.10 MPI
+ifneq (,$(findstring mpi, $(CXX)))
+       mpi = yes
+       CXXFLAGS += -DUSE_MPI -Wno-cast-qual -fexceptions
+        DEPENDFLAGS += -DUSE_MPI
+endif
+
  ### ==========================================================================
  ### Section 4. Public Targets
  ### ==========================================================================
@@ -807,6 +816,7 @@ config-sanity:
         @echo "vnni256: '$(vnni256)'"
         @echo "vnni512: '$(vnni512)'"
         @echo "neon: '$(neon)'"
+       @echo "mpi: '$(mpi)'"
         @echo ""
         @echo "Flags:"
         @echo "CXX: $(CXX)"
diff --git a/src/cluster.cpp b/src/cluster.cpp

new file mode 100644 (file)

index 0000000..a4de882
--- /dev/null
+++ b/src/cluster.cpp
@@ -0,0 +1,479 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef USE_MPI
+
+#include <array>
+#include <map>
+#include <cstddef>
+#include <cstdlib>
+#include <iostream>
+#include <istream>
+#include <mpi.h>
+#include <string>
+#include <vector>
+
+#include "cluster.h"
+#include "thread.h"
+#include "tt.h"
+#include "timeman.h"
+
+namespace Cluster {
+
+// Total number of ranks and rank within the communicator
+static int world_rank = MPI_PROC_NULL;
+static int world_size = 0;
+
+// Signals between ranks exchange basic info using a dedicated communicator
+static MPI_Comm signalsComm = MPI_COMM_NULL;
+static MPI_Request reqSignals = MPI_REQUEST_NULL;
+static uint64_t signalsCallCounter = 0;
+
+// Signals are the number of nodes searched, stop, table base hits, transposition table saves
+enum Signals : int { SIG_NODES = 0, SIG_STOP = 1, SIG_TB = 2, SIG_TTS = 3, SIG_NB = 4};
+static uint64_t signalsSend[SIG_NB] = {};
+static uint64_t signalsRecv[SIG_NB] = {};
+static uint64_t nodesSearchedOthers = 0;
+static uint64_t tbHitsOthers = 0;
+static uint64_t TTsavesOthers = 0;
+static uint64_t stopSignalsPosted = 0;
+
+// The UCI threads of each rank exchange use a dedicated communicator
+static MPI_Comm InputComm = MPI_COMM_NULL;
+
+// bestMove requires MoveInfo communicators and data types
+static MPI_Comm MoveComm = MPI_COMM_NULL;
+static MPI_Datatype MIDatatype = MPI_DATATYPE_NULL;
+
+// TT entries are communicated with a dedicated communicator.
+// The receive buffer is used to gather information from all ranks.
+// THe TTCacheCounter tracks the number of local elements that are ready to be sent.
+static MPI_Comm TTComm = MPI_COMM_NULL;
+static std::array<std::vector<KeyedTTEntry>, 2> TTSendRecvBuffs;
+static std::array<MPI_Request, 2> reqsTTSendRecv = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
+static uint64_t sendRecvPosted = 0;
+static std::atomic<uint64_t> TTCacheCounter = {};
+
+/// Initialize MPI and associated data types. Note that the MPI library must be configured
+/// to support MPI_THREAD_MULTIPLE, since multiple threads access MPI simultaneously.
+void init() {
+
+  int thread_support;
+  MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &thread_support);
+  if (thread_support < MPI_THREAD_MULTIPLE)
+  {
+      std::cerr << "Stockfish requires support for MPI_THREAD_MULTIPLE."
+                << std::endl;
+      std::exit(EXIT_FAILURE);
+  }
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+  const std::array<MPI_Aint, 5> MIdisps = {offsetof(MoveInfo, move),
+                                           offsetof(MoveInfo, ponder),
+                                           offsetof(MoveInfo, depth),
+                                           offsetof(MoveInfo, score),
+                                           offsetof(MoveInfo, rank)};
+  MPI_Type_create_hindexed_block(5, 1, MIdisps.data(), MPI_INT, &MIDatatype);
+  MPI_Type_commit(&MIDatatype);
+
+  MPI_Comm_dup(MPI_COMM_WORLD, &InputComm);
+  MPI_Comm_dup(MPI_COMM_WORLD, &TTComm);
+  MPI_Comm_dup(MPI_COMM_WORLD, &MoveComm);
+  MPI_Comm_dup(MPI_COMM_WORLD, &signalsComm);
+}
+
+/// Finalize MPI and free the associated data types.
+void finalize() {
+
+  MPI_Type_free(&MIDatatype);
+
+  MPI_Comm_free(&InputComm);
+  MPI_Comm_free(&TTComm);
+  MPI_Comm_free(&MoveComm);
+  MPI_Comm_free(&signalsComm);
+
+  MPI_Finalize();
+}
+
+/// Return the total number of ranks
+int size() {
+
+  return world_size;
+}
+
+/// Return the rank (index) of the process
+int rank() {
+
+  return world_rank;
+}
+
+/// The receive buffer depends on the number of MPI ranks and threads, resize as needed
+void ttSendRecvBuff_resize(size_t nThreads) {
+
+  for (int i : {0, 1})
+  {
+     TTSendRecvBuffs[i].resize(TTCacheSize * world_size * nThreads);
+     std::fill(TTSendRecvBuffs[i].begin(), TTSendRecvBuffs[i].end(), KeyedTTEntry());
+  }
+}
+
+/// As input is only received by the root (rank 0) of the cluster, this input must be relayed
+/// to the UCI threads of all ranks, in order to setup the position, etc. We do this with a
+/// dedicated getline implementation, where the root broadcasts to all other ranks the received
+/// information.
+bool getline(std::istream& input, std::string& str) {
+
+  int size;
+  std::vector<char> vec;
+  bool state;
+
+  if (is_root())
+  {
+      state = static_cast<bool>(std::getline(input, str));
+      vec.assign(str.begin(), str.end());
+      size = vec.size();
+  }
+
+  // Some MPI implementations use busy-wait polling, while we need yielding as otherwise
+  // the UCI thread on the non-root ranks would be consuming resources.
+  static MPI_Request reqInput = MPI_REQUEST_NULL;
+  MPI_Ibcast(&size, 1, MPI_INT, 0, InputComm, &reqInput);
+  if (is_root())
+      MPI_Wait(&reqInput, MPI_STATUS_IGNORE);
+  else
+  {
+      while (true)
+      {
+          int flag;
+          MPI_Test(&reqInput, &flag, MPI_STATUS_IGNORE);
+          if (flag)
+              break;
+          else
+              std::this_thread::sleep_for(std::chrono::milliseconds(10));
+      }
+  }
+
+  // Broadcast received string
+  if (!is_root())
+      vec.resize(size);
+  MPI_Bcast(vec.data(), size, MPI_CHAR, 0, InputComm);
+  if (!is_root())
+      str.assign(vec.begin(), vec.end());
+  MPI_Bcast(&state, 1, MPI_CXX_BOOL, 0, InputComm);
+
+  return state;
+}
+
+/// Sending part of the signal communication loop
+void signals_send() {
+
+  signalsSend[SIG_NODES] = Threads.nodes_searched();
+  signalsSend[SIG_TB] = Threads.tb_hits();
+  signalsSend[SIG_TTS] = Threads.TT_saves();
+  signalsSend[SIG_STOP] = Threads.stop;
+  MPI_Iallreduce(signalsSend, signalsRecv, SIG_NB, MPI_UINT64_T,
+                 MPI_SUM, signalsComm, &reqSignals);
+  ++signalsCallCounter;
+}
+
+/// Processing part of the signal communication loop.
+/// For some counters (e.g. nodes) we only keep their sum on the other nodes
+/// allowing to add local counters at any time for more fine grained process,
+/// which is useful to indicate progress during early iterations, and to have
+/// node counts that exactly match the non-MPI code in the single rank case.
+/// This call also propagates the stop signal between ranks.
+void signals_process() {
+
+  nodesSearchedOthers = signalsRecv[SIG_NODES] - signalsSend[SIG_NODES];
+  tbHitsOthers = signalsRecv[SIG_TB] - signalsSend[SIG_TB];
+  TTsavesOthers = signalsRecv[SIG_TTS] - signalsSend[SIG_TTS];
+  stopSignalsPosted = signalsRecv[SIG_STOP];
+  if (signalsRecv[SIG_STOP] > 0)
+      Threads.stop = true;
+}
+
+void sendrecv_post() {
+
+   ++sendRecvPosted;
+   MPI_Irecv(TTSendRecvBuffs[sendRecvPosted       % 2].data(),
+             TTSendRecvBuffs[sendRecvPosted       % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
+             (rank() + size() - 1) % size(), 42, TTComm, &reqsTTSendRecv[0]);
+   MPI_Isend(TTSendRecvBuffs[(sendRecvPosted + 1) % 2].data(),
+             TTSendRecvBuffs[(sendRecvPosted + 1) % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
+             (rank() + 1         ) % size(), 42, TTComm, &reqsTTSendRecv[1]);
+}
+
+/// During search, most message passing is asynchronous, but at the end of
+/// search it makes sense to bring them to a common, finalized state.
+void signals_sync() {
+
+  while(stopSignalsPosted < uint64_t(size()))
+      signals_poll();
+
+  // Finalize outstanding messages of the signal loops.
+  // We might have issued one call less than needed on some ranks.
+  uint64_t globalCounter;
+  MPI_Allreduce(&signalsCallCounter, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
+  if (signalsCallCounter < globalCounter)
+  {
+      MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
+      signals_send();
+  }
+  assert(signalsCallCounter == globalCounter);
+  MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
+  signals_process();
+
+  // Finalize outstanding messages in the sendRecv loop
+  MPI_Allreduce(&sendRecvPosted, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
+  while (sendRecvPosted < globalCounter)
+  {
+      MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
+      sendrecv_post();
+  }
+  assert(sendRecvPosted == globalCounter);
+  MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
+
+}
+
+/// Initialize signal counters to zero.
+void signals_init() {
+
+  stopSignalsPosted = tbHitsOthers = TTsavesOthers = nodesSearchedOthers = 0;
+
+  signalsSend[SIG_NODES] = signalsRecv[SIG_NODES] = 0;
+  signalsSend[SIG_TB] = signalsRecv[SIG_TB] = 0;
+  signalsSend[SIG_TTS] = signalsRecv[SIG_TTS] = 0;
+  signalsSend[SIG_STOP] = signalsRecv[SIG_STOP] = 0;
+
+}
+
+/// Poll the signal loop, and start next round as needed.
+void signals_poll() {
+
+  int flag;
+  MPI_Test(&reqSignals, &flag, MPI_STATUS_IGNORE);
+  if (flag)
+  {
+     signals_process();
+     signals_send();
+  }
+}
+
+/// Provide basic info related the cluster performance, in particular, the number of signals send,
+/// signals per sounds (sps), the number of gathers, the number of positions gathered (per node and per second, gpps)
+/// The number of TT saves and TT saves per second. If gpps equals approximately TTSavesps the gather loop has enough bandwidth.
+void cluster_info(Depth depth) {
+
+  TimePoint elapsed = Time.elapsed() + 1;
+  uint64_t TTSaves = TT_saves();
+
+  sync_cout << "info depth " << depth << " cluster "
+            << " signals " << signalsCallCounter << " sps " << signalsCallCounter * 1000 / elapsed
+            << " sendRecvs " << sendRecvPosted << " srpps " <<  TTSendRecvBuffs[0].size() * sendRecvPosted * 1000 / elapsed
+            << " TTSaves " << TTSaves << " TTSavesps " << TTSaves * 1000 / elapsed
+            << sync_endl;
+}
+
+/// When a TT entry is saved, additional steps are taken if the entry is of sufficient depth.
+/// If sufficient entries has been collected, a communication is initiated.
+/// If a communication has been completed, the received results are saved to the TT.
+void save(Thread* thread, TTEntry* tte,
+          Key k, Value v, bool PvHit, Bound b, Depth d, Move m, Value ev) {
+
+  // Standard save to the TT
+  tte->save(k, v, PvHit, b, d, m, ev);
+
+  // If the entry is of sufficient depth to be worth communicating, take action.
+  if (d > 3)
+  {
+     // count the TTsaves to information: this should be relatively similar
+     // to the number of entries we can send/recv.
+     thread->TTsaves.fetch_add(1, std::memory_order_relaxed);
+
+     // Add to thread's send buffer, the locking here avoids races when the master thread
+     // prepares the send buffer.
+     {
+         std::lock_guard<std::mutex> lk(thread->ttCache.mutex);
+         thread->ttCache.buffer.replace(KeyedTTEntry(k,*tte));
+        ++TTCacheCounter;
+     }
+
+     size_t recvBuffPerRankSize = Threads.size() * TTCacheSize;
+
+     // Communicate on main search thread, as soon the threads combined have collected
+     // sufficient data to fill the send buffers.
+     if (thread == Threads.main() && TTCacheCounter > recvBuffPerRankSize)
+     {
+         // Test communication status
+         int flag;
+         MPI_Testall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), &flag, MPI_STATUSES_IGNORE);
+
+         // Current communication is complete
+         if (flag)
+         {
+             // Save all received entries to TT, and store our TTCaches, ready for the next round of communication
+             for (size_t irank = 0; irank < size_t(size()) ; ++irank)
+             {
+                 if (irank == size_t(rank())) // this is our part, fill the part of the buffer for sending
+                 {
+                    // Copy from the thread caches to the right spot in the buffer
+                    size_t i = irank * recvBuffPerRankSize;
+                    for (auto&& th : Threads)
+                    {
+                        std::lock_guard<std::mutex> lk(th->ttCache.mutex);
+
+                        for (auto&& e : th->ttCache.buffer)
+                            TTSendRecvBuffs[sendRecvPosted % 2][i++] = e;
+
+                        // Reset thread's send buffer
+                        th->ttCache.buffer = {};
+                    }
+
+                   TTCacheCounter = 0;
+                 }
+                 else // process data received from the corresponding rank.
+                    for (size_t i = irank * recvBuffPerRankSize; i < (irank + 1) * recvBuffPerRankSize; ++i)
+                    {
+                        auto&& e = TTSendRecvBuffs[sendRecvPosted % 2][i];
+                        bool found;
+                        TTEntry* replace_tte;
+                        replace_tte = TT.probe(e.first, found);
+                        replace_tte->save(e.first, e.second.value(), e.second.is_pv(), e.second.bound(), e.second.depth(),
+                                          e.second.move(), e.second.eval());
+                    }
+             }
+
+             // Start next communication
+             sendrecv_post();
+
+            // Force check of time on the next occasion, the above actions might have taken some time.
+             static_cast<MainThread*>(thread)->callsCnt = 0;
+
+         }
+     }
+  }
+}
+
+/// Picks the bestMove across ranks, and send the associated info and PV to the root of the cluster.
+/// Note that this bestMove and PV must be output by the root, the guarantee proper ordering of output.
+/// TODO update to the scheme in master.. can this use aggregation of votes?
+void pick_moves(MoveInfo& mi, std::string& PVLine) {
+
+  MoveInfo* pMoveInfo = NULL;
+  if (is_root())
+  {
+      pMoveInfo = (MoveInfo*)malloc(sizeof(MoveInfo) * size());
+  }
+  MPI_Gather(&mi, 1, MIDatatype, pMoveInfo, 1, MIDatatype, 0, MoveComm);
+
+  if (is_root())
+  {
+      std::map<int, int> votes;
+      int minScore = pMoveInfo[0].score;
+      for (int i = 0; i < size(); ++i)
+      {
+          minScore = std::min(minScore, pMoveInfo[i].score);
+          votes[pMoveInfo[i].move] = 0;
+      }
+      for (int i = 0; i < size(); ++i)
+      {
+          votes[pMoveInfo[i].move] += pMoveInfo[i].score - minScore + pMoveInfo[i].depth;
+      }
+      int bestVote = votes[pMoveInfo[0].move];
+      for (int i = 0; i < size(); ++i)
+      {
+          if (votes[pMoveInfo[i].move] > bestVote)
+          {
+              bestVote = votes[pMoveInfo[i].move];
+              mi = pMoveInfo[i];
+          }
+      }
+      free(pMoveInfo);
+  }
+
+  // Send around the final result
+  MPI_Bcast(&mi, 1, MIDatatype, 0, MoveComm);
+
+  // Send PV line to root as needed
+  if (mi.rank != 0 && mi.rank == rank()) {
+      int size;
+      std::vector<char> vec;
+      vec.assign(PVLine.begin(), PVLine.end());
+      size = vec.size();
+      MPI_Send(&size, 1, MPI_INT, 0, 42, MoveComm);
+      MPI_Send(vec.data(), size, MPI_CHAR, 0, 42, MoveComm);
+  }
+  if (mi.rank != 0 && is_root()) {
+      int size;
+      std::vector<char> vec;
+      MPI_Recv(&size, 1, MPI_INT, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
+      vec.resize(size);
+      MPI_Recv(vec.data(), size, MPI_CHAR, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
+      PVLine.assign(vec.begin(), vec.end());
+  }
+
+}
+
+/// Return nodes searched (lazily updated cluster wide in the signal loop)
+uint64_t nodes_searched() {
+
+  return nodesSearchedOthers + Threads.nodes_searched();
+}
+
+/// Return table base hits (lazily updated cluster wide in the signal loop)
+uint64_t tb_hits() {
+
+  return tbHitsOthers + Threads.tb_hits();
+}
+
+/// Return the number of saves to the TT buffers, (lazily updated cluster wide in the signal loop)
+uint64_t TT_saves() {
+
+  return TTsavesOthers + Threads.TT_saves();
+}
+
+
+}
+
+#else
+
+#include "cluster.h"
+#include "thread.h"
+
+namespace Cluster {
+
+uint64_t nodes_searched() {
+
+  return Threads.nodes_searched();
+}
+
+uint64_t tb_hits() {
+
+  return Threads.tb_hits();
+}
+
+uint64_t TT_saves() {
+
+  return Threads.TT_saves();
+}
+
+}
+
+#endif // USE_MPI
diff --git a/src/cluster.h b/src/cluster.h

new file mode 100644 (file)

index 0000000..8e224d6
--- /dev/null
+++ b/src/cluster.h
@@ -0,0 +1,124 @@
+/*
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+
+  Stockfish is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Stockfish is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef CLUSTER_H_INCLUDED
+#define CLUSTER_H_INCLUDED
+
+#include <algorithm>
+#include <array>
+#include <istream>
+#include <string>
+
+#include "tt.h"
+
+class Thread;
+
+/// The Cluster namespace contains functionality required to run on distributed
+/// memory architectures using MPI as the message passing interface. On a high level,
+/// a 'lazy SMP'-like scheme is implemented where TT saves of sufficient depth are
+/// collected on each rank and distributed to, and used by, all other ranks,
+/// which search essentially independently.  The root (MPI rank 0) of the cluster
+/// is responsible for all I/O and time management, communicating this info to
+/// the other ranks as needed. UCI options such as Threads and Hash specify these
+/// quantities per MPI rank.  It is recommended to have one rank (MPI process) per node.
+/// For the non-MPI case, wrappers that will be compiler-optimized away are provided.
+
+namespace Cluster {
+
+/// Basic info to find the cluster-wide bestMove
+struct MoveInfo {
+  int move;
+  int ponder;
+  int depth;
+  int score;
+  int rank;
+};
+
+#ifdef USE_MPI
+
+// store the TTEntry with its full key, so it can be saved on the receiver side
+using KeyedTTEntry = std::pair<Key, TTEntry>;
+constexpr std::size_t TTCacheSize = 16;
+
+// Threads locally cache their high-depth TT entries till a batch can be send by MPI
+template <std::size_t N> class TTCache : public std::array<KeyedTTEntry, N> {
+
+  struct Compare {
+      inline bool operator()(const KeyedTTEntry& lhs, const KeyedTTEntry& rhs) {
+          return lhs.second.depth() > rhs.second.depth();
+      }
+  };
+  Compare compare;
+
+public:
+
+  // Keep a heap of entries replacing low depth with high depth entries
+  bool replace(const KeyedTTEntry& value) {
+
+      if (compare(value, this->front()))
+      {
+          std::pop_heap(this->begin(), this->end(), compare);
+          this->back() = value;
+          std::push_heap(this->begin(), this->end(), compare);
+          return true;
+      }
+      return false;
+  }
+};
+
+void init();
+void finalize();
+bool getline(std::istream& input, std::string& str);
+int size();
+int rank();
+inline bool is_root() { return rank() == 0; }
+void save(Thread* thread, TTEntry* tte, Key k, Value v, bool PvHit, Bound b, Depth d, Move m, Value ev);
+void pick_moves(MoveInfo& mi, std::string& PVLine);
+void ttSendRecvBuff_resize(size_t nThreads);
+uint64_t nodes_searched();
+uint64_t tb_hits();
+uint64_t TT_saves();
+void cluster_info(Depth depth);
+void signals_init();
+void signals_poll();
+void signals_sync();
+
+#else
+
+inline void init() { }
+inline void finalize() { }
+inline bool getline(std::istream& input, std::string& str) { return static_cast<bool>(std::getline(input, str)); }
+constexpr int size() { return 1; }
+constexpr int rank() { return 0; }
+constexpr bool is_root() { return true; }
+inline void save(Thread*, TTEntry* tte, Key k, Value v, bool PvHit, Bound b, Depth d, Move m, Value ev) { tte->save(k, v, PvHit, b, d, m, ev); }
+inline void pick_moves(MoveInfo&, std::string&) { }
+inline void ttSendRecvBuff_resize(size_t) { }
+uint64_t nodes_searched();
+uint64_t tb_hits();
+uint64_t TT_saves();
+inline void cluster_info(Depth) { }
+inline void signals_init() { }
+inline void signals_poll() { }
+inline void signals_sync() { }
+
+#endif /* USE_MPI */
+
+}
+
+#endif // #ifndef CLUSTER_H_INCLUDED
diff --git a/src/evaluate.cpp b/src/evaluate.cpp

index faf71d2701dd83ad152e2d82f260750a904e5809..694a15eaaa8c7f71ae4fdd1a15234f3993958206 100644 (file)
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -28,6 +28,7 @@
  #include <vector>
  
  #include "bitboard.h"
+#include "cluster.h"
  #include "evaluate.h"
  #include "material.h"
  #include "misc.h"
@@ -136,10 +137,13 @@ namespace Eval {
          exit(EXIT_FAILURE);
      }
  
-    if (useNNUE)
-        sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
-    else
-        sync_cout << "info string classical evaluation enabled" << sync_endl;
+    if (Cluster::is_root())
+    {
+        if (useNNUE)
+            sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+        else
+            sync_cout << "info string classical evaluation enabled" << sync_endl;
+    }
    }
  }
  
diff --git a/src/main.cpp b/src/main.cpp

index f95db1c2f09379c5c940cca74f81c8a3723c4ff6..94c6a2a7ae85c0ea9611a3782840949692dfa7c1 100644 (file)
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -33,7 +33,9 @@ namespace PSQT {
  
  int main(int argc, char* argv[]) {
  
-  std::cout << engine_info() << std::endl;
+  Cluster::init();
+  if (Cluster::is_root())
+      std::cout << engine_info() << std::endl;
  
    CommandLine::init(argc, argv);
    UCI::init(Options);
@@ -50,5 +52,6 @@ int main(int argc, char* argv[]) {
    UCI::loop(argc, argv);
  
    Threads.set(0);
+  Cluster::finalize();
    return 0;
  }
diff --git a/src/search.cpp b/src/search.cpp

index 9c5fb58bd8bbe283cadd3bf7f39ddb8671088ccd..6fc30be2099d4f728b8f478b597655a0ce4199ed 100644 (file)
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -23,6 +23,7 @@
  #include <iostream>
  #include <sstream>
  
+#include "cluster.h"
  #include "evaluate.h"
  #include "misc.h"
  #include "movegen.h"
@@ -178,7 +179,7 @@ namespace {
              nodes += cnt;
              pos.undo_move(m);
          }
-        if (Root)
+        if (Root && Cluster::is_root())
              sync_cout << UCI::move(m, pos.is_chess960()) << ": " << cnt << sync_endl;
      }
      return nodes;
@@ -217,7 +218,9 @@ void MainThread::search() {
    if (Limits.perft)
    {
        nodes = perft<true>(rootPos, Limits.perft);
-      sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
+      if (Cluster::is_root())
+          sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
+
        return;
    }
  
@@ -230,9 +233,10 @@ void MainThread::search() {
    if (rootMoves.empty())
    {
        rootMoves.emplace_back(MOVE_NONE);
-      sync_cout << "info depth 0 score "
-                << UCI::value(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW)
-                << sync_endl;
+      if (Cluster::is_root())
+          sync_cout << "info depth 0 score "
+                    << UCI::value(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW)
+                    << sync_endl;
    }
    else
    {
@@ -247,19 +251,22 @@ void MainThread::search() {
    // until the GUI sends one of those commands.
  
    while (!Threads.stop && (ponder || Limits.infinite))
-  {} // Busy wait for a stop or a ponder reset
+  { Cluster::signals_poll(); } // Busy wait for a stop or a ponder reset
  
    // Stop the threads if not already stopped (also raise the stop if
    // "ponderhit" just reset Threads.ponder).
    Threads.stop = true;
  
+  // Signal and synchronize all other ranks
+  Cluster::signals_sync();
+
    // Wait until all threads have finished
    Threads.wait_for_search_finished();
  
    // When playing in 'nodes as time' mode, subtract the searched nodes from
    // the available ones before exiting.
    if (Limits.npmsec)
-      Time.availableNodes += Limits.inc[us] - Threads.nodes_searched();
+      Time.availableNodes += Limits.inc[us] - Cluster::nodes_searched();
  
    Thread* bestThread = this;
  
@@ -269,18 +276,40 @@ void MainThread::search() {
        && rootMoves[0].pv[0] != MOVE_NONE)
        bestThread = Threads.get_best_thread();
  
-  bestPreviousScore = bestThread->rootMoves[0].score;
+  // Prepare PVLine and ponder move
+  std::string PVLine = UCI::pv(bestThread->rootPos, bestThread->completedDepth, -VALUE_INFINITE, VALUE_INFINITE);
  
-  // Send again PV info if we have a new best thread
-  if (bestThread != this)
-      sync_cout << UCI::pv(bestThread->rootPos, bestThread->completedDepth, -VALUE_INFINITE, VALUE_INFINITE) << sync_endl;
+  Move bestMove   = bestThread->rootMoves[0].pv[0];
+  Move ponderMove = MOVE_NONE;
+  if (bestThread->rootMoves[0].pv.size() > 1 || bestThread->rootMoves[0].extract_ponder_from_tt(rootPos))
+      ponderMove = bestThread->rootMoves[0].pv[1];
  
-  sync_cout << "bestmove " << UCI::move(bestThread->rootMoves[0].pv[0], rootPos.is_chess960());
+  // Exchange info as needed
+  Cluster::MoveInfo mi{bestMove,
+                       ponderMove,
+                       bestThread->completedDepth,
+                       bestThread->rootMoves[0].score,
+                       Cluster::rank()};
+  Cluster::pick_moves(mi, PVLine);
  
-  if (bestThread->rootMoves[0].pv.size() > 1 || bestThread->rootMoves[0].extract_ponder_from_tt(rootPos))
-      std::cout << " ponder " << UCI::move(bestThread->rootMoves[0].pv[1], rootPos.is_chess960());
+  bestPreviousScore = static_cast<Value>(mi.score);
+
+  if (Cluster::is_root())
+  {
+      // Send again PV info if we have a new best thread/rank
+      if (bestThread != this || mi.rank != 0)
+          sync_cout << PVLine << sync_endl;
+
+      bestMove = static_cast<Move>(mi.move);
+      ponderMove = static_cast<Move>(mi.ponder);
+
+      if (ponderMove != MOVE_NONE)
+          sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960())
+                    << " ponder "  << UCI::move(ponderMove, rootPos.is_chess960()) << sync_endl;
+      else
+          sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960()) << sync_endl;
+  }
  
-  std::cout << sync_endl;
  }
  
  
@@ -368,12 +397,13 @@ void Thread::search() {
    // Iterative deepening loop until requested to stop or the target depth is reached
    while (   ++rootDepth < MAX_PLY
           && !Threads.stop
-         && !(Limits.depth && mainThread && rootDepth > Limits.depth))
+         && !(Limits.depth && mainThread && Cluster::is_root() && rootDepth > Limits.depth))
    {
        // Age out PV variability metric
        if (mainThread)
            totBestMoveChanges /= 2;
  
+
        // Save the last iteration's scores before first PV line is searched and
        // all the move scores except the (new) PV are set to -VALUE_INFINITE.
        for (RootMove& rm : rootMoves)
@@ -439,11 +469,15 @@ void Thread::search() {
  
                // When failing high/low give some update (without cluttering
                // the UI) before a re-search.
-              if (   mainThread
+              if (   Cluster::is_root()
+                  && mainThread
                    && multiPV == 1
                    && (bestValue <= alpha || bestValue >= beta)
                    && Time.elapsed() > 3000)
+              {
                    sync_cout << UCI::pv(rootPos, rootDepth, alpha, beta) << sync_endl;
+                  Cluster::cluster_info(rootDepth);
+              }
  
                // In case of failing low/high increase aspiration window and
                // re-search, otherwise exit the loop.
@@ -475,9 +509,12 @@ void Thread::search() {
            // Sort the PV lines searched so far and update the GUI
            std::stable_sort(rootMoves.begin() + pvFirst, rootMoves.begin() + pvIdx + 1);
  
-          if (    mainThread
+          if (    Cluster::is_root() && mainThread
                && (Threads.stop || pvIdx + 1 == multiPV || Time.elapsed() > 3000))
+          {
                sync_cout << UCI::pv(rootPos, rootDepth, alpha, beta) << sync_endl;
+              Cluster::cluster_info(rootDepth);
+          }
        }
  
        if (!Threads.stop)
@@ -748,9 +785,10 @@ namespace {
                  if (    b == BOUND_EXACT
                      || (b == BOUND_LOWER ? value >= beta : value <= alpha))
                  {
-                    tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, b,
-                              std::min(MAX_PLY - 1, depth + 6),
-                              MOVE_NONE, VALUE_NONE);
+                    Cluster::save(thisThread, tte,
+                                  posKey, value_to_tt(value, ss->ply), ss->ttPv, b,
+                                  std::min(MAX_PLY - 1, depth + 6),
+                                  MOVE_NONE, VALUE_NONE);
  
                      return value;
                  }
@@ -798,7 +836,9 @@ namespace {
          else
              ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo;
  
-        tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE, eval);
+        Cluster::save(thisThread, tte,
+                      posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE,
+                      eval);
      }
  
      // Step 7. Razoring (~1 Elo)
@@ -995,7 +1035,7 @@ moves_loop: // When in check, search starts from here
  
        ss->moveCount = ++moveCount;
  
-      if (rootNode && thisThread == Threads.main() && Time.elapsed() > 3000)
+      if (rootNode && Cluster::is_root() && thisThread == Threads.main() && Time.elapsed() > 3000)
            sync_cout << "info depth " << depth
                      << " currmove " << UCI::move(move, pos.is_chess960())
                      << " currmovenumber " << moveCount + thisThread->pvIdx << sync_endl;
@@ -1386,10 +1426,11 @@ moves_loop: // When in check, search starts from here
          ss->ttPv = ss->ttPv && (ss+1)->ttPv;
  
      if (!excludedMove && !(rootNode && thisThread->pvIdx))
-        tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv,
-                  bestValue >= beta ? BOUND_LOWER :
-                  PvNode && bestMove ? BOUND_EXACT : BOUND_UPPER,
-                  depth, bestMove, ss->staticEval);
+        Cluster::save(thisThread, tte,
+                      posKey, value_to_tt(bestValue, ss->ply), ss->ttPv,
+                      bestValue >= beta ? BOUND_LOWER :
+                      PvNode && bestMove ? BOUND_EXACT : BOUND_UPPER,
+                      depth, bestMove, ss->staticEval);
  
      assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE);
  
@@ -1486,8 +1527,9 @@ moves_loop: // When in check, search starts from here
          if (bestValue >= beta)
          {
              if (!ss->ttHit)
-                tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER,
-                          DEPTH_NONE, MOVE_NONE, ss->staticEval);
+                Cluster::save(thisThread, tte,
+                              posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER,
+                              DEPTH_NONE, MOVE_NONE, ss->staticEval);
  
              return bestValue;
          }
@@ -1608,10 +1650,11 @@ moves_loop: // When in check, search starts from here
      if (ss->inCheck && bestValue == -VALUE_INFINITE)
          return mated_in(ss->ply); // Plies to mate from the root
  
-    tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit,
-              bestValue >= beta ? BOUND_LOWER :
-              PvNode && bestValue > oldAlpha  ? BOUND_EXACT : BOUND_UPPER,
-              ttDepth, bestMove, ss->staticEval);
+    Cluster::save(thisThread, tte,
+                  posKey, value_to_tt(bestValue, ss->ply), pvHit,
+                  bestValue >= beta ? BOUND_LOWER :
+                  PvNode && bestValue > oldAlpha  ? BOUND_EXACT : BOUND_UPPER,
+                  ttDepth, bestMove, ss->staticEval);
  
      assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE);
  
@@ -1819,13 +1862,16 @@ void MainThread::check_time() {
        dbg_print();
    }
  
+  // poll on MPI signals
+  Cluster::signals_poll();
+
    // We should not stop pondering until told so by the GUI
    if (ponder)
        return;
  
    if (   (Limits.use_time_management() && (elapsed > Time.maximum() - 10 || stopOnPonderhit))
        || (Limits.movetime && elapsed >= Limits.movetime)
-      || (Limits.nodes && Threads.nodes_searched() >= (uint64_t)Limits.nodes))
+      || (Limits.nodes && Cluster::nodes_searched() >= (uint64_t)Limits.nodes))
        Threads.stop = true;
  }
  
@@ -1840,8 +1886,8 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
    const RootMoves& rootMoves = pos.this_thread()->rootMoves;
    size_t pvIdx = pos.this_thread()->pvIdx;
    size_t multiPV = std::min((size_t)Options["MultiPV"], rootMoves.size());
-  uint64_t nodesSearched = Threads.nodes_searched();
-  uint64_t tbHits = Threads.tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
+  uint64_t nodesSearched = Cluster::nodes_searched();
+  uint64_t tbHits = Cluster::tb_hits() + (TB::RootInTB ? rootMoves.size() : 0);
  
    for (size_t i = 0; i < multiPV; ++i)
    {
diff --git a/src/search.h b/src/search.h

index f60da4a514d1814192bc7b67d02679971f4e94f3..fb42d5e11d607b9a845a16185bf0f53743f8a522 100644 (file)
--- a/src/search.h
+++ b/src/search.h
@@ -24,6 +24,7 @@
  #include "misc.h"
  #include "movepick.h"
  #include "types.h"
+#include "cluster.h"
  
  class Position;
  
@@ -91,7 +92,7 @@ struct LimitsType {
    }
  
    bool use_time_management() const {
-    return time[WHITE] || time[BLACK];
+    return Cluster::is_root() && (time[WHITE] || time[BLACK]);
    }
  
    std::vector<Move> searchmoves;
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp

index 4d682f1a90bc2a38f0e0c444a8e45b833afcb781..39535c68acdc23e051d3190cddb8063bfda52ff2 100644 (file)
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -29,6 +29,7 @@
  #include <mutex>
  
  #include "../bitboard.h"
+#include "../cluster.h"
  #include "../movegen.h"
  #include "../position.h"
  #include "../search.h"
@@ -1401,7 +1402,8 @@ void Tablebases::init(const std::string& paths) {
          }
      }
  
-    sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
+    if (Cluster::is_root())
+        sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
  }
  
  // Probe the WDL table for a particular position.
diff --git a/src/thread.cpp b/src/thread.cpp

index b46fce5e873933494031eade0330582b48003e11..c96c1b6024de1a1ee71d560e57841d870a76f896 100644 (file)
--- a/src/thread.cpp
+++ b/src/thread.cpp
@@ -143,6 +143,9 @@ void ThreadPool::set(size_t requested) {
        // Reallocate the hash with the new threadpool size
        TT.resize(size_t(Options["Hash"]));
  
+      // Adjust cluster buffers
+      Cluster::ttSendRecvBuff_resize(requested);
+
        // Init thread number dependent search params.
        Search::init();
    }
@@ -198,13 +201,15 @@ void ThreadPool::start_thinking(Position& pos, StateListPtr& states,
    // since they are read-only.
    for (Thread* th : *this)
    {
-      th->nodes = th->tbHits = th->nmpMinPly = th->bestMoveChanges = 0;
+      th->nodes = th->tbHits = th->TTsaves = th->nmpMinPly = th->bestMoveChanges = 0;
        th->rootDepth = th->completedDepth = 0;
        th->rootMoves = rootMoves;
        th->rootPos.set(pos.fen(), pos.is_chess960(), &th->rootState, th);
        th->rootState = setupStates->back();
    }
  
+  Cluster::signals_init();
+
    main()->start_searching();
  }
  
diff --git a/src/thread.h b/src/thread.h

index 34b99015bacec71f2cd6e188d84b658d07c5a3ec..62b51a1bca7104d1849482dd10f730819adfc77c 100644 (file)
--- a/src/thread.h
+++ b/src/thread.h
@@ -25,6 +25,7 @@
  #include <thread>
  #include <vector>
  
+#include "cluster.h"
  #include "material.h"
  #include "movepick.h"
  #include "pawns.h"
@@ -61,7 +62,7 @@ public:
    uint64_t ttHitAverage;
    int selDepth, nmpMinPly;
    Color nmpColor;
-  std::atomic<uint64_t> nodes, tbHits, bestMoveChanges;
+  std::atomic<uint64_t> nodes, tbHits, TTsaves, bestMoveChanges;
  
    Position rootPos;
    StateInfo rootState;
@@ -73,6 +74,13 @@ public:
    CapturePieceToHistory captureHistory;
    ContinuationHistory continuationHistory[2][2];
    Score contempt;
+
+#ifdef USE_MPI
+  struct {
+      std::mutex mutex;
+      Cluster::TTCache<Cluster::TTCacheSize> buffer = {};
+  } ttCache;
+#endif
  };
  
  
@@ -107,6 +115,7 @@ struct ThreadPool : public std::vector<Thread*> {
    MainThread* main()        const { return static_cast<MainThread*>(front()); }
    uint64_t nodes_searched() const { return accumulate(&Thread::nodes); }
    uint64_t tb_hits()        const { return accumulate(&Thread::tbHits); }
+  uint64_t TT_saves()       const { return accumulate(&Thread::TTsaves); }
    Thread* get_best_thread() const;
    void start_searching();
    void wait_for_search_finished() const;
diff --git a/src/timeman.h b/src/timeman.h

index 5ad72b32ff83d6b26e91b732d37181798387c6c7..96eecce1a9463cacc4aac5b9384d8ed2432aa674 100644 (file)
--- a/src/timeman.h
+++ b/src/timeman.h
@@ -21,7 +21,7 @@
  
  #include "misc.h"
  #include "search.h"
-#include "thread.h"
+#include "cluster.h"
  
  /// The TimeManagement class computes the optimal time to think depending on
  /// the maximum available time, the game move number and other parameters.
@@ -32,7 +32,7 @@ public:
    TimePoint optimum() const { return optimumTime; }
    TimePoint maximum() const { return maximumTime; }
    TimePoint elapsed() const { return Search::Limits.npmsec ?
-                                     TimePoint(Threads.nodes_searched()) : now() - startTime; }
+                                     TimePoint(Cluster::nodes_searched()) : now() - startTime; }
  
    int64_t availableNodes; // When in 'nodes as time' mode
  
diff --git a/src/tt.h b/src/tt.h

index fdfd67694e92f6a109c72a0ab25f354689076baa..ce7234e2947d3f75534e4d6f104d3705a081423d 100644 (file)
--- a/src/tt.h
+++ b/src/tt.h
@@ -22,6 +22,11 @@
  #include "misc.h"
  #include "types.h"
  
+namespace Cluster {
+  void init();
+}
+//void Cluster::init();
+
  /// TTEntry struct is the 10 bytes transposition table entry, defined as below:
  ///
  /// key        16 bit
@@ -45,6 +50,7 @@ struct TTEntry {
  
  private:
    friend class TranspositionTable;
+  friend void Cluster::init();
  
    uint16_t key16;
    uint8_t  depth8;
@@ -63,6 +69,8 @@ private:
  
  class TranspositionTable {
  
+  friend void Cluster::init();
+
    static constexpr int ClusterSize = 3;
  
    struct Cluster {
diff --git a/src/uci.cpp b/src/uci.cpp

index 3f3cc45874f47169e05e2ed6becbd6c7243e5455..3b4240b4592cb7cf3fced6ff64e18816403e3771 100644 (file)
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -23,6 +23,7 @@
  #include <string>
  
  #include "evaluate.h"
+#include "cluster.h"
  #include "movegen.h"
  #include "position.h"
  #include "search.h"
@@ -110,7 +111,7 @@ namespace {
  
      if (Options.count(name))
          Options[name] = value;
-    else
+    else if (Cluster::is_root())
          sync_cout << "No such option: " << name << sync_endl;
    }
  
@@ -170,14 +171,16 @@ namespace {
  
          if (token == "go" || token == "eval")
          {
-            cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl;
+            if (Cluster::is_root())
+                cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")" << endl;
+
              if (token == "go")
              {
                 go(pos, is, states);
                 Threads.main()->wait_for_search_finished();
                 nodes += Threads.nodes_searched();
              }
-            else
+            else if (Cluster::is_root())
                 trace_eval(pos);
          }
          else if (token == "setoption")  setoption(is);
@@ -189,10 +192,11 @@ namespace {
  
      dbg_print(); // Just before exiting
  
-    cerr << "\n==========================="
-         << "\nTotal time (ms) : " << elapsed
-         << "\nNodes searched  : " << nodes
-         << "\nNodes/second    : " << 1000 * nodes / elapsed << endl;
+    if (Cluster::is_root())
+        cerr << "\n==========================="
+             << "\nTotal time (ms) : " << elapsed
+             << "\nNodes searched  : " << nodes
+             << "\nNodes/second    : " << 1000 * nodes / elapsed << endl;
    }
  
    // The win rate model returns the probability (per mille) of winning given an eval
@@ -238,7 +242,7 @@ void UCI::loop(int argc, char* argv[]) {
        cmd += std::string(argv[i]) + " ";
  
    do {
-      if (argc == 1 && !getline(cin, cmd)) // Block here waiting for input or EOF
+      if (argc == 1 && !Cluster::getline(cin, cmd)) // Block here waiting for input or EOF
            cmd = "quit";
  
        istringstream is(cmd);
@@ -257,7 +261,7 @@ void UCI::loop(int argc, char* argv[]) {
        else if (token == "ponderhit")
            Threads.main()->ponder = false; // Switch to normal search
  
-      else if (token == "uci")
+      else if (token == "uci" && Cluster::is_root())
            sync_cout << "id name " << engine_info(true)
                      << "\n"       << Options
                      << "\nuciok"  << sync_endl;
@@ -266,16 +270,20 @@ void UCI::loop(int argc, char* argv[]) {
        else if (token == "go")         go(pos, is, states);
        else if (token == "position")   position(pos, is, states);
        else if (token == "ucinewgame") Search::clear();
-      else if (token == "isready")    sync_cout << "readyok" << sync_endl;
+      else if (token == "isready" && Cluster::is_root())
+          sync_cout << "readyok" << sync_endl;
  
        // Additional custom non-UCI commands, mainly for debugging.
        // Do not use these commands during a search!
        else if (token == "flip")     pos.flip();
        else if (token == "bench")    bench(pos, is, states);
-      else if (token == "d")        sync_cout << pos << sync_endl;
-      else if (token == "eval")     trace_eval(pos);
-      else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
-      else
+      else if (token == "d" && Cluster::is_root())
+          sync_cout << pos << sync_endl;
+      else if (token == "eval" && Cluster::is_root())
+          trace_eval(pos);
+      else if (token == "compiler" && Cluster::is_root())
+          sync_cout << compiler_info() << sync_endl;
+      else if (Cluster::is_root())
            sync_cout << "Unknown command: " << cmd << sync_endl;
  
    } while (token != "quit" && argc == 1); // Command line args are one-shot
author	Joost VandeVondele <Joost.VandeVondele@gmail.com>
	Thu, 17 Sep 2020 17:17:37 +0000 (19:17 +0200)
committer	Joost VandeVondele <Joost.VandeVondele@gmail.com>
	Thu, 17 Sep 2020 17:17:37 +0000 (19:17 +0200)
README.md		patch \| blob \| history
src/Makefile		patch \| blob \| history
src/cluster.cpp	[new file with mode: 0644]	patch \| blob
src/cluster.h	[new file with mode: 0644]	patch \| blob
src/evaluate.cpp		patch \| blob \| history
src/main.cpp		patch \| blob \| history
src/search.cpp		patch \| blob \| history
src/search.h		patch \| blob \| history
src/syzygy/tbprobe.cpp		patch \| blob \| history
src/thread.cpp		patch \| blob \| history
src/thread.h		patch \| blob \| history
src/timeman.h		patch \| blob \| history
src/tt.h		patch \| blob \| history
src/uci.cpp		patch \| blob \| history