From 8e560c4fd347514a699bde1931912834047cc835 Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Thu, 25 Jul 2024 14:37:08 +0200 Subject: [PATCH] Replicate network weights only to used NUMA nodes On a system with multiple NUMA nodes, this patch avoids unneeded replicated (e.g. 8x for a single threaded run), reducting memory use in that case. Lazy initialization forced before search. Passed STC: https://tests.stockfishchess.org/tests/view/66a28c524ff211be9d4ecdd4 LLR: 2.96 (-2.94,2.94) <-1.75,0.25> Total: 691776 W: 179429 L: 179927 D: 332420 Ptnml(0-2): 2573, 79370, 182547, 78778, 2620 closes https://github.com/official-stockfish/Stockfish/pull/5515 No functional change --- src/engine.cpp | 5 +++ src/engine.h | 8 ++-- src/numa.h | 112 +++++++++++++++++++++++++++++++++++++++++++++++++ src/search.cpp | 6 +++ src/search.h | 26 ++++++------ src/thread.cpp | 7 ++++ src/thread.h | 4 ++ 7 files changed, 152 insertions(+), 16 deletions(-) diff --git a/src/engine.cpp b/src/engine.cpp index 498b7c3e..81bb260b 100644 --- a/src/engine.cpp +++ b/src/engine.cpp @@ -204,6 +204,7 @@ void Engine::set_numa_config_from_option(const std::string& o) { // Force reallocation of threads in case affinities need to change. resize_threads(); + threads.ensure_network_replicated(); } void Engine::resize_threads() { @@ -212,6 +213,7 @@ void Engine::resize_threads() { // Reallocate the hash with the new threadpool size set_tt_size(options["Hash"]); + threads.ensure_network_replicated(); } void Engine::set_tt_size(size_t mb) { @@ -234,18 +236,21 @@ void Engine::load_networks() { networks_.small.load(binaryDirectory, options["EvalFileSmall"]); }); threads.clear(); + threads.ensure_network_replicated(); } void Engine::load_big_network(const std::string& file) { networks.modify_and_replicate( [this, &file](NN::Networks& networks_) { networks_.big.load(binaryDirectory, file); }); threads.clear(); + threads.ensure_network_replicated(); } void Engine::load_small_network(const std::string& file) { networks.modify_and_replicate( [this, &file](NN::Networks& networks_) { networks_.small.load(binaryDirectory, file); }); threads.clear(); + threads.ensure_network_replicated(); } void Engine::save_network(const std::pair, std::string> files[2]) { diff --git a/src/engine.h b/src/engine.h index 127f7d7c..f3c78398 100644 --- a/src/engine.h +++ b/src/engine.h @@ -114,10 +114,10 @@ class Engine { StateListPtr states; Square capSq; - OptionsMap options; - ThreadPool threads; - TranspositionTable tt; - NumaReplicated networks; + OptionsMap options; + ThreadPool threads; + TranspositionTable tt; + LazyNumaReplicated networks; Search::SearchManager::UpdateContext updateContext; }; diff --git a/src/numa.h b/src/numa.h index 3de8281d..20d352c9 100644 --- a/src/numa.h +++ b/src/numa.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1136,6 +1137,117 @@ class NumaReplicated: public NumaReplicatedBase { } }; +// We force boxing with a unique_ptr. If this becomes an issue due to added +// indirection we may need to add an option for a custom boxing type. +template +class LazyNumaReplicated: public NumaReplicatedBase { + public: + using ReplicatorFuncType = std::function; + + LazyNumaReplicated(NumaReplicationContext& ctx) : + NumaReplicatedBase(ctx) { + prepare_replicate_from(T{}); + } + + LazyNumaReplicated(NumaReplicationContext& ctx, T&& source) : + NumaReplicatedBase(ctx) { + prepare_replicate_from(std::move(source)); + } + + LazyNumaReplicated(const LazyNumaReplicated&) = delete; + LazyNumaReplicated(LazyNumaReplicated&& other) noexcept : + NumaReplicatedBase(std::move(other)), + instances(std::exchange(other.instances, {})) {} + + LazyNumaReplicated& operator=(const LazyNumaReplicated&) = delete; + LazyNumaReplicated& operator=(LazyNumaReplicated&& other) noexcept { + NumaReplicatedBase::operator=(*this, std::move(other)); + instances = std::exchange(other.instances, {}); + + return *this; + } + + LazyNumaReplicated& operator=(T&& source) { + prepare_replicate_from(std::move(source)); + + return *this; + } + + ~LazyNumaReplicated() override = default; + + const T& operator[](NumaReplicatedAccessToken token) const { + assert(token.get_numa_index() < instances.size()); + ensure_present(token.get_numa_index()); + return *(instances[token.get_numa_index()]); + } + + const T& operator*() const { return *(instances[0]); } + + const T* operator->() const { return instances[0].get(); } + + template + void modify_and_replicate(FuncT&& f) { + auto source = std::move(instances[0]); + std::forward(f)(*source); + prepare_replicate_from(std::move(*source)); + } + + void on_numa_config_changed() override { + // Use the first one as the source. It doesn't matter which one we use, + // because they all must be identical, but the first one is guaranteed to exist. + auto source = std::move(instances[0]); + prepare_replicate_from(std::move(*source)); + } + + private: + mutable std::vector> instances; + mutable std::mutex mutex; + + void ensure_present(NumaIndex idx) const { + assert(idx < instances.size()); + + if (instances[idx] != nullptr) + return; + + assert(idx != 0); + + std::unique_lock lock(mutex); + // Check again for races. + if (instances[idx] != nullptr) + return; + + const NumaConfig& cfg = get_numa_config(); + cfg.execute_on_numa_node( + idx, [this, idx]() { instances[idx] = std::make_unique(*instances[0]); }); + } + + void prepare_replicate_from(T&& source) { + instances.clear(); + + const NumaConfig& cfg = get_numa_config(); + if (cfg.requires_memory_replication()) + { + assert(cfg.num_numa_nodes() > 0); + + // We just need to make sure the first instance is there. + // Note that we cannot move here as we need to reallocate the data + // on the correct NUMA node. + cfg.execute_on_numa_node( + 0, [this, &source]() { instances.emplace_back(std::make_unique(source)); }); + + // Prepare others for lazy init. + instances.resize(cfg.num_numa_nodes()); + } + else + { + assert(cfg.num_numa_nodes() == 1); + // We take advantage of the fact that replication is not required + // and reuse the source value, avoiding one copy operation. + instances.emplace_back(std::make_unique(std::move(source))); + } + } +}; + class NumaReplicationContext { public: NumaReplicationContext(NumaConfig&& cfg) : diff --git a/src/search.cpp b/src/search.cpp index f20bd4c9..beafd87d 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -127,6 +127,12 @@ Search::Worker::Worker(SharedState& sharedState, clear(); } +void Search::Worker::ensure_network_replicated() { + // Access once to force lazy initialization. + // We do this because we want to avoid initialization during search. + (void) (networks[numaAccessToken]); +} + void Search::Worker::start_searching() { // Non-main threads go directly to iterative_deepening() diff --git a/src/search.h b/src/search.h index bdb63ffd..0f635186 100644 --- a/src/search.h +++ b/src/search.h @@ -131,19 +131,19 @@ struct LimitsType { // The UCI stores the uci options, thread pool, and transposition table. // This struct is used to easily forward data to the Search::Worker class. struct SharedState { - SharedState(const OptionsMap& optionsMap, - ThreadPool& threadPool, - TranspositionTable& transpositionTable, - const NumaReplicated& nets) : + SharedState(const OptionsMap& optionsMap, + ThreadPool& threadPool, + TranspositionTable& transpositionTable, + const LazyNumaReplicated& nets) : options(optionsMap), threads(threadPool), tt(transpositionTable), networks(nets) {} - const OptionsMap& options; - ThreadPool& threads; - TranspositionTable& tt; - const NumaReplicated& networks; + const OptionsMap& options; + ThreadPool& threads; + TranspositionTable& tt; + const LazyNumaReplicated& networks; }; class Worker; @@ -274,6 +274,8 @@ class Worker { bool is_mainthread() const { return threadIdx == 0; } + void ensure_network_replicated(); + // Public because they need to be updatable by the stats ButterflyHistory mainHistory; CapturePieceToHistory captureHistory; @@ -328,10 +330,10 @@ class Worker { Tablebases::Config tbConfig; - const OptionsMap& options; - ThreadPool& threads; - TranspositionTable& tt; - const NumaReplicated& networks; + const OptionsMap& options; + ThreadPool& threads; + TranspositionTable& tt; + const LazyNumaReplicated& networks; // Used by NNUE Eval::NNUE::AccumulatorCaches refreshTable; diff --git a/src/thread.cpp b/src/thread.cpp index f17fc4a5..b5d51594 100644 --- a/src/thread.cpp +++ b/src/thread.cpp @@ -102,6 +102,8 @@ void Thread::run_custom_job(std::function f) { cv.notify_one(); } +void Thread::ensure_network_replicated() { worker->ensure_network_replicated(); } + // Thread gets parked here, blocked on the condition variable // when the thread has no work to do. @@ -400,4 +402,9 @@ std::vector ThreadPool::get_bound_thread_count_by_numa_node() const { return counts; } +void ThreadPool::ensure_network_replicated() { + for (auto&& th : threads) + th->ensure_network_replicated(); +} + } // namespace Stockfish diff --git a/src/thread.h b/src/thread.h index 81ca39bb..43e2e142 100644 --- a/src/thread.h +++ b/src/thread.h @@ -83,6 +83,8 @@ class Thread { void clear_worker(); void run_custom_job(std::function f); + void ensure_network_replicated(); + // Thread has been slightly altered to allow running custom jobs, so // this name is no longer correct. However, this class (and ThreadPool) // require further work to make them properly generic while maintaining @@ -146,6 +148,8 @@ class ThreadPool { std::vector get_bound_thread_count_by_numa_node() const; + void ensure_network_replicated(); + std::atomic_bool stop, abortedSearch, increaseDepth; auto cbegin() const noexcept { return threads.cbegin(); } -- 2.39.5