From 9048ac00db12a9ac48bff9b9eb145b30ff88d984 Mon Sep 17 00:00:00 2001 From: noobpwnftw Date: Sat, 13 Nov 2021 06:38:52 +0800 Subject: [PATCH] Fix processor group binding under Windows. Starting with Windows Build 20348 the behavior of the numa API has been changed: https://docs.microsoft.com/en-us/windows/win32/procthread/numa-support Old code only worked because there was probably a limit on how many cores/threads can reside within one NUMA node, and the OS creates extra NUMA nodes when necessary, however the actual mechanism of core binding is done by "Processor Groups"(https://docs.microsoft.com/en-us/windows/win32/procthread/processor-groups). With a newer OS, one NUMA node can have many such "Processor Groups" and we should just consistently use the number of groups to bind the threads instead of deriving the topology from the number of NUMA nodes. This change is required to spread threads on all cores on Windows 11 with a 3990X CPU. It has only 1 NUMA node with 2 groups of 64 threads each. closes https://github.com/official-stockfish/Stockfish/pull/3787 No functional change. --- src/misc.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/misc.cpp b/src/misc.cpp index f9c12337..0af20e10 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -502,7 +502,7 @@ void bindThisThread(size_t) {} int best_group(size_t idx) { int threads = 0; - int nodes = 0; + int groups = 0; int cores = 0; DWORD returnLength = 0; DWORD byteOffset = 0; @@ -530,8 +530,8 @@ int best_group(size_t idx) { while (byteOffset < returnLength) { - if (ptr->Relationship == RelationNumaNode) - nodes++; + if (ptr->Relationship == RelationGroup) + groups += ptr->Group.MaximumGroupCount; else if (ptr->Relationship == RelationProcessorCore) { @@ -546,23 +546,23 @@ int best_group(size_t idx) { free(buffer); - std::vector groups; + std::vector core_groups; - // Run as many threads as possible on the same node until core limit is - // reached, then move on filling the next node. - for (int n = 0; n < nodes; n++) - for (int i = 0; i < cores / nodes; i++) - groups.push_back(n); + // Run as many threads as possible on the same group until core limit is + // reached, then move on filling the next group. + for (int n = 0; n < groups; n++) + for (int i = 0; i < cores / groups; i++) + core_groups.push_back(n); // In case a core has more than one logical processor (we assume 2) and we // have still threads to allocate, then spread them evenly across available - // nodes. + // groups. for (int t = 0; t < threads - cores; t++) - groups.push_back(t % nodes); + core_groups.push_back(t % groups); // If we still have more threads than the total number of logical processors // then return -1 and let the OS to decide what to do. - return idx < groups.size() ? groups[idx] : -1; + return idx < core_groups.size() ? core_groups[idx] : -1; } -- 2.39.2