From 7218ec4df9fef1146a451b71f0ed3bfd8123c9f9 Mon Sep 17 00:00:00 2001
From: noobpwnftw <guo.bojun@gmail.com>
Date: Sat, 20 Nov 2021 17:57:08 +0800
Subject: [PATCH] Revert and fix earlier windows NUMA patch

revert https://github.com/official-stockfish/Stockfish/commit/9048ac00db12a9ac48bff9b9eb145b30ff88d984 due to core spread problem and fix new OS compatibility with another method.

This code assumes that if one NUMA node has more than one processor groups, they are created equal(having equal amount of cores assigned to each of the groups), and also the total number of available cores contained in such groups are equal to the number of available cores within one NUMA node because of how best_node function works.

closes https://github.com/official-stockfish/Stockfish/pull/3798
fixes https://github.com/official-stockfish/Stockfish/pull/3787

No functional change.
---
 src/misc.cpp | 54 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 20 deletions(-)
diff --git a/src/misc.cpp b/src/misc.cpp
index 0af20e10..4dfa9f0c 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -36,6 +36,7 @@ typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP,
                       PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
 typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY);
 typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
+typedef bool(*fun4_t)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
 }
 #endif
 
@@ -495,14 +496,14 @@ void bindThisThread(size_t) {}
 
 #else
 
-/// best_group() retrieves logical processor information using Windows specific
-/// API and returns the best group id for the thread with index idx. Original
+/// best_node() retrieves logical processor information using Windows specific
+/// API and returns the best node id for the thread with index idx. Original
 /// code from Texel by Peter Ãsterlund.
 
-int best_group(size_t idx) {
+int best_node(size_t idx) {
 
   int threads = 0;
-  int groups = 0;
+  int nodes = 0;
   int cores = 0;
   DWORD returnLength = 0;
   DWORD byteOffset = 0;
@@ -530,8 +531,8 @@ int best_group(size_t idx) {
 
   while (byteOffset < returnLength)
   {
-      if (ptr->Relationship == RelationGroup)
-          groups += ptr->Group.MaximumGroupCount;
+      if (ptr->Relationship == RelationNumaNode)
+          nodes++;
 
       else if (ptr->Relationship == RelationProcessorCore)
       {
@@ -546,23 +547,23 @@ int best_group(size_t idx) {
 
   free(buffer);
 
-  std::vector<int> core_groups;
+  std::vector<int> groups;
 
-  // Run as many threads as possible on the same group until core limit is
-  // reached, then move on filling the next group.
-  for (int n = 0; n < groups; n++)
-      for (int i = 0; i < cores / groups; i++)
-          core_groups.push_back(n);
+  // Run as many threads as possible on the same node until core limit is
+  // reached, then move on filling the next node.
+  for (int n = 0; n < nodes; n++)
+      for (int i = 0; i < cores / nodes; i++)
+          groups.push_back(n);
 
   // In case a core has more than one logical processor (we assume 2) and we
   // have still threads to allocate, then spread them evenly across available
-  // groups.
+  // nodes.
   for (int t = 0; t < threads - cores; t++)
-      core_groups.push_back(t % groups);
+      groups.push_back(t % nodes);
 
   // If we still have more threads than the total number of logical processors
   // then return -1 and let the OS to decide what to do.
-  return idx < core_groups.size() ? core_groups[idx] : -1;
+  return idx < groups.size() ? groups[idx] : -1;
 }
 
 
@@ -571,22 +572,35 @@ int best_group(size_t idx) {
 void bindThisThread(size_t idx) {
 
   // Use only local variables to be thread-safe
-  int group = best_group(idx);
+  int node = best_node(idx);
 
-  if (group == -1)
+  if (node == -1)
       return;
 
   // Early exit if the needed API are not available at runtime
   HMODULE k32 = GetModuleHandle("Kernel32.dll");
   auto fun2 = (fun2_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMaskEx");
   auto fun3 = (fun3_t)(void(*)())GetProcAddress(k32, "SetThreadGroupAffinity");
+  auto fun4 = (fun4_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMask2");
 
   if (!fun2 || !fun3)
       return;
 
-  GROUP_AFFINITY affinity;
-  if (fun2(group, &affinity))
-      fun3(GetCurrentThread(), &affinity, nullptr);
+  if (!fun4) {
+      GROUP_AFFINITY affinity;
+      if (fun2(node, &affinity))
+          fun3(GetCurrentThread(), &affinity, nullptr);
+  } else {
+      // If a numa node has more than one processor group, we assume they are
+      // sized equal and we spread threads evenly across the groups.
+      USHORT elements, returnedElements;
+      elements = GetMaximumProcessorGroupCount();
+      GROUP_AFFINITY *affinity = (GROUP_AFFINITY*)malloc(
+          elements * sizeof(GROUP_AFFINITY));
+      if (fun4(node, affinity, elements, &returnedElements))
+          fun3(GetCurrentThread(), &affinity[idx % returnedElements], nullptr);
+      free(affinity);
+  }
 }
 
 #endif
-- 
2.39.2