X-Git-Url: https://git.sesse.net/?p=stockfish;a=blobdiff_plain;f=src%2Fmisc.cpp;fp=src%2Fmisc.cpp;h=4dfa9f0ce2f4bf6572f825e0741db602da4d3d55;hp=0af20e10d58f2b014f050ddcec8c23bd29cfc763;hb=7218ec4df9fef1146a451b71f0ed3bfd8123c9f9;hpb=a943b1d28d673814ee1f3de4a2ae4b8e091f1e4c

diff --git a/src/misc.cpp b/src/misc.cpp
index 0af20e10..4dfa9f0c 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -36,6 +36,7 @@ typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP,
                       PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
 typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY);
 typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
+typedef bool(*fun4_t)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
 }
 #endif
 
@@ -495,14 +496,14 @@ void bindThisThread(size_t) {}
 
 #else
 
-/// best_group() retrieves logical processor information using Windows specific
-/// API and returns the best group id for the thread with index idx. Original
+/// best_node() retrieves logical processor information using Windows specific
+/// API and returns the best node id for the thread with index idx. Original
 /// code from Texel by Peter Ãsterlund.
 
-int best_group(size_t idx) {
+int best_node(size_t idx) {
 
   int threads = 0;
-  int groups = 0;
+  int nodes = 0;
   int cores = 0;
   DWORD returnLength = 0;
   DWORD byteOffset = 0;
@@ -530,8 +531,8 @@ int best_group(size_t idx) {
 
   while (byteOffset < returnLength)
   {
-      if (ptr->Relationship == RelationGroup)
-          groups += ptr->Group.MaximumGroupCount;
+      if (ptr->Relationship == RelationNumaNode)
+          nodes++;
 
       else if (ptr->Relationship == RelationProcessorCore)
       {
@@ -546,23 +547,23 @@ int best_group(size_t idx) {
 
   free(buffer);
 
-  std::vector<int> core_groups;
+  std::vector<int> groups;
 
-  // Run as many threads as possible on the same group until core limit is
-  // reached, then move on filling the next group.
-  for (int n = 0; n < groups; n++)
-      for (int i = 0; i < cores / groups; i++)
-          core_groups.push_back(n);
+  // Run as many threads as possible on the same node until core limit is
+  // reached, then move on filling the next node.
+  for (int n = 0; n < nodes; n++)
+      for (int i = 0; i < cores / nodes; i++)
+          groups.push_back(n);
 
   // In case a core has more than one logical processor (we assume 2) and we
   // have still threads to allocate, then spread them evenly across available
-  // groups.
+  // nodes.
   for (int t = 0; t < threads - cores; t++)
-      core_groups.push_back(t % groups);
+      groups.push_back(t % nodes);
 
   // If we still have more threads than the total number of logical processors
   // then return -1 and let the OS to decide what to do.
-  return idx < core_groups.size() ? core_groups[idx] : -1;
+  return idx < groups.size() ? groups[idx] : -1;
 }
 
 
@@ -571,22 +572,35 @@ int best_group(size_t idx) {
 void bindThisThread(size_t idx) {
 
   // Use only local variables to be thread-safe
-  int group = best_group(idx);
+  int node = best_node(idx);
 
-  if (group == -1)
+  if (node == -1)
       return;
 
   // Early exit if the needed API are not available at runtime
   HMODULE k32 = GetModuleHandle("Kernel32.dll");
   auto fun2 = (fun2_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMaskEx");
   auto fun3 = (fun3_t)(void(*)())GetProcAddress(k32, "SetThreadGroupAffinity");
+  auto fun4 = (fun4_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMask2");
 
   if (!fun2 || !fun3)
       return;
 
-  GROUP_AFFINITY affinity;
-  if (fun2(group, &affinity))
-      fun3(GetCurrentThread(), &affinity, nullptr);
+  if (!fun4) {
+      GROUP_AFFINITY affinity;
+      if (fun2(node, &affinity))
+          fun3(GetCurrentThread(), &affinity, nullptr);
+  } else {
+      // If a numa node has more than one processor group, we assume they are
+      // sized equal and we spread threads evenly across the groups.
+      USHORT elements, returnedElements;
+      elements = GetMaximumProcessorGroupCount();
+      GROUP_AFFINITY *affinity = (GROUP_AFFINITY*)malloc(
+          elements * sizeof(GROUP_AFFINITY));
+      if (fun4(node, affinity, elements, &returnedElements))
+          fun3(GetCurrentThread(), &affinity[idx % returnedElements], nullptr);
+      free(affinity);
+  }
 }
 
 #endif