Revert and fix earlier windows NUMA patch

[stockfish] / src / misc.cpp
diff --git a/src/misc.cpp b/src/misc.cpp

index bb3a641bca9ff0b858568245dd33dae9bf53a853..4dfa9f0ce2f4bf6572f825e0741db602da4d3d55 100644 (file)
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -36,6 +36,7 @@ typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP,
                        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
  typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY);
  typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
+typedef bool(*fun4_t)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
  }
  #endif
  
@@ -67,7 +68,7 @@ namespace {
  
  /// Version number. If Version is left empty, then compile date in the format
  /// DD-MM-YY and show in engine_info.
-const string Version = "14.1";
+const string Version = "";
  
  /// Our fancy logging facility. The trick here is to replace cin.rdbuf() and
  /// cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We
@@ -495,11 +496,11 @@ void bindThisThread(size_t) {}
  
  #else
  
-/// best_group() retrieves logical processor information using Windows specific
-/// API and returns the best group id for the thread with index idx. Original
+/// best_node() retrieves logical processor information using Windows specific
+/// API and returns the best node id for the thread with index idx. Original
  /// code from Texel by Peter Österlund.
  
-int best_group(size_t idx) {
+int best_node(size_t idx) {
  
    int threads = 0;
    int nodes = 0;
@@ -571,22 +572,35 @@ int best_group(size_t idx) {
  void bindThisThread(size_t idx) {
  
    // Use only local variables to be thread-safe
-  int group = best_group(idx);
+  int node = best_node(idx);
  
-  if (group == -1)
+  if (node == -1)
        return;
  
    // Early exit if the needed API are not available at runtime
    HMODULE k32 = GetModuleHandle("Kernel32.dll");
    auto fun2 = (fun2_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMaskEx");
    auto fun3 = (fun3_t)(void(*)())GetProcAddress(k32, "SetThreadGroupAffinity");
+  auto fun4 = (fun4_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMask2");
  
    if (!fun2 || !fun3)
        return;
  
-  GROUP_AFFINITY affinity;
-  if (fun2(group, &affinity))
-      fun3(GetCurrentThread(), &affinity, nullptr);
+  if (!fun4) {
+      GROUP_AFFINITY affinity;
+      if (fun2(node, &affinity))
+          fun3(GetCurrentThread(), &affinity, nullptr);
+  } else {
+      // If a numa node has more than one processor group, we assume they are
+      // sized equal and we spread threads evenly across the groups.
+      USHORT elements, returnedElements;
+      elements = GetMaximumProcessorGroupCount();
+      GROUP_AFFINITY *affinity = (GROUP_AFFINITY*)malloc(
+          elements * sizeof(GROUP_AFFINITY));
+      if (fun4(node, affinity, elements, &returnedElements))
+          fun3(GetCurrentThread(), &affinity[idx % returnedElements], nullptr);
+      free(affinity);
+  }
  }
  
  #endif