The needed Windows API for processor groups could be missed from old Windows
versions, so instead of calling them directly (forcing the linker to resolve
the calls at compile time), try to load them at runtime. To do this we need
first to define the corresponding function pointers.
Also don't interfere with running fishtest on numa hardware with Windows.
Avoid all stockfish one-threaded processes will run on the same node
No functional change.
#ifdef _WIN32
#if _WIN32_WINNT < 0x0601
#undef _WIN32_WINNT
#ifdef _WIN32
#if _WIN32_WINNT < 0x0601
#undef _WIN32_WINNT
-#define _WIN32_WINNT 0x0601 // Force to include newest API (Win 7 or later)
+#define _WIN32_WINNT 0x0601 // Force to include needed API prototypes
-#include <windows.h> // For processor groups
+#include <windows.h>
+// The needed Windows API for processor groups could be missed from old Windows
+// versions, so instead of calling them directly (forcing the linker to resolve
+// the calls at compile time), try to load them at runtime. To do this we need
+// first to define the corresponding function pointers.
+extern "C" {
+typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP,
+ PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
+typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY);
+typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
+}
#endif
#include <fstream>
#endif
#include <fstream>
DWORD returnLength = 0;
DWORD byteOffset = 0;
DWORD returnLength = 0;
DWORD byteOffset = 0;
- // Early exit if the needed API are not available at runtime
+ // Early exit if the needed API is not available at runtime
HMODULE k32 = GetModuleHandle("Kernel32.dll");
HMODULE k32 = GetModuleHandle("Kernel32.dll");
- if ( !GetProcAddress(k32, "GetLogicalProcessorInformationEx")
- || !GetProcAddress(k32, "GetNumaNodeProcessorMaskEx")
- || !GetProcAddress(k32, "SetThreadGroupAffinity"))
+ auto fun1 = (fun1_t)GetProcAddress(k32, "GetLogicalProcessorInformationEx");
+ if (!fun1)
return -1;
// First call to get returnLength. We expect it to fail due to null buffer
return -1;
// First call to get returnLength. We expect it to fail due to null buffer
- if (GetLogicalProcessorInformationEx(RelationAll, nullptr, &returnLength))
+ if (fun1(RelationAll, nullptr, &returnLength))
return -1;
// Once we know returnLength, allocate the buffer
return -1;
// Once we know returnLength, allocate the buffer
ptr = buffer = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)malloc(returnLength);
// Second call, now we expect to succeed
ptr = buffer = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)malloc(returnLength);
// Second call, now we expect to succeed
- if (!GetLogicalProcessorInformationEx(RelationAll, buffer, &returnLength))
+ if (!fun1(RelationAll, buffer, &returnLength))
{
free(buffer);
return -1;
{
free(buffer);
return -1;
void bindThisThread(size_t idx) {
void bindThisThread(size_t idx) {
- // Use a local variable instead of a static: slower but thread-safe
+ // If OS already scheduled us on a different group than 0 then don't overwrite
+ // the choice, eventually we are one of many one-threaded processes running on
+ // some Windows NUMA hardware, for instance in fishtest. To make it simple,
+ // just check if running threads are below a threshold, in this case all this
+ // NUMA machinery is not needed.
+ if (Threads.size() < 8)
+ return;
+
+ // Use only local variables to be thread-safe
int group = get_group(idx);
if (group == -1)
return;
int group = get_group(idx);
if (group == -1)
return;
- GROUP_AFFINITY mask;
- if (GetNumaNodeProcessorMaskEx(group, &mask))
- SetThreadGroupAffinity(GetCurrentThread(), &mask, nullptr);
+ // Early exit if the needed API are not available at runtime
+ HMODULE k32 = GetModuleHandle("Kernel32.dll");
+ auto fun2 = (fun2_t)GetProcAddress(k32, "GetNumaNodeProcessorMaskEx");
+ auto fun3 = (fun3_t)GetProcAddress(k32, "SetThreadGroupAffinity");
+
+ if (!fun2 || !fun3)
+ return;
+
+ GROUP_AFFINITY affinity;
+ if (fun2(group, &affinity))
+ fun3(GetCurrentThread(), &affinity, nullptr);