X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Fmisc.cpp;h=b46786dff08e0e98badfbbe1373a08d3979c8432;hb=b82d93ece484f833c994b40d9eddd959ba20ef92;hp=0af20e10d58f2b014f050ddcec8c23bd29cfc763;hpb=9048ac00db12a9ac48bff9b9eb145b30ff88d984;p=stockfish diff --git a/src/misc.cpp b/src/misc.cpp index 0af20e10..b46786df 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -36,6 +36,8 @@ typedef bool(*fun1_t)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD); typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY); typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY); +typedef bool(*fun4_t)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT); +typedef WORD(*fun5_t)(); } #endif @@ -495,14 +497,14 @@ void bindThisThread(size_t) {} #else -/// best_group() retrieves logical processor information using Windows specific -/// API and returns the best group id for the thread with index idx. Original +/// best_node() retrieves logical processor information using Windows specific +/// API and returns the best node id for the thread with index idx. Original /// code from Texel by Peter Österlund. -int best_group(size_t idx) { +int best_node(size_t idx) { int threads = 0; - int groups = 0; + int nodes = 0; int cores = 0; DWORD returnLength = 0; DWORD byteOffset = 0; @@ -513,7 +515,8 @@ int best_group(size_t idx) { if (!fun1) return -1; - // First call to get returnLength. We expect it to fail due to null buffer + // First call to GetLogicalProcessorInformationEx() to get returnLength. + // We expect the call to fail due to null buffer. if (fun1(RelationAll, nullptr, &returnLength)) return -1; @@ -521,7 +524,7 @@ int best_group(size_t idx) { SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *buffer, *ptr; ptr = buffer = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)malloc(returnLength); - // Second call, now we expect to succeed + // Second call to GetLogicalProcessorInformationEx(), now we expect to succeed if (!fun1(RelationAll, buffer, &returnLength)) { free(buffer); @@ -530,8 +533,8 @@ int best_group(size_t idx) { while (byteOffset < returnLength) { - if (ptr->Relationship == RelationGroup) - groups += ptr->Group.MaximumGroupCount; + if (ptr->Relationship == RelationNumaNode) + nodes++; else if (ptr->Relationship == RelationProcessorCore) { @@ -546,23 +549,23 @@ int best_group(size_t idx) { free(buffer); - std::vector core_groups; + std::vector groups; - // Run as many threads as possible on the same group until core limit is - // reached, then move on filling the next group. - for (int n = 0; n < groups; n++) - for (int i = 0; i < cores / groups; i++) - core_groups.push_back(n); + // Run as many threads as possible on the same node until core limit is + // reached, then move on filling the next node. + for (int n = 0; n < nodes; n++) + for (int i = 0; i < cores / nodes; i++) + groups.push_back(n); // In case a core has more than one logical processor (we assume 2) and we // have still threads to allocate, then spread them evenly across available - // groups. + // nodes. for (int t = 0; t < threads - cores; t++) - core_groups.push_back(t % groups); + groups.push_back(t % nodes); // If we still have more threads than the total number of logical processors // then return -1 and let the OS to decide what to do. - return idx < core_groups.size() ? core_groups[idx] : -1; + return idx < groups.size() ? groups[idx] : -1; } @@ -571,22 +574,38 @@ int best_group(size_t idx) { void bindThisThread(size_t idx) { // Use only local variables to be thread-safe - int group = best_group(idx); + int node = best_node(idx); - if (group == -1) + if (node == -1) return; // Early exit if the needed API are not available at runtime HMODULE k32 = GetModuleHandle("Kernel32.dll"); auto fun2 = (fun2_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMaskEx"); auto fun3 = (fun3_t)(void(*)())GetProcAddress(k32, "SetThreadGroupAffinity"); + auto fun4 = (fun4_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMask2"); + auto fun5 = (fun5_t)(void(*)())GetProcAddress(k32, "GetMaximumProcessorGroupCount"); if (!fun2 || !fun3) return; - GROUP_AFFINITY affinity; - if (fun2(group, &affinity)) - fun3(GetCurrentThread(), &affinity, nullptr); + if (!fun4 || !fun5) + { + GROUP_AFFINITY affinity; + if (fun2(node, &affinity)) // GetNumaNodeProcessorMaskEx + fun3(GetCurrentThread(), &affinity, nullptr); // SetThreadGroupAffinity + } + else + { + // If a numa node has more than one processor group, we assume they are + // sized equal and we spread threads evenly across the groups. + USHORT elements, returnedElements; + elements = fun5(); // GetMaximumProcessorGroupCount + GROUP_AFFINITY *affinity = (GROUP_AFFINITY*)malloc(elements * sizeof(GROUP_AFFINITY)); + if (fun4(node, affinity, elements, &returnedElements)) // GetNumaNodeProcessorMask2 + fun3(GetCurrentThread(), &affinity[idx % returnedElements], nullptr); // SetThreadGroupAffinity + free(affinity); + } } #endif