PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
typedef bool(*fun2_t)(USHORT, PGROUP_AFFINITY);
typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
+typedef bool(*fun4_t)(USHORT, PGROUP_AFFINITY, USHORT, PUSHORT);
}
#endif
#else
-/// best_group() retrieves logical processor information using Windows specific
-/// API and returns the best group id for the thread with index idx. Original
+/// best_node() retrieves logical processor information using Windows specific
+/// API and returns the best node id for the thread with index idx. Original
/// code from Texel by Peter Ă–sterlund.
-int best_group(size_t idx) {
+int best_node(size_t idx) {
int threads = 0;
- int groups = 0;
+ int nodes = 0;
int cores = 0;
DWORD returnLength = 0;
DWORD byteOffset = 0;
while (byteOffset < returnLength)
{
- if (ptr->Relationship == RelationGroup)
- groups += ptr->Group.MaximumGroupCount;
+ if (ptr->Relationship == RelationNumaNode)
+ nodes++;
else if (ptr->Relationship == RelationProcessorCore)
{
free(buffer);
- std::vector<int> core_groups;
+ std::vector<int> groups;
- // Run as many threads as possible on the same group until core limit is
- // reached, then move on filling the next group.
- for (int n = 0; n < groups; n++)
- for (int i = 0; i < cores / groups; i++)
- core_groups.push_back(n);
+ // Run as many threads as possible on the same node until core limit is
+ // reached, then move on filling the next node.
+ for (int n = 0; n < nodes; n++)
+ for (int i = 0; i < cores / nodes; i++)
+ groups.push_back(n);
// In case a core has more than one logical processor (we assume 2) and we
// have still threads to allocate, then spread them evenly across available
- // groups.
+ // nodes.
for (int t = 0; t < threads - cores; t++)
- core_groups.push_back(t % groups);
+ groups.push_back(t % nodes);
// If we still have more threads than the total number of logical processors
// then return -1 and let the OS to decide what to do.
- return idx < core_groups.size() ? core_groups[idx] : -1;
+ return idx < groups.size() ? groups[idx] : -1;
}
void bindThisThread(size_t idx) {
// Use only local variables to be thread-safe
- int group = best_group(idx);
+ int node = best_node(idx);
- if (group == -1)
+ if (node == -1)
return;
// Early exit if the needed API are not available at runtime
HMODULE k32 = GetModuleHandle("Kernel32.dll");
auto fun2 = (fun2_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMaskEx");
auto fun3 = (fun3_t)(void(*)())GetProcAddress(k32, "SetThreadGroupAffinity");
+ auto fun4 = (fun4_t)(void(*)())GetProcAddress(k32, "GetNumaNodeProcessorMask2");
if (!fun2 || !fun3)
return;
- GROUP_AFFINITY affinity;
- if (fun2(group, &affinity))
- fun3(GetCurrentThread(), &affinity, nullptr);
+ if (!fun4) {
+ GROUP_AFFINITY affinity;
+ if (fun2(node, &affinity))
+ fun3(GetCurrentThread(), &affinity, nullptr);
+ } else {
+ // If a numa node has more than one processor group, we assume they are
+ // sized equal and we spread threads evenly across the groups.
+ USHORT elements, returnedElements;
+ elements = GetMaximumProcessorGroupCount();
+ GROUP_AFFINITY *affinity = (GROUP_AFFINITY*)malloc(
+ elements * sizeof(GROUP_AFFINITY));
+ if (fun4(node, affinity, elements, &returnedElements))
+ fun3(GetCurrentThread(), &affinity[idx % returnedElements], nullptr);
+ free(affinity);
+ }
}
#endif