From: Sami Kiminki Date: Mon, 4 May 2020 17:49:27 +0000 (+0300) Subject: Add support for Windows large pages X-Git-Url: https://git.sesse.net/?p=stockfish;a=commitdiff_plain;h=d4763424d2728fe2dfd0a6fe747666feb6a2fdbb;hp=86ee4eb84d54dac3f9de5b455ba41909c7722173 Add support for Windows large pages for users that set the needed privilige "Lock Pages in Memory" large pages will be automatically enabled (see Readme.md). This expert setting might improve speed, 5% - 30%, depending on the hardware, the number of threads and hash size. More for large hashes, large number of threads and NUMA. If the operating system can not allocate large pages (easier after a reboot), default allocation is used automatically. The engine log provides details. closes https://github.com/official-stockfish/Stockfish/pull/2656 fixes https://github.com/official-stockfish/Stockfish/issues/2619 No functional change --- diff --git a/Readme.md b/Readme.md index a759eff6..35ff095d 100644 --- a/Readme.md +++ b/Readme.md @@ -42,7 +42,7 @@ Currently, Stockfish has the following UCI options: this equal to the number of CPU cores available. * #### Hash - The size of the hash table in MB. + The size of the hash table in MB. It is recommended to set Hash after setting Threads. * #### Clear Hash Clear the hash table. @@ -138,6 +138,30 @@ more compact than Nalimov tablebases, while still storing all information needed for optimal play and in addition being able to take into account the 50-move rule. +## Large Pages + +Stockfish supports large pages on Linux and Windows. Large pages make +the hash access more efficient, improving the engine speed, especially +on large hash sizes. Typical increases are 5..10% in terms of nps, but +speed increases up to 30% have been measured. The support is +automatic. Stockfish attempts to use large pages when available and +will fall back to regular memory allocation when this is not the case. + +### Support on Linux + +Large page support on Linux is obtained by the Linux kernel +transparent huge pages functionality. Typically, transparent huge pages +are already enabled and no configuration is needed. + +### Support on Windows + +The use of large pages requires "Lock Pages in Memory" privilege. See +[Enable the Lock Pages in Memory Option (Windows)](https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows) +on how to enable this privilege. Logout/login may be needed +afterwards. Due to memory fragmentation, it may not always be +possible to allocate large pages even when enabled. A reboot +might alleviate this problem. To determine whether large pages +are in use, see the engine log. ## Compiling Stockfish yourself from the sources diff --git a/src/main.cpp b/src/main.cpp index 6eeda66d..c7cf2c6f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -49,6 +49,7 @@ int main(int argc, char* argv[]) { UCI::loop(argc, argv); + TT.resize(0); Threads.set(0); return 0; } diff --git a/src/misc.cpp b/src/misc.cpp index 94681008..b1c0feeb 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -309,6 +309,69 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { return mem; } +#elif defined(_WIN64) + +static void* aligned_ttmem_alloc_large_pages(size_t allocSize) { + + HANDLE hProcessToken { }; + LUID luid { }; + void* mem = nullptr; + + const size_t largePageSize = GetLargePageMinimum(); + if (!largePageSize) + return nullptr; + + // We need SeLockMemoryPrivilege, so try to enable it for the process + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hProcessToken)) + return nullptr; + + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &luid)) + { + TOKEN_PRIVILEGES tp { }; + TOKEN_PRIVILEGES prevTp { }; + DWORD prevTpLen = 0; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Luid = luid; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + // Try to enable SeLockMemoryPrivilege. Note that even if AdjustTokenPrivileges() succeeds, + // we still need to query GetLastError() to ensure that the privileges were actually obtained... + if (AdjustTokenPrivileges( + hProcessToken, FALSE, &tp, sizeof(TOKEN_PRIVILEGES), &prevTp, &prevTpLen) && + GetLastError() == ERROR_SUCCESS) + { + // round up size to full pages and allocate + allocSize = (allocSize + largePageSize - 1) & ~size_t(largePageSize - 1); + mem = VirtualAlloc( + NULL, allocSize, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE); + + // privilege no longer needed, restore previous state + AdjustTokenPrivileges(hProcessToken, FALSE, &prevTp, 0, NULL, NULL); + } + } + + CloseHandle(hProcessToken); + + return mem; +} + +void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { + + // try to allocate large pages + mem = aligned_ttmem_alloc_large_pages(allocSize); + if (mem) + sync_cout << "info string Hash table allocation: Windows large pages used." << sync_endl; + else + sync_cout << "info string Hash table allocation: Windows large pages not used." << sync_endl; + + // fall back to regular, page aligned, allocation if necessary + if (!mem) + mem = VirtualAlloc(NULL, allocSize, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); + + return mem; +} + #else void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { @@ -322,6 +385,28 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) { #endif +/// aligned_ttmem_free will free the previously allocated ttmem +#if defined(_WIN64) + +void aligned_ttmem_free(void* mem) { + + if (!VirtualFree(mem, 0, MEM_RELEASE)) + { + DWORD err = GetLastError(); + std::cerr << "Failed to free transposition table. Error code: 0x" << + std::hex << err << std::dec << std::endl; + exit(EXIT_FAILURE); + } +} + +#else + +void aligned_ttmem_free(void *mem) { + free(mem); +} + +#endif + namespace WinProcGroup { diff --git a/src/misc.h b/src/misc.h index e0e0e98b..9d53c2da 100644 --- a/src/misc.h +++ b/src/misc.h @@ -34,6 +34,7 @@ const std::string compiler_info(); void prefetch(void* addr); void start_logger(const std::string& fname); void* aligned_ttmem_alloc(size_t size, void*& mem); +void aligned_ttmem_free(void* mem); void dbg_hit_on(bool b); void dbg_hit_on(bool c, bool b); diff --git a/src/tt.cpp b/src/tt.cpp index 7e95a2a4..6ee63138 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -63,7 +63,14 @@ void TranspositionTable::resize(size_t mbSize) { Threads.main()->wait_for_search_finished(); - free(mem); + if (mem) + aligned_ttmem_free(mem); + + if (!mbSize) + { + mem = nullptr; + return; + } clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); table = static_cast(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));