Align the TT allocation by 2M to make it huge page friendly and advise the
kernel to use huge pages.
Benchmarks on my i7-8700K (6C/12T) box: (3 runs per bench per config)
vanilla (nps) hugepages (nps) avg
==================================================================================
bench |
3012490 3024364 3036331 3071052 3067544 3071052 +1.5%
bench 16 12 20 |
19237932 19050166 19085315 19266346 19207025 19548758 +1.1%
bench 16384 12 20 |
18182313 18371581 18336838 19381275 19738012 19620225 +7.0%
On my box, huge pages have a significant perf impact when using a big
hash size. They also speed up TT initialization big time:
vanilla (s) huge pages (s) speed-up
=======================================================================
time stockfish bench 16384 1 1 | 5.37 1.48 3.6x
In practice, huge pages with auto-defrag may always be enabled in the
system, in which case this patch has no effect. This
depends on the values in /sys/kernel/mm/transparent_hugepage/enabled
and /sys/kernel/mm/transparent_hugepage/defrag.
closes https://github.com/official-stockfish/Stockfish/pull/2463
No functional change
#include <sstream>
#include <vector>
#include <sstream>
#include <vector>
+#ifdef __linux__
+#include <stdlib.h>
+#include <sys/mman.h>
+#endif
+
#include "misc.h"
#include "thread.h"
#include "misc.h"
#include "thread.h"
compiler += "(unknown version)";
#endif
compiler += "(unknown version)";
#endif
compiler += " on Apple";
#elif defined(__CYGWIN__)
compiler += " on Cygwin";
compiler += " on Apple";
#elif defined(__CYGWIN__)
compiler += " on Cygwin";
+
+/// aligned_ttmem_alloc will return suitably aligned memory, and if possible use large pages.
+/// The returned pointer is the aligned one, while the mem argument is the one that needs to be passed to free.
+/// With c++17 some of this functionality can be simplified.
+#ifdef __linux__
+
+void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
+
+ constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
+ size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
+ *mem = aligned_alloc(alignment, size);
+ madvise(*mem, allocSize, MADV_HUGEPAGE);
+ return *mem;
+}
+
+#else
+
+void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
+
+ constexpr size_t alignment = 64; // assumed cache line size
+ size_t size = allocSize + alignment - 1; // allocate some extra space
+ *mem = malloc(size);
+ void* ret = reinterpret_cast<void*>((uintptr_t(*mem) + alignment - 1) & ~uintptr_t(alignment - 1));
+ return ret;
+}
+
+#endif
+
+
namespace WinProcGroup {
#ifndef _WIN32
namespace WinProcGroup {
#ifndef _WIN32
const std::string compiler_info();
void prefetch(void* addr);
void start_logger(const std::string& fname);
const std::string compiler_info();
void prefetch(void* addr);
void start_logger(const std::string& fname);
+void* aligned_ttmem_alloc(size_t size, void** mem);
void dbg_hit_on(bool b);
void dbg_hit_on(bool c, bool b);
void dbg_hit_on(bool b);
void dbg_hit_on(bool c, bool b);
Threads.main()->wait_for_search_finished();
Threads.main()->wait_for_search_finished();
- clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
-
- mem = malloc(clusterCount * sizeof(Cluster) + CacheLineSize - 1);
+ clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
+ table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), &mem));
if (!mem)
{
std::cerr << "Failed to allocate " << mbSize
if (!mem)
{
std::cerr << "Failed to allocate " << mbSize
- table = (Cluster*)((uintptr_t(mem) + CacheLineSize - 1) & ~(CacheLineSize - 1));
-/// A TranspositionTable consists of a power of 2 number of clusters and each
-/// cluster consists of ClusterSize number of TTEntry. Each non-empty entry
-/// contains information of exactly one position. The size of a cluster should
-/// divide the size of a cache line size, to ensure that clusters never cross
-/// cache lines. This ensures best cache performance, as the cacheline is
-/// prefetched, as soon as possible.
+/// A TranspositionTable is an array of Cluster, of size clusterCount. Each
+/// cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry
+/// contains information on exactly one position. The size of a Cluster should
+/// divide the size of a cache line for best performance,
+/// as the cacheline is prefetched when possible.
class TranspositionTable {
class TranspositionTable {
- static constexpr int CacheLineSize = 64;
static constexpr int ClusterSize = 3;
struct Cluster {
TTEntry entry[ClusterSize];
static constexpr int ClusterSize = 3;
struct Cluster {
TTEntry entry[ClusterSize];
- char padding[2]; // Align to a divisor of the cache line size
+ char padding[2]; // Pad to 32 bytes
- static_assert(CacheLineSize % sizeof(Cluster) == 0, "Cluster size incorrect");
+ static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
public:
~TranspositionTable() { free(mem); }
public:
~TranspositionTable() { free(mem); }