From 39437f4e55aaa26ef9f0d5a1c762e560e9ffde32 Mon Sep 17 00:00:00 2001
From: Sami Kiminki <skiminki@users.noreply.github.com>
Date: Sat, 21 Dec 2019 21:41:42 +0200
Subject: [PATCH] Advise the kernel to use huge pages (Linux)

Align the TT allocation by 2M to make it huge page friendly and advise the
kernel to use huge pages.

Benchmarks on my i7-8700K (6C/12T) box: (3 runs per bench per config)

                    vanilla (nps)               hugepages (nps)              avg
==================================================================================
bench             | 3012490  3024364  3036331   3071052  3067544  3071052    +1.5%
bench 16 12 20    | 19237932 19050166 19085315  19266346 19207025 19548758   +1.1%
bench 16384 12 20 | 18182313 18371581 18336838  19381275 19738012 19620225   +7.0%

On my box, huge pages have a significant perf impact when using a big
hash size. They also speed up TT initialization big time:

                                  vanilla (s)  huge pages (s)  speed-up
=======================================================================
time stockfish bench 16384 1 1  | 5.37         1.48            3.6x

In practice, huge pages with auto-defrag may always be enabled in the
system, in which case this patch has no effect. This
depends on the values in /sys/kernel/mm/transparent_hugepage/enabled
and /sys/kernel/mm/transparent_hugepage/defrag.

closes https://github.com/official-stockfish/Stockfish/pull/2463

No functional change
---
 src/misc.cpp | 36 +++++++++++++++++++++++++++++++++++-
 src/misc.h   |  1 +
 src/tt.cpp   |  6 ++----
 src/tt.h     | 16 +++++++---------
 4 files changed, 45 insertions(+), 14 deletions(-)
diff --git a/src/misc.cpp b/src/misc.cpp
index 484d0b21..0bae9f1e 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -47,6 +47,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
 #include <sstream>
 #include <vector>
 
+#ifdef __linux__
+#include <stdlib.h>
+#include <sys/mman.h>
+#endif
+
 #include "misc.h"
 #include "thread.h"
 
@@ -190,7 +195,7 @@ const std::string compiler_info() {
      compiler += "(unknown version)";
   #endif
 
-  #if defined(__APPLE__) 
+  #if defined(__APPLE__)
      compiler += " on Apple";
   #elif defined(__CYGWIN__)
      compiler += " on Cygwin";
@@ -288,6 +293,35 @@ void prefetch(void* addr) {
 
 #endif
 
+
+/// aligned_ttmem_alloc will return suitably aligned memory, and if possible use large pages.
+/// The returned pointer is the aligned one, while the mem argument is the one that needs to be passed to free.
+/// With c++17 some of this functionality can be simplified.
+#ifdef __linux__
+
+void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
+
+  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
+  size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
+  *mem = aligned_alloc(alignment, size);
+  madvise(*mem, allocSize, MADV_HUGEPAGE);
+  return *mem;
+}
+
+#else
+
+void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
+
+  constexpr size_t alignment = 64; // assumed cache line size
+  size_t size = allocSize + alignment - 1; // allocate some extra space
+  *mem = malloc(size);
+  void* ret = reinterpret_cast<void*>((uintptr_t(*mem) + alignment - 1) & ~uintptr_t(alignment - 1));
+  return ret;
+}
+
+#endif
+
+
 namespace WinProcGroup {
 
 #ifndef _WIN32
diff --git a/src/misc.h b/src/misc.h
index b11c5aa8..45d9951a 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -33,6 +33,7 @@ const std::string engine_info(bool to_uci = false);
 const std::string compiler_info();
 void prefetch(void* addr);
 void start_logger(const std::string& fname);
+void* aligned_ttmem_alloc(size_t size, void** mem);
 
 void dbg_hit_on(bool b);
 void dbg_hit_on(bool c, bool b);
diff --git a/src/tt.cpp b/src/tt.cpp
index 0b4a59de..080d3a6b 100644
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -63,11 +63,10 @@ void TranspositionTable::resize(size_t mbSize) {
 
   Threads.main()->wait_for_search_finished();
 
-  clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
-
   free(mem);
-  mem = malloc(clusterCount * sizeof(Cluster) + CacheLineSize - 1);
 
+  clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
+  table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), &mem));
   if (!mem)
   {
       std::cerr << "Failed to allocate " << mbSize
@@ -75,7 +74,6 @@ void TranspositionTable::resize(size_t mbSize) {
       exit(EXIT_FAILURE);
   }
 
-  table = (Cluster*)((uintptr_t(mem) + CacheLineSize - 1) & ~(CacheLineSize - 1));
   clear();
 }
 
diff --git a/src/tt.h b/src/tt.h
index 98b054d3..142afd90 100644
--- a/src/tt.h
+++ b/src/tt.h
@@ -57,24 +57,22 @@ private:
 };
 
 
-/// A TranspositionTable consists of a power of 2 number of clusters and each
-/// cluster consists of ClusterSize number of TTEntry. Each non-empty entry
-/// contains information of exactly one position. The size of a cluster should
-/// divide the size of a cache line size, to ensure that clusters never cross
-/// cache lines. This ensures best cache performance, as the cacheline is
-/// prefetched, as soon as possible.
+/// A TranspositionTable is an array of Cluster, of size clusterCount. Each
+/// cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry
+/// contains information on exactly one position. The size of a Cluster should
+/// divide the size of a cache line for best performance,
+/// as the cacheline is prefetched when possible.
 
 class TranspositionTable {
 
-  static constexpr int CacheLineSize = 64;
   static constexpr int ClusterSize = 3;
 
   struct Cluster {
     TTEntry entry[ClusterSize];
-    char padding[2]; // Align to a divisor of the cache line size
+    char padding[2]; // Pad to 32 bytes
   };
 
-  static_assert(CacheLineSize % sizeof(Cluster) == 0, "Cluster size incorrect");
+  static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
 
 public:
  ~TranspositionTable() { free(mem); }
-- 
2.39.2