Advise the kernel to use huge pages (Linux)

author Sami Kiminki <skiminki@users.noreply.github.com>

Sat, 21 Dec 2019 19:41:42 +0000 (21:41 +0200)

committer Joost VandeVondele <Joost.VandeVondele@gmail.com>

Mon, 27 Jan 2020 10:16:10 +0000 (11:16 +0100)
author Sami Kiminki <skiminki@users.noreply.github.com>
Sat, 21 Dec 2019 19:41:42 +0000 (21:41 +0200)
committer Joost VandeVondele <Joost.VandeVondele@gmail.com>
Mon, 27 Jan 2020 10:16:10 +0000 (11:16 +0100)
diff --git a/src/misc.cpp b/src/misc.cpp

index 484d0b210cb6ddeeed9ac315f6bc930b063b31f1..0bae9f1e0e9b6cf0a24364c46a1985172f481dc6 100644 (file)
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -47,6 +47,11 @@ typedef bool(*fun3_t)(HANDLE, CONST GROUP_AFFINITY*, PGROUP_AFFINITY);
  #include <sstream>
  #include <vector>
  
+#ifdef __linux__
+#include <stdlib.h>
+#include <sys/mman.h>
+#endif
+
  #include "misc.h"
  #include "thread.h"
  
@@ -190,7 +195,7 @@ const std::string compiler_info() {
       compiler += "(unknown version)";
    #endif
  
-  #if defined(__APPLE__) 
+  #if defined(__APPLE__)
       compiler += " on Apple";
    #elif defined(__CYGWIN__)
       compiler += " on Cygwin";
@@ -288,6 +293,35 @@ void prefetch(void* addr) {
  
  #endif
  
+
+/// aligned_ttmem_alloc will return suitably aligned memory, and if possible use large pages.
+/// The returned pointer is the aligned one, while the mem argument is the one that needs to be passed to free.
+/// With c++17 some of this functionality can be simplified.
+#ifdef __linux__
+
+void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
+
+  constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
+  size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
+  *mem = aligned_alloc(alignment, size);
+  madvise(*mem, allocSize, MADV_HUGEPAGE);
+  return *mem;
+}
+
+#else
+
+void* aligned_ttmem_alloc(size_t allocSize, void** mem) {
+
+  constexpr size_t alignment = 64; // assumed cache line size
+  size_t size = allocSize + alignment - 1; // allocate some extra space
+  *mem = malloc(size);
+  void* ret = reinterpret_cast<void*>((uintptr_t(*mem) + alignment - 1) & ~uintptr_t(alignment - 1));
+  return ret;
+}
+
+#endif
+
+
  namespace WinProcGroup {
  
  #ifndef _WIN32
diff --git a/src/misc.h b/src/misc.h

index b11c5aa843c5519a68699f115860419ca85f9eb7..45d9951a72aa4edfb0997a5102dd2bfbf841648f 100644 (file)
--- a/src/misc.h
+++ b/src/misc.h
@@ -33,6 +33,7 @@ const std::string engine_info(bool to_uci = false);
  const std::string compiler_info();
  void prefetch(void* addr);
  void start_logger(const std::string& fname);
+void* aligned_ttmem_alloc(size_t size, void** mem);
  
  void dbg_hit_on(bool b);
  void dbg_hit_on(bool c, bool b);
diff --git a/src/tt.cpp b/src/tt.cpp

index 0b4a59de55915962e7ed9409ae462be2556366f3..080d3a6bd480006e4c821661d2ec9800a4b7523b 100644 (file)
--- a/src/tt.cpp
+++ b/src/tt.cpp
@@ -63,11 +63,10 @@ void TranspositionTable::resize(size_t mbSize) {
  
    Threads.main()->wait_for_search_finished();
  
-  clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
-
    free(mem);
-  mem = malloc(clusterCount * sizeof(Cluster) + CacheLineSize - 1);
  
+  clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
+  table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), &mem));
    if (!mem)
    {
        std::cerr << "Failed to allocate " << mbSize
@@ -75,7 +74,6 @@ void TranspositionTable::resize(size_t mbSize) {
        exit(EXIT_FAILURE);
    }
  
-  table = (Cluster*)((uintptr_t(mem) + CacheLineSize - 1) & ~(CacheLineSize - 1));
    clear();
  }
  
diff --git a/src/tt.h b/src/tt.h

index 98b054d397207525a45978e6bae6a15055e29181..142afd90e56ca63e1b79691f27b6c51a5a19f966 100644 (file)
--- a/src/tt.h
+++ b/src/tt.h
@@ -57,24 +57,22 @@ private:
  };
  
  
-/// A TranspositionTable consists of a power of 2 number of clusters and each
-/// cluster consists of ClusterSize number of TTEntry. Each non-empty entry
-/// contains information of exactly one position. The size of a cluster should
-/// divide the size of a cache line size, to ensure that clusters never cross
-/// cache lines. This ensures best cache performance, as the cacheline is
-/// prefetched, as soon as possible.
+/// A TranspositionTable is an array of Cluster, of size clusterCount. Each
+/// cluster consists of ClusterSize number of TTEntry. Each non-empty TTEntry
+/// contains information on exactly one position. The size of a Cluster should
+/// divide the size of a cache line for best performance,
+/// as the cacheline is prefetched when possible.
  
  class TranspositionTable {
  
-  static constexpr int CacheLineSize = 64;
    static constexpr int ClusterSize = 3;
  
    struct Cluster {
      TTEntry entry[ClusterSize];
-    char padding[2]; // Align to a divisor of the cache line size
+    char padding[2]; // Pad to 32 bytes
    };
  
-  static_assert(CacheLineSize % sizeof(Cluster) == 0, "Cluster size incorrect");
+  static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
  
  public:
   ~TranspositionTable() { free(mem); }
author	Sami Kiminki <skiminki@users.noreply.github.com>
	Sat, 21 Dec 2019 19:41:42 +0000 (21:41 +0200)
committer	Joost VandeVondele <Joost.VandeVondele@gmail.com>
	Mon, 27 Jan 2020 10:16:10 +0000 (11:16 +0100)
src/misc.cpp		patch \| blob \| history
src/misc.h		patch \| blob \| history
src/tt.cpp		patch \| blob \| history
src/tt.h		patch \| blob \| history