Manually align arrays on the stack

author Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>

Tue, 3 Nov 2020 10:23:35 +0000 (11:23 +0100)

committer Joost VandeVondele <Joost.VandeVondele@gmail.com>

Wed, 4 Nov 2020 18:52:42 +0000 (19:52 +0100)
author Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Tue, 3 Nov 2020 10:23:35 +0000 (11:23 +0100)
committer Joost VandeVondele <Joost.VandeVondele@gmail.com>
Wed, 4 Nov 2020 18:52:42 +0000 (19:52 +0100)
diff --git a/src/misc.h b/src/misc.h

index bc48f303a8832cf867991f973976ba9423ed003e..682ef81641c041869fb255e6732a53b6b651da9c 100644 (file)
--- a/src/misc.h
+++ b/src/misc.h
@@ -24,6 +24,7 @@
  #include <ostream>
  #include <string>
  #include <vector>
+#include <cstdint>
  
  #include "types.h"
  
@@ -63,6 +64,17 @@ std::ostream& operator<<(std::ostream&, SyncCout);
  #define sync_cout std::cout << IO_LOCK
  #define sync_endl std::endl << IO_UNLOCK
  
+// `ptr` must point to an array of size at least
+// `sizeof(T) * N + alignment` bytes, where `N` is the
+// number of elements in the array.
+template <uintptr_t Alignment, typename T>
+T* align_ptr_up(T* ptr)
+{
+  static_assert(alignof(T) < Alignment);
+
+  const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
+  return reinterpret_cast<T*>(reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
+}
  
  /// xorshift64star Pseudo-Random Number Generator
  /// This class is based on original code written and dedicated
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp

index b5dcd992fa3a70ad557f2ce2cfdd32f364e69700..b0ed7d2f5a4f78e0092603edf4756402c69de2ae 100644 (file)
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -25,6 +25,7 @@
  #include "../position.h"
  #include "../misc.h"
  #include "../uci.h"
+#include "../types.h"
  
  #include "evaluate_nnue.h"
  
@@ -126,10 +127,28 @@ namespace Eval::NNUE {
    // Evaluation function. Perform differential calculation.
    Value evaluate(const Position& pos) {
  
-    alignas(kCacheLineSize) TransformedFeatureType
-        transformed_features[FeatureTransformer::kBufferSize];
+    // We manually align the arrays on the stack because with gcc < 9.3
+    // overaligning stack variables with alignas() doesn't work correctly.
+
+    constexpr uint64_t alignment = kCacheLineSize;
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+    TransformedFeatureType transformed_features_unaligned[
+      FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+    char buffer_unaligned[Network::kBufferSize + alignment];
+
+    auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+#else
+    alignas(alignment)
+      TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+    alignas(alignment) char buffer[Network::kBufferSize];
+#endif
+
+    ASSERT_ALIGNED(transformed_features, alignment);
+    ASSERT_ALIGNED(buffer, alignment);
+
      feature_transformer->Transform(pos, transformed_features);
-    alignas(kCacheLineSize) char buffer[Network::kBufferSize];
      const auto output = network->Propagate(transformed_features, buffer);
  
      return static_cast<Value>(output[0] / FV_SCALE);
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h

index 44d8a7def4c92ff55f7fcf1150b9553b37a90458..7f6d67bfe3b9cf5a104c64e8bdddcbdce154c2be 100644 (file)
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -74,12 +74,12 @@ namespace Eval::NNUE::Layers {
        const auto out = reinterpret_cast<__m256i*>(output);
        for (IndexType i = 0; i < kNumChunks; ++i) {
          const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+            _mm256_load_si256(&in[i * 4 + 0]),
+            _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
          const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+            _mm256_load_si256(&in[i * 4 + 2]),
+            _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+        _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
              _mm256_packs_epi16(words0, words1), kZero), kOffsets));
        }
        constexpr IndexType kStart = kNumChunks * kSimdWidth;
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h

index 8afea1866abfb2fc7f120d5fceeecbc6750e30e4..a9664262a47f0232643d17f1f59b77c3971c6a03 100644 (file)
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -43,29 +43,6 @@
  #include <arm_neon.h>
  #endif
  
-// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
-//       compiled with older g++ crashes because the output memory is not aligned
-//       even though alignas is specified.
-#if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm256_loadA_si256  _mm256_loadu_si256
-#define _mm256_storeA_si256 _mm256_storeu_si256
-#else
-#define _mm256_loadA_si256  _mm256_load_si256
-#define _mm256_storeA_si256 _mm256_store_si256
-#endif
-#endif
-
-#if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm512_loadA_si512   _mm512_loadu_si512
-#define _mm512_storeA_si512  _mm512_storeu_si512
-#else
-#define _mm512_loadA_si512   _mm512_load_si512
-#define _mm512_storeA_si512  _mm512_store_si512
-#endif
-#endif
-
  namespace Eval::NNUE {
  
    // Version of the evaluation file
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h

index f145c848f96fb82349be3d78530f76b97918ab04..c3f012e412ec2415b726a469d080378b8049526d 100644 (file)
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -36,16 +36,16 @@ namespace Eval::NNUE {
  
    #ifdef USE_AVX512
    typedef __m512i vec_t;
-  #define vec_load(a) _mm512_loadA_si512(a)
-  #define vec_store(a,b) _mm512_storeA_si512(a,b)
+  #define vec_load(a) _mm512_load_si512(a)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
    #define vec_add_16(a,b) _mm512_add_epi16(a,b)
    #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
    static constexpr IndexType kNumRegs = 8; // only 8 are needed
  
    #elif USE_AVX2
    typedef __m256i vec_t;
-  #define vec_load(a) _mm256_loadA_si256(a)
-  #define vec_store(a,b) _mm256_storeA_si256(a,b)
+  #define vec_load(a) _mm256_load_si256(a)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
    #define vec_add_16(a,b) _mm256_add_epi16(a,b)
    #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
    static constexpr IndexType kNumRegs = 16;
@@ -157,11 +157,11 @@ namespace Eval::NNUE {
    #if defined(USE_AVX2)
          auto out = reinterpret_cast<__m256i*>(&output[offset]);
          for (IndexType j = 0; j < kNumChunks; ++j) {
-          __m256i sum0 = _mm256_loadA_si256(
+          __m256i sum0 = _mm256_load_si256(
                &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
-          __m256i sum1 = _mm256_loadA_si256(
+          __m256i sum1 = _mm256_load_si256(
              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
-          _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
                _mm256_packs_epi16(sum0, sum1), kZero), kControl));
          }
  
diff --git a/src/position.cpp b/src/position.cpp

index b707293dd730655d060e717d35703d3acf9f2322..5ce7da22894568a2e60c68cd2606fdc01e27845d 100644 (file)
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -77,6 +77,8 @@ std::ostream& operator<<(std::ostream& os, const Position& pos) {
        && !pos.can_castle(ANY_CASTLING))
    {
        StateInfo st;
+      ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
        Position p;
        p.set(pos.fen(), pos.is_chess960(), &st, pos.this_thread());
        Tablebases::ProbeState s1, s2;
@@ -1318,6 +1320,8 @@ bool Position::pos_is_ok() const {
                assert(0 && "pos_is_ok: Bitboards");
  
    StateInfo si = *st;
+  ASSERT_ALIGNED(&si, Eval::NNUE::kCacheLineSize);
+
    set_state(&si);
    if (std::memcmp(&si, st, sizeof(StateInfo)))
        assert(0 && "pos_is_ok: State");
diff --git a/src/search.cpp b/src/search.cpp

index 743449fa39c5b7e13cd81dc68cd44f90d53fdf41..12c32194eb95436726f0c484f7cdd2988a65b20e 100644 (file)
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -164,6 +164,8 @@ namespace {
    uint64_t perft(Position& pos, Depth depth) {
  
      StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
      uint64_t cnt, nodes = 0;
      const bool leaf = (depth == 2);
  
@@ -590,6 +592,8 @@ namespace {
  
      Move pv[MAX_PLY+1], capturesSearched[32], quietsSearched[64];
      StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
      TTEntry* tte;
      Key posKey;
      Move ttMove, move, excludedMove, bestMove;
@@ -1403,6 +1407,8 @@ moves_loop: // When in check, search starts from here
  
      Move pv[MAX_PLY+1];
      StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
      TTEntry* tte;
      Key posKey;
      Move ttMove, move, bestMove;
@@ -1898,6 +1904,8 @@ string UCI::pv(const Position& pos, Depth depth, Value alpha, Value beta) {
  bool RootMove::extract_ponder_from_tt(Position& pos) {
  
      StateInfo st;
+    ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
      bool ttHit;
  
      assert(pv.size() == 1);
diff --git a/src/types.h b/src/types.h

index 5873c698f738d6bd5bc511fbc4ef5fb1af4f58ad..bf692f7e645f68faeb411e8d6479c25d3ede3071 100644 (file)
--- a/src/types.h
+++ b/src/types.h
@@ -57,6 +57,12 @@
  /// _WIN32             Building on Windows (any)
  /// _WIN64             Building on Windows 64 bit
  
+#if defined(__GNUC__ ) && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)) && defined(_WIN32) && !defined(__clang__)
+#define ALIGNAS_ON_STACK_VARIABLES_BROKEN
+#endif
+
+#define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast<uintptr_t>(ptr) % alignment == 0)
+
  #if defined(_WIN64) && defined(_MSC_VER) // No Makefile used
  #  include <intrin.h> // Microsoft header for _BitScanForward64()
  #  define IS_64BIT
author	Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
	Tue, 3 Nov 2020 10:23:35 +0000 (11:23 +0100)
committer	Joost VandeVondele <Joost.VandeVondele@gmail.com>
	Wed, 4 Nov 2020 18:52:42 +0000 (19:52 +0100)
src/misc.h		patch \| blob \| history
src/nnue/evaluate_nnue.cpp		patch \| blob \| history
src/nnue/layers/clipped_relu.h		patch \| blob \| history
src/nnue/nnue_common.h		patch \| blob \| history
src/nnue/nnue_feature_transformer.h		patch \| blob \| history
src/position.cpp		patch \| blob \| history
src/search.cpp		patch \| blob \| history
src/types.h		patch \| blob \| history