as a workaround to issues with overaligned alignas() on stack variables in gcc < 9.3 on windows.
closes https://github.com/official-stockfish/Stockfish/pull/3217
fixes #3216
No functional change
#include <ostream>
#include <string>
#include <vector>
+#include <cstdint>
#include "types.h"
#define sync_cout std::cout << IO_LOCK
#define sync_endl std::endl << IO_UNLOCK
+// `ptr` must point to an array of size at least
+// `sizeof(T) * N + alignment` bytes, where `N` is the
+// number of elements in the array.
+template <uintptr_t Alignment, typename T>
+T* align_ptr_up(T* ptr)
+{
+ static_assert(alignof(T) < Alignment);
+
+ const uintptr_t ptrint = reinterpret_cast<uintptr_t>(reinterpret_cast<char*>(ptr));
+ return reinterpret_cast<T*>(reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
+}
/// xorshift64star Pseudo-Random Number Generator
/// This class is based on original code written and dedicated
#include "../position.h"
#include "../misc.h"
#include "../uci.h"
+#include "../types.h"
#include "evaluate_nnue.h"
// Evaluation function. Perform differential calculation.
Value evaluate(const Position& pos) {
- alignas(kCacheLineSize) TransformedFeatureType
- transformed_features[FeatureTransformer::kBufferSize];
+ // We manually align the arrays on the stack because with gcc < 9.3
+ // overaligning stack variables with alignas() doesn't work correctly.
+
+ constexpr uint64_t alignment = kCacheLineSize;
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+ TransformedFeatureType transformed_features_unaligned[
+ FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+ char buffer_unaligned[Network::kBufferSize + alignment];
+
+ auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+ auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+#else
+ alignas(alignment)
+ TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+ alignas(alignment) char buffer[Network::kBufferSize];
+#endif
+
+ ASSERT_ALIGNED(transformed_features, alignment);
+ ASSERT_ALIGNED(buffer, alignment);
+
feature_transformer->Transform(pos, transformed_features);
- alignas(kCacheLineSize) char buffer[Network::kBufferSize];
const auto output = network->Propagate(transformed_features, buffer);
return static_cast<Value>(output[0] / FV_SCALE);
const auto out = reinterpret_cast<__m256i*>(output);
for (IndexType i = 0; i < kNumChunks; ++i) {
const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
- _mm256_loadA_si256(&in[i * 4 + 0]),
- _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
+ _mm256_load_si256(&in[i * 4 + 0]),
+ _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
- _mm256_loadA_si256(&in[i * 4 + 2]),
- _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
- _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+ _mm256_load_si256(&in[i * 4 + 2]),
+ _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
+ _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
_mm256_packs_epi16(words0, words1), kZero), kOffsets));
}
constexpr IndexType kStart = kNumChunks * kSimdWidth;
#include <arm_neon.h>
#endif
-// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
-// compiled with older g++ crashes because the output memory is not aligned
-// even though alignas is specified.
-#if defined(USE_AVX2)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm256_loadA_si256 _mm256_loadu_si256
-#define _mm256_storeA_si256 _mm256_storeu_si256
-#else
-#define _mm256_loadA_si256 _mm256_load_si256
-#define _mm256_storeA_si256 _mm256_store_si256
-#endif
-#endif
-
-#if defined(USE_AVX512)
-#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
-#define _mm512_loadA_si512 _mm512_loadu_si512
-#define _mm512_storeA_si512 _mm512_storeu_si512
-#else
-#define _mm512_loadA_si512 _mm512_load_si512
-#define _mm512_storeA_si512 _mm512_store_si512
-#endif
-#endif
-
namespace Eval::NNUE {
// Version of the evaluation file
#ifdef USE_AVX512
typedef __m512i vec_t;
- #define vec_load(a) _mm512_loadA_si512(a)
- #define vec_store(a,b) _mm512_storeA_si512(a,b)
+ #define vec_load(a) _mm512_load_si512(a)
+ #define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
static constexpr IndexType kNumRegs = 8; // only 8 are needed
#elif USE_AVX2
typedef __m256i vec_t;
- #define vec_load(a) _mm256_loadA_si256(a)
- #define vec_store(a,b) _mm256_storeA_si256(a,b)
+ #define vec_load(a) _mm256_load_si256(a)
+ #define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
static constexpr IndexType kNumRegs = 16;
#if defined(USE_AVX2)
auto out = reinterpret_cast<__m256i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
- __m256i sum0 = _mm256_loadA_si256(
+ __m256i sum0 = _mm256_load_si256(
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
- __m256i sum1 = _mm256_loadA_si256(
+ __m256i sum1 = _mm256_load_si256(
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
- _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
+ _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
}
&& !pos.can_castle(ANY_CASTLING))
{
StateInfo st;
+ ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
Position p;
p.set(pos.fen(), pos.is_chess960(), &st, pos.this_thread());
Tablebases::ProbeState s1, s2;
assert(0 && "pos_is_ok: Bitboards");
StateInfo si = *st;
+ ASSERT_ALIGNED(&si, Eval::NNUE::kCacheLineSize);
+
set_state(&si);
if (std::memcmp(&si, st, sizeof(StateInfo)))
assert(0 && "pos_is_ok: State");
uint64_t perft(Position& pos, Depth depth) {
StateInfo st;
+ ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
uint64_t cnt, nodes = 0;
const bool leaf = (depth == 2);
Move pv[MAX_PLY+1], capturesSearched[32], quietsSearched[64];
StateInfo st;
+ ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
TTEntry* tte;
Key posKey;
Move ttMove, move, excludedMove, bestMove;
Move pv[MAX_PLY+1];
StateInfo st;
+ ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
TTEntry* tte;
Key posKey;
Move ttMove, move, bestMove;
bool RootMove::extract_ponder_from_tt(Position& pos) {
StateInfo st;
+ ASSERT_ALIGNED(&st, Eval::NNUE::kCacheLineSize);
+
bool ttHit;
assert(pv.size() == 1);
/// _WIN32 Building on Windows (any)
/// _WIN64 Building on Windows 64 bit
+#if defined(__GNUC__ ) && (__GNUC__ < 9 || (__GNUC__ == 9 && __GNUC_MINOR__ <= 2)) && defined(_WIN32) && !defined(__clang__)
+#define ALIGNAS_ON_STACK_VARIABLES_BROKEN
+#endif
+
+#define ASSERT_ALIGNED(ptr, alignment) assert(reinterpret_cast<uintptr_t>(ptr) % alignment == 0)
+
#if defined(_WIN64) && defined(_MSC_VER) // No Makefile used
# include <intrin.h> // Microsoft header for _BitScanForward64()
# define IS_64BIT