From: Joost VandeVondele Date: Mon, 10 Aug 2020 05:18:15 +0000 (+0200) Subject: Revert "Avoid special casing for MinGW" X-Git-Url: https://git.sesse.net/?p=stockfish;a=commitdiff_plain;h=651ec3b31ee68db50f38ccd8fcdedbd6673cd9ed Revert "Avoid special casing for MinGW" This reverts commit a6e89293df5af35931b61d86b6de3872a981c100. The offending setup has been found as gcc/mingw 7.3 (on Ubuntu 18.04). fixes https://github.com/official-stockfish/Stockfish/issues/2963 closes https://github.com/official-stockfish/Stockfish/issues/2968 No functional change. --- diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index ecc3008a..b585bc87 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -104,8 +104,13 @@ namespace Eval::NNUE::Layers { __m512i sum = _mm512_setzero_si512(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m512i product = _mm512_maddubs_epi16( - _mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + + #if defined(__MINGW32__) || defined(__MINGW64__) + __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + #else + __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); + #endif + product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); } @@ -120,8 +125,12 @@ namespace Eval::NNUE::Layers { const auto row_256 = reinterpret_cast(&weights_[offset]); int j = kNumChunks * 2; - __m256i sum256 = _mm256_maddubs_epi16( - _mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); + #if defined(__MINGW32__) || defined(__MINGW64__) // See HACK comment below in AVX2. + __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); + #else + __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); + #endif + sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); sum256 = _mm256_hadd_epi32(sum256, sum256); sum256 = _mm256_hadd_epi32(sum256, sum256); @@ -135,7 +144,17 @@ namespace Eval::NNUE::Layers { const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i product = _mm256_maddubs_epi16( - _mm256_load_si256(&input_vector[j]), _mm256_load_si256(&row[j])); + + #if defined(__MINGW32__) || defined(__MINGW64__) + // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary + // compiled with g++ in MSYS2 crashes here because the output memory is not aligned + // even though alignas is specified. + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&input_vector[j]), _mm256_load_si256(&row[j])); product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); } diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 7e5fcf4a..7ade598f 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -74,13 +74,50 @@ namespace Eval::NNUE::Layers { const auto out = reinterpret_cast<__m256i*>(output); for (IndexType i = 0; i < kNumChunks; ++i) { const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_load_si256(&in[i * 4 + 0]), - _mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits); + + #if defined(__MINGW32__) || defined(__MINGW64__) + // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary + // compiled with g++ in MSYS2 crashes here because the output memory is not aligned + // even though alignas is specified. + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&in[i * 4 + 0]), + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&in[i * 4 + 1])), kWeightScaleBits); const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( - _mm256_load_si256(&in[i * 4 + 2]), - _mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits); - _mm256_store_si256( - &out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&in[i * 4 + 2]), + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&in[i * 4 + 3])), kWeightScaleBits); + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_storeu_si256 + #else + _mm256_store_si256 + #endif + + (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( _mm256_packs_epi16(words0, words1), kZero), kOffsets)); } constexpr IndexType kStart = kNumChunks * kSimdWidth; diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index f899d761..1cfebbe4 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -110,12 +110,36 @@ namespace Eval::NNUE { auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i sum0 = - _mm256_load_si256(&reinterpret_cast( + + #if defined(__MINGW32__) || defined(__MINGW64__) + // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary + // compiled with g++ in MSYS2 crashes here because the output memory is not aligned + // even though alignas is specified. + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 0]); __m256i sum1 = - _mm256_load_si256(&reinterpret_cast( + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 1]); - _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_storeu_si256 + #else + _mm256_store_si256 + #endif + + (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -178,7 +202,11 @@ namespace Eval::NNUE { auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); for (IndexType j = 0; j < kNumChunks; ++j) { + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j])); + #else accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); + #endif } #elif defined(USE_SSE2)