X-Git-Url: https://git.sesse.net/?p=stockfish;a=blobdiff_plain;f=src%2Fnnue%2Fnnue_feature_transformer.h;h=1cfebbe4cbe80425f65aa3e3012594494d615294;hp=f899d7617655cf086274aeee8ecb021434c06bca;hb=651ec3b31ee68db50f38ccd8fcdedbd6673cd9ed;hpb=27b593a94477a821f80a041320683f805114d4a3 diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index f899d761..1cfebbe4 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -110,12 +110,36 @@ namespace Eval::NNUE { auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i sum0 = - _mm256_load_si256(&reinterpret_cast( + + #if defined(__MINGW32__) || defined(__MINGW64__) + // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary + // compiled with g++ in MSYS2 crashes here because the output memory is not aligned + // even though alignas is specified. + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 0]); __m256i sum1 = - _mm256_load_si256(&reinterpret_cast( + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_loadu_si256 + #else + _mm256_load_si256 + #endif + + (&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 1]); - _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_storeu_si256 + #else + _mm256_store_si256 + #endif + + (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -178,7 +202,11 @@ namespace Eval::NNUE { auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); for (IndexType j = 0; j < kNumChunks; ++j) { + #if defined(__MINGW32__) || defined(__MINGW64__) + _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j])); + #else accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); + #endif } #elif defined(USE_SSE2)