X-Git-Url: https://git.sesse.net/?p=stockfish;a=blobdiff_plain;f=src%2Fnnue%2Fnnue_feature_transformer.h;h=f899d7617655cf086274aeee8ecb021434c06bca;hp=1cfebbe4cbe80425f65aa3e3012594494d615294;hb=a6e89293df5af35931b61d86b6de3872a981c100;hpb=2bfde5542919c2ed624b5b62883616e325ccb942 diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 1cfebbe4..f899d761 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -110,36 +110,12 @@ namespace Eval::NNUE { auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i sum0 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( + _mm256_load_si256(&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 0]); __m256i sum1 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( + _mm256_load_si256(&reinterpret_cast( accumulation[perspectives[p]][0])[j * 2 + 1]); - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256 - #else - _mm256_store_si256 - #endif - - (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -202,11 +178,7 @@ namespace Eval::NNUE { auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); for (IndexType j = 0; j < kNumChunks; ++j) { - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j])); - #else accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); - #endif } #elif defined(USE_SSE2)