From: mstembera Date: Mon, 10 Aug 2020 06:50:59 +0000 (-0700) Subject: Workaround using unaligned loads for gcc < 9 X-Git-Url: https://git.sesse.net/?p=stockfish;a=commitdiff_plain;h=875183b310a8249922c2155e82cb4cecfae2097e Workaround using unaligned loads for gcc < 9 despite usage of alignas, the generated (avx2/avx512) code with older compilers needs to use unaligned loads with older gcc (e.g. confirmed crash with gcc 7.3/mingw on abrok). Better performance thus requires gcc >= 9 on hardware supporting avx2/avx512 closes https://github.com/official-stockfish/Stockfish/pull/2969 No functional change --- diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index b585bc87..20ec2f12 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -104,13 +104,7 @@ namespace Eval::NNUE::Layers { __m512i sum = _mm512_setzero_si512(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - - #if defined(__MINGW32__) || defined(__MINGW64__) - __m512i product = _mm512_maddubs_epi16(_mm512_loadu_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #else - __m512i product = _mm512_maddubs_epi16(_mm512_load_si512(&input_vector[j]), _mm512_load_si512(&row[j])); - #endif - + __m512i product = _mm512_maddubs_epi16(_mm512_loadA_si512(&input_vector[j]), _mm512_load_si512(&row[j])); product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); } @@ -124,13 +118,7 @@ namespace Eval::NNUE::Layers { const auto iv_256 = reinterpret_cast(input); const auto row_256 = reinterpret_cast(&weights_[offset]); int j = kNumChunks * 2; - - #if defined(__MINGW32__) || defined(__MINGW64__) // See HACK comment below in AVX2. - __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadu_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - #else - __m256i sum256 = _mm256_maddubs_epi16(_mm256_load_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - #endif - + __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); sum256 = _mm256_hadd_epi32(sum256, sum256); sum256 = _mm256_hadd_epi32(sum256, sum256); @@ -143,18 +131,7 @@ namespace Eval::NNUE::Layers { __m256i sum = _mm256_setzero_si256(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m256i product = _mm256_maddubs_epi16( - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&input_vector[j]), _mm256_load_si256(&row[j])); + __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); } @@ -168,8 +145,7 @@ namespace Eval::NNUE::Layers { __m128i sum = _mm_cvtsi32_si128(biases_[i]); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m128i product = _mm_maddubs_epi16( - _mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); + __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); product = _mm_madd_epi16(product, kOnes); sum = _mm_add_epi32(sum, product); } diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 7ade598f..13196ec2 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -74,50 +74,12 @@ namespace Eval::NNUE::Layers { const auto out = reinterpret_cast<__m256i*>(output); for (IndexType i = 0; i < kNumChunks; ++i) { const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32( - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 0]), - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 1])), kWeightScaleBits); + _mm256_loadA_si256(&in[i * 4 + 0]), + _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits); const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32( - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 2]), - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&in[i * 4 + 3])), kWeightScaleBits); - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256 - #else - _mm256_store_si256 - #endif - - (&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( + _mm256_loadA_si256(&in[i * 4 + 2]), + _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits); + _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8( _mm256_packs_epi16(words0, words1), kZero), kOffsets)); } constexpr IndexType kStart = kNumChunks * kSimdWidth; diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index 972ef3e5..e7ce84f7 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -37,6 +37,27 @@ #include #endif +// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary +// compiled with older g++ crashes because the output memory is not aligned +// even though alignas is specified. +#if defined(USE_AVX2) +#if defined(__GNUC__ ) && (__GNUC__ < 9) +#define _mm256_loadA_si256 _mm256_loadu_si256 +#define _mm256_storeA_si256 _mm256_storeu_si256 +#else +#define _mm256_loadA_si256 _mm256_load_si256 +#define _mm256_storeA_si256 _mm256_store_si256 +#endif +#endif + +#if defined(USE_AVX512) +#if defined(__GNUC__ ) && (__GNUC__ < 9) +#define _mm512_loadA_si512 _mm512_loadu_si512 +#else +#define _mm512_loadA_si512 _mm512_load_si512 +#endif +#endif + namespace Eval::NNUE { // Version of the evaluation file diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 1cfebbe4..cbcc26f3 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -109,37 +109,11 @@ namespace Eval::NNUE { #if defined(USE_AVX2) auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { - __m256i sum0 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - // HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Because the binary - // compiled with g++ in MSYS2 crashes here because the output memory is not aligned - // even though alignas is specified. - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 0]); - __m256i sum1 = - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_loadu_si256 - #else - _mm256_load_si256 - #endif - - (&reinterpret_cast( - accumulation[perspectives[p]][0])[j * 2 + 1]); - - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256 - #else - _mm256_store_si256 - #endif - - (&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( + __m256i sum0 = _mm256_loadA_si256( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); + __m256i sum1 = _mm256_loadA_si256( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); + _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -202,11 +176,7 @@ namespace Eval::NNUE { auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); for (IndexType j = 0; j < kNumChunks; ++j) { - #if defined(__MINGW32__) || defined(__MINGW64__) - _mm256_storeu_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadu_si256(&accumulation[j]), column[j])); - #else - accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]); - #endif + _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j])); } #elif defined(USE_SSE2)