From 1461d861c8240e29df690f1e34dc50eee37ae1b5 Mon Sep 17 00:00:00 2001 From: Tomasz Sobczyk Date: Mon, 4 Sep 2023 13:53:30 +0200 Subject: [PATCH] Prevent usage of AVX-512 for the last layer. Add more static checks regarding the SIMD width match. STC: https://tests.stockfishchess.org/tests/view/64f5c568a9bc5a78c669e70e LLR: 2.95 (-2.94,2.94) <-1.75,0.25> Total: 125216 W: 31756 L: 31636 D: 61824 Ptnml(0-2): 327, 13993, 33848, 14113, 327 Fixes a bug introduced in 2f2f45f, where with AVX-512 the weights and input to the last layer were being read out of bounds. Now AVX-512 is only used for the layers it can be used for. Additional static assertions have been added to prevent more errors like this in the future. closes https://github.com/official-stockfish/Stockfish/pull/4773 No functional change --- src/nnue/layers/affine_transform.h | 50 ++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index e9d0beac..61cdb781 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -210,6 +210,11 @@ namespace Stockfish::Eval::NNUE::Layers { void propagate( const InputType* input, OutputType* output) const { +#if defined (USE_SSSE3) + + if constexpr (OutputDimensions > 1) + { + #if defined (USE_AVX512) using vec_t = __m512i; #define vec_setzero _mm512_setzero_si512 @@ -233,15 +238,10 @@ namespace Stockfish::Eval::NNUE::Layers { #define vec_hadd Simd::m128_hadd #endif -#if defined (USE_SSSE3) - const auto inputVector = reinterpret_cast(input); + static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType); - static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType); + static_assert(OutputDimensions % OutputSimdWidth == 0); - static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1); - - if constexpr (OutputDimensions % OutputSimdWidth == 0) - { constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4; constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; @@ -264,10 +264,41 @@ namespace Stockfish::Eval::NNUE::Layers { vec_t* outptr = reinterpret_cast(output); for (IndexType k = 0; k < NumRegs; ++k) outptr[k] = acc[k]; + +# undef vec_setzero +# undef vec_set_32 +# undef vec_add_dpbusd_32 +# undef vec_add_dpbusd_32x2 +# undef vec_hadd + } else if constexpr (OutputDimensions == 1) { - constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; + +// We cannot use AVX512 for the last layer because there's only 32 inputs and the buffer is not padded to 64 elements. +#if defined (USE_AVX2) + using vec_t = __m256i; + #define vec_setzero _mm256_setzero_si256 + #define vec_set_32 _mm256_set1_epi32 + #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 + #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2 + #define vec_hadd Simd::m256_hadd +#elif defined (USE_SSSE3) + using vec_t = __m128i; + #define vec_setzero _mm_setzero_si128 + #define vec_set_32 _mm_set1_epi32 + #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 + #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2 + #define vec_hadd Simd::m128_hadd +#endif + + const auto inputVector = reinterpret_cast(input); + + static constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(InputType); + + static_assert(PaddedInputDimensions % InputSimdWidth == 0); + + constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth; vec_t sum0 = vec_setzero(); const auto row0 = reinterpret_cast(&weights[0]); @@ -277,13 +308,14 @@ namespace Stockfish::Eval::NNUE::Layers { vec_add_dpbusd_32(sum0, in, row0[j]); } output[0] = vec_hadd(sum0, biases[0]); - } # undef vec_setzero # undef vec_set_32 # undef vec_add_dpbusd_32 # undef vec_add_dpbusd_32x2 # undef vec_hadd + + } #else // Use old implementation for the other architectures. affine_transform_non_ssse3< -- 2.39.2