X-Git-Url: https://git.sesse.net/?p=stockfish;a=blobdiff_plain;f=src%2Fnnue%2Flayers%2Faffine_transform.h;fp=src%2Fnnue%2Flayers%2Faffine_transform.h;h=61cdb7818661286cde3c2be0c55f87035746fcad;hp=e9d0beace17925cdad847425cfd2a5bd45504f09;hb=1461d861c8240e29df690f1e34dc50eee37ae1b5;hpb=a8b4fd16716e74a9819e798fc09e5926e003013e diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index e9d0beac..61cdb781 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -210,6 +210,11 @@ namespace Stockfish::Eval::NNUE::Layers { void propagate( const InputType* input, OutputType* output) const { +#if defined (USE_SSSE3) + + if constexpr (OutputDimensions > 1) + { + #if defined (USE_AVX512) using vec_t = __m512i; #define vec_setzero _mm512_setzero_si512 @@ -233,15 +238,10 @@ namespace Stockfish::Eval::NNUE::Layers { #define vec_hadd Simd::m128_hadd #endif -#if defined (USE_SSSE3) - const auto inputVector = reinterpret_cast(input); + static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType); - static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType); + static_assert(OutputDimensions % OutputSimdWidth == 0); - static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1); - - if constexpr (OutputDimensions % OutputSimdWidth == 0) - { constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / 4; constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; @@ -264,10 +264,41 @@ namespace Stockfish::Eval::NNUE::Layers { vec_t* outptr = reinterpret_cast(output); for (IndexType k = 0; k < NumRegs; ++k) outptr[k] = acc[k]; + +# undef vec_setzero +# undef vec_set_32 +# undef vec_add_dpbusd_32 +# undef vec_add_dpbusd_32x2 +# undef vec_hadd + } else if constexpr (OutputDimensions == 1) { - constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; + +// We cannot use AVX512 for the last layer because there's only 32 inputs and the buffer is not padded to 64 elements. +#if defined (USE_AVX2) + using vec_t = __m256i; + #define vec_setzero _mm256_setzero_si256 + #define vec_set_32 _mm256_set1_epi32 + #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 + #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2 + #define vec_hadd Simd::m256_hadd +#elif defined (USE_SSSE3) + using vec_t = __m128i; + #define vec_setzero _mm_setzero_si128 + #define vec_set_32 _mm_set1_epi32 + #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 + #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2 + #define vec_hadd Simd::m128_hadd +#endif + + const auto inputVector = reinterpret_cast(input); + + static constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(InputType); + + static_assert(PaddedInputDimensions % InputSimdWidth == 0); + + constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth; vec_t sum0 = vec_setzero(); const auto row0 = reinterpret_cast(&weights[0]); @@ -277,13 +308,14 @@ namespace Stockfish::Eval::NNUE::Layers { vec_add_dpbusd_32(sum0, in, row0[j]); } output[0] = vec_hadd(sum0, biases[0]); - } # undef vec_setzero # undef vec_set_32 # undef vec_add_dpbusd_32 # undef vec_add_dpbusd_32x2 # undef vec_hadd + + } #else // Use old implementation for the other architectures. affine_transform_non_ssse3<