X-Git-Url: https://git.sesse.net/?p=stockfish;a=blobdiff_plain;f=src%2Fnnue%2Flayers%2Faffine_transform_sparse_input.h;fp=src%2Fnnue%2Flayers%2Faffine_transform_sparse_input.h;h=6cb4d1a9347369e3b85c0276e66499fe6dcf7435;hp=3c7defcc42cbc25425d30e98708d13eaa54a8ac2;hb=bfee35f930bac95b646b1821339f342c70aac2f6;hpb=487c21b1aa64dcc09dd95b845a66f39ae3c3754e diff --git a/src/nnue/layers/affine_transform_sparse_input.h b/src/nnue/layers/affine_transform_sparse_input.h index 3c7defcc..6cb4d1a9 100644 --- a/src/nnue/layers/affine_transform_sparse_input.h +++ b/src/nnue/layers/affine_transform_sparse_input.h @@ -21,10 +21,12 @@ #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED #define NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED -#include #include #include -#include +#include +#include + +#include "../../bitboard.h" #include "../nnue_common.h" #include "affine_transform.h" #include "simd.h" @@ -34,115 +36,119 @@ */ namespace Stockfish::Eval::NNUE::Layers { -#if defined(__GNUC__) // GCC, Clang, ICC - - static inline IndexType lsb_(std::uint32_t b) { - assert(b); - return IndexType(__builtin_ctzl(b)); - } - -#elif defined(_MSC_VER) // MSVC - - static inline IndexType lsb_(std::uint32_t b) { - assert(b); - unsigned long idx; - _BitScanForward(&idx, b); - return (IndexType) idx; - } - -#else // Compiler is neither GCC nor MSVC compatible - -#error "Compiler not supported." -#endif - - -#if defined(USE_SSSE3) - alignas(CacheLineSize) static inline const std::array, 256> lookup_indices = [](){ - std::array, 256> v{}; - for (int i = 0; i < 256; ++i) - { - int j = i; - int k = 0; - while(j) +#if (USE_SSSE3 | (USE_NEON >= 8)) +alignas(CacheLineSize) static inline const + std::array, 256> lookup_indices = []() { + std::array, 256> v{}; + for (unsigned i = 0; i < 256; ++i) { - const IndexType lsbIndex = lsb_(std::uint32_t(j)); - j &= j - 1; - v[i][k] = lsbIndex; - ++k; + std::uint64_t j = i, k = 0; + while (j) + v[i][k++] = pop_lsb(j); } - } - return v; + return v; }(); - // Find indices of nonzero numbers in an int32_t array - template - void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) { -#if defined (USE_AVX512) +// Find indices of nonzero numbers in an int32_t array +template +void find_nnz(const std::int32_t* input, std::uint16_t* out, IndexType& count_out) { + #if defined(USE_SSSE3) + #if defined(USE_AVX512) using vec_t = __m512i; - #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512()) -#elif defined (USE_AVX2) + #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512()) + #elif defined(USE_AVX2) using vec_t = __m256i; - #define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256()))) -#elif defined (USE_SSSE3) + #if defined(USE_VNNI) && !defined(USE_AVXVNNI) + #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256()) + #else + #define vec_nnz(a) \ + _mm256_movemask_ps( \ + _mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256()))) + #endif + #elif defined(USE_SSSE3) using vec_t = __m128i; - #define vec_nnz(a) _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128()))) -#endif + #define vec_nnz(a) \ + _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128()))) + #endif + using vec128_t = __m128i; + #define vec128_zero _mm_setzero_si128() + #define vec128_set_16(a) _mm_set1_epi16(a) + #define vec128_load(a) _mm_load_si128(a) + #define vec128_storeu(a, b) _mm_storeu_si128(a, b) + #define vec128_add(a, b) _mm_add_epi16(a, b) + #elif defined(USE_NEON) + using vec_t = uint32x4_t; + static const std::uint32_t Mask[4] = {1, 2, 4, 8}; + #define vec_nnz(a) vaddvq_u32(vandq_u32(vtstq_u32(a, a), vld1q_u32(Mask))) + using vec128_t = uint16x8_t; + #define vec128_zero vdupq_n_u16(0) + #define vec128_set_16(a) vdupq_n_u16(a) + #define vec128_load(a) vld1q_u16(reinterpret_cast(a)) + #define vec128_storeu(a, b) vst1q_u16(reinterpret_cast(a), b) + #define vec128_add(a, b) vaddq_u16(a, b) + #endif constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t); // Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8) - constexpr IndexType ChunkSize = std::max(InputSimdWidth, 8); - constexpr IndexType NumChunks = InputDimensions / ChunkSize; - constexpr IndexType InputsPerChunk = ChunkSize / InputSimdWidth; + constexpr IndexType ChunkSize = std::max(InputSimdWidth, 8); + constexpr IndexType NumChunks = InputDimensions / ChunkSize; + constexpr IndexType InputsPerChunk = ChunkSize / InputSimdWidth; constexpr IndexType OutputsPerChunk = ChunkSize / 8; - const auto inputVector = reinterpret_cast(input); - IndexType count = 0; - __m128i base = _mm_set1_epi16(0); - __m128i increment = _mm_set1_epi16(8); + const auto inputVector = reinterpret_cast(input); + IndexType count = 0; + vec128_t base = vec128_zero; + const vec128_t increment = vec128_set_16(8); for (IndexType i = 0; i < NumChunks; ++i) { - // bitmask of nonzero values in this chunk - unsigned nnz = 0; - for (IndexType j = 0; j < InputsPerChunk; ++j) - { - const vec_t inputChunk = inputVector[i * InputsPerChunk + j]; - nnz |= (unsigned)vec_nnz(inputChunk) << (j * InputSimdWidth); - } - for (IndexType j = 0; j < OutputsPerChunk; ++j) - { - const auto lookup = (nnz >> (j * 8)) & 0xFF; - const auto offsets = _mm_loadu_si128(reinterpret_cast(&lookup_indices[lookup])); - _mm_storeu_si128(reinterpret_cast<__m128i*>(out + count), _mm_add_epi16(base, offsets)); - count += popcount(lookup); - base = _mm_add_epi16(base, increment); - } + // bitmask of nonzero values in this chunk + unsigned nnz = 0; + for (IndexType j = 0; j < InputsPerChunk; ++j) + { + const vec_t inputChunk = inputVector[i * InputsPerChunk + j]; + nnz |= unsigned(vec_nnz(inputChunk)) << (j * InputSimdWidth); + } + for (IndexType j = 0; j < OutputsPerChunk; ++j) + { + const auto lookup = (nnz >> (j * 8)) & 0xFF; + const auto offsets = + vec128_load(reinterpret_cast(&lookup_indices[lookup])); + vec128_storeu(reinterpret_cast(out + count), vec128_add(base, offsets)); + count += popcount(lookup); + base = vec128_add(base, increment); + } } count_out = count; - } -# undef vec_nnz +} + #undef vec_nnz + #undef vec128_zero + #undef vec128_set_16 + #undef vec128_load + #undef vec128_storeu + #undef vec128_add #endif - // Sparse input implementation - template - class AffineTransformSparseInput { +// Sparse input implementation +template +class AffineTransformSparseInput { public: // Input/output type - // Input/output type - using InputType = std::uint8_t; + using InputType = std::uint8_t; using OutputType = std::int32_t; // Number of input/output dimensions - static constexpr IndexType InputDimensions = InDims; + static constexpr IndexType InputDimensions = InDims; static constexpr IndexType OutputDimensions = OutDims; - static_assert(OutputDimensions % 16 == 0, "Only implemented for OutputDimensions divisible by 16."); + static_assert(OutputDimensions % 16 == 0, + "Only implemented for OutputDimensions divisible by 16."); static constexpr IndexType PaddedInputDimensions = ceil_to_multiple(InputDimensions, MaxSimdWidth); static constexpr IndexType PaddedOutputDimensions = ceil_to_multiple(OutputDimensions, MaxSimdWidth); -#if defined (USE_SSSE3) +#if (USE_SSSE3 | (USE_NEON >= 8)) static constexpr IndexType ChunkSize = 4; #else static constexpr IndexType ChunkSize = 1; @@ -152,120 +158,121 @@ namespace Stockfish::Eval::NNUE::Layers { // Hash value embedded in the evaluation file static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) { - std::uint32_t hashValue = 0xCC03DAE4u; - hashValue += OutputDimensions; - hashValue ^= prevHash >> 1; - hashValue ^= prevHash << 31; - return hashValue; + std::uint32_t hashValue = 0xCC03DAE4u; + hashValue += OutputDimensions; + hashValue ^= prevHash >> 1; + hashValue ^= prevHash << 31; + return hashValue; } - static IndexType get_weight_index_scrambled(IndexType i) - { - return - (i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize + - i / PaddedInputDimensions * ChunkSize + - i % ChunkSize; + static constexpr IndexType get_weight_index_scrambled(IndexType i) { + return (i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize + + i / PaddedInputDimensions * ChunkSize + i % ChunkSize; } - static IndexType get_weight_index(IndexType i) - { -#if defined (USE_SSSE3) - return get_weight_index_scrambled(i); + static constexpr IndexType get_weight_index(IndexType i) { +#if (USE_SSSE3 | (USE_NEON >= 8)) + return get_weight_index_scrambled(i); #else - return i; + return i; #endif } // Read network parameters bool read_parameters(std::istream& stream) { - read_little_endian(stream, biases, OutputDimensions); - for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) - weights[get_weight_index(i)] = read_little_endian(stream); + read_little_endian(stream, biases, OutputDimensions); + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + weights[get_weight_index(i)] = read_little_endian(stream); - return !stream.fail(); + return !stream.fail(); } // Write network parameters bool write_parameters(std::ostream& stream) const { - write_little_endian(stream, biases, OutputDimensions); + write_little_endian(stream, biases, OutputDimensions); - for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) - write_little_endian(stream, weights[get_weight_index(i)]); + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + write_little_endian(stream, weights[get_weight_index(i)]); - return !stream.fail(); + return !stream.fail(); } // Forward propagation - const OutputType* propagate( - const InputType* input, OutputType* output) const { - -#if defined (USE_SSSE3) -#if defined (USE_AVX512) - using vec_t = __m512i; - #define vec_setzero _mm512_setzero_si512 - #define vec_set_32 _mm512_set1_epi32 - #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32 -#elif defined (USE_AVX2) - using vec_t = __m256i; - #define vec_setzero _mm256_setzero_si256 - #define vec_set_32 _mm256_set1_epi32 - #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 -#elif defined (USE_SSSE3) - using vec_t = __m128i; - #define vec_setzero _mm_setzero_si128 - #define vec_set_32 _mm_set1_epi32 - #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 -#endif - static constexpr IndexType OutputSimdWidth = sizeof(vec_t) / sizeof(OutputType); - - constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / ChunkSize; - constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; - std::uint16_t nnz[NumChunks]; - IndexType count; - - const auto input32 = reinterpret_cast(input); - - // Find indices of nonzero 32bit blocks - find_nnz(input32, nnz, count); - - const vec_t* biasvec = reinterpret_cast(biases); - vec_t acc[NumRegs]; - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = biasvec[k]; - - for (IndexType j = 0; j < count; ++j) - { - const auto i = nnz[j]; - const vec_t in = vec_set_32(input32[i]); - const auto col = reinterpret_cast(&weights[i * OutputDimensions * ChunkSize]); + void propagate(const InputType* input, OutputType* output) const { + +#if (USE_SSSE3 | (USE_NEON >= 8)) + #if defined(USE_AVX512) + using invec_t = __m512i; + using outvec_t = __m512i; + #define vec_set_32 _mm512_set1_epi32 + #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32 + #elif defined(USE_AVX2) + using invec_t = __m256i; + using outvec_t = __m256i; + #define vec_set_32 _mm256_set1_epi32 + #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 + #elif defined(USE_SSSE3) + using invec_t = __m128i; + using outvec_t = __m128i; + #define vec_set_32 _mm_set1_epi32 + #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 + #elif defined(USE_NEON_DOTPROD) + using invec_t = int8x16_t; + using outvec_t = int32x4_t; + #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a)) + #define vec_add_dpbusd_32 Simd::dotprod_m128_add_dpbusd_epi32 + #elif defined(USE_NEON) + using invec_t = int8x16_t; + using outvec_t = int32x4_t; + #define vec_set_32(a) vreinterpretq_s8_u32(vdupq_n_u32(a)) + #define vec_add_dpbusd_32 Simd::neon_m128_add_dpbusd_epi32 + #endif + static constexpr IndexType OutputSimdWidth = sizeof(outvec_t) / sizeof(OutputType); + + constexpr IndexType NumChunks = ceil_to_multiple(InputDimensions, 8) / ChunkSize; + constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; + std::uint16_t nnz[NumChunks]; + IndexType count; + + const auto input32 = reinterpret_cast(input); + + // Find indices of nonzero 32bit blocks + find_nnz(input32, nnz, count); + + const outvec_t* biasvec = reinterpret_cast(biases); + outvec_t acc[NumRegs]; for (IndexType k = 0; k < NumRegs; ++k) - vec_add_dpbusd_32(acc[k], in, col[k]); - } - - vec_t* outptr = reinterpret_cast(output); - for (IndexType k = 0; k < NumRegs; ++k) - outptr[k] = acc[k]; -# undef vec_setzero -# undef vec_set_32 -# undef vec_add_dpbusd_32 + acc[k] = biasvec[k]; + + for (IndexType j = 0; j < count; ++j) + { + const auto i = nnz[j]; + const invec_t in = vec_set_32(input32[i]); + const auto col = + reinterpret_cast(&weights[i * OutputDimensions * ChunkSize]); + for (IndexType k = 0; k < NumRegs; ++k) + vec_add_dpbusd_32(acc[k], in, col[k]); + } + + outvec_t* outptr = reinterpret_cast(output); + for (IndexType k = 0; k < NumRegs; ++k) + outptr[k] = acc[k]; + #undef vec_set_32 + #undef vec_add_dpbusd_32 #else - // Use dense implementation for the other architectures. - affine_transform_non_ssse3< - InputDimensions, - PaddedInputDimensions, - OutputDimensions>(output, weights, biases, input); + // Use dense implementation for the other architectures. + affine_transform_non_ssse3( + output, weights, biases, input); #endif - - return output; } private: - using BiasType = OutputType; + using BiasType = OutputType; using WeightType = std::int8_t; alignas(CacheLineSize) BiasType biases[OutputDimensions]; alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; - }; +}; } // namespace Stockfish::Eval::NNUE::Layers -#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED +#endif // #ifndef NNUE_LAYERS_AFFINE_TRANSFORM_SPARSE_INPUT_H_INCLUDED