X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Fnnue%2Fnnue_feature_transformer.h;h=10b226b31130d802155ef34a273807b28470bd34;hb=8f081c86f7f8827ea35fc687e6f6591950cc8f90;hp=2c0a0c6d3134b61270a9a16a3a1ae199176d9d07;hpb=e8d64af1230fdac65bb0da246df3e7abe82e0838;p=stockfish diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 2c0a0c6d..10b226b3 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -84,18 +84,18 @@ namespace Stockfish::Eval::NNUE { #elif USE_MMX typedef __m64 vec_t; - typedef std::int32_t psqt_vec_t; + typedef __m64 psqt_vec_t; #define vec_load(a) (*(a)) #define vec_store(a,b) *(a)=(b) #define vec_add_16(a,b) _mm_add_pi16(a,b) #define vec_sub_16(a,b) _mm_sub_pi16(a,b) #define vec_load_psqt(a) (*(a)) #define vec_store_psqt(a,b) *(a)=(b) - #define vec_add_psqt_32(a,b) a+b - #define vec_sub_psqt_32(a,b) a-b - #define vec_zero_psqt() 0 + #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b) + #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b) + #define vec_zero_psqt() _mm_setzero_si64() static constexpr IndexType NumRegs = 8; - static constexpr IndexType NumPsqtRegs = 8; + static constexpr IndexType NumPsqtRegs = 4; #elif USE_NEON typedef int16x8_t vec_t; @@ -124,8 +124,6 @@ namespace Stockfish::Eval::NNUE { // Number of output dimensions for one side static constexpr IndexType HalfDimensions = TransformedFeatureDimensions; - static constexpr int LazyThreshold = 1400; - #ifdef VECTOR static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2; static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4; @@ -167,11 +165,13 @@ namespace Stockfish::Eval::NNUE { write_little_endian(stream, biases[i]); for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i) write_little_endian(stream, weights[i]); + for (std::size_t i = 0; i < PSQTBuckets * InputDimensions; ++i) + write_little_endian(stream, psqtWeights[i]); return !stream.fail(); } // Convert input features - std::pair transform(const Position& pos, OutputType* output, int bucket) const { + std::int32_t transform(const Position& pos, OutputType* output, int bucket) const { update_accumulator(pos, WHITE); update_accumulator(pos, BLACK); @@ -180,121 +180,144 @@ namespace Stockfish::Eval::NNUE { const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation; const auto psqt = ( - psqtAccumulation[static_cast(perspectives[0])][bucket] - - psqtAccumulation[static_cast(perspectives[1])][bucket] + psqtAccumulation[perspectives[0]][bucket] + - psqtAccumulation[perspectives[1]][bucket] ) / 2; - if (abs(psqt) > LazyThreshold * OutputScale) - return { psqt, true }; #if defined(USE_AVX512) + constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2); static_assert(HalfDimensions % (SimdWidth * 2) == 0); const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); const __m512i Zero = _mm512_setzero_si512(); + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m512i*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m512i sum0 = _mm512_load_si512(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 0]); + __m512i sum1 = _mm512_load_si512(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 1]); + + _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control, + _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero))); + } + } + return psqt; + #elif defined(USE_AVX2) + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; constexpr int Control = 0b11011000; const __m256i Zero = _mm256_setzero_si256(); + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m256i*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m256i sum0 = _mm256_load_si256(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 0]); + __m256i sum1 = _mm256_load_si256(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 1]); + + _mm256_store_si256(&out[j], _mm256_permute4x64_epi64( + _mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control)); + } + } + return psqt; + #elif defined(USE_SSE2) - constexpr IndexType NumChunks = HalfDimensions / SimdWidth; - #ifdef USE_SSE41 + #ifdef USE_SSE41 + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m128i Zero = _mm_setzero_si128(); - #else + #else + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m128i k0x80s = _mm_set1_epi8(-128); - #endif + #endif + + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m128i*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m128i sum0 = _mm_load_si128(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 0]); + __m128i sum1 = _mm_load_si128(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 1]); + const __m128i packedbytes = _mm_packs_epi16(sum0, sum1); + + #ifdef USE_SSE41 + _mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero)); + #else + _mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)); + #endif + } + } + return psqt; #elif defined(USE_MMX) + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m64 k0x80s = _mm_set1_pi8(-128); + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m64*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m64 sum0 = *(&reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); + __m64 sum1 = *(&reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); + const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); + out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + } + _mm_empty(); + return psqt; + #elif defined(USE_NEON) + constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2); const int8x8_t Zero = {0}; - #endif - - for (IndexType p = 0; p < 2; ++p) { - const IndexType offset = HalfDimensions * p; - - #if defined(USE_AVX512) - auto out = reinterpret_cast<__m512i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m512i sum0 = _mm512_load_si512( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); - __m512i sum1 = _mm512_load_si512( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); - _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control, - _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero))); - } - #elif defined(USE_AVX2) - auto out = reinterpret_cast<__m256i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m256i sum0 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); - __m256i sum1 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); - _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( - _mm256_packs_epi16(sum0, sum1), Zero), Control)); - } + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + const auto out = reinterpret_cast(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + int16x8_t sum = reinterpret_cast(accumulation[perspectives[p]])[j]; + out[j] = vmax_s8(vqmovn_s16(sum), Zero); + } + } + return psqt; - #elif defined(USE_SSE2) - auto out = reinterpret_cast<__m128i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m128i sum0 = _mm_load_si128(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 0]); - __m128i sum1 = _mm_load_si128(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 1]); - const __m128i packedbytes = _mm_packs_epi16(sum0, sum1); - - _mm_store_si128(&out[j], - - #ifdef USE_SSE41 - _mm_max_epi8(packedbytes, Zero) #else - _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) - #endif - ); - } - - #elif defined(USE_MMX) - auto out = reinterpret_cast<__m64*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m64 sum0 = *(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 0]); - __m64 sum1 = *(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 1]); - const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); - out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); - } - - #elif defined(USE_NEON) - const auto out = reinterpret_cast(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - int16x8_t sum = reinterpret_cast( - accumulation[perspectives[p]])[j]; - out[j] = vmax_s8(vqmovn_s16(sum), Zero); - } + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + for (IndexType j = 0; j < HalfDimensions; ++j) + { + BiasType sum = accumulation[perspectives[p]][j]; + output[offset + j] = static_cast(std::max(0, std::min(127, sum))); + } + } + return psqt; - #else - for (IndexType j = 0; j < HalfDimensions; ++j) { - BiasType sum = accumulation[static_cast(perspectives[p])][j]; - output[offset + j] = static_cast( - std::max(0, std::min(127, sum))); - } #endif - } - #if defined(USE_MMX) - _mm_empty(); - #endif + } // end of function transform() + - return { psqt, false }; - } private: void update_accumulator(const Position& pos, const Color perspective) const {