X-Git-Url: https://git.sesse.net/?p=stockfish;a=blobdiff_plain;f=src%2Fnnue%2Fnnue_feature_transformer.h;fp=src%2Fnnue%2Fnnue_feature_transformer.h;h=10b226b31130d802155ef34a273807b28470bd34;hp=c249d3e70184edd46be1fef51fa5fe874d16fcd0;hb=8f081c86f7f8827ea35fc687e6f6591950cc8f90;hpb=4445965f9714402050119f9e6a76c6a8fc4f8d9a diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index c249d3e7..10b226b3 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -180,118 +180,144 @@ namespace Stockfish::Eval::NNUE { const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation; const auto psqt = ( - psqtAccumulation[static_cast(perspectives[0])][bucket] - - psqtAccumulation[static_cast(perspectives[1])][bucket] + psqtAccumulation[perspectives[0]][bucket] + - psqtAccumulation[perspectives[1]][bucket] ) / 2; + #if defined(USE_AVX512) + constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2); static_assert(HalfDimensions % (SimdWidth * 2) == 0); const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); const __m512i Zero = _mm512_setzero_si512(); + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m512i*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m512i sum0 = _mm512_load_si512(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 0]); + __m512i sum1 = _mm512_load_si512(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 1]); + + _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control, + _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero))); + } + } + return psqt; + #elif defined(USE_AVX2) + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; constexpr int Control = 0b11011000; const __m256i Zero = _mm256_setzero_si256(); + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m256i*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m256i sum0 = _mm256_load_si256(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 0]); + __m256i sum1 = _mm256_load_si256(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 1]); + + _mm256_store_si256(&out[j], _mm256_permute4x64_epi64( + _mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control)); + } + } + return psqt; + #elif defined(USE_SSE2) - constexpr IndexType NumChunks = HalfDimensions / SimdWidth; - #ifdef USE_SSE41 + #ifdef USE_SSE41 + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m128i Zero = _mm_setzero_si128(); - #else + #else + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m128i k0x80s = _mm_set1_epi8(-128); - #endif + #endif + + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m128i*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m128i sum0 = _mm_load_si128(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 0]); + __m128i sum1 = _mm_load_si128(&reinterpret_cast + (accumulation[perspectives[p]])[j * 2 + 1]); + const __m128i packedbytes = _mm_packs_epi16(sum0, sum1); + + #ifdef USE_SSE41 + _mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero)); + #else + _mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)); + #endif + } + } + return psqt; #elif defined(USE_MMX) + constexpr IndexType NumChunks = HalfDimensions / SimdWidth; const __m64 k0x80s = _mm_set1_pi8(-128); + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + auto out = reinterpret_cast<__m64*>(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + __m64 sum0 = *(&reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); + __m64 sum1 = *(&reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); + const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); + out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + } + _mm_empty(); + return psqt; + #elif defined(USE_NEON) + constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2); const int8x8_t Zero = {0}; - #endif - - for (IndexType p = 0; p < 2; ++p) { - const IndexType offset = HalfDimensions * p; - - #if defined(USE_AVX512) - auto out = reinterpret_cast<__m512i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m512i sum0 = _mm512_load_si512( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); - __m512i sum1 = _mm512_load_si512( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); - _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control, - _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero))); - } - #elif defined(USE_AVX2) - auto out = reinterpret_cast<__m256i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m256i sum0 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 0]); - __m256i sum1 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]])[j * 2 + 1]); - _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( - _mm256_packs_epi16(sum0, sum1), Zero), Control)); - } + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + const auto out = reinterpret_cast(&output[offset]); + for (IndexType j = 0; j < NumChunks; ++j) + { + int16x8_t sum = reinterpret_cast(accumulation[perspectives[p]])[j]; + out[j] = vmax_s8(vqmovn_s16(sum), Zero); + } + } + return psqt; - #elif defined(USE_SSE2) - auto out = reinterpret_cast<__m128i*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m128i sum0 = _mm_load_si128(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 0]); - __m128i sum1 = _mm_load_si128(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 1]); - const __m128i packedbytes = _mm_packs_epi16(sum0, sum1); - - _mm_store_si128(&out[j], - - #ifdef USE_SSE41 - _mm_max_epi8(packedbytes, Zero) #else - _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) - #endif - - ); - } - - #elif defined(USE_MMX) - auto out = reinterpret_cast<__m64*>(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m64 sum0 = *(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 0]); - __m64 sum1 = *(&reinterpret_cast( - accumulation[perspectives[p]])[j * 2 + 1]); - const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); - out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); - } - #elif defined(USE_NEON) - const auto out = reinterpret_cast(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - int16x8_t sum = reinterpret_cast( - accumulation[perspectives[p]])[j]; - out[j] = vmax_s8(vqmovn_s16(sum), Zero); - } + for (IndexType p = 0; p < 2; ++p) + { + const IndexType offset = HalfDimensions * p; + for (IndexType j = 0; j < HalfDimensions; ++j) + { + BiasType sum = accumulation[perspectives[p]][j]; + output[offset + j] = static_cast(std::max(0, std::min(127, sum))); + } + } + return psqt; - #else - for (IndexType j = 0; j < HalfDimensions; ++j) { - BiasType sum = accumulation[static_cast(perspectives[p])][j]; - output[offset + j] = static_cast( - std::max(0, std::min(127, sum))); - } #endif - } - #if defined(USE_MMX) - _mm_empty(); - #endif + } // end of function transform() + - return psqt; - } private: void update_accumulator(const Position& pos, const Color perspective) const {