From: mstembera Date: Wed, 12 Jul 2023 05:19:48 +0000 (-0700) Subject: More simplifications and cleanup in affine_transform_sparse_input.h X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;ds=sidebyside;h=529d3be8e245c06b62a45525cb9325ed6e50b636;p=stockfish More simplifications and cleanup in affine_transform_sparse_input.h closes https://github.com/official-stockfish/Stockfish/pull/4677 No functional change --- diff --git a/src/nnue/layers/affine_transform_sparse_input.h b/src/nnue/layers/affine_transform_sparse_input.h index 3c7defcc..a5bea08e 100644 --- a/src/nnue/layers/affine_transform_sparse_input.h +++ b/src/nnue/layers/affine_transform_sparse_input.h @@ -34,43 +34,15 @@ */ namespace Stockfish::Eval::NNUE::Layers { -#if defined(__GNUC__) // GCC, Clang, ICC - - static inline IndexType lsb_(std::uint32_t b) { - assert(b); - return IndexType(__builtin_ctzl(b)); - } - -#elif defined(_MSC_VER) // MSVC - - static inline IndexType lsb_(std::uint32_t b) { - assert(b); - unsigned long idx; - _BitScanForward(&idx, b); - return (IndexType) idx; - } - -#else // Compiler is neither GCC nor MSVC compatible - -#error "Compiler not supported." - -#endif - #if defined(USE_SSSE3) alignas(CacheLineSize) static inline const std::array, 256> lookup_indices = [](){ std::array, 256> v{}; - for (int i = 0; i < 256; ++i) + for (unsigned i = 0; i < 256; ++i) { - int j = i; - int k = 0; + std::uint64_t j = i, k = 0; while(j) - { - const IndexType lsbIndex = lsb_(std::uint32_t(j)); - j &= j - 1; - v[i][k] = lsbIndex; - ++k; - } + v[i][k++] = pop_lsb(j); } return v; }(); @@ -83,7 +55,11 @@ namespace Stockfish::Eval::NNUE::Layers { #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512()) #elif defined (USE_AVX2) using vec_t = __m256i; - #define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256()))) + #if defined(USE_VNNI) && !defined(USE_AVXVNNI) + #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256()) + #else + #define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256()))) + #endif #elif defined (USE_SSSE3) using vec_t = __m128i; #define vec_nnz(a) _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128()))) @@ -97,8 +73,8 @@ namespace Stockfish::Eval::NNUE::Layers { const auto inputVector = reinterpret_cast(input); IndexType count = 0; - __m128i base = _mm_set1_epi16(0); - __m128i increment = _mm_set1_epi16(8); + __m128i base = _mm_setzero_si128(); + const __m128i increment = _mm_set1_epi16(8); for (IndexType i = 0; i < NumChunks; ++i) { // bitmask of nonzero values in this chunk