#include <iostream>
#include <algorithm>
#include <array>
-#include <bitset>
#include <type_traits>
#include "../nnue_common.h"
#include "affine_transform.h"
*/
namespace Stockfish::Eval::NNUE::Layers {
-#if defined(__GNUC__) // GCC, Clang, ICC
-
- static inline IndexType lsb_(std::uint32_t b) {
- assert(b);
- return IndexType(__builtin_ctzl(b));
- }
-
-#elif defined(_MSC_VER) // MSVC
-
- static inline IndexType lsb_(std::uint32_t b) {
- assert(b);
- unsigned long idx;
- _BitScanForward(&idx, b);
- return (IndexType) idx;
- }
-
-#else // Compiler is neither GCC nor MSVC compatible
-
-#error "Compiler not supported."
-
-#endif
-
#if defined(USE_SSSE3)
alignas(CacheLineSize) static inline const std::array<std::array<std::uint16_t, 8>, 256> lookup_indices = [](){
std::array<std::array<std::uint16_t, 8>, 256> v{};
- for (int i = 0; i < 256; ++i)
+ for (unsigned i = 0; i < 256; ++i)
{
- int j = i;
- int k = 0;
+ std::uint64_t j = i, k = 0;
while(j)
- {
- const IndexType lsbIndex = lsb_(std::uint32_t(j));
- j &= j - 1;
- v[i][k] = lsbIndex;
- ++k;
- }
+ v[i][k++] = pop_lsb(j);
}
return v;
}();
- alignas(CacheLineSize) static inline const std::array<unsigned, 256> lookup_count = [](){
- std::array<unsigned, 256> v;
- for (int i = 0; i < 256; ++i)
- v[i] = unsigned(std::bitset<8>(i).count());
- return v;
- }();
// Find indices of nonzero numbers in an int32_t array
template<const IndexType InputDimensions>
#define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
#elif defined (USE_AVX2)
using vec_t = __m256i;
- #define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
+ #if defined(USE_VNNI) && !defined(USE_AVXVNNI)
+ #define vec_nnz(a) _mm256_cmpgt_epi32_mask(a, _mm256_setzero_si256())
+ #else
+ #define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
+ #endif
#elif defined (USE_SSSE3)
using vec_t = __m128i;
#define vec_nnz(a) _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
const auto inputVector = reinterpret_cast<const vec_t*>(input);
IndexType count = 0;
- __m128i base = _mm_set1_epi16(0);
- __m128i increment = _mm_set1_epi16(8);
+ __m128i base = _mm_setzero_si128();
+ const __m128i increment = _mm_set1_epi16(8);
for (IndexType i = 0; i < NumChunks; ++i)
{
// bitmask of nonzero values in this chunk
const auto lookup = (nnz >> (j * 8)) & 0xFF;
const auto offsets = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&lookup_indices[lookup]));
_mm_storeu_si128(reinterpret_cast<__m128i*>(out + count), _mm_add_epi16(base, offsets));
- count += lookup_count[lookup];
+ count += popcount(lookup);
base = _mm_add_epi16(base, increment);
}
}
template <IndexType InDims, IndexType OutDims>
class AffineTransformSparseInput {
public:
- // Input/output type
// Input/output type
using InputType = std::uint8_t;
using OutputType = std::int32_t;
return hashValue;
}
- static IndexType get_weight_index_scrambled(IndexType i)
+ static constexpr IndexType get_weight_index_scrambled(IndexType i)
{
return
(i / ChunkSize) % (PaddedInputDimensions / ChunkSize) * OutputDimensions * ChunkSize +
i % ChunkSize;
}
- static IndexType get_weight_index(IndexType i)
+ static constexpr IndexType get_weight_index(IndexType i)
{
#if defined (USE_SSSE3)
return get_weight_index_scrambled(i);
return !stream.fail();
}
// Forward propagation
- const OutputType* propagate(
+ void propagate(
const InputType* input, OutputType* output) const {
#if defined (USE_SSSE3)
PaddedInputDimensions,
OutputDimensions>(output, weights, biases, input);
#endif
-
- return output;
}
private: