#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
#define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
-#include "nnue_common.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <utility>
+
+#include "../position.h"
+#include "../types.h"
+#include "nnue_accumulator.h"
#include "nnue_architecture.h"
-
-#include <cstring> // std::memset()
-#include <utility> // std::pair
+#include "nnue_common.h"
namespace Stockfish::Eval::NNUE {
"Per feature PSQT values cannot be processed at granularity lower than 8 at a time.");
#ifdef USE_AVX512
- typedef __m512i vec_t;
- typedef __m256i psqt_vec_t;
+ using vec_t = __m512i;
+ using psqt_vec_t = __m256i;
#define vec_load(a) _mm512_load_si512(a)
#define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
#define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
#define vec_zero_psqt() _mm256_setzero_si256()
- #define NumRegistersSIMD 32
+ #define NumRegistersSIMD 16
#define MaxChunkSize 64
#elif USE_AVX2
- typedef __m256i vec_t;
- typedef __m256i psqt_vec_t;
+ using vec_t = __m256i;
+ using psqt_vec_t = __m256i;
#define vec_load(a) _mm256_load_si256(a)
#define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define MaxChunkSize 32
#elif USE_SSE2
- typedef __m128i vec_t;
- typedef __m128i psqt_vec_t;
+ using vec_t = __m128i;
+ using psqt_vec_t = __m128i;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b)
#define MaxChunkSize 16
#elif USE_MMX
- typedef __m64 vec_t;
- typedef __m64 psqt_vec_t;
+ using vec_t = __m64;
+ using psqt_vec_t = __m64;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define MaxChunkSize 8
#elif USE_NEON
- typedef int16x8_t vec_t;
- typedef int32x4_t psqt_vec_t;
+ using vec_t = int16x8_t;
+ using psqt_vec_t = int32x4_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b)
// Read network parameters
bool read_parameters(std::istream& stream) {
- read_little_endian<BiasType >(stream, biases , HalfDimensions );
- read_little_endian<WeightType >(stream, weights , HalfDimensions * InputDimensions);
- read_little_endian<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
+ read_leb_128<BiasType >(stream, biases , HalfDimensions );
+ read_leb_128<WeightType >(stream, weights , HalfDimensions * InputDimensions);
+ read_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
return !stream.fail();
}
// Write network parameters
bool write_parameters(std::ostream& stream) const {
- write_little_endian<BiasType >(stream, biases , HalfDimensions );
- write_little_endian<WeightType >(stream, weights , HalfDimensions * InputDimensions);
- write_little_endian<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
+ write_leb_128<BiasType >(stream, biases , HalfDimensions );
+ write_leb_128<WeightType >(stream, weights , HalfDimensions * InputDimensions);
+ write_leb_128<PSQTWeightType>(stream, psqtWeights, PSQTBuckets * InputDimensions);
return !stream.fail();
}
for (IndexType j = 0; j < HalfDimensions / 2; ++j) {
BiasType sum0 = accumulation[static_cast<int>(perspectives[p])][j + 0];
BiasType sum1 = accumulation[static_cast<int>(perspectives[p])][j + HalfDimensions / 2];
- sum0 = std::max<int>(0, std::min<int>(127, sum0));
- sum1 = std::max<int>(0, std::min<int>(127, sum1));
- output[offset + j] = static_cast<OutputType>(sum0 * sum1 / 128);
+ sum0 = std::clamp<BiasType>(sum0, 0, 127);
+ sum1 = std::clamp<BiasType>(sum1, 0, 127);
+ output[offset + j] = static_cast<OutputType>(unsigned(sum0 * sum1) / 128);
}
#endif
// NOTE: The parameter states_to_update is an array of position states, ending with nullptr.
// All states must be sequential, that is states_to_update[i] must either be reachable
// by repeatedly applying ->previous from states_to_update[i+1] or states_to_update[i] == nullptr.
- // computed_st must be reachable by repeatadly applying ->previous on states_to_update[0], if not nullptr.
+ // computed_st must be reachable by repeatedly applying ->previous on states_to_update[0], if not nullptr.
template<Color Perspective, size_t N>
void update_accumulator_incremental(const Position& pos, StateInfo* computed_st, StateInfo* states_to_update[N]) const {
static_assert(N > 0);