X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Fnnue%2Fnnue_feature_transformer.h;h=4f6a174a486667ea23acab3e3d99c4f1fe13d438;hb=4766dfc3956f78d853c5e0c4636d6f90fd93df9a;hp=300ce3671c256e67c8ba5aec7a3582ae542aa7be;hpb=b84fa04db6ea5fc6d7d714539c11537bde64538b;p=stockfish diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 300ce367..4f6a174a 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -28,12 +28,17 @@ namespace Stockfish::Eval::NNUE { + using BiasType = std::int16_t; + using WeightType = std::int16_t; + using PSQTWeightType = std::int32_t; + // If vector instructions are enabled, we update and refresh the // accumulator tile by tile such that each tile fits in the CPU's // vector registers. #define VECTOR - static_assert(PSQTBuckets == 8, "Assumed by the current choice of constants."); + static_assert(PSQTBuckets % 8 == 0, + "Per feature PSQT values cannot be processed at granularity lower than 8 at a time."); #ifdef USE_AVX512 typedef __m512i vec_t; @@ -47,8 +52,7 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b) #define vec_zero_psqt() _mm256_setzero_si256() - static constexpr IndexType NumRegs = 8; // only 8 are needed - static constexpr IndexType NumPsqtRegs = 1; + #define NumRegistersSIMD 32 #elif USE_AVX2 typedef __m256i vec_t; @@ -62,8 +66,7 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b) #define vec_zero_psqt() _mm256_setzero_si256() - static constexpr IndexType NumRegs = 16; - static constexpr IndexType NumPsqtRegs = 1; + #define NumRegistersSIMD 16 #elif USE_SSE2 typedef __m128i vec_t; @@ -77,8 +80,7 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b) #define vec_zero_psqt() _mm_setzero_si128() - static constexpr IndexType NumRegs = Is64Bit ? 16 : 8; - static constexpr IndexType NumPsqtRegs = 2; + #define NumRegistersSIMD (Is64Bit ? 16 : 8) #elif USE_MMX typedef __m64 vec_t; @@ -92,8 +94,7 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b) #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b) #define vec_zero_psqt() _mm_setzero_si64() - static constexpr IndexType NumRegs = 8; - static constexpr IndexType NumPsqtRegs = 4; + #define NumRegistersSIMD 8 #elif USE_NEON typedef int16x8_t vec_t; @@ -107,14 +108,61 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) vaddq_s32(a,b) #define vec_sub_psqt_32(a,b) vsubq_s32(a,b) #define vec_zero_psqt() psqt_vec_t{0} - static constexpr IndexType NumRegs = 16; - static constexpr IndexType NumPsqtRegs = 2; + #define NumRegistersSIMD 16 #else #undef VECTOR #endif + + #ifdef VECTOR + + // Compute optimal SIMD register count for feature transformer accumulation. + + // We use __m* types as template arguments, which causes GCC to emit warnings + // about losing some attribute information. This is irrelevant to us as we + // only take their size, so the following pragma are harmless. + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wignored-attributes" + + template + static constexpr int BestRegisterCount() + { + #define RegisterSize sizeof(SIMDRegisterType) + #define LaneSize sizeof(LaneType) + + static_assert(RegisterSize >= LaneSize); + static_assert(MaxRegisters <= NumRegistersSIMD); + static_assert(MaxRegisters > 0); + static_assert(NumRegistersSIMD > 0); + static_assert(RegisterSize % LaneSize == 0); + static_assert((NumLanes * LaneSize) % RegisterSize == 0); + + const int ideal = (NumLanes * LaneSize) / RegisterSize; + if (ideal <= MaxRegisters) + return ideal; + + // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters + for (int divisor = MaxRegisters; divisor > 1; --divisor) + if (ideal % divisor == 0) + return divisor; + + return 1; + } + + static constexpr int NumRegs = BestRegisterCount(); + static constexpr int NumPsqtRegs = BestRegisterCount(); + + #pragma GCC diagnostic pop + + #endif + + + // Input feature converter class FeatureTransformer { @@ -288,10 +336,17 @@ namespace Stockfish::Eval::NNUE { { const IndexType offset = HalfDimensions * p; const auto out = reinterpret_cast(&output[offset]); - for (IndexType j = 0; j < NumChunks; ++j) + + constexpr IndexType UnrollFactor = 16; + static_assert(UnrollFactor % UnrollFactor == 0); + for (IndexType j = 0; j < NumChunks; j += UnrollFactor) { - int16x8_t sum = reinterpret_cast(accumulation[perspectives[p]])[j]; - out[j] = vmax_s8(vqmovn_s16(sum), Zero); + int16x8_t sums[UnrollFactor]; + for (IndexType i = 0; i < UnrollFactor; ++i) + sums[i] = reinterpret_cast(accumulation[perspectives[p]])[j+i]; + + for (IndexType i = 0; i < UnrollFactor; ++i) + out[j+i] = vmax_s8(vqmovn_s16(sums[i]), Zero); } } return psqt; @@ -322,7 +377,6 @@ namespace Stockfish::Eval::NNUE { // That might depend on the feature set and generally relies on the // feature set's update cost calculation to be correct and never // allow updates with more added/removed features than MaxActiveDimensions. - using IndexList = ValueList; #ifdef VECTOR // Gcc-10.2 unnecessarily spills AVX2 registers if this array @@ -335,7 +389,7 @@ namespace Stockfish::Eval::NNUE { // of the estimated gain in terms of features to be added/subtracted. StateInfo *st = pos.state(), *next = nullptr; int gain = FeatureSet::refresh_cost(pos); - while (st->accumulator.state[perspective] == EMPTY) + while (st->previous && !st->accumulator.computed[perspective]) { // This governs when a full feature refresh is needed and how many // updates are better than just one full refresh. @@ -346,7 +400,7 @@ namespace Stockfish::Eval::NNUE { st = st->previous; } - if (st->accumulator.state[perspective] == COMPUTED) + if (st->accumulator.computed[perspective]) { if (next == nullptr) return; @@ -356,16 +410,16 @@ namespace Stockfish::Eval::NNUE { // Gather all features to be updated. const Square ksq = pos.square(perspective); - IndexList removed[2], added[2]; + FeatureSet::IndexList removed[2], added[2]; FeatureSet::append_changed_indices( - ksq, next, perspective, removed[0], added[0]); + ksq, next->dirtyPiece, perspective, removed[0], added[0]); for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous) FeatureSet::append_changed_indices( - ksq, st2, perspective, removed[1], added[1]); + ksq, st2->dirtyPiece, perspective, removed[1], added[1]); // Mark the accumulators as computed. - next->accumulator.state[perspective] = COMPUTED; - pos.state()->accumulator.state[perspective] = COMPUTED; + next->accumulator.computed[perspective] = true; + pos.state()->accumulator.computed[perspective] = true; // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. StateInfo *states_to_update[3] = @@ -485,8 +539,8 @@ namespace Stockfish::Eval::NNUE { { // Refresh the accumulator auto& accumulator = pos.state()->accumulator; - accumulator.state[perspective] = COMPUTED; - IndexList active; + accumulator.computed[perspective] = true; + FeatureSet::IndexList active; FeatureSet::append_active_indices(pos, perspective, active); #ifdef VECTOR @@ -557,10 +611,6 @@ namespace Stockfish::Eval::NNUE { #endif } - using BiasType = std::int16_t; - using WeightType = std::int16_t; - using PSQTWeightType = std::int32_t; - alignas(CacheLineSize) BiasType biases[HalfDimensions]; alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions]; alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];