X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Fnnue%2Fnnue_feature_transformer.h;h=13f1604fe1365d1b07a7bc2ef6a4cd111dade85f;hb=08385527dd470ece814ac85013802995a0e7f6ca;hp=62f1615d5fefc93efb197427b4a556a7015a2973;hpb=77dfcbedce2861b2c6c5056d49e7a8731fea4256;p=stockfish diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 62f1615d..13f1604f 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -25,6 +25,7 @@ #include "nnue_architecture.h" #include // std::memset() +#include // std::pair namespace Stockfish::Eval::NNUE { @@ -332,27 +333,16 @@ namespace Stockfish::Eval::NNUE { #endif return psqt; + } // end of function transform() - } // end of function transform() - - + void hint_common_access(const Position& pos) const { + hint_common_access_for_perspective(pos); + hint_common_access_for_perspective(pos); + } private: template - void update_accumulator(const Position& pos) const { - - // The size must be enough to contain the largest possible update. - // That might depend on the feature set and generally relies on the - // feature set's update cost calculation to be correct and never - // allow updates with more added/removed features than MaxActiveDimensions. - - #ifdef VECTOR - // Gcc-10.2 unnecessarily spills AVX2 registers if this array - // is defined in the VECTOR code below, once in each branch - vec_t acc[NumRegs]; - psqt_vec_t psqt[NumPsqtRegs]; - #endif - + [[nodiscard]] std::pair try_find_computed_accumulator(const Position& pos) const { // Look for a usable accumulator of an earlier position. We keep track // of the estimated gain in terms of features to be added/subtracted. StateInfo *st = pos.state(), *next = nullptr; @@ -367,218 +357,313 @@ namespace Stockfish::Eval::NNUE { next = st; st = st->previous; } + return { st, next }; + } - if (st->accumulator.computed[Perspective]) - { - if (next == nullptr) - return; + // NOTE: The parameter states_to_update is an array of position states, ending with nullptr. + // All states must be sequential, that is states_to_update[i] must either be reachable + // by repeatedly applying ->previous from states_to_update[i+1] or states_to_update[i] == nullptr. + // computed_st must be reachable by repeatadly applying ->previous on states_to_update[0], if not nullptr. + template + void update_accumulator_incremetal(const Position& pos, StateInfo* computed_st, StateInfo* states_to_update[N]) const { + static_assert(N > 0); + assert(states_to_update[N-1] == nullptr); - // Update incrementally in two steps. First, we update the "next" - // accumulator. Then, we update the current accumulator (pos.state()). + #ifdef VECTOR + // Gcc-10.2 unnecessarily spills AVX2 registers if this array + // is defined in the VECTOR code below, once in each branch + vec_t acc[NumRegs]; + psqt_vec_t psqt[NumPsqtRegs]; + #endif - // Gather all features to be updated. - const Square ksq = pos.square(Perspective); - FeatureSet::IndexList removed[2], added[2]; - FeatureSet::append_changed_indices( - ksq, next->dirtyPiece, removed[0], added[0]); - for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous) - FeatureSet::append_changed_indices( - ksq, st2->dirtyPiece, removed[1], added[1]); + if (states_to_update[0] == nullptr) + return; - // Mark the accumulators as computed. - next->accumulator.computed[Perspective] = true; - pos.state()->accumulator.computed[Perspective] = true; + // Update incrementally going back through states_to_update. - // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. - StateInfo *states_to_update[3] = - { next, next == pos.state() ? nullptr : pos.state(), nullptr }; - #ifdef VECTOR - for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) - { - // Load accumulator - auto accTile = reinterpret_cast( - &st->accumulator.accumulation[Perspective][j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = vec_load(&accTile[k]); + // Gather all features to be updated. + const Square ksq = pos.square(Perspective); - for (IndexType i = 0; states_to_update[i]; ++i) - { - // Difference calculation for the deactivated features - for (const auto index : removed[i]) - { - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = vec_sub_16(acc[k], column[k]); - } - - // Difference calculation for the activated features - for (const auto index : added[i]) - { - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); - } - - // Store accumulator - accTile = reinterpret_cast( - &states_to_update[i]->accumulator.accumulation[Perspective][j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - vec_store(&accTile[k], acc[k]); - } - } + // The size must be enough to contain the largest possible update. + // That might depend on the feature set and generally relies on the + // feature set's update cost calculation to be correct and never + // allow updates with more added/removed features than MaxActiveDimensions. + FeatureSet::IndexList removed[N-1], added[N-1]; - for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) - { - // Load accumulator - auto accTilePsqt = reinterpret_cast( - &st->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_load_psqt(&accTilePsqt[k]); + { + int i = N-2; // last potential state to update. Skip last element because it must be nullptr. + while (states_to_update[i] == nullptr) + --i; - for (IndexType i = 0; states_to_update[i]; ++i) - { - // Difference calculation for the deactivated features - for (const auto index : removed[i]) - { - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); - } - - // Difference calculation for the activated features - for (const auto index : added[i]) - { - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); - } - - // Store accumulator - accTilePsqt = reinterpret_cast( - &states_to_update[i]->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - vec_store_psqt(&accTilePsqt[k], psqt[k]); - } - } + StateInfo *st2 = states_to_update[i]; - #else - for (IndexType i = 0; states_to_update[i]; ++i) + for (; i >= 0; --i) { - std::memcpy(states_to_update[i]->accumulator.accumulation[Perspective], - st->accumulator.accumulation[Perspective], - HalfDimensions * sizeof(BiasType)); + states_to_update[i]->accumulator.computed[Perspective] = true; - for (std::size_t k = 0; k < PSQTBuckets; ++k) - states_to_update[i]->accumulator.psqtAccumulation[Perspective][k] = st->accumulator.psqtAccumulation[Perspective][k]; + StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1]; + + for (; st2 != end_state; st2 = st2->previous) + FeatureSet::append_changed_indices( + ksq, st2->dirtyPiece, removed[i], added[i]); + } + } + + StateInfo* st = computed_st; - st = states_to_update[i]; + // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. +#ifdef VECTOR + for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) + { + // Load accumulator + auto accTile = reinterpret_cast( + &st->accumulator.accumulation[Perspective][j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = vec_load(&accTile[k]); + for (IndexType i = 0; states_to_update[i]; ++i) + { // Difference calculation for the deactivated features for (const auto index : removed[i]) { - const IndexType offset = HalfDimensions * index; - - for (IndexType j = 0; j < HalfDimensions; ++j) - st->accumulator.accumulation[Perspective][j] -= weights[offset + j]; - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - st->accumulator.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = vec_sub_16(acc[k], column[k]); } // Difference calculation for the activated features for (const auto index : added[i]) - { - const IndexType offset = HalfDimensions * index; - - for (IndexType j = 0; j < HalfDimensions; ++j) - st->accumulator.accumulation[Perspective][j] += weights[offset + j]; - - for (std::size_t k = 0; k < PSQTBuckets; ++k) - st->accumulator.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; - } - } - #endif - } - else - { - // Refresh the accumulator - auto& accumulator = pos.state()->accumulator; - accumulator.computed[Perspective] = true; - FeatureSet::IndexList active; - FeatureSet::append_active_indices(pos, active); - - #ifdef VECTOR - for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) - { - auto biasesTile = reinterpret_cast( - &biases[j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = biasesTile[k]; - - for (const auto index : active) { const IndexType offset = HalfDimensions * index + j * TileHeight; auto column = reinterpret_cast(&weights[offset]); - - for (unsigned k = 0; k < NumRegs; ++k) + for (IndexType k = 0; k < NumRegs; ++k) acc[k] = vec_add_16(acc[k], column[k]); } - auto accTile = reinterpret_cast( - &accumulator.accumulation[Perspective][j * TileHeight]); - for (unsigned k = 0; k < NumRegs; k++) + // Store accumulator + accTile = reinterpret_cast( + &states_to_update[i]->accumulator.accumulation[Perspective][j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) vec_store(&accTile[k], acc[k]); } + } - for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) - { - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_zero_psqt(); + for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) + { + // Load accumulator + auto accTilePsqt = reinterpret_cast( + &st->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = vec_load_psqt(&accTilePsqt[k]); - for (const auto index : active) + for (IndexType i = 0; states_to_update[i]; ++i) + { + // Difference calculation for the deactivated features + for (const auto index : removed[i]) { const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); + } + // Difference calculation for the activated features + for (const auto index : added[i]) + { + const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); } - auto accTilePsqt = reinterpret_cast( - &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); + // Store accumulator + accTilePsqt = reinterpret_cast( + &states_to_update[i]->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) vec_store_psqt(&accTilePsqt[k], psqt[k]); } + } - #else - std::memcpy(accumulator.accumulation[Perspective], biases, +#else + for (IndexType i = 0; states_to_update[i]; ++i) + { + std::memcpy(states_to_update[i]->accumulator.accumulation[Perspective], + st->accumulator.accumulation[Perspective], HalfDimensions * sizeof(BiasType)); for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[Perspective][k] = 0; + states_to_update[i]->accumulator.psqtAccumulation[Perspective][k] = st->accumulator.psqtAccumulation[Perspective][k]; - for (const auto index : active) + st = states_to_update[i]; + + // Difference calculation for the deactivated features + for (const auto index : removed[i]) { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - accumulator.accumulation[Perspective][j] += weights[offset + j]; + st->accumulator.accumulation[Perspective][j] -= weights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; + st->accumulator.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; } + + // Difference calculation for the activated features + for (const auto index : added[i]) + { + const IndexType offset = HalfDimensions * index; + + for (IndexType j = 0; j < HalfDimensions; ++j) + st->accumulator.accumulation[Perspective][j] += weights[offset + j]; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + st->accumulator.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; + } + } +#endif + + #if defined(USE_MMX) + _mm_empty(); + #endif + } + + template + void update_accumulator_refresh(const Position& pos) const { + #ifdef VECTOR + // Gcc-10.2 unnecessarily spills AVX2 registers if this array + // is defined in the VECTOR code below, once in each branch + vec_t acc[NumRegs]; + psqt_vec_t psqt[NumPsqtRegs]; #endif + + // Refresh the accumulator + // Could be extracted to a separate function because it's done in 2 places, + // but it's unclear if compilers would correctly handle register allocation. + auto& accumulator = pos.state()->accumulator; + accumulator.computed[Perspective] = true; + FeatureSet::IndexList active; + FeatureSet::append_active_indices(pos, active); + +#ifdef VECTOR + for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) + { + auto biasesTile = reinterpret_cast( + &biases[j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = biasesTile[k]; + + for (const auto index : active) + { + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + + for (unsigned k = 0; k < NumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + + auto accTile = reinterpret_cast( + &accumulator.accumulation[Perspective][j * TileHeight]); + for (unsigned k = 0; k < NumRegs; k++) + vec_store(&accTile[k], acc[k]); + } + + for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) + { + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = vec_zero_psqt(); + + for (const auto index : active) + { + const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + } + + auto accTilePsqt = reinterpret_cast( + &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + vec_store_psqt(&accTilePsqt[k], psqt[k]); } +#else + std::memcpy(accumulator.accumulation[Perspective], biases, + HalfDimensions * sizeof(BiasType)); + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + accumulator.psqtAccumulation[Perspective][k] = 0; + + for (const auto index : active) + { + const IndexType offset = HalfDimensions * index; + + for (IndexType j = 0; j < HalfDimensions; ++j) + accumulator.accumulation[Perspective][j] += weights[offset + j]; + + for (std::size_t k = 0; k < PSQTBuckets; ++k) + accumulator.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; + } +#endif + #if defined(USE_MMX) _mm_empty(); #endif } + template + void hint_common_access_for_perspective(const Position& pos) const { + + // Works like update_accumulator, but performs less work. + // Updates ONLY the accumulator for pos. + + // Look for a usable accumulator of an earlier position. We keep track + // of the estimated gain in terms of features to be added/subtracted. + // Fast early exit. + if (pos.state()->accumulator.computed[Perspective]) + return; + + auto [oldest_st, _] = try_find_computed_accumulator(pos); + + if (oldest_st->accumulator.computed[Perspective]) + { + // Only update current position accumulator to minimize work. + StateInfo* states_to_update[2] = { pos.state(), nullptr }; + update_accumulator_incremetal(pos, oldest_st, states_to_update); + } + else + { + update_accumulator_refresh(pos); + } + } + + template + void update_accumulator(const Position& pos) const { + + auto [oldest_st, next] = try_find_computed_accumulator(pos); + + if (oldest_st->accumulator.computed[Perspective]) + { + if (next == nullptr) + return; + + // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. + // Currently we update 2 accumulators. + // 1. for the current position + // 2. the next accumulator after the computed one + // The heuristic may change in the future. + StateInfo *states_to_update[3] = + { next, next == pos.state() ? nullptr : pos.state(), nullptr }; + + update_accumulator_incremetal(pos, oldest_st, states_to_update); + } + else + { + update_accumulator_refresh(pos); + } + } + alignas(CacheLineSize) BiasType biases[HalfDimensions]; alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions]; alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];