From: mstembera Date: Fri, 14 Oct 2022 21:41:08 +0000 (-0700) Subject: Optimize make_index() using templates and lookup tables. X-Git-Url: https://git.sesse.net/?p=stockfish;a=commitdiff_plain;h=93f71ecfe1d26e5ccc813318f420b8363cd26003 Optimize make_index() using templates and lookup tables. https://tests.stockfishchess.org/tests/view/634517e54bc7650f07542f99 LLR: 2.94 (-2.94,2.94) <0.00,2.00> Total: 642672 W: 171819 L: 170658 D: 300195 Ptnml(0-2): 2278, 68077, 179416, 69336, 2229 this also introduces `-flto-partition=one` as suggested by MinetaS (Syine Mineta) to avoid linking errors due to LTO on 32 bit mingw. This change was tested in isolation as well https://tests.stockfishchess.org/tests/view/634aacf84bc7650f0755188b LLR: 2.94 (-2.94,2.94) <-1.75,0.25> Total: 119352 W: 31986 L: 31862 D: 55504 Ptnml(0-2): 439, 12624, 33400, 12800, 413 closes https://github.com/official-stockfish/Stockfish/pull/4199 No functional change --- diff --git a/src/Makefile b/src/Makefile index 8315f33d..880710fe 100644 --- a/src/Makefile +++ b/src/Makefile @@ -698,11 +698,9 @@ ifeq ($(debug), no) # To use LTO and static linking on Windows, # the tool chain requires gcc version 10.1 or later. else ifeq ($(comp),mingw) - ifneq ($(arch),i386) - CXXFLAGS += -flto + CXXFLAGS += -flto -flto-partition=one LDFLAGS += $(CXXFLAGS) -save-temps endif - endif endif endif diff --git a/src/nnue/features/half_ka_v2_hm.cpp b/src/nnue/features/half_ka_v2_hm.cpp index 07a1d7a1..11e05c94 100644 --- a/src/nnue/features/half_ka_v2_hm.cpp +++ b/src/nnue/features/half_ka_v2_hm.cpp @@ -24,50 +24,51 @@ namespace Stockfish::Eval::NNUE::Features { - // Orient a square according to perspective (rotates by 180 for black) - inline Square HalfKAv2_hm::orient(Color perspective, Square s, Square ksq) { - return Square(int(s) ^ (bool(perspective) * SQ_A8) ^ ((file_of(ksq) < FILE_E) * SQ_H1)); - } - // Index of a feature for a given king position and another piece on some square - inline IndexType HalfKAv2_hm::make_index(Color perspective, Square s, Piece pc, Square ksq) { - Square o_ksq = orient(perspective, ksq, ksq); - return IndexType(orient(perspective, s, ksq) + PieceSquareIndex[perspective][pc] + PS_NB * KingBuckets[o_ksq]); + template + inline IndexType HalfKAv2_hm::make_index(Square s, Piece pc, Square ksq) { + return IndexType((int(s) ^ OrientTBL[Perspective][ksq]) + PieceSquareIndex[Perspective][pc] + KingBuckets[Perspective][ksq]); } // Get a list of indices for active features + template void HalfKAv2_hm::append_active_indices( const Position& pos, - Color perspective, IndexList& active ) { - Square ksq = pos.square(perspective); + Square ksq = pos.square(Perspective); Bitboard bb = pos.pieces(); while (bb) { Square s = pop_lsb(bb); - active.push_back(make_index(perspective, s, pos.piece_on(s), ksq)); + active.push_back(make_index(s, pos.piece_on(s), ksq)); } } - + // Explicit template instantiations + template void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active); + template void HalfKAv2_hm::append_active_indices(const Position& pos, IndexList& active); + // append_changed_indices() : get a list of indices for recently changed features - + template void HalfKAv2_hm::append_changed_indices( Square ksq, const DirtyPiece& dp, - Color perspective, IndexList& removed, IndexList& added ) { for (int i = 0; i < dp.dirty_num; ++i) { if (dp.from[i] != SQ_NONE) - removed.push_back(make_index(perspective, dp.from[i], dp.piece[i], ksq)); + removed.push_back(make_index(dp.from[i], dp.piece[i], ksq)); if (dp.to[i] != SQ_NONE) - added.push_back(make_index(perspective, dp.to[i], dp.piece[i], ksq)); + added.push_back(make_index(dp.to[i], dp.piece[i], ksq)); } } + // Explicit template instantiations + template void HalfKAv2_hm::append_changed_indices(Square ksq, const DirtyPiece& dp, IndexList& removed, IndexList& added); + template void HalfKAv2_hm::append_changed_indices(Square ksq, const DirtyPiece& dp, IndexList& removed, IndexList& added); + int HalfKAv2_hm::update_cost(const StateInfo* st) { return st->dirtyPiece.dirty_num; } diff --git a/src/nnue/features/half_ka_v2_hm.h b/src/nnue/features/half_ka_v2_hm.h index 1e6da0bf..a95d4328 100644 --- a/src/nnue/features/half_ka_v2_hm.h +++ b/src/nnue/features/half_ka_v2_hm.h @@ -49,8 +49,8 @@ namespace Stockfish::Eval::NNUE::Features { PS_B_ROOK = 7 * SQUARE_NB, PS_W_QUEEN = 8 * SQUARE_NB, PS_B_QUEEN = 9 * SQUARE_NB, - PS_KING = 10 * SQUARE_NB, - PS_NB = 11 * SQUARE_NB + PS_KING = 10 * SQUARE_NB, + PS_NB = 11 * SQUARE_NB }; static constexpr IndexType PieceSquareIndex[COLOR_NB][PIECE_NB] = { @@ -62,11 +62,9 @@ namespace Stockfish::Eval::NNUE::Features { PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_KING, PS_NONE } }; - // Orient a square according to perspective (rotates by 180 for black) - static Square orient(Color perspective, Square s, Square ksq); - // Index of a feature for a given king position and another piece on some square - static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq); + template + static IndexType make_index(Square s, Piece pc, Square ksq); public: // Feature name @@ -79,15 +77,45 @@ namespace Stockfish::Eval::NNUE::Features { static constexpr IndexType Dimensions = static_cast(SQUARE_NB) * static_cast(PS_NB) / 2; - static constexpr int KingBuckets[64] = { - -1, -1, -1, -1, 31, 30, 29, 28, - -1, -1, -1, -1, 27, 26, 25, 24, - -1, -1, -1, -1, 23, 22, 21, 20, - -1, -1, -1, -1, 19, 18, 17, 16, - -1, -1, -1, -1, 15, 14, 13, 12, - -1, -1, -1, -1, 11, 10, 9, 8, - -1, -1, -1, -1, 7, 6, 5, 4, - -1, -1, -1, -1, 3, 2, 1, 0 +#define B(v) (v * PS_NB) + static constexpr int KingBuckets[COLOR_NB][SQUARE_NB] = { + { B(28), B(29), B(30), B(31), B(31), B(30), B(29), B(28), + B(24), B(25), B(26), B(27), B(27), B(26), B(25), B(24), + B(20), B(21), B(22), B(23), B(23), B(22), B(21), B(20), + B(16), B(17), B(18), B(19), B(19), B(18), B(17), B(16), + B(12), B(13), B(14), B(15), B(15), B(14), B(13), B(12), + B( 8), B( 9), B(10), B(11), B(11), B(10), B( 9), B( 8), + B( 4), B( 5), B( 6), B( 7), B( 7), B( 6), B( 5), B( 4), + B( 0), B( 1), B( 2), B( 3), B( 3), B( 2), B( 1), B( 0) }, + { B( 0), B( 1), B( 2), B( 3), B( 3), B( 2), B( 1), B( 0), + B( 4), B( 5), B( 6), B( 7), B( 7), B( 6), B( 5), B( 4), + B( 8), B( 9), B(10), B(11), B(11), B(10), B( 9), B( 8), + B(12), B(13), B(14), B(15), B(15), B(14), B(13), B(12), + B(16), B(17), B(18), B(19), B(19), B(18), B(17), B(16), + B(20), B(21), B(22), B(23), B(23), B(22), B(21), B(20), + B(24), B(25), B(26), B(27), B(27), B(26), B(25), B(24), + B(28), B(29), B(30), B(31), B(31), B(30), B(29), B(28) } + }; +#undef B + + // Orient a square according to perspective (rotates by 180 for black) + static constexpr int OrientTBL[COLOR_NB][SQUARE_NB] = { + { SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1, + SQ_H1, SQ_H1, SQ_H1, SQ_H1, SQ_A1, SQ_A1, SQ_A1, SQ_A1 }, + { SQ_H8, SQ_H8, SQ_H8, SQ_H8, SQ_A8, SQ_A8, SQ_A8, SQ_A8, + SQ_H8, SQ_H8, SQ_H8, SQ_H8, SQ_A8, SQ_A8, SQ_A8, SQ_A8, + SQ_H8, SQ_H8, SQ_H8, SQ_H8, SQ_A8, SQ_A8, SQ_A8, SQ_A8, + SQ_H8, SQ_H8, SQ_H8, SQ_H8, SQ_A8, SQ_A8, SQ_A8, SQ_A8, + SQ_H8, SQ_H8, SQ_H8, SQ_H8, SQ_A8, SQ_A8, SQ_A8, SQ_A8, + SQ_H8, SQ_H8, SQ_H8, SQ_H8, SQ_A8, SQ_A8, SQ_A8, SQ_A8, + SQ_H8, SQ_H8, SQ_H8, SQ_H8, SQ_A8, SQ_A8, SQ_A8, SQ_A8, + SQ_H8, SQ_H8, SQ_H8, SQ_H8, SQ_A8, SQ_A8, SQ_A8, SQ_A8 } }; // Maximum number of simultaneously active features. @@ -95,16 +123,16 @@ namespace Stockfish::Eval::NNUE::Features { using IndexList = ValueList; // Get a list of indices for active features + template static void append_active_indices( const Position& pos, - Color perspective, IndexList& active); // Get a list of indices for recently changed features + template static void append_changed_indices( Square ksq, const DirtyPiece& dp, - Color perspective, IndexList& removed, IndexList& added ); diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 34d7292c..b6dd54d3 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -271,8 +271,8 @@ namespace Stockfish::Eval::NNUE { // Convert input features std::int32_t transform(const Position& pos, OutputType* output, int bucket) const { - update_accumulator(pos, WHITE); - update_accumulator(pos, BLACK); + update_accumulator(pos); + update_accumulator(pos); const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()}; const auto& accumulation = pos.state()->accumulator.accumulation; @@ -338,7 +338,8 @@ namespace Stockfish::Eval::NNUE { private: - void update_accumulator(const Position& pos, const Color perspective) const { + template + void update_accumulator(const Position& pos) const { // The size must be enough to contain the largest possible update. // That might depend on the feature set and generally relies on the @@ -356,18 +357,18 @@ namespace Stockfish::Eval::NNUE { // of the estimated gain in terms of features to be added/subtracted. StateInfo *st = pos.state(), *next = nullptr; int gain = FeatureSet::refresh_cost(pos); - while (st->previous && !st->accumulator.computed[perspective]) + while (st->previous && !st->accumulator.computed[Perspective]) { // This governs when a full feature refresh is needed and how many // updates are better than just one full refresh. - if ( FeatureSet::requires_refresh(st, perspective) + if ( FeatureSet::requires_refresh(st, Perspective) || (gain -= FeatureSet::update_cost(st) + 1) < 0) break; next = st; st = st->previous; } - if (st->accumulator.computed[perspective]) + if (st->accumulator.computed[Perspective]) { if (next == nullptr) return; @@ -376,17 +377,17 @@ namespace Stockfish::Eval::NNUE { // accumulator. Then, we update the current accumulator (pos.state()). // Gather all features to be updated. - const Square ksq = pos.square(perspective); + const Square ksq = pos.square(Perspective); FeatureSet::IndexList removed[2], added[2]; - FeatureSet::append_changed_indices( - ksq, next->dirtyPiece, perspective, removed[0], added[0]); + FeatureSet::append_changed_indices( + ksq, next->dirtyPiece, removed[0], added[0]); for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous) - FeatureSet::append_changed_indices( - ksq, st2->dirtyPiece, perspective, removed[1], added[1]); + FeatureSet::append_changed_indices( + ksq, st2->dirtyPiece, removed[1], added[1]); // Mark the accumulators as computed. - next->accumulator.computed[perspective] = true; - pos.state()->accumulator.computed[perspective] = true; + next->accumulator.computed[Perspective] = true; + pos.state()->accumulator.computed[Perspective] = true; // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. StateInfo *states_to_update[3] = @@ -396,7 +397,7 @@ namespace Stockfish::Eval::NNUE { { // Load accumulator auto accTile = reinterpret_cast( - &st->accumulator.accumulation[perspective][j * TileHeight]); + &st->accumulator.accumulation[Perspective][j * TileHeight]); for (IndexType k = 0; k < NumRegs; ++k) acc[k] = vec_load(&accTile[k]); @@ -422,7 +423,7 @@ namespace Stockfish::Eval::NNUE { // Store accumulator accTile = reinterpret_cast( - &states_to_update[i]->accumulator.accumulation[perspective][j * TileHeight]); + &states_to_update[i]->accumulator.accumulation[Perspective][j * TileHeight]); for (IndexType k = 0; k < NumRegs; ++k) vec_store(&accTile[k], acc[k]); } @@ -432,7 +433,7 @@ namespace Stockfish::Eval::NNUE { { // Load accumulator auto accTilePsqt = reinterpret_cast( - &st->accumulator.psqtAccumulation[perspective][j * PsqtTileHeight]); + &st->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) psqt[k] = vec_load_psqt(&accTilePsqt[k]); @@ -458,7 +459,7 @@ namespace Stockfish::Eval::NNUE { // Store accumulator accTilePsqt = reinterpret_cast( - &states_to_update[i]->accumulator.psqtAccumulation[perspective][j * PsqtTileHeight]); + &states_to_update[i]->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) vec_store_psqt(&accTilePsqt[k], psqt[k]); } @@ -467,12 +468,12 @@ namespace Stockfish::Eval::NNUE { #else for (IndexType i = 0; states_to_update[i]; ++i) { - std::memcpy(states_to_update[i]->accumulator.accumulation[perspective], - st->accumulator.accumulation[perspective], + std::memcpy(states_to_update[i]->accumulator.accumulation[Perspective], + st->accumulator.accumulation[Perspective], HalfDimensions * sizeof(BiasType)); for (std::size_t k = 0; k < PSQTBuckets; ++k) - states_to_update[i]->accumulator.psqtAccumulation[perspective][k] = st->accumulator.psqtAccumulation[perspective][k]; + states_to_update[i]->accumulator.psqtAccumulation[Perspective][k] = st->accumulator.psqtAccumulation[Perspective][k]; st = states_to_update[i]; @@ -482,10 +483,10 @@ namespace Stockfish::Eval::NNUE { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - st->accumulator.accumulation[perspective][j] -= weights[offset + j]; + st->accumulator.accumulation[Perspective][j] -= weights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) - st->accumulator.psqtAccumulation[perspective][k] -= psqtWeights[index * PSQTBuckets + k]; + st->accumulator.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k]; } // Difference calculation for the activated features @@ -494,10 +495,10 @@ namespace Stockfish::Eval::NNUE { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - st->accumulator.accumulation[perspective][j] += weights[offset + j]; + st->accumulator.accumulation[Perspective][j] += weights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) - st->accumulator.psqtAccumulation[perspective][k] += psqtWeights[index * PSQTBuckets + k]; + st->accumulator.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; } } #endif @@ -506,9 +507,9 @@ namespace Stockfish::Eval::NNUE { { // Refresh the accumulator auto& accumulator = pos.state()->accumulator; - accumulator.computed[perspective] = true; + accumulator.computed[Perspective] = true; FeatureSet::IndexList active; - FeatureSet::append_active_indices(pos, perspective, active); + FeatureSet::append_active_indices(pos, active); #ifdef VECTOR for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) @@ -528,7 +529,7 @@ namespace Stockfish::Eval::NNUE { } auto accTile = reinterpret_cast( - &accumulator.accumulation[perspective][j * TileHeight]); + &accumulator.accumulation[Perspective][j * TileHeight]); for (unsigned k = 0; k < NumRegs; k++) vec_store(&accTile[k], acc[k]); } @@ -548,27 +549,27 @@ namespace Stockfish::Eval::NNUE { } auto accTilePsqt = reinterpret_cast( - &accumulator.psqtAccumulation[perspective][j * PsqtTileHeight]); + &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) vec_store_psqt(&accTilePsqt[k], psqt[k]); } #else - std::memcpy(accumulator.accumulation[perspective], biases, + std::memcpy(accumulator.accumulation[Perspective], biases, HalfDimensions * sizeof(BiasType)); for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[perspective][k] = 0; + accumulator.psqtAccumulation[Perspective][k] = 0; for (const auto index : active) { const IndexType offset = HalfDimensions * index; for (IndexType j = 0; j < HalfDimensions; ++j) - accumulator.accumulation[perspective][j] += weights[offset + j]; + accumulator.accumulation[Perspective][j] += weights[offset + j]; for (std::size_t k = 0; k < PSQTBuckets; ++k) - accumulator.psqtAccumulation[perspective][k] += psqtWeights[index * PSQTBuckets + k]; + accumulator.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k]; } #endif }