From: mstembera Date: Sun, 1 Oct 2023 06:12:02 +0000 (-0700) Subject: Optimize the most common update accumalator cases w/o tiling X-Git-Url: https://git.sesse.net/?p=stockfish;a=commitdiff_plain;h=c17a657b045d4dc720c8c36558fe649a1c3f4a05 Optimize the most common update accumalator cases w/o tiling In the most common case where we only update a single state it's faster to not use temporary accumulation registers and tiling. (Also includes a couple of small cleanups.) passed STC https://tests.stockfishchess.org/tests/view/651918e3cff46e538ee0023b LLR: 2.95 (-2.94,2.94) <0.00,2.00> Total: 34944 W: 8989 L: 8687 D: 17268 Ptnml(0-2): 88, 3743, 9512, 4037, 92 A simpler version https://tests.stockfishchess.org/tests/view/65190dfacff46e538ee00155 also passed but this version is stronger still https://tests.stockfishchess.org/tests/view/6519b95fcff46e538ee00fa2 closes https://github.com/official-stockfish/Stockfish/pull/4816 No functional change --- diff --git a/src/misc.h b/src/misc.h index c0387f7c..52595fb9 100644 --- a/src/misc.h +++ b/src/misc.h @@ -87,6 +87,7 @@ public: void push_back(const T& value) { values_[size_++] = value; } const T* begin() const { return values_; } const T* end() const { return values_ + size_; } + const T& operator[](int index) const { return values_[index]; } private: T values_[MaxSize]; diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 77a175f5..25f686da 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -370,13 +370,13 @@ namespace Stockfish::Eval::NNUE { while (states_to_update[i] == nullptr) --i; - StateInfo *st2 = states_to_update[i]; + StateInfo* st2 = states_to_update[i]; for (; i >= 0; --i) { states_to_update[i]->accumulator.computed[Perspective] = true; - StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1]; + const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1]; for (; st2 != end_state; st2 = st2->previous) FeatureSet::append_changed_indices( @@ -388,78 +388,140 @@ namespace Stockfish::Eval::NNUE { // Now update the accumulators listed in states_to_update[], where the last element is a sentinel. #ifdef VECTOR - for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) + + if ( states_to_update[1] == nullptr + && (removed[0].size() == 1 || removed[0].size() == 2) + && added[0].size() == 1) { - // Load accumulator - auto accTile = reinterpret_cast( - &st->accumulator.accumulation[Perspective][j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = vec_load(&accTile[k]); + assert(states_to_update[0]); - for (IndexType i = 0; states_to_update[i]; ++i) - { - // Difference calculation for the deactivated features - for (const auto index : removed[i]) + auto accTileIn = reinterpret_cast( + &st->accumulator.accumulation[Perspective][0]); + auto accTileOut = reinterpret_cast( + &states_to_update[0]->accumulator.accumulation[Perspective][0]); + + const IndexType offsetR0 = HalfDimensions * removed[0][0]; + auto columnR0 = reinterpret_cast(&weights[offsetR0]); + const IndexType offsetA = HalfDimensions * added[0][0]; + auto columnA = reinterpret_cast(&weights[offsetA]); + + if (removed[0].size() == 1) { - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = vec_sub_16(acc[k], column[k]); + for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t); ++k) + accTileOut[k] = vec_add_16(vec_sub_16(accTileIn[k], columnR0[k]), columnA[k]); } + else + { + const IndexType offsetR1 = HalfDimensions * removed[0][1]; + auto columnR1 = reinterpret_cast(&weights[offsetR1]); + + for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t); ++k) + accTileOut[k] = vec_sub_16( + vec_add_16(accTileIn[k], columnA[k]), + vec_add_16(columnR0[k], columnR1[k])); + } + + auto accTilePsqtIn = reinterpret_cast( + &st->accumulator.psqtAccumulation[Perspective][0]); + auto accTilePsqtOut = reinterpret_cast( + &states_to_update[0]->accumulator.psqtAccumulation[Perspective][0]); - // Difference calculation for the activated features - for (const auto index : added[i]) + const IndexType offsetPsqtR0 = PSQTBuckets * removed[0][0]; + auto columnPsqtR0 = reinterpret_cast(&psqtWeights[offsetPsqtR0]); + const IndexType offsetPsqtA = PSQTBuckets * added[0][0]; + auto columnPsqtA = reinterpret_cast(&psqtWeights[offsetPsqtA]); + + if (removed[0].size() == 1) { - const IndexType offset = HalfDimensions * index + j * TileHeight; - auto column = reinterpret_cast(&weights[offset]); - for (IndexType k = 0; k < NumRegs; ++k) - acc[k] = vec_add_16(acc[k], column[k]); + for (std::size_t k = 0; k < PSQTBuckets * sizeof(std::int32_t) / sizeof(psqt_vec_t); ++k) + accTilePsqtOut[k] = vec_add_psqt_32(vec_sub_psqt_32( + accTilePsqtIn[k], columnPsqtR0[k]), columnPsqtA[k]); } + else + { + const IndexType offsetPsqtR1 = PSQTBuckets * removed[0][1]; + auto columnPsqtR1 = reinterpret_cast(&psqtWeights[offsetPsqtR1]); - // Store accumulator - accTile = reinterpret_cast( - &states_to_update[i]->accumulator.accumulation[Perspective][j * TileHeight]); - for (IndexType k = 0; k < NumRegs; ++k) - vec_store(&accTile[k], acc[k]); - } + for (std::size_t k = 0; k < PSQTBuckets * sizeof(std::int32_t) / sizeof(psqt_vec_t); ++k) + accTilePsqtOut[k] = vec_sub_psqt_32( + vec_add_psqt_32(accTilePsqtIn[k], columnPsqtA[k]), + vec_add_psqt_32(columnPsqtR0[k], columnPsqtR1[k])); + } } - - for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) + else { - // Load accumulator - auto accTilePsqt = reinterpret_cast( - &st->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_load_psqt(&accTilePsqt[k]); - - for (IndexType i = 0; states_to_update[i]; ++i) - { - // Difference calculation for the deactivated features - for (const auto index : removed[i]) + for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) { - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); + // Load accumulator + auto accTileIn = reinterpret_cast( + &st->accumulator.accumulation[Perspective][j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = vec_load(&accTileIn[k]); + + for (IndexType i = 0; states_to_update[i]; ++i) + { + // Difference calculation for the deactivated features + for (const auto index : removed[i]) + { + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = vec_sub_16(acc[k], column[k]); + } + + // Difference calculation for the activated features + for (const auto index : added[i]) + { + const IndexType offset = HalfDimensions * index + j * TileHeight; + auto column = reinterpret_cast(&weights[offset]); + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = vec_add_16(acc[k], column[k]); + } + + // Store accumulator + auto accTileOut = reinterpret_cast( + &states_to_update[i]->accumulator.accumulation[Perspective][j * TileHeight]); + for (IndexType k = 0; k < NumRegs; ++k) + vec_store(&accTileOut[k], acc[k]); + } } - // Difference calculation for the activated features - for (const auto index : added[i]) + for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) { - const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; - auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + // Load accumulator + auto accTilePsqtIn = reinterpret_cast( + &st->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); for (std::size_t k = 0; k < NumPsqtRegs; ++k) - psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + psqt[k] = vec_load_psqt(&accTilePsqtIn[k]); + + for (IndexType i = 0; states_to_update[i]; ++i) + { + // Difference calculation for the deactivated features + for (const auto index : removed[i]) + { + const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); + } + + // Difference calculation for the activated features + for (const auto index : added[i]) + { + const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; + auto columnPsqt = reinterpret_cast(&psqtWeights[offset]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); + } + + // Store accumulator + auto accTilePsqtOut = reinterpret_cast( + &states_to_update[i]->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); + for (std::size_t k = 0; k < NumPsqtRegs; ++k) + vec_store_psqt(&accTilePsqtOut[k], psqt[k]); + } } - - // Store accumulator - accTilePsqt = reinterpret_cast( - &states_to_update[i]->accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); - for (std::size_t k = 0; k < NumPsqtRegs; ++k) - vec_store_psqt(&accTilePsqt[k], psqt[k]); - } } - #else for (IndexType i = 0; states_to_update[i]; ++i) {