]> git.sesse.net Git - stockfish/commitdiff
Cache small net w/ psqtOnly support
authormstembera <m_stembera@yahoo.com>
Fri, 26 Apr 2024 01:20:08 +0000 (18:20 -0700)
committerDisservin <disservin.social@gmail.com>
Sun, 28 Apr 2024 19:30:19 +0000 (21:30 +0200)
Caching the small net in the same way as the big net allows them to
share the same code path and completely removes
update_accumulator_refresh().

STC:
https://tests.stockfishchess.org/tests/view/662bfb5ed46f72253dcfed85
LLR: 2.94 (-2.94,2.94) <-1.75,0.25>
Total: 151712 W: 39252 L: 39158 D: 73302
Ptnml(0-2): 565, 17474, 39683, 17570, 564

closes https://github.com/official-stockfish/Stockfish/pull/5194

Bench: 1836777

src/evaluate.cpp
src/nnue/network.cpp
src/nnue/network.h
src/nnue/nnue_accumulator.h
src/nnue/nnue_feature_transformer.h
src/nnue/nnue_misc.cpp

index 6e101e7830aa4d2a72256713cb725baf4ce059ad..345925f6b2a92d6b7df9ab3df3a94497a93dc15e 100644 (file)
@@ -60,7 +60,7 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
     int  nnueComplexity;
     int  v;
 
-    Value nnue = smallNet ? networks.small.evaluate(pos, nullptr, true, &nnueComplexity, psqtOnly)
+    Value nnue = smallNet ? networks.small.evaluate(pos, &caches.small, true, &nnueComplexity, psqtOnly)
                           : networks.big.evaluate(pos, &caches.big, true, &nnueComplexity, false);
 
     const auto adjustEval = [&](int optDiv, int nnueDiv, int npmDiv, int pawnCountConstant,
index 42320bae1ab56ef3fdaaed72e2c0352c63636657..2eca18bd15ded7d7475901044e2f48ed67d304ec 100644 (file)
@@ -263,8 +263,8 @@ void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::hint_common_access(const Position&                         pos,
                                                     AccumulatorCaches::Cache<FTDimensions>* cache,
-                                                    bool psqtOnl) const {
-    featureTransformer->hint_common_access(pos, cache, psqtOnl);
+                                                    bool psqtOnly) const {
+    featureTransformer->hint_common_access(pos, cache, psqtOnly);
 }
 
 template<typename Arch, typename Transformer>
index df59732d955fa52239706a84f1edabbaaf295a19..053b7d19c8250d15c2512a4ec540b65fc41a2cf7 100644 (file)
@@ -62,7 +62,7 @@ class Network {
 
     void hint_common_access(const Position&                         pos,
                             AccumulatorCaches::Cache<FTDimensions>* cache,
-                            bool                                    psqtOnl) const;
+                            bool                                    psqtOnly) const;
 
     void          verify(std::string evalfilePath) const;
     NnueEvalTrace trace_evaluate(const Position&                         pos,
index f65385688ded5fab049604cc01a939a3fdd35293..dd313958fe63e96831469684e79ca08828969cea 100644 (file)
@@ -63,6 +63,7 @@ struct AccumulatorCaches {
             PSQTWeightType psqtAccumulation[COLOR_NB][PSQTBuckets];
             Bitboard       byColorBB[COLOR_NB][COLOR_NB];
             Bitboard       byTypeBB[COLOR_NB][PIECE_TYPE_NB];
+            bool           psqtOnly;
 
             // To initialize a refresh entry, we set all its bitboards empty,
             // so we put the biases in the accumulation, without any weights on top
@@ -70,6 +71,7 @@ struct AccumulatorCaches {
 
                 std::memset(byColorBB, 0, sizeof(byColorBB));
                 std::memset(byTypeBB, 0, sizeof(byTypeBB));
+                psqtOnly = false;
 
                 std::memcpy(accumulation[WHITE], biases, Size * sizeof(BiasType));
                 std::memcpy(accumulation[BLACK], biases, Size * sizeof(BiasType));
@@ -97,11 +99,11 @@ struct AccumulatorCaches {
     template<typename Networks>
     void clear(const Networks& networks) {
         big.clear(networks.big);
+        small.clear(networks.small);
     }
 
-    // When adding a new cache for a network, i.e. the smallnet
-    // the appropriate condition must be added to FeatureTransformer::update_accumulator_refresh.
     Cache<TransformedFeatureDimensionsBig> big;
+    Cache<TransformedFeatureDimensionsSmall> small;
 };
 
 }  // namespace Stockfish::Eval::NNUE
index 88f0e4031a4d0fead4890131d93db3b8b804f401..60957ebeb77ab09ebfa843d03a2a4a9c67fd6966 100644 (file)
@@ -656,75 +656,84 @@ class FeatureTransformer {
 
     template<Color Perspective>
     void update_accumulator_refresh_cache(const Position&                           pos,
-                                          AccumulatorCaches::Cache<HalfDimensions>* cache) const {
+                                          AccumulatorCaches::Cache<HalfDimensions>* cache,
+                                          bool psqtOnly) const {
         assert(cache != nullptr);
 
         Square ksq = pos.square<KING>(Perspective);
-
         auto& entry = (*cache)[ksq];
-
-        auto& accumulator                     = pos.state()->*accPtr;
-        accumulator.computed[Perspective]     = true;
-        accumulator.computedPSQT[Perspective] = true;
-
         FeatureSet::IndexList removed, added;
-        for (Color c : {WHITE, BLACK})
+
+        if (entry.psqtOnly && !psqtOnly)
         {
-            for (PieceType pt = PAWN; pt <= KING; ++pt)
+            entry.clear(biases);
+            FeatureSet::append_active_indices<Perspective>(pos, added);
+        }
+        else
+        {
+            for (Color c : {WHITE, BLACK})
             {
-                const Piece    piece = make_piece(c, pt);
-                const Bitboard oldBB =
-                  entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
-                const Bitboard newBB    = pos.pieces(c, pt);
-                Bitboard       toRemove = oldBB & ~newBB;
-                Bitboard       toAdd    = newBB & ~oldBB;
-
-                while (toRemove)
-                {
-                    Square sq = pop_lsb(toRemove);
-                    removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
-                }
-                while (toAdd)
+                for (PieceType pt = PAWN; pt <= KING; ++pt)
                 {
-                    Square sq = pop_lsb(toAdd);
-                    added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    const Piece    piece = make_piece(c, pt);
+                    const Bitboard oldBB =
+                      entry.byColorBB[Perspective][c] & entry.byTypeBB[Perspective][pt];
+                    const Bitboard newBB    = pos.pieces(c, pt);
+                    Bitboard       toRemove = oldBB & ~newBB;
+                    Bitboard       toAdd    = newBB & ~oldBB;
+
+                    while (toRemove)
+                    {
+                        Square sq = pop_lsb(toRemove);
+                        removed.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    }
+                    while (toAdd)
+                    {
+                        Square sq = pop_lsb(toAdd);
+                        added.push_back(FeatureSet::make_index<Perspective>(sq, piece, ksq));
+                    }
                 }
             }
         }
 
+        auto& accumulator                     = pos.state()->*accPtr;
+        accumulator.computed[Perspective]     = !psqtOnly;
+        accumulator.computedPSQT[Perspective] = true;
+
 #ifdef VECTOR
         vec_t      acc[NumRegs];
         psqt_vec_t psqt[NumPsqtRegs];
 
-        for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
-        {
-            auto entryTile =
-              reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
-            for (IndexType k = 0; k < NumRegs; ++k)
-                acc[k] = entryTile[k];
-
-            for (int i = 0; i < int(added.size()); ++i)
+        if (!psqtOnly)
+            for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
             {
-                IndexType       index  = added[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+                auto entryTile =
+                  reinterpret_cast<vec_t*>(&entry.accumulation[Perspective][j * TileHeight]);
+                for (IndexType k = 0; k < NumRegs; ++k)
+                    acc[k] = entryTile[k];
 
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_add_16(acc[k], column[k]);
-            }
-            for (int i = 0; i < int(removed.size()); ++i)
-            {
-                IndexType       index  = removed[i];
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+                for (int i = 0; i < int(added.size()); ++i)
+                {
+                    IndexType       index  = added[i];
+                    const IndexType offset = HalfDimensions * index + j * TileHeight;
+                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
 
-                for (unsigned k = 0; k < NumRegs; ++k)
-                    acc[k] = vec_sub_16(acc[k], column[k]);
-            }
+                    for (unsigned k = 0; k < NumRegs; ++k)
+                        acc[k] = vec_add_16(acc[k], column[k]);
+                }
+                for (int i = 0; i < int(removed.size()); ++i)
+                {
+                    IndexType       index  = removed[i];
+                    const IndexType offset = HalfDimensions * index + j * TileHeight;
+                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
 
-            for (IndexType k = 0; k < NumRegs; k++)
-                vec_store(&entryTile[k], acc[k]);
-        }
+                    for (unsigned k = 0; k < NumRegs; ++k)
+                        acc[k] = vec_sub_16(acc[k], column[k]);
+                }
+
+                for (IndexType k = 0; k < NumRegs; k++)
+                    vec_store(&entryTile[k], acc[k]);
+            }
 
         for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
         {
@@ -760,18 +769,24 @@ class FeatureTransformer {
 
         for (const auto index : added)
         {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[Perspective][j] += weights[offset + j];
+            if (!psqtOnly)
+            {
+                const IndexType offset = HalfDimensions * index;
+                for (IndexType j = 0; j < HalfDimensions; ++j)
+                    entry.accumulation[Perspective][j] += weights[offset + j];
+            }
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
                 entry.psqtAccumulation[Perspective][k] += psqtWeights[index * PSQTBuckets + k];
         }
         for (const auto index : removed)
         {
-            const IndexType offset = HalfDimensions * index;
-            for (IndexType j = 0; j < HalfDimensions; ++j)
-                entry.accumulation[Perspective][j] -= weights[offset + j];
+            if (!psqtOnly)
+            {
+                const IndexType offset = HalfDimensions * index;
+                for (IndexType j = 0; j < HalfDimensions; ++j)
+                    entry.accumulation[Perspective][j] -= weights[offset + j];
+            }
 
             for (std::size_t k = 0; k < PSQTBuckets; ++k)
                 entry.psqtAccumulation[Perspective][k] -= psqtWeights[index * PSQTBuckets + k];
@@ -782,144 +797,20 @@ class FeatureTransformer {
         // The accumulator of the refresh entry has been updated.
         // Now copy its content to the actual accumulator we were refreshing
 
+        if (!psqtOnly)
+            std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
+                        sizeof(BiasType) * HalfDimensions);
+
         std::memcpy(accumulator.psqtAccumulation[Perspective], entry.psqtAccumulation[Perspective],
                     sizeof(int32_t) * PSQTBuckets);
 
-        std::memcpy(accumulator.accumulation[Perspective], entry.accumulation[Perspective],
-                    sizeof(BiasType) * HalfDimensions);
-
         for (Color c : {WHITE, BLACK})
             entry.byColorBB[Perspective][c] = pos.pieces(c);
 
         for (PieceType pt = PAWN; pt <= KING; ++pt)
             entry.byTypeBB[Perspective][pt] = pos.pieces(pt);
-    }
-
-    template<Color Perspective>
-    void
-    update_accumulator_refresh(const Position&                                            pos,
-                               [[maybe_unused]] AccumulatorCaches::Cache<HalfDimensions>* cache,
-                               bool psqtOnly) const {
-
-        // When we are refreshing the accumulator of the big net,
-        // redirect to the version of refresh that uses the refresh table.
-        // Using the cache for the small net is not beneficial.
-        if constexpr (HalfDimensions == Eval::NNUE::TransformedFeatureDimensionsBig)
-        {
-            update_accumulator_refresh_cache<Perspective>(pos, cache);
-            return;
-        }
 
-#ifdef VECTOR
-        // Gcc-10.2 unnecessarily spills AVX2 registers if this array
-        // is defined in the VECTOR code below, once in each branch
-        vec_t      acc[NumRegs];
-        psqt_vec_t psqt[NumPsqtRegs];
-#endif
-
-        // Refresh the accumulator
-        // Could be extracted to a separate function because it's done in 2 places,
-        // but it's unclear if compilers would correctly handle register allocation.
-        auto& accumulator                     = pos.state()->*accPtr;
-        accumulator.computed[Perspective]     = !psqtOnly;
-        accumulator.computedPSQT[Perspective] = true;
-        FeatureSet::IndexList active;
-        FeatureSet::append_active_indices<Perspective>(pos, active);
-
-#ifdef VECTOR
-        if (!psqtOnly)
-            for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
-            {
-                auto biasesTile = reinterpret_cast<const vec_t*>(&biases[j * TileHeight]);
-                for (IndexType k = 0; k < NumRegs; ++k)
-                    acc[k] = biasesTile[k];
-
-                int i = 0;
-                for (; i < int(active.size()) - 1; i += 2)
-                {
-                    IndexType       index0  = active[i];
-                    IndexType       index1  = active[i + 1];
-                    const IndexType offset0 = HalfDimensions * index0 + j * TileHeight;
-                    const IndexType offset1 = HalfDimensions * index1 + j * TileHeight;
-                    auto            column0 = reinterpret_cast<const vec_t*>(&weights[offset0]);
-                    auto            column1 = reinterpret_cast<const vec_t*>(&weights[offset1]);
-
-                    for (unsigned k = 0; k < NumRegs; ++k)
-                        acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k]));
-                }
-                for (; i < int(active.size()); ++i)
-                {
-                    IndexType       index  = active[i];
-                    const IndexType offset = HalfDimensions * index + j * TileHeight;
-                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
-
-                    for (unsigned k = 0; k < NumRegs; ++k)
-                        acc[k] = vec_add_16(acc[k], column[k]);
-                }
-
-                auto accTile =
-                  reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
-                for (unsigned k = 0; k < NumRegs; k++)
-                    vec_store(&accTile[k], acc[k]);
-            }
-
-        for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
-        {
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                psqt[k] = vec_zero_psqt();
-
-            int i = 0;
-            for (; i < int(active.size()) - 1; i += 2)
-            {
-                IndexType       index0  = active[i];
-                IndexType       index1  = active[i + 1];
-                const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight;
-                const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight;
-                auto columnPsqt0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset0]);
-                auto columnPsqt1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset1]);
-
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] =
-                      vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k]));
-            }
-            for (; i < int(active.size()); ++i)
-            {
-                IndexType       index  = active[i];
-                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
-                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
-
-                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                    psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
-            }
-
-            auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
-              &accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]);
-            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
-                vec_store_psqt(&accTilePsqt[k], psqt[k]);
-        }
-
-#else
-        if (!psqtOnly)
-            std::memcpy(accumulator.accumulation[Perspective], biases,
-                        HalfDimensions * sizeof(BiasType));
-
-        for (std::size_t k = 0; k < PSQTBuckets; ++k)
-            accumulator.psqtAccumulation[Perspective][k] = 0;
-
-        for (const auto index : active)
-        {
-            if (!psqtOnly)
-            {
-                const IndexType offset = HalfDimensions * index;
-                for (IndexType j = 0; j < HalfDimensions; ++j)
-                    accumulator.accumulation[Perspective][j] += weights[offset + j];
-            }
-
-            for (std::size_t k = 0; k < PSQTBuckets; ++k)
-                accumulator.psqtAccumulation[Perspective][k] +=
-                  psqtWeights[index * PSQTBuckets + k];
-        }
-#endif
+        entry.psqtOnly = psqtOnly;
     }
 
     template<Color Perspective>
@@ -948,7 +839,7 @@ class FeatureTransformer {
                                                            psqtOnly);
         }
         else
-            update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
+            update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
     }
 
     template<Color Perspective>
@@ -976,7 +867,7 @@ class FeatureTransformer {
                                                            psqtOnly);
         }
         else
-            update_accumulator_refresh<Perspective>(pos, cache, psqtOnly);
+            update_accumulator_refresh_cache<Perspective>(pos, cache, psqtOnly);
     }
 
     template<IndexType Size>
index 51838fefa4418fc919ff7a9689ddd624eae96255..e92dcc7108622c2e471fc6bb31b3defc183ff351 100644 (file)
@@ -48,7 +48,7 @@ void hint_common_parent_position(const Position&    pos,
 
     int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move()));
     if (simpleEvalAbs > Eval::SmallNetThreshold)
-        networks.small.hint_common_access(pos, nullptr, simpleEvalAbs > Eval::PsqtOnlyThreshold);
+        networks.small.hint_common_access(pos, &caches.small, simpleEvalAbs > Eval::PsqtOnlyThreshold);
     else
         networks.big.hint_common_access(pos, &caches.big, false);
 }