Update architecture to "SFNNv4". Update network to nn-6877cd24400e.nnue.

author Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>

Sat, 27 Nov 2021 14:17:02 +0000 (15:17 +0100)

committer Joost VandeVondele <Joost.VandeVondele@gmail.com>

Thu, 10 Feb 2022 18:54:31 +0000 (19:54 +0100)
author Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Sat, 27 Nov 2021 14:17:02 +0000 (15:17 +0100)
committer Joost VandeVondele <Joost.VandeVondele@gmail.com>
Thu, 10 Feb 2022 18:54:31 +0000 (19:54 +0100)
diff --git a/src/evaluate.h b/src/evaluate.h

index 57a7687d776599196d2b5d584f4d4cbc5f9f2417..1934c9bddf07173731d9b0203228f1cf6da256e0 100644 (file)
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -39,7 +39,7 @@ namespace Eval {
    // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
    // for the build process (profile-build and fishtest) to work. Do not change the
    // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-ac07bd334b62.nnue"
+  #define EvalFileDefaultName   "nn-6877cd24400e.nnue"
  
    namespace NNUE {
  
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp

index 862b200338888a908716d191e0f8152d89e23b4a..0fd58462b78b9dc2bafe273e9bb5ac2498626511 100644 (file)
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -148,22 +148,18 @@ namespace Stockfish::Eval::NNUE {
  #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
      TransformedFeatureType transformedFeaturesUnaligned[
        FeatureTransformer::BufferSize + alignment / sizeof(TransformedFeatureType)];
-    char bufferUnaligned[Network::BufferSize + alignment];
  
      auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
-    auto* buffer = align_ptr_up<alignment>(&bufferUnaligned[0]);
  #else
      alignas(alignment)
        TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize];
-    alignas(alignment) char buffer[Network::BufferSize];
  #endif
  
      ASSERT_ALIGNED(transformedFeatures, alignment);
-    ASSERT_ALIGNED(buffer, alignment);
  
      const std::size_t bucket = (pos.count<ALL_PIECES>() - 1) / 4;
      const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket);
-    const auto positional = network[bucket]->propagate(transformedFeatures, buffer)[0];
+    const auto positional = network[bucket]->propagate(transformedFeatures);
  
      // Give more value to positional evaluation when adjusted flag is set
      if (adjusted)
@@ -190,27 +186,20 @@ namespace Stockfish::Eval::NNUE {
  #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
      TransformedFeatureType transformedFeaturesUnaligned[
        FeatureTransformer::BufferSize + alignment / sizeof(TransformedFeatureType)];
-    char bufferUnaligned[Network::BufferSize + alignment];
  
      auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
-    auto* buffer = align_ptr_up<alignment>(&bufferUnaligned[0]);
  #else
      alignas(alignment)
        TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize];
-    alignas(alignment) char buffer[Network::BufferSize];
  #endif
  
      ASSERT_ALIGNED(transformedFeatures, alignment);
-    ASSERT_ALIGNED(buffer, alignment);
  
      NnueEvalTrace t{};
      t.correctBucket = (pos.count<ALL_PIECES>() - 1) / 4;
      for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket) {
-      const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket);
-      const auto output = network[bucket]->propagate(transformedFeatures, buffer);
-
-      int materialist = psqt;
-      int positional  = output[0];
+      const auto materialist = featureTransformer->transform(pos, transformedFeatures, bucket);
+      const auto positional = network[bucket]->propagate(transformedFeatures);
  
        t.psqt[bucket] = static_cast<Value>( materialist / OutputScale );
        t.positional[bucket] = static_cast<Value>( positional / OutputScale );
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h

index 4e85a5fe4b114412527a27c1c06e669870106424..22451915ba1eb2f547c823c52c95c703d5ebdba1 100644 (file)
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -63,19 +63,17 @@ namespace Stockfish::Eval::NNUE::Layers {
    {
  # if defined(USE_SSE2)
      // At least a multiple of 16, with SSE2.
-    static_assert(PaddedInputDimensions % 16 == 0);
-    constexpr IndexType NumChunks = PaddedInputDimensions / 16;
+    constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
      const __m128i Zeros = _mm_setzero_si128();
      const auto inputVector = reinterpret_cast<const __m128i*>(input);
  
  # elif defined(USE_MMX)
-    static_assert(InputDimensions % 8 == 0);
-    constexpr IndexType NumChunks = InputDimensions / 8;
+    constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 8;
      const __m64 Zeros = _mm_setzero_si64();
      const auto inputVector = reinterpret_cast<const __m64*>(input);
  
  # elif defined(USE_NEON)
-    constexpr IndexType NumChunks = (InputDimensions + 15) / 16;
+    constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
      const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
  # endif
  
@@ -150,24 +148,27 @@ namespace Stockfish::Eval::NNUE::Layers {
    }
  #endif
  
-  template <typename PreviousLayer, IndexType OutDims, typename Enabled = void>
+  template <IndexType InDims, IndexType OutDims, typename Enabled = void>
    class AffineTransform;
  
    // A specialization for large inputs.
-  template <typename PreviousLayer, IndexType OutDims>
-  class AffineTransform<PreviousLayer, OutDims, std::enable_if_t<(PreviousLayer::OutputDimensions >= 2*64-1)>> {
+  template <IndexType InDims, IndexType OutDims>
+  class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) >= 2*64)>> {
     public:
      // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
+    using InputType = std::uint8_t;
      using OutputType = std::int32_t;
-    static_assert(std::is_same<InputType, std::uint8_t>::value, "");
  
      // Number of input/output dimensions
-    static constexpr IndexType InputDimensions = PreviousLayer::OutputDimensions;
+    static constexpr IndexType InputDimensions = InDims;
      static constexpr IndexType OutputDimensions = OutDims;
  
      static constexpr IndexType PaddedInputDimensions =
        ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
+    static constexpr IndexType PaddedOutputDimensions =
+      ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);
+
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
  
      static_assert(PaddedInputDimensions >= 128, "Something went wrong. This specialization should not have been chosen.");
  
@@ -202,20 +203,12 @@ namespace Stockfish::Eval::NNUE::Layers {
  
      static_assert(OutputDimensions % NumOutputRegs == 0);
  
-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t SelfBufferSize =
-      ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize);
-
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t BufferSize =
-      PreviousLayer::BufferSize + SelfBufferSize;
-
      // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t get_hash_value() {
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
        std::uint32_t hashValue = 0xCC03DAE4u;
        hashValue += OutputDimensions;
-      hashValue ^= PreviousLayer::get_hash_value() >> 1;
-      hashValue ^= PreviousLayer::get_hash_value() << 31;
+      hashValue ^= prevHash >> 1;
+      hashValue ^= prevHash << 31;
        return hashValue;
      }
  
@@ -242,7 +235,6 @@ namespace Stockfish::Eval::NNUE::Layers {
  
      // Read network parameters
      bool read_parameters(std::istream& stream) {
-      if (!previousLayer.read_parameters(stream)) return false;
        for (std::size_t i = 0; i < OutputDimensions; ++i)
          biases[i] = read_little_endian<BiasType>(stream);
  
@@ -254,7 +246,6 @@ namespace Stockfish::Eval::NNUE::Layers {
  
      // Write network parameters
      bool write_parameters(std::ostream& stream) const {
-      if (!previousLayer.write_parameters(stream)) return false;
        for (std::size_t i = 0; i < OutputDimensions; ++i)
            write_little_endian<BiasType>(stream, biases[i]);
  
@@ -266,10 +257,7 @@ namespace Stockfish::Eval::NNUE::Layers {
  
      // Forward propagation
      const OutputType* propagate(
-        const TransformedFeatureType* transformedFeatures, char* buffer) const {
-      const auto input = previousLayer.propagate(
-        transformedFeatures, buffer + SelfBufferSize);
-      OutputType* output = reinterpret_cast<OutputType*>(buffer);
+        const InputType* input, OutputType* output) const {
  
  #if defined (USE_AVX512)
        using acc_vec_t = __m512i;
@@ -312,7 +300,6 @@ namespace Stockfish::Eval::NNUE::Layers {
  #if defined (USE_SSSE3) || defined (USE_NEON)
        const in_vec_t* invec = reinterpret_cast<const in_vec_t*>(input);
  
-
        // Perform accumulation to registers for each big block
        for (IndexType bigBlock = 0; bigBlock < NumBigBlocks; ++bigBlock)
        {
@@ -377,26 +364,28 @@ namespace Stockfish::Eval::NNUE::Layers {
      using BiasType = OutputType;
      using WeightType = std::int8_t;
  
-    PreviousLayer previousLayer;
-
      alignas(CacheLineSize) BiasType biases[OutputDimensions];
      alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
    };
  
-  template <typename PreviousLayer, IndexType OutDims>
-  class AffineTransform<PreviousLayer, OutDims, std::enable_if_t<(PreviousLayer::OutputDimensions < 2*64-1)>> {
+  template <IndexType InDims, IndexType OutDims>
+  class AffineTransform<InDims, OutDims, std::enable_if_t<(ceil_to_multiple<IndexType>(InDims, MaxSimdWidth) < 2*64)>> {
     public:
      // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
+    // Input/output type
+    using InputType = std::uint8_t;
      using OutputType = std::int32_t;
-    static_assert(std::is_same<InputType, std::uint8_t>::value, "");
  
      // Number of input/output dimensions
-    static constexpr IndexType InputDimensions =
-        PreviousLayer::OutputDimensions;
+    static constexpr IndexType InputDimensions = InDims;
      static constexpr IndexType OutputDimensions = OutDims;
+
      static constexpr IndexType PaddedInputDimensions =
-        ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
+      ceil_to_multiple<IndexType>(InputDimensions, MaxSimdWidth);
+    static constexpr IndexType PaddedOutputDimensions =
+      ceil_to_multiple<IndexType>(OutputDimensions, MaxSimdWidth);
+
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
  
      static_assert(PaddedInputDimensions < 128, "Something went wrong. This specialization should not have been chosen.");
  
@@ -405,20 +394,12 @@ namespace Stockfish::Eval::NNUE::Layers {
      static constexpr const IndexType InputSimdWidth = SimdWidth;
  #endif
  
-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t SelfBufferSize =
-      ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize);
-
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t BufferSize =
-      PreviousLayer::BufferSize + SelfBufferSize;
-
      // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t get_hash_value() {
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
        std::uint32_t hashValue = 0xCC03DAE4u;
        hashValue += OutputDimensions;
-      hashValue ^= PreviousLayer::get_hash_value() >> 1;
-      hashValue ^= PreviousLayer::get_hash_value() << 31;
+      hashValue ^= prevHash >> 1;
+      hashValue ^= prevHash << 31;
        return hashValue;
      }
  
@@ -441,7 +422,6 @@ namespace Stockfish::Eval::NNUE::Layers {
  
      // Read network parameters
      bool read_parameters(std::istream& stream) {
-      if (!previousLayer.read_parameters(stream)) return false;
        for (std::size_t i = 0; i < OutputDimensions; ++i)
          biases[i] = read_little_endian<BiasType>(stream);
        for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
@@ -452,7 +432,6 @@ namespace Stockfish::Eval::NNUE::Layers {
  
      // Write network parameters
      bool write_parameters(std::ostream& stream) const {
-      if (!previousLayer.write_parameters(stream)) return false;
        for (std::size_t i = 0; i < OutputDimensions; ++i)
          write_little_endian<BiasType>(stream, biases[i]);
  
@@ -463,10 +442,7 @@ namespace Stockfish::Eval::NNUE::Layers {
      }
      // Forward propagation
      const OutputType* propagate(
-        const TransformedFeatureType* transformedFeatures, char* buffer) const {
-      const auto input = previousLayer.propagate(
-        transformedFeatures, buffer + SelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
+        const InputType* input, OutputType* output) const {
  
  #if defined (USE_AVX2)
        using vec_t = __m256i;
@@ -491,12 +467,11 @@ namespace Stockfish::Eval::NNUE::Layers {
  #if defined (USE_SSSE3)
        const auto inputVector = reinterpret_cast<const vec_t*>(input);
  
-      static_assert(InputDimensions % 8 == 0);
        static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1);
  
        if constexpr (OutputDimensions % OutputSimdWidth == 0)
        {
-        constexpr IndexType NumChunks = InputDimensions / 4;
+        constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 8) / 4;
          constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
  
          const auto input32 = reinterpret_cast<const std::int32_t*>(input);
@@ -555,8 +530,6 @@ namespace Stockfish::Eval::NNUE::Layers {
      using BiasType = OutputType;
      using WeightType = std::int8_t;
  
-    PreviousLayer previousLayer;
-
      alignas(CacheLineSize) BiasType biases[OutputDimensions];
      alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
    };
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h

index 0da5e8210119a5a38e474dfeb5e2bfa2cc3e00e4..ffd2e3b76a94523eec1beb01d6b67bec122a89d2 100644 (file)
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -26,51 +26,41 @@
  namespace Stockfish::Eval::NNUE::Layers {
  
    // Clipped ReLU
-  template <typename PreviousLayer>
+  template <IndexType InDims>
    class ClippedReLU {
     public:
      // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
+    using InputType = std::int32_t;
      using OutputType = std::uint8_t;
-    static_assert(std::is_same<InputType, std::int32_t>::value, "");
  
      // Number of input/output dimensions
-    static constexpr IndexType InputDimensions = PreviousLayer::OutputDimensions;
+    static constexpr IndexType InputDimensions = InDims;
      static constexpr IndexType OutputDimensions = InputDimensions;
      static constexpr IndexType PaddedOutputDimensions =
          ceil_to_multiple<IndexType>(OutputDimensions, 32);
  
-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t SelfBufferSize =
-        ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize);
-
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t BufferSize =
-        PreviousLayer::BufferSize + SelfBufferSize;
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
  
      // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t get_hash_value() {
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
        std::uint32_t hashValue = 0x538D24C7u;
-      hashValue += PreviousLayer::get_hash_value();
+      hashValue += prevHash;
        return hashValue;
      }
  
      // Read network parameters
-    bool read_parameters(std::istream& stream) {
-      return previousLayer.read_parameters(stream);
+    bool read_parameters(std::istream&) {
+      return true;
      }
  
      // Write network parameters
-    bool write_parameters(std::ostream& stream) const {
-      return previousLayer.write_parameters(stream);
+    bool write_parameters(std::ostream&) const {
+      return true;
      }
  
      // Forward propagation
      const OutputType* propagate(
-        const TransformedFeatureType* transformedFeatures, char* buffer) const {
-      const auto input = previousLayer.propagate(
-          transformedFeatures, buffer + SelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
+        const InputType* input, OutputType* output) const {
  
    #if defined(USE_AVX2)
        if constexpr (InputDimensions % SimdWidth == 0) {
@@ -191,9 +181,6 @@ namespace Stockfish::Eval::NNUE::Layers {
  
        return output;
      }
-
-   private:
-    PreviousLayer previousLayer;
    };
  
  }  // namespace Stockfish::Eval::NNUE::Layers
diff --git a/src/nnue/layers/input_slice.h b/src/nnue/layers/input_slice.h

deleted file mode 100644 (file)

index 8f526b7..0000000
--- a/src/nnue/layers/input_slice.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
-  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)
-
-  Stockfish is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Stockfish is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-// NNUE evaluation function layer InputSlice definition
-
-#ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
-#define NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
-
-#include "../nnue_common.h"
-
-namespace Stockfish::Eval::NNUE::Layers {
-
-// Input layer
-template <IndexType OutDims, IndexType Offset = 0>
-class InputSlice {
- public:
-  // Need to maintain alignment
-  static_assert(Offset % MaxSimdWidth == 0, "");
-
-  // Output type
-  using OutputType = TransformedFeatureType;
-
-  // Output dimensionality
-  static constexpr IndexType OutputDimensions = OutDims;
-
-  // Size of forward propagation buffer used from the input layer to this layer
-  static constexpr std::size_t BufferSize = 0;
-
-  // Hash value embedded in the evaluation file
-  static constexpr std::uint32_t get_hash_value() {
-    std::uint32_t hashValue = 0xEC42E90Du;
-    hashValue ^= OutputDimensions ^ (Offset << 10);
-    return hashValue;
-  }
-
-  // Read network parameters
-  bool read_parameters(std::istream& /*stream*/) {
-    return true;
-  }
-
-  // Write network parameters
-  bool write_parameters(std::ostream& /*stream*/) const {
-    return true;
-  }
-
-  // Forward propagation
-  const OutputType* propagate(
-      const TransformedFeatureType* transformedFeatures,
-      char* /*buffer*/) const {
-    return transformedFeatures + Offset;
-  }
-
- private:
-};
-
-}  // namespace Stockfish::Eval::NNUE::Layers
-
-#endif // #ifndef NNUE_LAYERS_INPUT_SLICE_H_INCLUDED
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h

index 8867fac72fc2f39f12ecf8bbc89673515f63998d..725b40fb43d2e87d9ab484603528f9965a06f59d 100644 (file)
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -25,35 +25,106 @@
  
  #include "features/half_ka_v2_hm.h"
  
-#include "layers/input_slice.h"
  #include "layers/affine_transform.h"
  #include "layers/clipped_relu.h"
  
-namespace Stockfish::Eval::NNUE {
-
-  // Input features used in evaluation function
-  using FeatureSet = Features::HalfKAv2_hm;
-
-  // Number of input feature dimensions after conversion
-  constexpr IndexType TransformedFeatureDimensions = 1024;
-  constexpr IndexType PSQTBuckets = 8;
-  constexpr IndexType LayerStacks = 8;
-
-  namespace Layers {
+#include "../misc.h"
  
-    // Define network structure
-    using InputLayer = InputSlice<TransformedFeatureDimensions * 2>;
-    using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 8>>;
-    using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
-    using OutputLayer = AffineTransform<HiddenLayer2, 1>;
-
-  }  // namespace Layers
-
-  using Network = Layers::OutputLayer;
+namespace Stockfish::Eval::NNUE {
  
-  static_assert(TransformedFeatureDimensions % MaxSimdWidth == 0, "");
-  static_assert(Network::OutputDimensions == 1, "");
-  static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+// Input features used in evaluation function
+using FeatureSet = Features::HalfKAv2_hm;
+
+// Number of input feature dimensions after conversion
+constexpr IndexType TransformedFeatureDimensions = 1024;
+constexpr IndexType PSQTBuckets = 8;
+constexpr IndexType LayerStacks = 8;
+
+struct Network
+{
+  static constexpr int FC_0_OUTPUTS = 15;
+  static constexpr int FC_1_OUTPUTS = 32;
+
+  Layers::AffineTransform<TransformedFeatureDimensions, FC_0_OUTPUTS + 1> fc_0;
+  Layers::ClippedReLU<FC_0_OUTPUTS> ac_0;
+  Layers::AffineTransform<FC_0_OUTPUTS, FC_1_OUTPUTS> fc_1;
+  Layers::ClippedReLU<FC_1_OUTPUTS> ac_1;
+  Layers::AffineTransform<FC_1_OUTPUTS, 1> fc_2;
+
+  // Hash value embedded in the evaluation file
+  static constexpr std::uint32_t get_hash_value() {
+    // input slice hash
+    std::uint32_t hashValue = 0xEC42E90Du;
+    hashValue ^= TransformedFeatureDimensions * 2;
+
+    hashValue = decltype(fc_0)::get_hash_value(hashValue);
+    hashValue = decltype(ac_0)::get_hash_value(hashValue);
+    hashValue = decltype(fc_1)::get_hash_value(hashValue);
+    hashValue = decltype(ac_1)::get_hash_value(hashValue);
+    hashValue = decltype(fc_2)::get_hash_value(hashValue);
+
+    return hashValue;
+  }
+
+  // Read network parameters
+  bool read_parameters(std::istream& stream) {
+    if (!fc_0.read_parameters(stream)) return false;
+    if (!ac_0.read_parameters(stream)) return false;
+    if (!fc_1.read_parameters(stream)) return false;
+    if (!ac_1.read_parameters(stream)) return false;
+    if (!fc_2.read_parameters(stream)) return false;
+    return true;
+  }
+
+  // Read network parameters
+  bool write_parameters(std::ostream& stream) const {
+    if (!fc_0.write_parameters(stream)) return false;
+    if (!ac_0.write_parameters(stream)) return false;
+    if (!fc_1.write_parameters(stream)) return false;
+    if (!ac_1.write_parameters(stream)) return false;
+    if (!fc_2.write_parameters(stream)) return false;
+    return true;
+  }
+
+  std::int32_t propagate(const TransformedFeatureType* transformedFeatures)
+  {
+    constexpr uint64_t alignment = CacheLineSize;
+
+    struct Buffer
+    {
+      alignas(CacheLineSize) decltype(fc_0)::OutputBuffer fc_0_out;
+      alignas(CacheLineSize) decltype(ac_0)::OutputBuffer ac_0_out;
+      alignas(CacheLineSize) decltype(fc_1)::OutputBuffer fc_1_out;
+      alignas(CacheLineSize) decltype(ac_1)::OutputBuffer ac_1_out;
+      alignas(CacheLineSize) decltype(fc_2)::OutputBuffer fc_2_out;
+    };
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+    char bufferRaw[sizeof(Buffer) + alignment];
+    char* bufferRawAligned = align_ptr_up<alignment>(&bufferRaw[0]);
+    Buffer& buffer = *(new (bufferRawAligned) Buffer);
+#else
+    alignas(alignment) Buffer buffer;
+#endif
+
+    fc_0.propagate(transformedFeatures, buffer.fc_0_out);
+    ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out);
+    fc_1.propagate(buffer.ac_0_out, buffer.fc_1_out);
+    ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out);
+    fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out);
+
+    // buffer.fc_0_out[FC_0_OUTPUTS] is such that 1.0 is equal to 127*(1<<WeightScaleBits) in quantized form
+    // but we want 1.0 to be equal to 600*OutputScale
+    std::int32_t fwdOut = int(buffer.fc_0_out[FC_0_OUTPUTS]) * (600*OutputScale) / (127*(1<<WeightScaleBits));
+    std::int32_t outputValue = buffer.fc_2_out[0] + fwdOut;
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+    buffer.~Buffer();
+#endif
+
+    return outputValue;
+  }
+};
  
  }  // namespace Stockfish::Eval::NNUE
  
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h

index f4024dce83b9d1e5f02ce87528a80d70a68874fc..fb867421f6c249b7513190ef5f2a48296726ac03 100644 (file)
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -183,7 +183,7 @@ namespace Stockfish::Eval::NNUE {
  
      // Number of input/output dimensions
      static constexpr IndexType InputDimensions = FeatureSet::Dimensions;
-    static constexpr IndexType OutputDimensions = HalfDimensions * 2;
+    static constexpr IndexType OutputDimensions = HalfDimensions;
  
      // Size of forward propagation buffer
      static constexpr std::size_t BufferSize =
@@ -191,7 +191,7 @@ namespace Stockfish::Eval::NNUE {
  
      // Hash value embedded in the evaluation file
      static constexpr std::uint32_t get_hash_value() {
-      return FeatureSet::HashValue ^ OutputDimensions;
+      return FeatureSet::HashValue ^ (OutputDimensions * 2);
      }
  
      // Read network parameters
@@ -229,142 +229,130 @@ namespace Stockfish::Eval::NNUE {
          ) / 2;
  
  
-  #if defined(USE_AVX512)
-
-      constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
-      static_assert(HalfDimensions % (SimdWidth * 2) == 0);
-      const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
-      const __m512i Zero = _mm512_setzero_si512();
-
        for (IndexType p = 0; p < 2; ++p)
        {
-          const IndexType offset = HalfDimensions * p;
-          auto out = reinterpret_cast<__m512i*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
-          {
-              __m512i sum0 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
-                                              (accumulation[perspectives[p]])[j * 2 + 0]);
-              __m512i sum1 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
-                                              (accumulation[perspectives[p]])[j * 2 + 1]);
+          const IndexType offset = (HalfDimensions / 2) * p;
  
-              _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
-                                 _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
-          }
-      }
-      return psqt;
+#if defined(USE_AVX512)
  
-  #elif defined(USE_AVX2)
+          constexpr IndexType OutputChunkSize = 512 / 8;
+          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
+          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
  
-      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
-      constexpr int Control = 0b11011000;
-      const __m256i Zero = _mm256_setzero_si256();
+          const __m512i Zero = _mm512_setzero_si512();
+          const __m512i One = _mm512_set1_epi16(127);
+          const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
  
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          auto out = reinterpret_cast<__m256i*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
+          const __m512i* in0 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][0]));
+          const __m512i* in1 = reinterpret_cast<const __m512i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+                __m512i* out = reinterpret_cast<      __m512i*>(output + offset);
+
+          for (IndexType j = 0; j < NumOutputChunks; j += 1)
            {
-              __m256i sum0 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
-                                              (accumulation[perspectives[p]])[j * 2 + 0]);
-              __m256i sum1 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
-                                              (accumulation[perspectives[p]])[j * 2 + 1]);
+              const __m512i sum0a = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 0], One), Zero);
+              const __m512i sum0b = _mm512_max_epi16(_mm512_min_epi16(in0[j * 2 + 1], One), Zero);
+              const __m512i sum1a = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 0], One), Zero);
+              const __m512i sum1b = _mm512_max_epi16(_mm512_min_epi16(in1[j * 2 + 1], One), Zero);
  
-              _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(
-                                 _mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control));
+              const __m512i pa = _mm512_srli_epi16(_mm512_mullo_epi16(sum0a, sum1a), 7);
+              const __m512i pb = _mm512_srli_epi16(_mm512_mullo_epi16(sum0b, sum1b), 7);
+
+              out[j] = _mm512_permutexvar_epi64(Control, _mm512_packs_epi16(pa, pb));
            }
-      }
-      return psqt;
  
-  #elif defined(USE_SSE2)
+#elif defined(USE_AVX2)
  
-      #ifdef USE_SSE41
-      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
-      const __m128i Zero = _mm_setzero_si128();
-      #else
-      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-      #endif
+          constexpr IndexType OutputChunkSize = 256 / 8;
+          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
+          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
  
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          auto out = reinterpret_cast<__m128i*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
+          const __m256i Zero = _mm256_setzero_si256();
+          const __m256i One = _mm256_set1_epi16(127);
+          constexpr int Control = 0b11011000;
+
+          const __m256i* in0 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][0]));
+          const __m256i* in1 = reinterpret_cast<const __m256i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+                __m256i* out = reinterpret_cast<      __m256i*>(output + offset);
+
+          for (IndexType j = 0; j < NumOutputChunks; j += 1)
            {
-              __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>
-                                           (accumulation[perspectives[p]])[j * 2 + 0]);
-              __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>
-                                           (accumulation[perspectives[p]])[j * 2 + 1]);
-              const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
-
-              #ifdef USE_SSE41
-              _mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero));
-              #else
-              _mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
-              #endif
+              const __m256i sum0a = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 0], One), Zero);
+              const __m256i sum0b = _mm256_max_epi16(_mm256_min_epi16(in0[j * 2 + 1], One), Zero);
+              const __m256i sum1a = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 0], One), Zero);
+              const __m256i sum1b = _mm256_max_epi16(_mm256_min_epi16(in1[j * 2 + 1], One), Zero);
+
+              const __m256i pa = _mm256_srli_epi16(_mm256_mullo_epi16(sum0a, sum1a), 7);
+              const __m256i pb = _mm256_srli_epi16(_mm256_mullo_epi16(sum0b, sum1b), 7);
+
+              out[j] = _mm256_permute4x64_epi64(_mm256_packs_epi16(pa, pb), Control);
            }
-      }
-      return psqt;
  
-  #elif defined(USE_MMX)
+#elif defined(USE_SSE2)
  
-      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
+          constexpr IndexType OutputChunkSize = 128 / 8;
+          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
+          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
  
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          auto out = reinterpret_cast<__m64*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
+          const __m128i Zero = _mm_setzero_si128();
+          const __m128i One = _mm_set1_epi16(127);
+
+          const __m128i* in0 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][0]));
+          const __m128i* in1 = reinterpret_cast<const __m128i*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+                __m128i* out = reinterpret_cast<      __m128i*>(output + offset);
+
+          for (IndexType j = 0; j < NumOutputChunks; j += 1)
            {
-              __m64 sum0 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 0]);
-              __m64 sum1 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 1]);
-              const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
-              out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+              const __m128i sum0a = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 0], One), Zero);
+              const __m128i sum0b = _mm_max_epi16(_mm_min_epi16(in0[j * 2 + 1], One), Zero);
+              const __m128i sum1a = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 0], One), Zero);
+              const __m128i sum1b = _mm_max_epi16(_mm_min_epi16(in1[j * 2 + 1], One), Zero);
+
+              const __m128i pa = _mm_srli_epi16(_mm_mullo_epi16(sum0a, sum1a), 7);
+              const __m128i pb = _mm_srli_epi16(_mm_mullo_epi16(sum0b, sum1b), 7);
+
+              out[j] = _mm_packs_epi16(pa, pb);
            }
-      }
-      _mm_empty();
-      return psqt;
  
-  #elif defined(USE_NEON)
+#elif defined(USE_NEON)
  
-      constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2);
-      const int8x8_t Zero = {0};
+          constexpr IndexType OutputChunkSize = 128 / 8;
+          static_assert((HalfDimensions / 2) % OutputChunkSize == 0);
+          constexpr IndexType NumOutputChunks = HalfDimensions / 2 / OutputChunkSize;
  
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+          const int16x8_t Zero = vdupq_n_s16(0);
+          const int16x8_t One  = vdupq_n_s16(127);
  
-          constexpr IndexType UnrollFactor = 16;
-          static_assert(UnrollFactor % UnrollFactor == 0);
-          for (IndexType j = 0; j < NumChunks; j += UnrollFactor)
+          const int16x8_t* in0 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][0]));
+          const int16x8_t* in1 = reinterpret_cast<const int16x8_t*>(&(accumulation[perspectives[p]][HalfDimensions / 2]));
+                int8x16_t* out = reinterpret_cast<      int8x16_t*>(output + offset);
+
+          for (IndexType j = 0; j < NumOutputChunks; j += 1)
            {
-              int16x8_t sums[UnrollFactor];
-              for (IndexType i = 0; i < UnrollFactor; ++i)
-                sums[i] = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j+i];
+              const int16x8_t sum0a = vmaxq_s16(vminq_s16(in0[j * 2 + 0], One), Zero);
+              const int16x8_t sum0b = vmaxq_s16(vminq_s16(in0[j * 2 + 1], One), Zero);
+              const int16x8_t sum1a = vmaxq_s16(vminq_s16(in1[j * 2 + 0], One), Zero);
+              const int16x8_t sum1b = vmaxq_s16(vminq_s16(in1[j * 2 + 1], One), Zero);
+
+              const int8x8_t pa = vshrn_n_s16(vmulq_s16(sum0a, sum1a), 7);
+              const int8x8_t pb = vshrn_n_s16(vmulq_s16(sum0b, sum1b), 7);
  
-              for (IndexType i = 0; i < UnrollFactor; ++i)
-                out[j+i] = vmax_s8(vqmovn_s16(sums[i]), Zero);
+              out[j] = vcombine_s8(pa, pb);
            }
-      }
-      return psqt;
  
-  #else
+#else
  
-      for (IndexType p = 0; p < 2; ++p)
-      {
-          const IndexType offset = HalfDimensions * p;
-          for (IndexType j = 0; j < HalfDimensions; ++j)
-          {
-              BiasType sum = accumulation[perspectives[p]][j];
-              output[offset + j] = static_cast<OutputType>(std::max<int>(0, std::min<int>(127, sum)));
+          for (IndexType j = 0; j < HalfDimensions / 2; ++j) {
+              BiasType sum0 = accumulation[static_cast<int>(perspectives[p])][j + 0];
+              BiasType sum1 = accumulation[static_cast<int>(perspectives[p])][j + HalfDimensions / 2];
+              sum0 = std::max<int>(0, std::min<int>(127, sum0));
+              sum1 = std::max<int>(0, std::min<int>(127, sum1));
+              output[offset + j] = static_cast<OutputType>(sum0 * sum1 / 128);
            }
+
+#endif
        }
-      return psqt;
  
-  #endif
+      return psqt;
  
     } // end of function transform()
author	Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
	Sat, 27 Nov 2021 14:17:02 +0000 (15:17 +0100)
committer	Joost VandeVondele <Joost.VandeVondele@gmail.com>
	Thu, 10 Feb 2022 18:54:31 +0000 (19:54 +0100)
src/evaluate.h		patch \| blob \| history
src/nnue/evaluate_nnue.cpp		patch \| blob \| history
src/nnue/layers/affine_transform.h		patch \| blob \| history
src/nnue/layers/clipped_relu.h		patch \| blob \| history
src/nnue/layers/input_slice.h	[deleted file]	patch \| blob \| history
src/nnue/nnue_architecture.h		patch \| blob \| history
src/nnue/nnue_feature_transformer.h		patch \| blob \| history