Merge remote-tracking branch 'upstream/master'

[stockfish] / src / nnue / layers / clipped_relu.h
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h

index 0da5e8210119a5a38e474dfeb5e2bfa2cc3e00e4..2ee378ad881266c9a8fadcac6d04cfb482167a5d 100644 (file)
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -1,6 +1,6 @@
  /*
    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  /*
    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2022 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
  
    Stockfish is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
  
    Stockfish is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -21,181 +21,144 @@
  #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  
  #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  
+#include <algorithm>
+#include <cstdint>
+#include <iosfwd>
+
  #include "../nnue_common.h"
  
  namespace Stockfish::Eval::NNUE::Layers {
  
  #include "../nnue_common.h"
  
  namespace Stockfish::Eval::NNUE::Layers {
  
-  // Clipped ReLU
-  template <typename PreviousLayer>
-  class ClippedReLU {
+// Clipped ReLU
+template<IndexType InDims>
+class ClippedReLU {
     public:
      // Input/output type
     public:
      // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
+    using InputType  = std::int32_t;
      using OutputType = std::uint8_t;
      using OutputType = std::uint8_t;
-    static_assert(std::is_same<InputType, std::int32_t>::value, "");
  
      // Number of input/output dimensions
  
      // Number of input/output dimensions
-    static constexpr IndexType InputDimensions = PreviousLayer::OutputDimensions;
+    static constexpr IndexType InputDimensions  = InDims;
      static constexpr IndexType OutputDimensions = InputDimensions;
      static constexpr IndexType PaddedOutputDimensions =
      static constexpr IndexType OutputDimensions = InputDimensions;
      static constexpr IndexType PaddedOutputDimensions =
-        ceil_to_multiple<IndexType>(OutputDimensions, 32);
-
-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t SelfBufferSize =
-        ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize);
+      ceil_to_multiple<IndexType>(OutputDimensions, 32);
  
  
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t BufferSize =
-        PreviousLayer::BufferSize + SelfBufferSize;
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
  
      // Hash value embedded in the evaluation file
  
      // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t get_hash_value() {
-      std::uint32_t hashValue = 0x538D24C7u;
-      hashValue += PreviousLayer::get_hash_value();
-      return hashValue;
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
+        std::uint32_t hashValue = 0x538D24C7u;
+        hashValue += prevHash;
+        return hashValue;
      }
  
      // Read network parameters
      }
  
      // Read network parameters
-    bool read_parameters(std::istream& stream) {
-      return previousLayer.read_parameters(stream);
-    }
+    bool read_parameters(std::istream&) { return true; }
  
      // Write network parameters
  
      // Write network parameters
-    bool write_parameters(std::ostream& stream) const {
-      return previousLayer.write_parameters(stream);
-    }
+    bool write_parameters(std::ostream&) const { return true; }
  
      // Forward propagation
  
      // Forward propagation
-    const OutputType* propagate(
-        const TransformedFeatureType* transformedFeatures, char* buffer) const {
-      const auto input = previousLayer.propagate(
-          transformedFeatures, buffer + SelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
-
-  #if defined(USE_AVX2)
-      if constexpr (InputDimensions % SimdWidth == 0) {
+    void propagate(const InputType* input, OutputType* output) const {
+
+#if defined(USE_AVX2)
+        if constexpr (InputDimensions % SimdWidth == 0)
+        {
+            constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+            const __m256i       Offsets   = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+            const auto          in        = reinterpret_cast<const __m256i*>(input);
+            const auto          out       = reinterpret_cast<__m256i*>(output);
+            for (IndexType i = 0; i < NumChunks; ++i)
+            {
+                const __m256i words0 =
+                  _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 0]),
+                                                        _mm256_load_si256(&in[i * 4 + 1])),
+                                    WeightScaleBits);
+                const __m256i words1 =
+                  _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 2]),
+                                                        _mm256_load_si256(&in[i * 4 + 3])),
+                                    WeightScaleBits);
+                _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(
+                                              _mm256_packs_epi16(words0, words1), Offsets));
+            }
+        }
+        else
+        {
+            constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
+            const auto          in        = reinterpret_cast<const __m128i*>(input);
+            const auto          out       = reinterpret_cast<__m128i*>(output);
+            for (IndexType i = 0; i < NumChunks; ++i)
+            {
+                const __m128i words0 = _mm_srli_epi16(
+                  _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
+                  WeightScaleBits);
+                const __m128i words1 = _mm_srli_epi16(
+                  _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
+                  WeightScaleBits);
+                _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
+            }
+        }
+        constexpr IndexType Start = InputDimensions % SimdWidth == 0
+                                    ? InputDimensions / SimdWidth * SimdWidth
+                                    : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);
+
+#elif defined(USE_SSE2)
          constexpr IndexType NumChunks = InputDimensions / SimdWidth;
          constexpr IndexType NumChunks = InputDimensions / SimdWidth;
-        const __m256i Zero = _mm256_setzero_si256();
-        const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-        const auto in = reinterpret_cast<const __m256i*>(input);
-        const auto out = reinterpret_cast<__m256i*>(output);
-        for (IndexType i = 0; i < NumChunks; ++i) {
-          const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-              _mm256_load_si256(&in[i * 4 + 0]),
-              _mm256_load_si256(&in[i * 4 + 1])), WeightScaleBits);
-          const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-              _mm256_load_si256(&in[i * 4 + 2]),
-              _mm256_load_si256(&in[i * 4 + 3])), WeightScaleBits);
-          _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
-              _mm256_packs_epi16(words0, words1), Zero), Offsets));
+
+    #ifndef USE_SSE41
+        const __m128i k0x80s = _mm_set1_epi8(-128);
+    #endif
+
+        const auto in  = reinterpret_cast<const __m128i*>(input);
+        const auto out = reinterpret_cast<__m128i*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+    #if defined(USE_SSE41)
+            const __m128i words0 = _mm_srli_epi16(
+              _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
+              WeightScaleBits);
+            const __m128i words1 = _mm_srli_epi16(
+              _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
+              WeightScaleBits);
+            _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
+    #else
+            const __m128i words0 = _mm_srai_epi16(
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
+              WeightScaleBits);
+            const __m128i words1 = _mm_srai_epi16(
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
+              WeightScaleBits);
+            const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+            _mm_store_si128(&out[i], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
+    #endif
          }
          }
-      } else {
+        constexpr IndexType Start = NumChunks * SimdWidth;
+
+#elif defined(USE_NEON)
          constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
          constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
-        const __m128i Zero = _mm_setzero_si128();
-        const auto in = reinterpret_cast<const __m128i*>(input);
-        const auto out = reinterpret_cast<__m128i*>(output);
-        for (IndexType i = 0; i < NumChunks; ++i) {
-          const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
-              _mm_load_si128(&in[i * 4 + 0]),
-              _mm_load_si128(&in[i * 4 + 1])), WeightScaleBits);
-          const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
-              _mm_load_si128(&in[i * 4 + 2]),
-              _mm_load_si128(&in[i * 4 + 3])), WeightScaleBits);
-          const __m128i packedbytes = _mm_packs_epi16(words0, words1);
-          _mm_store_si128(&out[i], _mm_max_epi8(packedbytes, Zero));
+        const int8x8_t      Zero      = {0};
+        const auto          in        = reinterpret_cast<const int32x4_t*>(input);
+        const auto          out       = reinterpret_cast<int8x8_t*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+            int16x8_t  shifted;
+            const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+            pack[0]         = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
+            pack[1]         = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
+            out[i]          = vmax_s8(vqmovn_s16(shifted), Zero);
+        }
+        constexpr IndexType Start = NumChunks * (SimdWidth / 2);
+#else
+        constexpr IndexType Start = 0;
+#endif
+
+        for (IndexType i = Start; i < InputDimensions; ++i)
+        {
+            output[i] = static_cast<OutputType>(std::clamp(input[i] >> WeightScaleBits, 0, 127));
          }
          }
-      }
-      constexpr IndexType Start =
-        InputDimensions % SimdWidth == 0
-        ? InputDimensions / SimdWidth * SimdWidth
-        : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);
-
-  #elif defined(USE_SSE2)
-      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
-
-  #ifdef USE_SSE41
-      const __m128i Zero = _mm_setzero_si128();
-  #else
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
-
-      const auto in = reinterpret_cast<const __m128i*>(input);
-      const auto out = reinterpret_cast<__m128i*>(output);
-      for (IndexType i = 0; i < NumChunks; ++i) {
-        const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 0]),
-            _mm_load_si128(&in[i * 4 + 1])), WeightScaleBits);
-        const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 2]),
-            _mm_load_si128(&in[i * 4 + 3])), WeightScaleBits);
-        const __m128i packedbytes = _mm_packs_epi16(words0, words1);
-        _mm_store_si128(&out[i],
-
-  #ifdef USE_SSE41
-          _mm_max_epi8(packedbytes, Zero)
-  #else
-          _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
-
-        );
-      }
-      constexpr IndexType Start = NumChunks * SimdWidth;
-
-  #elif defined(USE_MMX)
-      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
-      const auto in = reinterpret_cast<const __m64*>(input);
-      const auto out = reinterpret_cast<__m64*>(output);
-      for (IndexType i = 0; i < NumChunks; ++i) {
-        const __m64 words0 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
-            WeightScaleBits);
-        const __m64 words1 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
-            WeightScaleBits);
-        const __m64 packedbytes = _mm_packs_pi16(words0, words1);
-        out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
-      }
-      _mm_empty();
-      constexpr IndexType Start = NumChunks * SimdWidth;
-
-  #elif defined(USE_NEON)
-      constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
-      const int8x8_t Zero = {0};
-      const auto in = reinterpret_cast<const int32x4_t*>(input);
-      const auto out = reinterpret_cast<int8x8_t*>(output);
-      for (IndexType i = 0; i < NumChunks; ++i) {
-        int16x8_t shifted;
-        const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
-        pack[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
-        pack[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
-        out[i] = vmax_s8(vqmovn_s16(shifted), Zero);
-      }
-      constexpr IndexType Start = NumChunks * (SimdWidth / 2);
-  #else
-      constexpr IndexType Start = 0;
-  #endif
-
-      for (IndexType i = Start; i < InputDimensions; ++i) {
-        output[i] = static_cast<OutputType>(
-            std::max(0, std::min(127, input[i] >> WeightScaleBits)));
-      }
-
-      // Affine transform layers expect that there is at least
-      // ceil_to_multiple(OutputDimensions, 32) initialized values.
-      // We cannot do this in the affine transform because it requires
-      // preallocating space here.
-      for (IndexType i = OutputDimensions; i < PaddedOutputDimensions; ++i) {
-        output[i] = 0;
-      }
-
-      return output;
      }
      }
-
-   private:
-    PreviousLayer previousLayer;
-  };
+};
  
  }  // namespace Stockfish::Eval::NNUE::Layers
  
  
  }  // namespace Stockfish::Eval::NNUE::Layers
  
-#endif // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#endif  // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED