Merge remote-tracking branch 'upstream/master'

[stockfish] / src / nnue / layers / clipped_relu.h
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h

index 44d8a7def4c92ff55f7fcf1150b9553b37a90458..2ee378ad881266c9a8fadcac6d04cfb482167a5d 100644 (file)
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -1,6 +1,6 @@
  /*
    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  /*
    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
  
    Stockfish is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
  
    Stockfish is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -21,146 +21,144 @@
  #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  
  #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  
+#include <algorithm>
+#include <cstdint>
+#include <iosfwd>
+
  #include "../nnue_common.h"
  
  #include "../nnue_common.h"
  
-namespace Eval::NNUE::Layers {
+namespace Stockfish::Eval::NNUE::Layers {
  
  
-  // Clipped ReLU
-  template <typename PreviousLayer>
-  class ClippedReLU {
+// Clipped ReLU
+template<IndexType InDims>
+class ClippedReLU {
     public:
      // Input/output type
     public:
      // Input/output type
-    using InputType = typename PreviousLayer::OutputType;
+    using InputType  = std::int32_t;
      using OutputType = std::uint8_t;
      using OutputType = std::uint8_t;
-    static_assert(std::is_same<InputType, std::int32_t>::value, "");
  
      // Number of input/output dimensions
  
      // Number of input/output dimensions
-    static constexpr IndexType kInputDimensions =
-        PreviousLayer::kOutputDimensions;
-    static constexpr IndexType kOutputDimensions = kInputDimensions;
-
-    // Size of forward propagation buffer used in this layer
-    static constexpr std::size_t kSelfBufferSize =
-        CeilToMultiple(kOutputDimensions * sizeof(OutputType), kCacheLineSize);
+    static constexpr IndexType InputDimensions  = InDims;
+    static constexpr IndexType OutputDimensions = InputDimensions;
+    static constexpr IndexType PaddedOutputDimensions =
+      ceil_to_multiple<IndexType>(OutputDimensions, 32);
  
  
-    // Size of the forward propagation buffer used from the input layer to this layer
-    static constexpr std::size_t kBufferSize =
-        PreviousLayer::kBufferSize + kSelfBufferSize;
+    using OutputBuffer = OutputType[PaddedOutputDimensions];
  
      // Hash value embedded in the evaluation file
  
      // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t GetHashValue() {
-      std::uint32_t hash_value = 0x538D24C7u;
-      hash_value += PreviousLayer::GetHashValue();
-      return hash_value;
+    static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
+        std::uint32_t hashValue = 0x538D24C7u;
+        hashValue += prevHash;
+        return hashValue;
      }
  
      // Read network parameters
      }
  
      // Read network parameters
-    bool ReadParameters(std::istream& stream) {
-      return previous_layer_.ReadParameters(stream);
-    }
+    bool read_parameters(std::istream&) { return true; }
+
+    // Write network parameters
+    bool write_parameters(std::ostream&) const { return true; }
  
      // Forward propagation
  
      // Forward propagation
-    const OutputType* Propagate(
-        const TransformedFeatureType* transformed_features, char* buffer) const {
-      const auto input = previous_layer_.Propagate(
-          transformed_features, buffer + kSelfBufferSize);
-      const auto output = reinterpret_cast<OutputType*>(buffer);
-
-  #if defined(USE_AVX2)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
-      const __m256i kZero = _mm256_setzero_si256();
-      const __m256i kOffsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-      const auto in = reinterpret_cast<const __m256i*>(input);
-      const auto out = reinterpret_cast<__m256i*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 0]),
-            _mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
-        const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
-            _mm256_loadA_si256(&in[i * 4 + 2]),
-            _mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
-        _mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
-            _mm256_packs_epi16(words0, words1), kZero), kOffsets));
-      }
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
-
-  #elif defined(USE_SSE2)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
-
-  #ifdef USE_SSE41
-      const __m128i kZero = _mm_setzero_si128();
-  #else
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
-
-      const auto in = reinterpret_cast<const __m128i*>(input);
-      const auto out = reinterpret_cast<__m128i*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 0]),
-            _mm_load_si128(&in[i * 4 + 1])), kWeightScaleBits);
-        const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 2]),
-            _mm_load_si128(&in[i * 4 + 3])), kWeightScaleBits);
-        const __m128i packedbytes = _mm_packs_epi16(words0, words1);
-        _mm_store_si128(&out[i],
-
-  #ifdef USE_SSE41
-          _mm_max_epi8(packedbytes, kZero)
-  #else
-          _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
-
-        );
-      }
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
-
-  #elif defined(USE_MMX)
-      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
-      const __m64 k0x80s = _mm_set1_pi8(-128);
-      const auto in = reinterpret_cast<const __m64*>(input);
-      const auto out = reinterpret_cast<__m64*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        const __m64 words0 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
-            kWeightScaleBits);
-        const __m64 words1 = _mm_srai_pi16(
-            _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
-            kWeightScaleBits);
-        const __m64 packedbytes = _mm_packs_pi16(words0, words1);
-        out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
-      }
-      _mm_empty();
-      constexpr IndexType kStart = kNumChunks * kSimdWidth;
-
-  #elif defined(USE_NEON)
-      constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
-      const int8x8_t kZero = {0};
-      const auto in = reinterpret_cast<const int32x4_t*>(input);
-      const auto out = reinterpret_cast<int8x8_t*>(output);
-      for (IndexType i = 0; i < kNumChunks; ++i) {
-        int16x8_t shifted;
-        const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
-        pack[0] = vqshrn_n_s32(in[i * 2 + 0], kWeightScaleBits);
-        pack[1] = vqshrn_n_s32(in[i * 2 + 1], kWeightScaleBits);
-        out[i] = vmax_s8(vqmovn_s16(shifted), kZero);
-      }
-      constexpr IndexType kStart = kNumChunks * (kSimdWidth / 2);
-  #else
-      constexpr IndexType kStart = 0;
-  #endif
-
-      for (IndexType i = kStart; i < kInputDimensions; ++i) {
-        output[i] = static_cast<OutputType>(
-            std::max(0, std::min(127, input[i] >> kWeightScaleBits)));
-      }
-      return output;
+    void propagate(const InputType* input, OutputType* output) const {
+
+#if defined(USE_AVX2)
+        if constexpr (InputDimensions % SimdWidth == 0)
+        {
+            constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+            const __m256i       Offsets   = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+            const auto          in        = reinterpret_cast<const __m256i*>(input);
+            const auto          out       = reinterpret_cast<__m256i*>(output);
+            for (IndexType i = 0; i < NumChunks; ++i)
+            {
+                const __m256i words0 =
+                  _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 0]),
+                                                        _mm256_load_si256(&in[i * 4 + 1])),
+                                    WeightScaleBits);
+                const __m256i words1 =
+                  _mm256_srli_epi16(_mm256_packus_epi32(_mm256_load_si256(&in[i * 4 + 2]),
+                                                        _mm256_load_si256(&in[i * 4 + 3])),
+                                    WeightScaleBits);
+                _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(
+                                              _mm256_packs_epi16(words0, words1), Offsets));
+            }
+        }
+        else
+        {
+            constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
+            const auto          in        = reinterpret_cast<const __m128i*>(input);
+            const auto          out       = reinterpret_cast<__m128i*>(output);
+            for (IndexType i = 0; i < NumChunks; ++i)
+            {
+                const __m128i words0 = _mm_srli_epi16(
+                  _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
+                  WeightScaleBits);
+                const __m128i words1 = _mm_srli_epi16(
+                  _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
+                  WeightScaleBits);
+                _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
+            }
+        }
+        constexpr IndexType Start = InputDimensions % SimdWidth == 0
+                                    ? InputDimensions / SimdWidth * SimdWidth
+                                    : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);
+
+#elif defined(USE_SSE2)
+        constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+
+    #ifndef USE_SSE41
+        const __m128i k0x80s = _mm_set1_epi8(-128);
+    #endif
+
+        const auto in  = reinterpret_cast<const __m128i*>(input);
+        const auto out = reinterpret_cast<__m128i*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+    #if defined(USE_SSE41)
+            const __m128i words0 = _mm_srli_epi16(
+              _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
+              WeightScaleBits);
+            const __m128i words1 = _mm_srli_epi16(
+              _mm_packus_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
+              WeightScaleBits);
+            _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
+    #else
+            const __m128i words0 = _mm_srai_epi16(
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1])),
+              WeightScaleBits);
+            const __m128i words1 = _mm_srai_epi16(
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3])),
+              WeightScaleBits);
+            const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+            _mm_store_si128(&out[i], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
+    #endif
+        }
+        constexpr IndexType Start = NumChunks * SimdWidth;
+
+#elif defined(USE_NEON)
+        constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
+        const int8x8_t      Zero      = {0};
+        const auto          in        = reinterpret_cast<const int32x4_t*>(input);
+        const auto          out       = reinterpret_cast<int8x8_t*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+            int16x8_t  shifted;
+            const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
+            pack[0]         = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
+            pack[1]         = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
+            out[i]          = vmax_s8(vqmovn_s16(shifted), Zero);
+        }
+        constexpr IndexType Start = NumChunks * (SimdWidth / 2);
+#else
+        constexpr IndexType Start = 0;
+#endif
+
+        for (IndexType i = Start; i < InputDimensions; ++i)
+        {
+            output[i] = static_cast<OutputType>(std::clamp(input[i] >> WeightScaleBits, 0, 127));
+        }
      }
      }
+};
  
  
-   private:
-    PreviousLayer previous_layer_;
-  };
-
-}  // namespace Eval::NNUE::Layers
+}  // namespace Stockfish::Eval::NNUE::Layers
  
  
-#endif // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
+#endif  // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED