Merge remote-tracking branch 'upstream/master'

[stockfish] / src / nnue / layers / sqr_clipped_relu.h
diff --git a/src/nnue/layers/sqr_clipped_relu.h b/src/nnue/layers/sqr_clipped_relu.h

index 3fbb243cfd6c38371183e8ebec459f822fa01cb6..f8e2d497ac0b0b826bd955e2b5bfd986373fcaa8 100644 (file)
--- a/src/nnue/layers/sqr_clipped_relu.h
+++ b/src/nnue/layers/sqr_clipped_relu.h
@@ -21,100 +21,83 @@
  #ifndef NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
  #define NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
  
+#include <algorithm>
+#include <cstdint>
+#include <iosfwd>
+
  #include "../nnue_common.h"
  
  namespace Stockfish::Eval::NNUE::Layers {
  
-  // Clipped ReLU
-  template <IndexType InDims>
-  class SqrClippedReLU {
+// Clipped ReLU
+template<IndexType InDims>
+class SqrClippedReLU {
     public:
      // Input/output type
-    using InputType = std::int32_t;
+    using InputType  = std::int32_t;
      using OutputType = std::uint8_t;
  
      // Number of input/output dimensions
-    static constexpr IndexType InputDimensions = InDims;
+    static constexpr IndexType InputDimensions  = InDims;
      static constexpr IndexType OutputDimensions = InputDimensions;
      static constexpr IndexType PaddedOutputDimensions =
-        ceil_to_multiple<IndexType>(OutputDimensions, 32);
+      ceil_to_multiple<IndexType>(OutputDimensions, 32);
  
      using OutputBuffer = OutputType[PaddedOutputDimensions];
  
      // Hash value embedded in the evaluation file
      static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
-      std::uint32_t hashValue = 0x538D24C7u;
-      hashValue += prevHash;
-      return hashValue;
+        std::uint32_t hashValue = 0x538D24C7u;
+        hashValue += prevHash;
+        return hashValue;
      }
  
      // Read network parameters
-    bool read_parameters(std::istream&) {
-      return true;
-    }
+    bool read_parameters(std::istream&) { return true; }
  
      // Write network parameters
-    bool write_parameters(std::ostream&) const {
-      return true;
-    }
+    bool write_parameters(std::ostream&) const { return true; }
  
      // Forward propagation
-    const OutputType* propagate(
-        const InputType* input, OutputType* output) const {
-
-  #if defined(USE_SSE2)
-      constexpr IndexType NumChunks = InputDimensions / 16;
-
-  #ifdef USE_SSE41
-      const __m128i Zero = _mm_setzero_si128();
-  #else
-      const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
-
-      static_assert(WeightScaleBits == 6);
-      const auto in = reinterpret_cast<const __m128i*>(input);
-      const auto out = reinterpret_cast<__m128i*>(output);
-      for (IndexType i = 0; i < NumChunks; ++i) {
-        __m128i words0 = _mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 0]),
-            _mm_load_si128(&in[i * 4 + 1]));
-        __m128i words1 = _mm_packs_epi32(
-            _mm_load_si128(&in[i * 4 + 2]),
-            _mm_load_si128(&in[i * 4 + 3]));
-
-        // Not sure if
-        words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3);
-        words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3);
-
-        const __m128i packedbytes = _mm_packs_epi16(words0, words1);
-
-        _mm_store_si128(&out[i],
-
-  #ifdef USE_SSE41
-          _mm_max_epi8(packedbytes, Zero)
-  #else
-          _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
-
-        );
-      }
-      constexpr IndexType Start = NumChunks * 16;
-
-  #else
-      constexpr IndexType Start = 0;
-  #endif
-
-      for (IndexType i = Start; i < InputDimensions; ++i) {
-        output[i] = static_cast<OutputType>(
-            // really should be /127 but we need to make it fast
-            // needs to be accounted for in the trainer
-            std::max(0ll, std::min(127ll, (((long long)input[i] * input[i]) >> (2 * WeightScaleBits)) / 128)));
-      }
-
-      return output;
+    void propagate(const InputType* input, OutputType* output) const {
+
+#if defined(USE_SSE2)
+        constexpr IndexType NumChunks = InputDimensions / 16;
+
+        static_assert(WeightScaleBits == 6);
+        const auto in  = reinterpret_cast<const __m128i*>(input);
+        const auto out = reinterpret_cast<__m128i*>(output);
+        for (IndexType i = 0; i < NumChunks; ++i)
+        {
+            __m128i words0 =
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 0]), _mm_load_si128(&in[i * 4 + 1]));
+            __m128i words1 =
+              _mm_packs_epi32(_mm_load_si128(&in[i * 4 + 2]), _mm_load_si128(&in[i * 4 + 3]));
+
+            // We shift by WeightScaleBits * 2 = 12 and divide by 128
+            // which is an additional shift-right of 7, meaning 19 in total.
+            // MulHi strips the lower 16 bits so we need to shift out 3 more to match.
+            words0 = _mm_srli_epi16(_mm_mulhi_epi16(words0, words0), 3);
+            words1 = _mm_srli_epi16(_mm_mulhi_epi16(words1, words1), 3);
+
+            _mm_store_si128(&out[i], _mm_packs_epi16(words0, words1));
+        }
+        constexpr IndexType Start = NumChunks * 16;
+
+#else
+        constexpr IndexType Start = 0;
+#endif
+
+        for (IndexType i = Start; i < InputDimensions; ++i)
+        {
+            output[i] = static_cast<OutputType>(
+              // Really should be /127 but we need to make it fast so we right shift
+              // by an extra 7 bits instead. Needs to be accounted for in the trainer.
+              std::min(127ll, ((long long) (input[i]) * input[i]) >> (2 * WeightScaleBits + 7)));
+        }
      }
-  };
+};
  
  }  // namespace Stockfish::Eval::NNUE::Layers
  
-#endif // NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED
+#endif  // NNUE_LAYERS_SQR_CLIPPED_RELU_H_INCLUDED