git.sesse.net Git - stockfish/blob - src/nnue/layers/clipped_relu.h

   1 /*
   2   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   3   Copyright (C) 2004-2023 The Stockfish developers (see AUTHORS file)
   4
   5   Stockfish is free software: you can redistribute it and/or modify
   6   it under the terms of the GNU General Public License as published by
   7   the Free Software Foundation, either version 3 of the License, or
   8   (at your option) any later version.
   9
  10   Stockfish is distributed in the hope that it will be useful,
  11   but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13   GNU General Public License for more details.
  14
  15   You should have received a copy of the GNU General Public License
  16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17 */
  18
  19 // Definition of layer ClippedReLU of NNUE evaluation function
  20
  21 #ifndef NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  22 #define NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED
  23
  24 #include <algorithm>
  25 #include <cstdint>
  26 #include <iosfwd>
  27
  28 #include "../nnue_common.h"
  29
  30 namespace Stockfish::Eval::NNUE::Layers {
  31
  32   // Clipped ReLU
  33   template <IndexType InDims>
  34   class ClippedReLU {
  35    public:
  36     // Input/output type
  37     using InputType = std::int32_t;
  38     using OutputType = std::uint8_t;
  39
  40     // Number of input/output dimensions
  41     static constexpr IndexType InputDimensions = InDims;
  42     static constexpr IndexType OutputDimensions = InputDimensions;
  43     static constexpr IndexType PaddedOutputDimensions =
  44         ceil_to_multiple<IndexType>(OutputDimensions, 32);
  45
  46     using OutputBuffer = OutputType[PaddedOutputDimensions];
  47
  48     // Hash value embedded in the evaluation file
  49     static constexpr std::uint32_t get_hash_value(std::uint32_t prevHash) {
  50       std::uint32_t hashValue = 0x538D24C7u;
  51       hashValue += prevHash;
  52       return hashValue;
  53     }
  54
  55     // Read network parameters
  56     bool read_parameters(std::istream&) {
  57       return true;
  58     }
  59
  60     // Write network parameters
  61     bool write_parameters(std::ostream&) const {
  62       return true;
  63     }
  64
  65     // Forward propagation
  66     void propagate(
  67         const InputType* input, OutputType* output) const {
  68
  69   #if defined(USE_AVX2)
  70       if constexpr (InputDimensions % SimdWidth == 0) {
  71         constexpr IndexType NumChunks = InputDimensions / SimdWidth;
  72         const __m256i Zero = _mm256_setzero_si256();
  73         const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
  74         const auto in = reinterpret_cast<const __m256i*>(input);
  75         const auto out = reinterpret_cast<__m256i*>(output);
  76         for (IndexType i = 0; i < NumChunks; ++i) {
  77           const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
  78               _mm256_load_si256(&in[i * 4 + 0]),
  79               _mm256_load_si256(&in[i * 4 + 1])), WeightScaleBits);
  80           const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
  81               _mm256_load_si256(&in[i * 4 + 2]),
  82               _mm256_load_si256(&in[i * 4 + 3])), WeightScaleBits);
  83           _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
  84               _mm256_packs_epi16(words0, words1), Zero), Offsets));
  85         }
  86       } else {
  87         constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
  88         const __m128i Zero = _mm_setzero_si128();
  89         const auto in = reinterpret_cast<const __m128i*>(input);
  90         const auto out = reinterpret_cast<__m128i*>(output);
  91         for (IndexType i = 0; i < NumChunks; ++i) {
  92           const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
  93               _mm_load_si128(&in[i * 4 + 0]),
  94               _mm_load_si128(&in[i * 4 + 1])), WeightScaleBits);
  95           const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
  96               _mm_load_si128(&in[i * 4 + 2]),
  97               _mm_load_si128(&in[i * 4 + 3])), WeightScaleBits);
  98           const __m128i packedbytes = _mm_packs_epi16(words0, words1);
  99           _mm_store_si128(&out[i], _mm_max_epi8(packedbytes, Zero));
 100         }
 101       }
 102       constexpr IndexType Start =
 103         InputDimensions % SimdWidth == 0
 104         ? InputDimensions / SimdWidth * SimdWidth
 105         : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);
 106
 107   #elif defined(USE_SSE2)
 108       constexpr IndexType NumChunks = InputDimensions / SimdWidth;
 109
 110   #ifdef USE_SSE41
 111       const __m128i Zero = _mm_setzero_si128();
 112   #else
 113       const __m128i k0x80s = _mm_set1_epi8(-128);
 114   #endif
 115
 116       const auto in = reinterpret_cast<const __m128i*>(input);
 117       const auto out = reinterpret_cast<__m128i*>(output);
 118       for (IndexType i = 0; i < NumChunks; ++i) {
 119         const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
 120             _mm_load_si128(&in[i * 4 + 0]),
 121             _mm_load_si128(&in[i * 4 + 1])), WeightScaleBits);
 122         const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
 123             _mm_load_si128(&in[i * 4 + 2]),
 124             _mm_load_si128(&in[i * 4 + 3])), WeightScaleBits);
 125         const __m128i packedbytes = _mm_packs_epi16(words0, words1);
 126         _mm_store_si128(&out[i],
 127
 128   #ifdef USE_SSE41
 129           _mm_max_epi8(packedbytes, Zero)
 130   #else
 131           _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
 132   #endif
 133
 134         );
 135       }
 136       constexpr IndexType Start = NumChunks * SimdWidth;
 137
 138   #elif defined(USE_NEON)
 139       constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
 140       const int8x8_t Zero = {0};
 141       const auto in = reinterpret_cast<const int32x4_t*>(input);
 142       const auto out = reinterpret_cast<int8x8_t*>(output);
 143       for (IndexType i = 0; i < NumChunks; ++i) {
 144         int16x8_t shifted;
 145         const auto pack = reinterpret_cast<int16x4_t*>(&shifted);
 146         pack[0] = vqshrn_n_s32(in[i * 2 + 0], WeightScaleBits);
 147         pack[1] = vqshrn_n_s32(in[i * 2 + 1], WeightScaleBits);
 148         out[i] = vmax_s8(vqmovn_s16(shifted), Zero);
 149       }
 150       constexpr IndexType Start = NumChunks * (SimdWidth / 2);
 151   #else
 152       constexpr IndexType Start = 0;
 153   #endif
 154
 155       for (IndexType i = Start; i < InputDimensions; ++i) {
 156         output[i] = static_cast<OutputType>(
 157             std::clamp(input[i] >> WeightScaleBits, 0, 127));
 158       }
 159     }
 160   };
 161
 162 }  // namespace Stockfish::Eval::NNUE::Layers
 163
 164 #endif // NNUE_LAYERS_CLIPPED_RELU_H_INCLUDED