git.sesse.net Git - stockfish/blob - src/nnue/nnue_feature_transformer.h

   1 /*
   2   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   3   Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
   4
   5   Stockfish is free software: you can redistribute it and/or modify
   6   it under the terms of the GNU General Public License as published by
   7   the Free Software Foundation, either version 3 of the License, or
   8   (at your option) any later version.
   9
  10   Stockfish is distributed in the hope that it will be useful,
  11   but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13   GNU General Public License for more details.
  14
  15   You should have received a copy of the GNU General Public License
  16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17 */
  18
  19 // A class that converts the input features of the NNUE evaluation function
  20
  21 #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
  22 #define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
  23
  24 #include "nnue_common.h"
  25 #include "nnue_architecture.h"
  26 #include "features/index_list.h"
  27
  28 #include <cstring> // std::memset()
  29
  30 namespace Eval::NNUE {
  31
  32   // If vector instructions are enabled, we update and refresh the
  33   // accumulator tile by tile such that each tile fits in the CPU's
  34   // vector registers.
  35   #define TILING
  36
  37   #ifdef USE_AVX512
  38   typedef __m512i vec_t;
  39   #define vec_load(a) _mm512_loadA_si512(a)
  40   #define vec_store(a,b) _mm512_storeA_si512(a,b)
  41   #define vec_add_16(a,b) _mm512_add_epi16(a,b)
  42   #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
  43   static constexpr IndexType kNumRegs = 8; // only 8 are needed
  44
  45   #elif USE_AVX2
  46   typedef __m256i vec_t;
  47   #define vec_load(a) _mm256_loadA_si256(a)
  48   #define vec_store(a,b) _mm256_storeA_si256(a,b)
  49   #define vec_add_16(a,b) _mm256_add_epi16(a,b)
  50   #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
  51   static constexpr IndexType kNumRegs = 16;
  52
  53   #elif USE_SSE2
  54   typedef __m128i vec_t;
  55   #define vec_load(a) (*(a))
  56   #define vec_store(a,b) *(a)=(b)
  57   #define vec_add_16(a,b) _mm_add_epi16(a,b)
  58   #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
  59   static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
  60
  61   #elif USE_MMX
  62   typedef __m64 vec_t;
  63   #define vec_load(a) (*(a))
  64   #define vec_store(a,b) *(a)=(b)
  65   #define vec_add_16(a,b) _mm_add_pi16(a,b)
  66   #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
  67   static constexpr IndexType kNumRegs = 8;
  68
  69   #elif USE_NEON
  70   typedef int16x8_t vec_t;
  71   #define vec_load(a) (*(a))
  72   #define vec_store(a,b) *(a)=(b)
  73   #define vec_add_16(a,b) vaddq_s16(a,b)
  74   #define vec_sub_16(a,b) vsubq_s16(a,b)
  75   static constexpr IndexType kNumRegs = 16;
  76
  77   #else
  78   #undef TILING
  79
  80   #endif
  81
  82   // Input feature converter
  83   class FeatureTransformer {
  84
  85    private:
  86     // Number of output dimensions for one side
  87     static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
  88
  89     #ifdef TILING
  90     static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
  91     static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
  92     #endif
  93
  94    public:
  95     // Output type
  96     using OutputType = TransformedFeatureType;
  97
  98     // Number of input/output dimensions
  99     static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
 100     static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
 101
 102     // Size of forward propagation buffer
 103     static constexpr std::size_t kBufferSize =
 104         kOutputDimensions * sizeof(OutputType);
 105
 106     // Hash value embedded in the evaluation file
 107     static constexpr std::uint32_t GetHashValue() {
 108
 109       return RawFeatures::kHashValue ^ kOutputDimensions;
 110     }
 111
 112     // Read network parameters
 113     bool ReadParameters(std::istream& stream) {
 114
 115       for (std::size_t i = 0; i < kHalfDimensions; ++i)
 116         biases_[i] = read_little_endian<BiasType>(stream);
 117       for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
 118         weights_[i] = read_little_endian<WeightType>(stream);
 119       return !stream.fail();
 120     }
 121
 122     // Proceed with the difference calculation if possible
 123     bool UpdateAccumulatorIfPossible(const Position& pos) const {
 124
 125       const auto now = pos.state();
 126       if (now->accumulator.computed_accumulation)
 127         return true;
 128
 129       const auto prev = now->previous;
 130       if (prev && prev->accumulator.computed_accumulation) {
 131         UpdateAccumulator(pos);
 132         return true;
 133       }
 134
 135       return false;
 136     }
 137
 138     // Convert input features
 139     void Transform(const Position& pos, OutputType* output) const {
 140
 141       if (!UpdateAccumulatorIfPossible(pos))
 142         RefreshAccumulator(pos);
 143
 144       const auto& accumulation = pos.state()->accumulator.accumulation;
 145
 146   #if defined(USE_AVX2)
 147       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 148       constexpr int kControl = 0b11011000;
 149       const __m256i kZero = _mm256_setzero_si256();
 150
 151   #elif defined(USE_SSE2)
 152       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 153
 154   #ifdef USE_SSE41
 155       const __m128i kZero = _mm_setzero_si128();
 156   #else
 157       const __m128i k0x80s = _mm_set1_epi8(-128);
 158   #endif
 159
 160   #elif defined(USE_MMX)
 161       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 162       const __m64 k0x80s = _mm_set1_pi8(-128);
 163
 164   #elif defined(USE_NEON)
 165       constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
 166       const int8x8_t kZero = {0};
 167   #endif
 168
 169       const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
 170       for (IndexType p = 0; p < 2; ++p) {
 171         const IndexType offset = kHalfDimensions * p;
 172
 173   #if defined(USE_AVX2)
 174         auto out = reinterpret_cast<__m256i*>(&output[offset]);
 175         for (IndexType j = 0; j < kNumChunks; ++j) {
 176           __m256i sum0 = _mm256_loadA_si256(
 177               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
 178           __m256i sum1 = _mm256_loadA_si256(
 179             &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
 180           _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
 181               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
 182         }
 183
 184   #elif defined(USE_SSE2)
 185         auto out = reinterpret_cast<__m128i*>(&output[offset]);
 186         for (IndexType j = 0; j < kNumChunks; ++j) {
 187           __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
 188               accumulation[perspectives[p]][0])[j * 2 + 0]);
 189           __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
 190               accumulation[perspectives[p]][0])[j * 2 + 1]);
 191       const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
 192
 193           _mm_store_si128(&out[j],
 194
 195   #ifdef USE_SSE41
 196             _mm_max_epi8(packedbytes, kZero)
 197   #else
 198             _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
 199   #endif
 200
 201           );
 202         }
 203
 204   #elif defined(USE_MMX)
 205         auto out = reinterpret_cast<__m64*>(&output[offset]);
 206         for (IndexType j = 0; j < kNumChunks; ++j) {
 207           __m64 sum0 = *(&reinterpret_cast<const __m64*>(
 208               accumulation[perspectives[p]][0])[j * 2 + 0]);
 209           __m64 sum1 = *(&reinterpret_cast<const __m64*>(
 210               accumulation[perspectives[p]][0])[j * 2 + 1]);
 211           const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
 212           out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
 213         }
 214
 215   #elif defined(USE_NEON)
 216         const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
 217         for (IndexType j = 0; j < kNumChunks; ++j) {
 218           int16x8_t sum = reinterpret_cast<const int16x8_t*>(
 219               accumulation[perspectives[p]][0])[j];
 220           out[j] = vmax_s8(vqmovn_s16(sum), kZero);
 221         }
 222
 223   #else
 224         for (IndexType j = 0; j < kHalfDimensions; ++j) {
 225           BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
 226           output[offset + j] = static_cast<OutputType>(
 227               std::max<int>(0, std::min<int>(127, sum)));
 228         }
 229   #endif
 230
 231       }
 232   #if defined(USE_MMX)
 233       _mm_empty();
 234   #endif
 235     }
 236
 237    private:
 238     // Calculate cumulative value without using difference calculation
 239     void RefreshAccumulator(const Position& pos) const {
 240
 241       auto& accumulator = pos.state()->accumulator;
 242       IndexType i = 0;
 243       Features::IndexList active_indices[2];
 244       RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
 245                                        active_indices);
 246       for (Color perspective : { WHITE, BLACK }) {
 247   #ifdef TILING
 248         for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
 249           auto biasesTile = reinterpret_cast<const vec_t*>(
 250               &biases_[j * kTileHeight]);
 251           auto accTile = reinterpret_cast<vec_t*>(
 252               &accumulator.accumulation[perspective][i][j * kTileHeight]);
 253           vec_t acc[kNumRegs];
 254
 255           for (unsigned k = 0; k < kNumRegs; ++k)
 256             acc[k] = biasesTile[k];
 257
 258           for (const auto index : active_indices[perspective]) {
 259             const IndexType offset = kHalfDimensions * index + j * kTileHeight;
 260             auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 261
 262             for (unsigned k = 0; k < kNumRegs; ++k)
 263               acc[k] = vec_add_16(acc[k], column[k]);
 264           }
 265
 266           for (unsigned k = 0; k < kNumRegs; k++)
 267             vec_store(&accTile[k], acc[k]);
 268         }
 269   #else
 270         std::memcpy(accumulator.accumulation[perspective][i], biases_,
 271             kHalfDimensions * sizeof(BiasType));
 272
 273         for (const auto index : active_indices[perspective]) {
 274           const IndexType offset = kHalfDimensions * index;
 275
 276           for (IndexType j = 0; j < kHalfDimensions; ++j)
 277             accumulator.accumulation[perspective][i][j] += weights_[offset + j];
 278         }
 279   #endif
 280       }
 281
 282   #if defined(USE_MMX)
 283       _mm_empty();
 284   #endif
 285
 286       accumulator.computed_accumulation = true;
 287     }
 288
 289     // Calculate cumulative value using difference calculation
 290     void UpdateAccumulator(const Position& pos) const {
 291
 292       const auto prev_accumulator = pos.state()->previous->accumulator;
 293       auto& accumulator = pos.state()->accumulator;
 294       IndexType i = 0;
 295       Features::IndexList removed_indices[2], added_indices[2];
 296       bool reset[2];
 297       RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
 298                                         removed_indices, added_indices, reset);
 299
 300   #ifdef TILING
 301       for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
 302         for (Color perspective : { WHITE, BLACK }) {
 303           auto accTile = reinterpret_cast<vec_t*>(
 304               &accumulator.accumulation[perspective][i][j * kTileHeight]);
 305           vec_t acc[kNumRegs];
 306
 307           if (reset[perspective]) {
 308             auto biasesTile = reinterpret_cast<const vec_t*>(
 309                 &biases_[j * kTileHeight]);
 310             for (unsigned k = 0; k < kNumRegs; ++k)
 311               acc[k] = biasesTile[k];
 312           } else {
 313             auto prevAccTile = reinterpret_cast<const vec_t*>(
 314                 &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
 315             for (IndexType k = 0; k < kNumRegs; ++k)
 316               acc[k] = vec_load(&prevAccTile[k]);
 317
 318             // Difference calculation for the deactivated features
 319             for (const auto index : removed_indices[perspective]) {
 320               const IndexType offset = kHalfDimensions * index + j * kTileHeight;
 321               auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 322
 323               for (IndexType k = 0; k < kNumRegs; ++k)
 324                 acc[k] = vec_sub_16(acc[k], column[k]);
 325             }
 326           }
 327           { // Difference calculation for the activated features
 328             for (const auto index : added_indices[perspective]) {
 329               const IndexType offset = kHalfDimensions * index + j * kTileHeight;
 330               auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 331
 332               for (IndexType k = 0; k < kNumRegs; ++k)
 333                 acc[k] = vec_add_16(acc[k], column[k]);
 334             }
 335           }
 336
 337           for (IndexType k = 0; k < kNumRegs; ++k)
 338             vec_store(&accTile[k], acc[k]);
 339         }
 340       }
 341   #if defined(USE_MMX)
 342       _mm_empty();
 343   #endif
 344
 345   #else
 346       for (Color perspective : { WHITE, BLACK }) {
 347
 348         if (reset[perspective]) {
 349           std::memcpy(accumulator.accumulation[perspective][i], biases_,
 350                       kHalfDimensions * sizeof(BiasType));
 351         } else {
 352           std::memcpy(accumulator.accumulation[perspective][i],
 353                       prev_accumulator.accumulation[perspective][i],
 354                       kHalfDimensions * sizeof(BiasType));
 355           // Difference calculation for the deactivated features
 356           for (const auto index : removed_indices[perspective]) {
 357             const IndexType offset = kHalfDimensions * index;
 358
 359             for (IndexType j = 0; j < kHalfDimensions; ++j)
 360               accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
 361           }
 362         }
 363         { // Difference calculation for the activated features
 364           for (const auto index : added_indices[perspective]) {
 365             const IndexType offset = kHalfDimensions * index;
 366
 367             for (IndexType j = 0; j < kHalfDimensions; ++j)
 368               accumulator.accumulation[perspective][i][j] += weights_[offset + j];
 369           }
 370         }
 371       }
 372   #endif
 373
 374       accumulator.computed_accumulation = true;
 375     }
 376
 377     using BiasType = std::int16_t;
 378     using WeightType = std::int16_t;
 379
 380     alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
 381     alignas(kCacheLineSize)
 382         WeightType weights_[kHalfDimensions * kInputDimensions];
 383   };
 384
 385 }  // namespace Eval::NNUE
 386
 387 #endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED