git.sesse.net Git - stockfish/blob - src/nnue/nnue_feature_transformer.h

   1 /*
   2   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
   3   Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
   4
   5   Stockfish is free software: you can redistribute it and/or modify
   6   it under the terms of the GNU General Public License as published by
   7   the Free Software Foundation, either version 3 of the License, or
   8   (at your option) any later version.
   9
  10   Stockfish is distributed in the hope that it will be useful,
  11   but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13   GNU General Public License for more details.
  14
  15   You should have received a copy of the GNU General Public License
  16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17 */
  18
  19 // A class that converts the input features of the NNUE evaluation function
  20
  21 #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
  22 #define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
  23
  24 #include "nnue_common.h"
  25 #include "nnue_architecture.h"
  26 #include "features/index_list.h"
  27
  28 #include <cstring> // std::memset()
  29
  30 namespace Eval::NNUE {
  31
  32   // If vector instructions are enabled, we update and refresh the
  33   // accumulator tile by tile such that each tile fits in the CPU's
  34   // vector registers.
  35   #define TILING
  36
  37   #ifdef USE_AVX512
  38   typedef __m512i vec_t;
  39   #define vec_load(a) _mm512_loadA_si512(a)
  40   #define vec_store(a,b) _mm512_storeA_si512(a,b)
  41   #define vec_add_16(a,b) _mm512_add_epi16(a,b)
  42   #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
  43   static constexpr IndexType kNumRegs = 8; // only 8 are needed
  44
  45   #elif USE_AVX2
  46   typedef __m256i vec_t;
  47   #define vec_load(a) _mm256_loadA_si256(a)
  48   #define vec_store(a,b) _mm256_storeA_si256(a,b)
  49   #define vec_add_16(a,b) _mm256_add_epi16(a,b)
  50   #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
  51   static constexpr IndexType kNumRegs = 16;
  52
  53   #elif USE_SSE2
  54   typedef __m128i vec_t;
  55   #define vec_load(a) (*(a))
  56   #define vec_store(a,b) *(a)=(b)
  57   #define vec_add_16(a,b) _mm_add_epi16(a,b)
  58   #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
  59   static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
  60
  61   #elif USE_MMX
  62   typedef __m64 vec_t;
  63   #define vec_load(a) (*(a))
  64   #define vec_store(a,b) *(a)=(b)
  65   #define vec_add_16(a,b) _mm_add_pi16(a,b)
  66   #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
  67   static constexpr IndexType kNumRegs = 8;
  68
  69   #elif USE_NEON
  70   typedef int16x8_t vec_t;
  71   #define vec_load(a) (*(a))
  72   #define vec_store(a,b) *(a)=(b)
  73   #define vec_add_16(a,b) vaddq_s16(a,b)
  74   #define vec_sub_16(a,b) vsubq_s16(a,b)
  75   static constexpr IndexType kNumRegs = 16;
  76
  77   #else
  78   #undef TILING
  79
  80   #endif
  81
  82   // Input feature converter
  83   class FeatureTransformer {
  84
  85    private:
  86     // Number of output dimensions for one side
  87     static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
  88
  89     #ifdef TILING
  90     static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
  91     static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
  92     #endif
  93
  94    public:
  95     // Output type
  96     using OutputType = TransformedFeatureType;
  97
  98     // Number of input/output dimensions
  99     static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
 100     static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
 101
 102     // Size of forward propagation buffer
 103     static constexpr std::size_t kBufferSize =
 104         kOutputDimensions * sizeof(OutputType);
 105
 106     // Hash value embedded in the evaluation file
 107     static constexpr std::uint32_t GetHashValue() {
 108
 109       return RawFeatures::kHashValue ^ kOutputDimensions;
 110     }
 111
 112     // Read network parameters
 113     bool ReadParameters(std::istream& stream) {
 114
 115       for (std::size_t i = 0; i < kHalfDimensions; ++i)
 116         biases_[i] = read_little_endian<BiasType>(stream);
 117       for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
 118         weights_[i] = read_little_endian<WeightType>(stream);
 119       return !stream.fail();
 120     }
 121
 122     // Proceed with the difference calculation if possible
 123     bool UpdateAccumulatorIfPossible(const Position& pos) const {
 124
 125       const auto now = pos.state();
 126       if (now->accumulator.computed_accumulation)
 127         return true;
 128
 129       const auto prev = now->previous;
 130       if (prev) {
 131         if (prev->accumulator.computed_accumulation) {
 132           UpdateAccumulator(pos);
 133           return true;
 134         } else if (prev->previous && prev->previous->accumulator.computed_accumulation) {
 135           UpdateAccumulator(pos);
 136           return true;
 137         }
 138       }
 139
 140       return false;
 141     }
 142
 143     // Convert input features
 144     void Transform(const Position& pos, OutputType* output) const {
 145
 146       if (!UpdateAccumulatorIfPossible(pos))
 147         RefreshAccumulator(pos);
 148
 149       const auto& accumulation = pos.state()->accumulator.accumulation;
 150
 151   #if defined(USE_AVX2)
 152       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 153       constexpr int kControl = 0b11011000;
 154       const __m256i kZero = _mm256_setzero_si256();
 155
 156   #elif defined(USE_SSE2)
 157       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 158
 159   #ifdef USE_SSE41
 160       const __m128i kZero = _mm_setzero_si128();
 161   #else
 162       const __m128i k0x80s = _mm_set1_epi8(-128);
 163   #endif
 164
 165   #elif defined(USE_MMX)
 166       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 167       const __m64 k0x80s = _mm_set1_pi8(-128);
 168
 169   #elif defined(USE_NEON)
 170       constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
 171       const int8x8_t kZero = {0};
 172   #endif
 173
 174       const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
 175       for (IndexType p = 0; p < 2; ++p) {
 176         const IndexType offset = kHalfDimensions * p;
 177
 178   #if defined(USE_AVX2)
 179         auto out = reinterpret_cast<__m256i*>(&output[offset]);
 180         for (IndexType j = 0; j < kNumChunks; ++j) {
 181           __m256i sum0 = _mm256_loadA_si256(
 182               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
 183           __m256i sum1 = _mm256_loadA_si256(
 184             &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
 185           _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
 186               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
 187         }
 188
 189   #elif defined(USE_SSE2)
 190         auto out = reinterpret_cast<__m128i*>(&output[offset]);
 191         for (IndexType j = 0; j < kNumChunks; ++j) {
 192           __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
 193               accumulation[perspectives[p]][0])[j * 2 + 0]);
 194           __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
 195               accumulation[perspectives[p]][0])[j * 2 + 1]);
 196       const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
 197
 198           _mm_store_si128(&out[j],
 199
 200   #ifdef USE_SSE41
 201             _mm_max_epi8(packedbytes, kZero)
 202   #else
 203             _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
 204   #endif
 205
 206           );
 207         }
 208
 209   #elif defined(USE_MMX)
 210         auto out = reinterpret_cast<__m64*>(&output[offset]);
 211         for (IndexType j = 0; j < kNumChunks; ++j) {
 212           __m64 sum0 = *(&reinterpret_cast<const __m64*>(
 213               accumulation[perspectives[p]][0])[j * 2 + 0]);
 214           __m64 sum1 = *(&reinterpret_cast<const __m64*>(
 215               accumulation[perspectives[p]][0])[j * 2 + 1]);
 216           const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
 217           out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
 218         }
 219
 220   #elif defined(USE_NEON)
 221         const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
 222         for (IndexType j = 0; j < kNumChunks; ++j) {
 223           int16x8_t sum = reinterpret_cast<const int16x8_t*>(
 224               accumulation[perspectives[p]][0])[j];
 225           out[j] = vmax_s8(vqmovn_s16(sum), kZero);
 226         }
 227
 228   #else
 229         for (IndexType j = 0; j < kHalfDimensions; ++j) {
 230           BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
 231           output[offset + j] = static_cast<OutputType>(
 232               std::max<int>(0, std::min<int>(127, sum)));
 233         }
 234   #endif
 235
 236       }
 237   #if defined(USE_MMX)
 238       _mm_empty();
 239   #endif
 240     }
 241
 242    private:
 243     // Calculate cumulative value without using difference calculation
 244     void RefreshAccumulator(const Position& pos) const {
 245
 246       auto& accumulator = pos.state()->accumulator;
 247       IndexType i = 0;
 248       Features::IndexList active_indices[2];
 249       RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
 250                                        active_indices);
 251       for (Color perspective : { WHITE, BLACK }) {
 252   #ifdef TILING
 253         for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
 254           auto biasesTile = reinterpret_cast<const vec_t*>(
 255               &biases_[j * kTileHeight]);
 256           auto accTile = reinterpret_cast<vec_t*>(
 257               &accumulator.accumulation[perspective][i][j * kTileHeight]);
 258           vec_t acc[kNumRegs];
 259
 260           for (unsigned k = 0; k < kNumRegs; ++k)
 261             acc[k] = biasesTile[k];
 262
 263           for (const auto index : active_indices[perspective]) {
 264             const IndexType offset = kHalfDimensions * index + j * kTileHeight;
 265             auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 266
 267             for (unsigned k = 0; k < kNumRegs; ++k)
 268               acc[k] = vec_add_16(acc[k], column[k]);
 269           }
 270
 271           for (unsigned k = 0; k < kNumRegs; k++)
 272             vec_store(&accTile[k], acc[k]);
 273         }
 274   #else
 275         std::memcpy(accumulator.accumulation[perspective][i], biases_,
 276             kHalfDimensions * sizeof(BiasType));
 277
 278         for (const auto index : active_indices[perspective]) {
 279           const IndexType offset = kHalfDimensions * index;
 280
 281           for (IndexType j = 0; j < kHalfDimensions; ++j)
 282             accumulator.accumulation[perspective][i][j] += weights_[offset + j];
 283         }
 284   #endif
 285       }
 286
 287   #if defined(USE_MMX)
 288       _mm_empty();
 289   #endif
 290
 291       accumulator.computed_accumulation = true;
 292     }
 293
 294     // Calculate cumulative value using difference calculation
 295     void UpdateAccumulator(const Position& pos) const {
 296
 297       Accumulator* prev_accumulator;
 298       assert(pos.state()->previous);
 299       if (pos.state()->previous->accumulator.computed_accumulation) {
 300         prev_accumulator = &pos.state()->previous->accumulator;
 301       }
 302       else {
 303         assert(pos.state()->previous->previous);
 304         assert(pos.state()->previous->previous->accumulator.computed_accumulation);
 305         prev_accumulator = &pos.state()->previous->previous->accumulator;
 306       }
 307
 308       auto& accumulator = pos.state()->accumulator;
 309       IndexType i = 0;
 310       Features::IndexList removed_indices[2], added_indices[2];
 311       bool reset[2] = { false, false };
 312       RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
 313                                         removed_indices, added_indices, reset);
 314
 315   #ifdef TILING
 316       for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
 317         for (Color perspective : { WHITE, BLACK }) {
 318           auto accTile = reinterpret_cast<vec_t*>(
 319               &accumulator.accumulation[perspective][i][j * kTileHeight]);
 320           vec_t acc[kNumRegs];
 321
 322           if (reset[perspective]) {
 323             auto biasesTile = reinterpret_cast<const vec_t*>(
 324                 &biases_[j * kTileHeight]);
 325             for (unsigned k = 0; k < kNumRegs; ++k)
 326               acc[k] = biasesTile[k];
 327           } else {
 328             auto prevAccTile = reinterpret_cast<const vec_t*>(
 329                 &prev_accumulator->accumulation[perspective][i][j * kTileHeight]);
 330             for (IndexType k = 0; k < kNumRegs; ++k)
 331               acc[k] = vec_load(&prevAccTile[k]);
 332
 333             // Difference calculation for the deactivated features
 334             for (const auto index : removed_indices[perspective]) {
 335               const IndexType offset = kHalfDimensions * index + j * kTileHeight;
 336               auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 337
 338               for (IndexType k = 0; k < kNumRegs; ++k)
 339                 acc[k] = vec_sub_16(acc[k], column[k]);
 340             }
 341           }
 342           { // Difference calculation for the activated features
 343             for (const auto index : added_indices[perspective]) {
 344               const IndexType offset = kHalfDimensions * index + j * kTileHeight;
 345               auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
 346
 347               for (IndexType k = 0; k < kNumRegs; ++k)
 348                 acc[k] = vec_add_16(acc[k], column[k]);
 349             }
 350           }
 351
 352           for (IndexType k = 0; k < kNumRegs; ++k)
 353             vec_store(&accTile[k], acc[k]);
 354         }
 355       }
 356   #if defined(USE_MMX)
 357       _mm_empty();
 358   #endif
 359
 360   #else
 361       for (Color perspective : { WHITE, BLACK }) {
 362
 363         if (reset[perspective]) {
 364           std::memcpy(accumulator.accumulation[perspective][i], biases_,
 365                       kHalfDimensions * sizeof(BiasType));
 366         } else {
 367           std::memcpy(accumulator.accumulation[perspective][i],
 368                       prev_accumulator->accumulation[perspective][i],
 369                       kHalfDimensions * sizeof(BiasType));
 370           // Difference calculation for the deactivated features
 371           for (const auto index : removed_indices[perspective]) {
 372             const IndexType offset = kHalfDimensions * index;
 373
 374             for (IndexType j = 0; j < kHalfDimensions; ++j)
 375               accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
 376           }
 377         }
 378         { // Difference calculation for the activated features
 379           for (const auto index : added_indices[perspective]) {
 380             const IndexType offset = kHalfDimensions * index;
 381
 382             for (IndexType j = 0; j < kHalfDimensions; ++j)
 383               accumulator.accumulation[perspective][i][j] += weights_[offset + j];
 384           }
 385         }
 386       }
 387   #endif
 388
 389       accumulator.computed_accumulation = true;
 390     }
 391
 392     using BiasType = std::int16_t;
 393     using WeightType = std::int16_t;
 394
 395     alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
 396     alignas(kCacheLineSize)
 397         WeightType weights_[kHalfDimensions * kInputDimensions];
 398   };
 399
 400 }  // namespace Eval::NNUE
 401
 402 #endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED