2 Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3 Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file)
5 Stockfish is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 Stockfish is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 // A class that converts the input features of the NNUE evaluation function
21 #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
22 #define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
24 #include "nnue_common.h"
25 #include "nnue_architecture.h"
26 #include "features/index_list.h"
28 #include <cstring> // std::memset()
30 namespace Stockfish::Eval::NNUE {
32 // If vector instructions are enabled, we update and refresh the
33 // accumulator tile by tile such that each tile fits in the CPU's
38 typedef __m512i vec_t;
39 #define vec_load(a) _mm512_load_si512(a)
40 #define vec_store(a,b) _mm512_store_si512(a,b)
41 #define vec_add_16(a,b) _mm512_add_epi16(a,b)
42 #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
43 static constexpr IndexType NumRegs = 8; // only 8 are needed
46 typedef __m256i vec_t;
47 #define vec_load(a) _mm256_load_si256(a)
48 #define vec_store(a,b) _mm256_store_si256(a,b)
49 #define vec_add_16(a,b) _mm256_add_epi16(a,b)
50 #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
51 static constexpr IndexType NumRegs = 16;
54 typedef __m128i vec_t;
55 #define vec_load(a) (*(a))
56 #define vec_store(a,b) *(a)=(b)
57 #define vec_add_16(a,b) _mm_add_epi16(a,b)
58 #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
59 static constexpr IndexType NumRegs = Is64Bit ? 16 : 8;
63 #define vec_load(a) (*(a))
64 #define vec_store(a,b) *(a)=(b)
65 #define vec_add_16(a,b) _mm_add_pi16(a,b)
66 #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
67 static constexpr IndexType NumRegs = 8;
70 typedef int16x8_t vec_t;
71 #define vec_load(a) (*(a))
72 #define vec_store(a,b) *(a)=(b)
73 #define vec_add_16(a,b) vaddq_s16(a,b)
74 #define vec_sub_16(a,b) vsubq_s16(a,b)
75 static constexpr IndexType NumRegs = 16;
82 // Input feature converter
83 class FeatureTransformer {
86 // Number of output dimensions for one side
87 static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
90 static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2;
91 static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
96 using OutputType = TransformedFeatureType;
98 // Number of input/output dimensions
99 static constexpr IndexType InputDimensions = RawFeatures::Dimensions;
100 static constexpr IndexType OutputDimensions = HalfDimensions * 2;
102 // Size of forward propagation buffer
103 static constexpr std::size_t BufferSize =
104 OutputDimensions * sizeof(OutputType);
106 // Hash value embedded in the evaluation file
107 static constexpr std::uint32_t get_hash_value() {
108 return RawFeatures::HashValue ^ OutputDimensions;
111 // Read network parameters
112 bool read_parameters(std::istream& stream) {
113 for (std::size_t i = 0; i < HalfDimensions; ++i)
114 biases[i] = read_little_endian<BiasType>(stream);
115 for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i)
116 weights[i] = read_little_endian<WeightType>(stream);
117 return !stream.fail();
120 // Convert input features
121 void transform(const Position& pos, OutputType* output) const {
122 update_accumulator(pos, WHITE);
123 update_accumulator(pos, BLACK);
125 const auto& accumulation = pos.state()->accumulator.accumulation;
127 #if defined(USE_AVX512)
128 constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
129 static_assert(HalfDimensions % (SimdWidth * 2) == 0);
130 const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
131 const __m512i Zero = _mm512_setzero_si512();
133 #elif defined(USE_AVX2)
134 constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
135 constexpr int Control = 0b11011000;
136 const __m256i Zero = _mm256_setzero_si256();
138 #elif defined(USE_SSE2)
139 constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
142 const __m128i Zero = _mm_setzero_si128();
144 const __m128i k0x80s = _mm_set1_epi8(-128);
147 #elif defined(USE_MMX)
148 constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
149 const __m64 k0x80s = _mm_set1_pi8(-128);
151 #elif defined(USE_NEON)
152 constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2);
153 const int8x8_t Zero = {0};
156 const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
157 for (IndexType p = 0; p < 2; ++p) {
158 const IndexType offset = HalfDimensions * p;
160 #if defined(USE_AVX512)
161 auto out = reinterpret_cast<__m512i*>(&output[offset]);
162 for (IndexType j = 0; j < NumChunks; ++j) {
163 __m512i sum0 = _mm512_load_si512(
164 &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
165 __m512i sum1 = _mm512_load_si512(
166 &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
167 _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
168 _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
171 #elif defined(USE_AVX2)
172 auto out = reinterpret_cast<__m256i*>(&output[offset]);
173 for (IndexType j = 0; j < NumChunks; ++j) {
174 __m256i sum0 = _mm256_load_si256(
175 &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
176 __m256i sum1 = _mm256_load_si256(
177 &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
178 _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
179 _mm256_packs_epi16(sum0, sum1), Zero), Control));
182 #elif defined(USE_SSE2)
183 auto out = reinterpret_cast<__m128i*>(&output[offset]);
184 for (IndexType j = 0; j < NumChunks; ++j) {
185 __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
186 accumulation[perspectives[p]][0])[j * 2 + 0]);
187 __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
188 accumulation[perspectives[p]][0])[j * 2 + 1]);
189 const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
191 _mm_store_si128(&out[j],
194 _mm_max_epi8(packedbytes, Zero)
196 _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
202 #elif defined(USE_MMX)
203 auto out = reinterpret_cast<__m64*>(&output[offset]);
204 for (IndexType j = 0; j < NumChunks; ++j) {
205 __m64 sum0 = *(&reinterpret_cast<const __m64*>(
206 accumulation[perspectives[p]][0])[j * 2 + 0]);
207 __m64 sum1 = *(&reinterpret_cast<const __m64*>(
208 accumulation[perspectives[p]][0])[j * 2 + 1]);
209 const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
210 out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
213 #elif defined(USE_NEON)
214 const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
215 for (IndexType j = 0; j < NumChunks; ++j) {
216 int16x8_t sum = reinterpret_cast<const int16x8_t*>(
217 accumulation[perspectives[p]][0])[j];
218 out[j] = vmax_s8(vqmovn_s16(sum), Zero);
222 for (IndexType j = 0; j < HalfDimensions; ++j) {
223 BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
224 output[offset + j] = static_cast<OutputType>(
225 std::max<int>(0, std::min<int>(127, sum)));
236 void update_accumulator(const Position& pos, const Color c) const {
239 // Gcc-10.2 unnecessarily spills AVX2 registers if this array
240 // is defined in the VECTOR code below, once in each branch
244 // Look for a usable accumulator of an earlier position. We keep track
245 // of the estimated gain in terms of features to be added/subtracted.
246 StateInfo *st = pos.state(), *next = nullptr;
247 int gain = pos.count<ALL_PIECES>() - 2;
248 while (st->accumulator.state[c] == EMPTY)
250 auto& dp = st->dirtyPiece;
251 // The first condition tests whether an incremental update is
252 // possible at all: if this side's king has moved, it is not possible.
253 static_assert(std::is_same_v<RawFeatures::SortedTriggerSet,
254 Features::CompileTimeList<Features::TriggerEvent, Features::TriggerEvent::FriendKingMoved>>,
255 "Current code assumes that only FriendlyKingMoved refresh trigger is being used.");
256 if ( dp.piece[0] == make_piece(c, KING)
257 || (gain -= dp.dirty_num + 1) < 0)
263 if (st->accumulator.state[c] == COMPUTED)
268 // Update incrementally in two steps. First, we update the "next"
269 // accumulator. Then, we update the current accumulator (pos.state()).
271 // Gather all features to be updated. This code assumes HalfKP features
272 // only and doesn't support refresh triggers.
273 static_assert(std::is_same_v<Features::FeatureSet<Features::HalfKP<Features::Side::Friend>>,
275 Features::IndexList removed[2], added[2];
276 Features::HalfKP<Features::Side::Friend>::append_changed_indices(pos,
277 next->dirtyPiece, c, &removed[0], &added[0]);
278 for (StateInfo *st2 = pos.state(); st2 != next; st2 = st2->previous)
279 Features::HalfKP<Features::Side::Friend>::append_changed_indices(pos,
280 st2->dirtyPiece, c, &removed[1], &added[1]);
282 // Mark the accumulators as computed.
283 next->accumulator.state[c] = COMPUTED;
284 pos.state()->accumulator.state[c] = COMPUTED;
286 // Now update the accumulators listed in info[], where the last element is a sentinel.
288 { next, next == pos.state() ? nullptr : pos.state(), nullptr };
290 for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
293 auto accTile = reinterpret_cast<vec_t*>(
294 &st->accumulator.accumulation[c][0][j * TileHeight]);
295 for (IndexType k = 0; k < NumRegs; ++k)
296 acc[k] = vec_load(&accTile[k]);
298 for (IndexType i = 0; info[i]; ++i)
300 // Difference calculation for the deactivated features
301 for (const auto index : removed[i])
303 const IndexType offset = HalfDimensions * index + j * TileHeight;
304 auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
305 for (IndexType k = 0; k < NumRegs; ++k)
306 acc[k] = vec_sub_16(acc[k], column[k]);
309 // Difference calculation for the activated features
310 for (const auto index : added[i])
312 const IndexType offset = HalfDimensions * index + j * TileHeight;
313 auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
314 for (IndexType k = 0; k < NumRegs; ++k)
315 acc[k] = vec_add_16(acc[k], column[k]);
319 accTile = reinterpret_cast<vec_t*>(
320 &info[i]->accumulator.accumulation[c][0][j * TileHeight]);
321 for (IndexType k = 0; k < NumRegs; ++k)
322 vec_store(&accTile[k], acc[k]);
327 for (IndexType i = 0; info[i]; ++i)
329 std::memcpy(info[i]->accumulator.accumulation[c][0],
330 st->accumulator.accumulation[c][0],
331 HalfDimensions * sizeof(BiasType));
334 // Difference calculation for the deactivated features
335 for (const auto index : removed[i])
337 const IndexType offset = HalfDimensions * index;
339 for (IndexType j = 0; j < HalfDimensions; ++j)
340 st->accumulator.accumulation[c][0][j] -= weights[offset + j];
343 // Difference calculation for the activated features
344 for (const auto index : added[i])
346 const IndexType offset = HalfDimensions * index;
348 for (IndexType j = 0; j < HalfDimensions; ++j)
349 st->accumulator.accumulation[c][0][j] += weights[offset + j];
356 // Refresh the accumulator
357 auto& accumulator = pos.state()->accumulator;
358 accumulator.state[c] = COMPUTED;
359 Features::IndexList active;
360 Features::HalfKP<Features::Side::Friend>::append_active_indices(pos, c, &active);
363 for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
365 auto biasesTile = reinterpret_cast<const vec_t*>(
366 &biases[j * TileHeight]);
367 for (IndexType k = 0; k < NumRegs; ++k)
368 acc[k] = biasesTile[k];
370 for (const auto index : active)
372 const IndexType offset = HalfDimensions * index + j * TileHeight;
373 auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
375 for (unsigned k = 0; k < NumRegs; ++k)
376 acc[k] = vec_add_16(acc[k], column[k]);
379 auto accTile = reinterpret_cast<vec_t*>(
380 &accumulator.accumulation[c][0][j * TileHeight]);
381 for (unsigned k = 0; k < NumRegs; k++)
382 vec_store(&accTile[k], acc[k]);
386 std::memcpy(accumulator.accumulation[c][0], biases,
387 HalfDimensions * sizeof(BiasType));
389 for (const auto index : active)
391 const IndexType offset = HalfDimensions * index;
393 for (IndexType j = 0; j < HalfDimensions; ++j)
394 accumulator.accumulation[c][0][j] += weights[offset + j];
404 using BiasType = std::int16_t;
405 using WeightType = std::int16_t;
407 alignas(CacheLineSize) BiasType biases[HalfDimensions];
408 alignas(CacheLineSize)
409 WeightType weights[HalfDimensions * InputDimensions];
412 } // namespace Stockfish::Eval::NNUE
414 #endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED