Bug fix in do_null_move() and NNUE simplification.
[stockfish] / src / nnue / nnue_feature_transformer.h
1 /*
2   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3   Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
4
5   Stockfish is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation, either version 3 of the License, or
8   (at your option) any later version.
9
10   Stockfish is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 // A class that converts the input features of the NNUE evaluation function
20
21 #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
22 #define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
23
24 #include "nnue_common.h"
25 #include "nnue_architecture.h"
26 #include "features/index_list.h"
27
28 #include <cstring> // std::memset()
29
30 namespace Eval::NNUE {
31
32   // Input feature converter
33   class FeatureTransformer {
34
35    private:
36     // Number of output dimensions for one side
37     static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
38
39    public:
40     // Output type
41     using OutputType = TransformedFeatureType;
42
43     // Number of input/output dimensions
44     static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
45     static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
46
47     // Size of forward propagation buffer
48     static constexpr std::size_t kBufferSize =
49         kOutputDimensions * sizeof(OutputType);
50
51     // Hash value embedded in the evaluation file
52     static constexpr std::uint32_t GetHashValue() {
53
54       return RawFeatures::kHashValue ^ kOutputDimensions;
55     }
56
57     // Read network parameters
58     bool ReadParameters(std::istream& stream) {
59
60       for (std::size_t i = 0; i < kHalfDimensions; ++i)
61         biases_[i] = read_little_endian<BiasType>(stream);
62       for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
63         weights_[i] = read_little_endian<WeightType>(stream);
64       return !stream.fail();
65     }
66
67     // Proceed with the difference calculation if possible
68     bool UpdateAccumulatorIfPossible(const Position& pos) const {
69
70       const auto now = pos.state();
71       if (now->accumulator.computed_accumulation)
72         return true;
73
74       const auto prev = now->previous;
75       if (prev && prev->accumulator.computed_accumulation) {
76         UpdateAccumulator(pos);
77         return true;
78       }
79
80       return false;
81     }
82
83     // Convert input features
84     void Transform(const Position& pos, OutputType* output) const {
85
86       if (!UpdateAccumulatorIfPossible(pos))
87         RefreshAccumulator(pos);
88
89       const auto& accumulation = pos.state()->accumulator.accumulation;
90
91   #if defined(USE_AVX2)
92       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
93       constexpr int kControl = 0b11011000;
94       const __m256i kZero = _mm256_setzero_si256();
95
96   #elif defined(USE_SSE2)
97       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
98
99   #ifdef USE_SSE41
100       const __m128i kZero = _mm_setzero_si128();
101   #else
102       const __m128i k0x80s = _mm_set1_epi8(-128);
103   #endif
104
105   #elif defined(USE_MMX)
106       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
107       const __m64 k0x80s = _mm_set1_pi8(-128);
108
109   #elif defined(USE_NEON)
110       constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
111       const int8x8_t kZero = {0};
112   #endif
113
114       const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
115       for (IndexType p = 0; p < 2; ++p) {
116         const IndexType offset = kHalfDimensions * p;
117
118   #if defined(USE_AVX2)
119         auto out = reinterpret_cast<__m256i*>(&output[offset]);
120         for (IndexType j = 0; j < kNumChunks; ++j) {
121           __m256i sum0 = _mm256_loadA_si256(
122               &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
123           __m256i sum1 = _mm256_loadA_si256(
124             &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
125           _mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
126               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
127         }
128
129   #elif defined(USE_SSE2)
130         auto out = reinterpret_cast<__m128i*>(&output[offset]);
131         for (IndexType j = 0; j < kNumChunks; ++j) {
132           __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
133               accumulation[perspectives[p]][0])[j * 2 + 0]);
134           __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
135               accumulation[perspectives[p]][0])[j * 2 + 1]);
136       const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
137
138           _mm_store_si128(&out[j],
139
140   #ifdef USE_SSE41
141             _mm_max_epi8(packedbytes, kZero)
142   #else
143             _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
144   #endif
145
146           );
147         }
148
149   #elif defined(USE_MMX)
150         auto out = reinterpret_cast<__m64*>(&output[offset]);
151         for (IndexType j = 0; j < kNumChunks; ++j) {
152           __m64 sum0 = *(&reinterpret_cast<const __m64*>(
153               accumulation[perspectives[p]][0])[j * 2 + 0]);
154           __m64 sum1 = *(&reinterpret_cast<const __m64*>(
155               accumulation[perspectives[p]][0])[j * 2 + 1]);
156           const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
157           out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
158         }
159
160   #elif defined(USE_NEON)
161         const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
162         for (IndexType j = 0; j < kNumChunks; ++j) {
163           int16x8_t sum = reinterpret_cast<const int16x8_t*>(
164               accumulation[perspectives[p]][0])[j];
165           out[j] = vmax_s8(vqmovn_s16(sum), kZero);
166         }
167
168   #else
169         for (IndexType j = 0; j < kHalfDimensions; ++j) {
170           BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
171           output[offset + j] = static_cast<OutputType>(
172               std::max<int>(0, std::min<int>(127, sum)));
173         }
174   #endif
175
176       }
177   #if defined(USE_MMX)
178       _mm_empty();
179   #endif
180     }
181
182    private:
183     // Calculate cumulative value without using difference calculation
184     void RefreshAccumulator(const Position& pos) const {
185
186       auto& accumulator = pos.state()->accumulator;
187       IndexType i = 0;
188       Features::IndexList active_indices[2];
189       RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
190                                        active_indices);
191       for (Color perspective : { WHITE, BLACK }) {
192         std::memcpy(accumulator.accumulation[perspective][i], biases_,
193                    kHalfDimensions * sizeof(BiasType));
194         for (const auto index : active_indices[perspective]) {
195           const IndexType offset = kHalfDimensions * index;
196   #if defined(USE_AVX512)
197           auto accumulation = reinterpret_cast<__m512i*>(
198               &accumulator.accumulation[perspective][i][0]);
199           auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
200           constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
201           for (IndexType j = 0; j < kNumChunks; ++j)
202             _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
203
204   #elif defined(USE_AVX2)
205           auto accumulation = reinterpret_cast<__m256i*>(
206               &accumulator.accumulation[perspective][i][0]);
207           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
208           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
209           for (IndexType j = 0; j < kNumChunks; ++j)
210             _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
211
212   #elif defined(USE_SSE2)
213           auto accumulation = reinterpret_cast<__m128i*>(
214               &accumulator.accumulation[perspective][i][0]);
215           auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
216           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
217           for (IndexType j = 0; j < kNumChunks; ++j)
218             accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
219
220   #elif defined(USE_MMX)
221           auto accumulation = reinterpret_cast<__m64*>(
222               &accumulator.accumulation[perspective][i][0]);
223           auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
224           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
225           for (IndexType j = 0; j < kNumChunks; ++j)
226             accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
227
228   #elif defined(USE_NEON)
229           auto accumulation = reinterpret_cast<int16x8_t*>(
230               &accumulator.accumulation[perspective][i][0]);
231           auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
232           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
233           for (IndexType j = 0; j < kNumChunks; ++j)
234             accumulation[j] = vaddq_s16(accumulation[j], column[j]);
235
236   #else
237           for (IndexType j = 0; j < kHalfDimensions; ++j)
238             accumulator.accumulation[perspective][i][j] += weights_[offset + j];
239   #endif
240
241         }
242       }
243   #if defined(USE_MMX)
244       _mm_empty();
245   #endif
246
247       accumulator.computed_accumulation = true;
248     }
249
250     // Calculate cumulative value using difference calculation
251     void UpdateAccumulator(const Position& pos) const {
252
253       const auto prev_accumulator = pos.state()->previous->accumulator;
254       auto& accumulator = pos.state()->accumulator;
255       IndexType i = 0;
256       Features::IndexList removed_indices[2], added_indices[2];
257       bool reset[2];
258       RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
259                                         removed_indices, added_indices, reset);
260       for (Color perspective : { WHITE, BLACK }) {
261
262   #if defined(USE_AVX2)
263         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
264         auto accumulation = reinterpret_cast<__m256i*>(
265             &accumulator.accumulation[perspective][i][0]);
266
267   #elif defined(USE_SSE2)
268         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
269         auto accumulation = reinterpret_cast<__m128i*>(
270             &accumulator.accumulation[perspective][i][0]);
271
272   #elif defined(USE_MMX)
273         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
274         auto accumulation = reinterpret_cast<__m64*>(
275             &accumulator.accumulation[perspective][i][0]);
276
277   #elif defined(USE_NEON)
278         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
279         auto accumulation = reinterpret_cast<int16x8_t*>(
280             &accumulator.accumulation[perspective][i][0]);
281   #endif
282
283         if (reset[perspective]) {
284           std::memcpy(accumulator.accumulation[perspective][i], biases_,
285                       kHalfDimensions * sizeof(BiasType));
286         } else {
287           std::memcpy(accumulator.accumulation[perspective][i],
288                       prev_accumulator.accumulation[perspective][i],
289                       kHalfDimensions * sizeof(BiasType));
290           // Difference calculation for the deactivated features
291           for (const auto index : removed_indices[perspective]) {
292             const IndexType offset = kHalfDimensions * index;
293
294   #if defined(USE_AVX2)
295             auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
296             for (IndexType j = 0; j < kNumChunks; ++j)
297               accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
298
299   #elif defined(USE_SSE2)
300             auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
301             for (IndexType j = 0; j < kNumChunks; ++j)
302               accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
303
304   #elif defined(USE_MMX)
305             auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
306             for (IndexType j = 0; j < kNumChunks; ++j)
307               accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
308
309   #elif defined(USE_NEON)
310             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
311             for (IndexType j = 0; j < kNumChunks; ++j)
312               accumulation[j] = vsubq_s16(accumulation[j], column[j]);
313
314   #else
315             for (IndexType j = 0; j < kHalfDimensions; ++j)
316               accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
317   #endif
318
319           }
320         }
321         { // Difference calculation for the activated features
322           for (const auto index : added_indices[perspective]) {
323             const IndexType offset = kHalfDimensions * index;
324
325   #if defined(USE_AVX2)
326             auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
327             for (IndexType j = 0; j < kNumChunks; ++j)
328               accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
329
330   #elif defined(USE_SSE2)
331             auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
332             for (IndexType j = 0; j < kNumChunks; ++j)
333               accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
334
335   #elif defined(USE_MMX)
336             auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
337             for (IndexType j = 0; j < kNumChunks; ++j)
338               accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
339
340   #elif defined(USE_NEON)
341             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
342             for (IndexType j = 0; j < kNumChunks; ++j)
343               accumulation[j] = vaddq_s16(accumulation[j], column[j]);
344
345   #else
346             for (IndexType j = 0; j < kHalfDimensions; ++j)
347               accumulator.accumulation[perspective][i][j] += weights_[offset + j];
348   #endif
349
350           }
351         }
352       }
353   #if defined(USE_MMX)
354       _mm_empty();
355   #endif
356
357       accumulator.computed_accumulation = true;
358     }
359
360     using BiasType = std::int16_t;
361     using WeightType = std::int16_t;
362
363     alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
364     alignas(kCacheLineSize)
365         WeightType weights_[kHalfDimensions * kInputDimensions];
366   };
367
368 }  // namespace Eval::NNUE
369
370 #endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED