Avoid special casing for MinGW
[stockfish] / src / nnue / nnue_feature_transformer.h
1 /*
2   Stockfish, a UCI chess playing engine derived from Glaurung 2.1
3   Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
4
5   Stockfish is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation, either version 3 of the License, or
8   (at your option) any later version.
9
10   Stockfish is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   GNU General Public License for more details.
14
15   You should have received a copy of the GNU General Public License
16   along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 // A class that converts the input features of the NNUE evaluation function
20
21 #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
22 #define NNUE_FEATURE_TRANSFORMER_H_INCLUDED
23
24 #include "nnue_common.h"
25 #include "nnue_architecture.h"
26 #include "features/index_list.h"
27
28 #include <cstring> // std::memset()
29
30 namespace Eval::NNUE {
31
32   // Input feature converter
33   class FeatureTransformer {
34
35    private:
36     // Number of output dimensions for one side
37     static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
38
39    public:
40     // Output type
41     using OutputType = TransformedFeatureType;
42
43     // Number of input/output dimensions
44     static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
45     static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
46
47     // Size of forward propagation buffer
48     static constexpr std::size_t kBufferSize =
49         kOutputDimensions * sizeof(OutputType);
50
51     // Hash value embedded in the evaluation file
52     static constexpr std::uint32_t GetHashValue() {
53       return RawFeatures::kHashValue ^ kOutputDimensions;
54     }
55
56     // Read network parameters
57     bool ReadParameters(std::istream& stream) {
58       stream.read(reinterpret_cast<char*>(biases_),
59                   kHalfDimensions * sizeof(BiasType));
60       stream.read(reinterpret_cast<char*>(weights_),
61                   kHalfDimensions * kInputDimensions * sizeof(WeightType));
62       return !stream.fail();
63     }
64
65     // Proceed with the difference calculation if possible
66     bool UpdateAccumulatorIfPossible(const Position& pos) const {
67       const auto now = pos.state();
68       if (now->accumulator.computed_accumulation) {
69         return true;
70       }
71       const auto prev = now->previous;
72       if (prev && prev->accumulator.computed_accumulation) {
73         UpdateAccumulator(pos);
74         return true;
75       }
76       return false;
77     }
78
79     // Convert input features
80     void Transform(const Position& pos, OutputType* output, bool refresh) const {
81       if (refresh || !UpdateAccumulatorIfPossible(pos)) {
82         RefreshAccumulator(pos);
83       }
84       const auto& accumulation = pos.state()->accumulator.accumulation;
85
86   #if defined(USE_AVX2)
87       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
88       constexpr int kControl = 0b11011000;
89       const __m256i kZero = _mm256_setzero_si256();
90
91   #elif defined(USE_SSSE3)
92       constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
93
94   #ifdef USE_SSE41
95       const __m128i kZero = _mm_setzero_si128();
96   #else
97       const __m128i k0x80s = _mm_set1_epi8(-128);
98   #endif
99
100   #elif defined(USE_NEON)
101       constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
102       const int8x8_t kZero = {0};
103   #endif
104
105       const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
106       for (IndexType p = 0; p < 2; ++p) {
107         const IndexType offset = kHalfDimensions * p;
108
109   #if defined(USE_AVX2)
110         auto out = reinterpret_cast<__m256i*>(&output[offset]);
111         for (IndexType j = 0; j < kNumChunks; ++j) {
112           __m256i sum0 =
113             _mm256_load_si256(&reinterpret_cast<const __m256i*>(
114               accumulation[perspectives[p]][0])[j * 2 + 0]);
115           __m256i sum1 =
116             _mm256_load_si256(&reinterpret_cast<const __m256i*>(
117               accumulation[perspectives[p]][0])[j * 2 + 1]);
118           _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
119               _mm256_packs_epi16(sum0, sum1), kZero), kControl));
120         }
121
122   #elif defined(USE_SSSE3)
123         auto out = reinterpret_cast<__m128i*>(&output[offset]);
124         for (IndexType j = 0; j < kNumChunks; ++j) {
125           __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
126               accumulation[perspectives[p]][0])[j * 2 + 0]);
127           __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
128               accumulation[perspectives[p]][0])[j * 2 + 1]);
129       const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
130
131           _mm_store_si128(&out[j],
132
133   #ifdef USE_SSE41
134             _mm_max_epi8(packedbytes, kZero)
135   #else
136             _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
137   #endif
138
139           );
140         }
141
142   #elif defined(USE_NEON)
143         const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
144         for (IndexType j = 0; j < kNumChunks; ++j) {
145           int16x8_t sum = reinterpret_cast<const int16x8_t*>(
146               accumulation[perspectives[p]][0])[j];
147           out[j] = vmax_s8(vqmovn_s16(sum), kZero);
148         }
149
150   #else
151         for (IndexType j = 0; j < kHalfDimensions; ++j) {
152           BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
153           output[offset + j] = static_cast<OutputType>(
154               std::max<int>(0, std::min<int>(127, sum)));
155         }
156   #endif
157
158       }
159     }
160
161    private:
162     // Calculate cumulative value without using difference calculation
163     void RefreshAccumulator(const Position& pos) const {
164       auto& accumulator = pos.state()->accumulator;
165       IndexType i = 0;
166       Features::IndexList active_indices[2];
167       RawFeatures::AppendActiveIndices(pos, kRefreshTriggers[i],
168                                        active_indices);
169       for (Color perspective : { WHITE, BLACK }) {
170         std::memcpy(accumulator.accumulation[perspective][i], biases_,
171                    kHalfDimensions * sizeof(BiasType));
172         for (const auto index : active_indices[perspective]) {
173           const IndexType offset = kHalfDimensions * index;
174
175   #if defined(USE_AVX2)
176           auto accumulation = reinterpret_cast<__m256i*>(
177               &accumulator.accumulation[perspective][i][0]);
178           auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
179           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
180           for (IndexType j = 0; j < kNumChunks; ++j) {
181             accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
182           }
183
184   #elif defined(USE_SSE2)
185           auto accumulation = reinterpret_cast<__m128i*>(
186               &accumulator.accumulation[perspective][i][0]);
187           auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
188           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
189           for (IndexType j = 0; j < kNumChunks; ++j) {
190             accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
191           }
192
193   #elif defined(USE_NEON)
194           auto accumulation = reinterpret_cast<int16x8_t*>(
195               &accumulator.accumulation[perspective][i][0]);
196           auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
197           constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
198           for (IndexType j = 0; j < kNumChunks; ++j) {
199             accumulation[j] = vaddq_s16(accumulation[j], column[j]);
200           }
201
202   #else
203           for (IndexType j = 0; j < kHalfDimensions; ++j) {
204             accumulator.accumulation[perspective][i][j] += weights_[offset + j];
205           }
206   #endif
207
208         }
209       }
210
211       accumulator.computed_accumulation = true;
212       accumulator.computed_score = false;
213     }
214
215     // Calculate cumulative value using difference calculation
216     void UpdateAccumulator(const Position& pos) const {
217       const auto prev_accumulator = pos.state()->previous->accumulator;
218       auto& accumulator = pos.state()->accumulator;
219       IndexType i = 0;
220       Features::IndexList removed_indices[2], added_indices[2];
221       bool reset[2];
222       RawFeatures::AppendChangedIndices(pos, kRefreshTriggers[i],
223                                         removed_indices, added_indices, reset);
224       for (Color perspective : { WHITE, BLACK }) {
225
226   #if defined(USE_AVX2)
227         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
228         auto accumulation = reinterpret_cast<__m256i*>(
229             &accumulator.accumulation[perspective][i][0]);
230
231   #elif defined(USE_SSE2)
232         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
233         auto accumulation = reinterpret_cast<__m128i*>(
234             &accumulator.accumulation[perspective][i][0]);
235
236   #elif defined(USE_NEON)
237         constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
238         auto accumulation = reinterpret_cast<int16x8_t*>(
239             &accumulator.accumulation[perspective][i][0]);
240   #endif
241
242         if (reset[perspective]) {
243           std::memcpy(accumulator.accumulation[perspective][i], biases_,
244                       kHalfDimensions * sizeof(BiasType));
245         } else {
246           std::memcpy(accumulator.accumulation[perspective][i],
247                       prev_accumulator.accumulation[perspective][i],
248                       kHalfDimensions * sizeof(BiasType));
249           // Difference calculation for the deactivated features
250           for (const auto index : removed_indices[perspective]) {
251             const IndexType offset = kHalfDimensions * index;
252
253   #if defined(USE_AVX2)
254             auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
255             for (IndexType j = 0; j < kNumChunks; ++j) {
256               accumulation[j] = _mm256_sub_epi16(accumulation[j], column[j]);
257             }
258
259   #elif defined(USE_SSE2)
260             auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
261             for (IndexType j = 0; j < kNumChunks; ++j) {
262               accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
263             }
264
265   #elif defined(USE_NEON)
266             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
267             for (IndexType j = 0; j < kNumChunks; ++j) {
268               accumulation[j] = vsubq_s16(accumulation[j], column[j]);
269             }
270
271   #else
272             for (IndexType j = 0; j < kHalfDimensions; ++j) {
273               accumulator.accumulation[perspective][i][j] -=
274                   weights_[offset + j];
275             }
276   #endif
277
278           }
279         }
280         { // Difference calculation for the activated features
281           for (const auto index : added_indices[perspective]) {
282             const IndexType offset = kHalfDimensions * index;
283
284   #if defined(USE_AVX2)
285             auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
286             for (IndexType j = 0; j < kNumChunks; ++j) {
287               accumulation[j] = _mm256_add_epi16(accumulation[j], column[j]);
288             }
289
290   #elif defined(USE_SSE2)
291             auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
292             for (IndexType j = 0; j < kNumChunks; ++j) {
293               accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
294             }
295
296   #elif defined(USE_NEON)
297             auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
298             for (IndexType j = 0; j < kNumChunks; ++j) {
299               accumulation[j] = vaddq_s16(accumulation[j], column[j]);
300             }
301
302   #else
303             for (IndexType j = 0; j < kHalfDimensions; ++j) {
304               accumulator.accumulation[perspective][i][j] +=
305                   weights_[offset + j];
306             }
307   #endif
308
309           }
310         }
311       }
312
313       accumulator.computed_accumulation = true;
314       accumulator.computed_score = false;
315     }
316
317     using BiasType = std::int16_t;
318     using WeightType = std::int16_t;
319
320     alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
321     alignas(kCacheLineSize)
322         WeightType weights_[kHalfDimensions * kInputDimensions];
323   };
324
325 }  // namespace Eval::NNUE
326
327 #endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED