X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Fnnue%2Fnnue_feature_transformer.h;h=1e0b0e6da55112dbcddf78bddeb31668564f5e66;hb=7dcfc50bde9ed7499d43b469e12ba7f456bf0d25;hp=c3f012e412ec2415b726a469d080378b8049526d;hpb=3f6451eff7c62e8d4a33c5b11f055a81b3da8387;p=stockfish diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index c3f012e4..1e0b0e6d 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -1,6 +1,6 @@ /* Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file) + Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file) Stockfish is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -27,7 +27,7 @@ #include // std::memset() -namespace Eval::NNUE { +namespace Stockfish::Eval::NNUE { // If vector instructions are enabled, we update and refresh the // accumulator tile by tile such that each tile fits in the CPU's @@ -127,7 +127,13 @@ namespace Eval::NNUE { const auto& accumulation = pos.state()->accumulator.accumulation; - #if defined(USE_AVX2) + #if defined(USE_AVX512) + constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2); + static_assert(kHalfDimensions % (kSimdWidth * 2) == 0); + const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + const __m512i kZero = _mm512_setzero_si512(); + + #elif defined(USE_AVX2) constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; constexpr int kControl = 0b11011000; const __m256i kZero = _mm256_setzero_si256(); @@ -154,13 +160,24 @@ namespace Eval::NNUE { for (IndexType p = 0; p < 2; ++p) { const IndexType offset = kHalfDimensions * p; - #if defined(USE_AVX2) + #if defined(USE_AVX512) + auto out = reinterpret_cast<__m512i*>(&output[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m512i sum0 = _mm512_load_si512( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); + __m512i sum1 = _mm512_load_si512( + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); + _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl, + _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero))); + } + + #elif defined(USE_AVX2) auto out = reinterpret_cast<__m256i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m256i sum0 = _mm256_load_si256( &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 0]); __m256i sum1 = _mm256_load_si256( - &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); + &reinterpret_cast(accumulation[perspectives[p]][0])[j * 2 + 1]); _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8( _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } @@ -177,9 +194,9 @@ namespace Eval::NNUE { _mm_store_si128(&out[j], #ifdef USE_SSE41 - _mm_max_epi8(packedbytes, kZero) + _mm_max_epi8(packedbytes, kZero) #else - _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) + _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s) #endif ); @@ -230,7 +247,7 @@ namespace Eval::NNUE { // Look for a usable accumulator of an earlier position. We keep track // of the estimated gain in terms of features to be added/subtracted. StateInfo *st = pos.state(), *next = nullptr; - int gain = popcount(pos.pieces()) - 2; + int gain = pos.count() - 2; while (st->accumulator.state[c] == EMPTY) { auto& dp = st->dirtyPiece; @@ -395,6 +412,6 @@ namespace Eval::NNUE { WeightType weights_[kHalfDimensions * kInputDimensions]; }; -} // namespace Eval::NNUE +} // namespace Stockfish::Eval::NNUE #endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED