From: Steinar H. Gunderson Date: Thu, 9 Sep 2021 16:40:19 +0000 (+0200) Subject: Merge remote-tracking branch 'upstream/master' into HEAD X-Git-Url: https://git.sesse.net/?p=stockfish;a=commitdiff_plain;h=4be5348194e12801abd556f7ced60c3e3f3c7c8b;hp=d558f8a673b56b32ab6da8050f41b9e02fe1758b Merge remote-tracking branch 'upstream/master' into HEAD --- diff --git a/.github/workflows/stockfish.yml b/.github/workflows/stockfish.yml index 8970fcd1..54b0cb12 100644 --- a/.github/workflows/stockfish.yml +++ b/.github/workflows/stockfish.yml @@ -25,29 +25,88 @@ jobs: os: ubuntu-20.04, compiler: g++, comp: gcc, - run_expensive_tests: true + run_expensive_tests: true, + run_32bit_tests: true, + run_64bit_tests: true, + shell: 'bash {0}' } - { name: "Ubuntu 20.04 Clang", os: ubuntu-20.04, compiler: clang++, comp: clang, - run_expensive_tests: false + run_expensive_tests: false, + run_32bit_tests: true, + run_64bit_tests: true, + shell: 'bash {0}' + } + - { + name: "MacOS 10.15 Apple Clang", + os: macos-10.15, + compiler: clang++, + comp: clang, + run_expensive_tests: false, + run_32bit_tests: false, + run_64bit_tests: true, + shell: 'bash {0}' + } + - { + name: "MacOS 10.15 GCC 10", + os: macos-10.15, + compiler: g++-10, + comp: gcc, + run_expensive_tests: false, + run_32bit_tests: false, + run_64bit_tests: true, + shell: 'bash {0}' + } + - { + name: "Windows 2019 Mingw-w64 GCC x86_64", + os: windows-2019, + compiler: g++, + comp: gcc, + run_expensive_tests: false, + run_32bit_tests: false, + run_64bit_tests: true, + msys_sys: 'mingw64', + msys_env: 'x86_64', + shell: 'msys2 {0}' + } + - { + name: "Windows 2019 Mingw-w64 GCC i686", + os: windows-2019, + compiler: g++, + comp: gcc, + run_expensive_tests: false, + run_32bit_tests: true, + run_64bit_tests: false, + msys_sys: 'mingw32', + msys_env: 'i686', + shell: 'msys2 {0}' } defaults: run: working-directory: src + shell: ${{ matrix.config.shell }} steps: - uses: actions/checkout@v2 with: fetch-depth: 0 - - name: Download required packages + - name: Download required linux packages + if: runner.os == 'Linux' run: | sudo apt update sudo apt install expect valgrind g++-multilib + - name: Setup msys and install required packages + if: runner.os == 'Windows' + uses: msys2/setup-msys2@v2 + with: + msystem: ${{matrix.config.msys_sys}} + install: mingw-w64-${{matrix.config.msys_env}}-gcc make git expect + - name: Download the used network from the fishtest framework run: | make net @@ -68,6 +127,7 @@ jobs: # x86-32 tests - name: Test debug x86-32 build + if: ${{ matrix.config.run_32bit_tests }} run: | export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG" make clean @@ -75,24 +135,28 @@ jobs: ../tests/signature.sh $benchref - name: Test x86-32 build + if: ${{ matrix.config.run_32bit_tests }} run: | make clean make -j2 ARCH=x86-32 build ../tests/signature.sh $benchref - name: Test x86-32-sse41-popcnt build + if: ${{ matrix.config.run_32bit_tests }} run: | make clean make -j2 ARCH=x86-32-sse41-popcnt build ../tests/signature.sh $benchref - name: Test x86-32-sse2 build + if: ${{ matrix.config.run_32bit_tests }} run: | make clean make -j2 ARCH=x86-32-sse2 build ../tests/signature.sh $benchref - name: Test general-32 build + if: ${{ matrix.config.run_32bit_tests }} run: | make clean make -j2 ARCH=general-32 build @@ -101,6 +165,7 @@ jobs: # x86-64 tests - name: Test debug x86-64-modern build + if: ${{ matrix.config.run_64bit_tests }} run: | export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG" make clean @@ -108,30 +173,35 @@ jobs: ../tests/signature.sh $benchref - name: Test x86-64-modern build + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64-modern build ../tests/signature.sh $benchref - name: Test x86-64-ssse3 build + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64-ssse3 build ../tests/signature.sh $benchref - name: Test x86-64-sse3-popcnt build + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64-sse3-popcnt build ../tests/signature.sh $benchref - name: Test x86-64 build + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64 build ../tests/signature.sh $benchref - name: Test general-64 build + if: matrix.config.run_64bit_tests run: | make clean make -j2 ARCH=general-64 build @@ -140,26 +210,31 @@ jobs: # x86-64 with newer extensions tests - name: Compile x86-64-avx2 build + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64-avx2 build - name: Compile x86-64-bmi2 build + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64-bmi2 build - name: Compile x86-64-avx512 build + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64-avx512 build - name: Compile x86-64-vnni512 build + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64-vnni512 build - name: Compile x86-64-vnni256 build + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64-vnni256 build @@ -167,6 +242,7 @@ jobs: # Other tests - name: Check perft and search reproducibility + if: ${{ matrix.config.run_64bit_tests }} run: | make clean make -j2 ARCH=x86-64-modern build diff --git a/AUTHORS b/AUTHORS index 7e63591a..5b5bbf22 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,4 @@ -# List of authors for Stockfish, as of June 14, 2021 +# List of authors for Stockfish # Founders of the Stockfish project and fishtest infrastructure Tord Romstad (romstad) @@ -69,6 +69,7 @@ gamander Gary Heckman (gheckman) George Sobala (gsobala) gguliash +Giacomo Lorenzetti (G-Lorenz) Gian-Carlo Pascutto (gcp) Gontran Lemaire (gonlem) Goodkov Vasiliy Aleksandrovich (goodkov) @@ -107,6 +108,7 @@ Kojirion Krystian Kuzniarek (kuzkry) Leonardo Ljubičić (ICCF World Champion) Leonid Pechenik (lp--) +Liam Keegan (lkeegan) Linus Arver (listx) loco-loco Lub van den Berg (ElbertoOne) @@ -184,6 +186,7 @@ Tom Truscott Tom Vijlbrief (tomtor) Tomasz Sobczyk (Sopel97) Torsten Franz (torfranz, tfranzer) +Torsten Hellwig (Torom) Tracey Emery (basepr1me) tttak Unai Corzo (unaiic) diff --git a/src/Makefile b/src/Makefile index e85cba59..cf4f4ecf 100644 --- a/src/Makefile +++ b/src/Makefile @@ -41,7 +41,7 @@ endif SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \ material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \ search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \ - nnue/evaluate_nnue.cpp nnue/features/half_ka_v2.cpp \ + nnue/evaluate_nnue.cpp nnue/features/half_ka_v2_hm.cpp \ hashprobe.grpc.pb.cc hashprobe.pb.cc CLISRCS = client.cpp hashprobe.grpc.pb.cc hashprobe.pb.cc uci.cpp @@ -371,7 +371,7 @@ ifeq ($(COMP),mingw) CXX=g++ endif - CXXFLAGS += -Wextra -Wshadow + CXXFLAGS += -pedantic -Wextra -Wshadow LDFLAGS += -static endif @@ -408,8 +408,12 @@ ifeq ($(COMP),clang) endif ifeq ($(KERNEL),Darwin) - CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.14 - LDFLAGS += -arch $(arch) -mmacosx-version-min=10.14 + CXXFLAGS += -mmacosx-version-min=10.14 + LDFLAGS += -mmacosx-version-min=10.14 + ifneq ($(arch),any) + CXXFLAGS += -arch $(arch) + LDFLAGS += -arch $(arch) + endif XCRUN = xcrun endif @@ -937,7 +941,7 @@ client: $(CLIOBJS) # Other stuff -.depend: +.depend: $(SRCS) -@$(CXX) $(DEPENDFLAGS) -MM $(SRCS) > $@ 2> /dev/null -include .depend diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 538214d3..62d4be84 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -78,6 +78,8 @@ namespace Eval { return; string eval_file = string(Options["EvalFile"]); + if (eval_file.empty()) + eval_file = EvalFileDefaultName; #if defined(DEFAULT_NNUE_DIRECTORY) #define stringify2(x) #x @@ -118,16 +120,16 @@ namespace Eval { void NNUE::verify() { string eval_file = string(Options["EvalFile"]); + if (eval_file.empty()) + eval_file = EvalFileDefaultName; if (useNNUE && eval_file_loaded != eval_file) { - UCI::OptionsMap defaults; - UCI::init(defaults); string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available."; string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully."; string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file."; - string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + string(defaults["EvalFile"]); + string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(EvalFileDefaultName); string msg5 = "The engine will be terminated now."; sync_cout << "info string ERROR: " << msg1 << sync_endl; @@ -1090,7 +1092,7 @@ Value Eval::evaluate(const Position& pos) { // Scale and shift NNUE for compatibility with search and classical evaluation auto adjusted_NNUE = [&]() { - int scale = 903 + int scale = 883 + 32 * pos.count() + 32 * pos.non_pawn_material() / 1024; @@ -1106,7 +1108,7 @@ Value Eval::evaluate(const Position& pos) { // NNUE eval faster when shuffling or if the material on the board is high. int r50 = pos.rule50_count(); Value psq = Value(abs(eg_value(pos.psq_score()))); - bool classical = psq * 5 > (750 + pos.non_pawn_material() / 64) * (5 + r50); + bool classical = psq * 5 > (850 + pos.non_pawn_material() / 64) * (5 + r50); v = classical ? Evaluation(pos).value() // classical : adjusted_NNUE(); // NNUE diff --git a/src/evaluate.h b/src/evaluate.h index 54f20baf..d7cc6e29 100644 --- a/src/evaluate.h +++ b/src/evaluate.h @@ -39,7 +39,7 @@ namespace Eval { // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue // for the build process (profile-build and fishtest) to work. Do not change the // name of the macro, as it is used in the Makefile. - #define EvalFileDefaultName "nn-9e3c6298299a.nnue" + #define EvalFileDefaultName "nn-6762d36ad265.nnue" namespace NNUE { diff --git a/src/misc.cpp b/src/misc.cpp index feaf9b1a..4cac7e98 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -110,7 +110,14 @@ public: static Logger l; - if (!fname.empty() && !l.file.is_open()) + if (l.file.is_open()) + { + cout.rdbuf(l.out.buf); + cin.rdbuf(l.in.buf); + l.file.close(); + } + + if (!fname.empty()) { l.file.open(fname, ifstream::out); @@ -123,12 +130,6 @@ public: cin.rdbuf(&l.in); cout.rdbuf(&l.out); } - else if (fname.empty() && l.file.is_open()) - { - cout.rdbuf(l.out.buf); - cin.rdbuf(l.in.buf); - l.file.close(); - } } }; @@ -379,6 +380,7 @@ void std_aligned_free(void* ptr) { static void* aligned_large_pages_alloc_windows(size_t allocSize) { #if !defined(_WIN64) + (void)allocSize; // suppress unused-parameter compiler warning return nullptr; #else diff --git a/src/movegen.cpp b/src/movegen.cpp index 5f3ba90a..5095bb74 100644 --- a/src/movegen.cpp +++ b/src/movegen.cpp @@ -52,9 +52,9 @@ namespace { constexpr Direction UpRight = (Us == WHITE ? NORTH_EAST : SOUTH_WEST); constexpr Direction UpLeft = (Us == WHITE ? NORTH_WEST : SOUTH_EAST); - const Bitboard emptySquares = Type == QUIETS || Type == QUIET_CHECKS ? target : ~pos.pieces(); - const Bitboard enemies = Type == EVASIONS ? pos.checkers() - : Type == CAPTURES ? target : pos.pieces(Them); + const Bitboard emptySquares = ~pos.pieces(); + const Bitboard enemies = Type == EVASIONS ? pos.checkers() + : pos.pieces(Them); Bitboard pawnsOn7 = pos.pieces(Us, PAWN) & TRank7BB; Bitboard pawnsNotOn7 = pos.pieces(Us, PAWN) & ~TRank7BB; diff --git a/src/movepick.cpp b/src/movepick.cpp index 4ff4cff4..20640fe2 100644 --- a/src/movepick.cpp +++ b/src/movepick.cpp @@ -111,7 +111,7 @@ void MovePicker::score() { + (*continuationHistory[1])[pos.moved_piece(m)][to_sq(m)] + (*continuationHistory[3])[pos.moved_piece(m)][to_sq(m)] + (*continuationHistory[5])[pos.moved_piece(m)][to_sq(m)] - + (ply < MAX_LPH ? std::min(4, depth / 3) * (*lowPlyHistory)[ply][from_to(m)] : 0); + + (ply < MAX_LPH ? 6 * (*lowPlyHistory)[ply][from_to(m)] : 0); else // Type == EVASIONS { diff --git a/src/nnue/features/half_ka_v2.cpp b/src/nnue/features/half_ka_v2_hm.cpp similarity index 68% rename from src/nnue/features/half_ka_v2.cpp rename to src/nnue/features/half_ka_v2_hm.cpp index 57f43e50..098a6d60 100644 --- a/src/nnue/features/half_ka_v2.cpp +++ b/src/nnue/features/half_ka_v2_hm.cpp @@ -16,31 +16,32 @@ along with this program. If not, see . */ -//Definition of input features HalfKAv2 of NNUE evaluation function +//Definition of input features HalfKAv2_hm of NNUE evaluation function -#include "half_ka_v2.h" +#include "half_ka_v2_hm.h" #include "../../position.h" namespace Stockfish::Eval::NNUE::Features { // Orient a square according to perspective (rotates by 180 for black) - inline Square HalfKAv2::orient(Color perspective, Square s) { - return Square(int(s) ^ (bool(perspective) * 56)); + inline Square HalfKAv2_hm::orient(Color perspective, Square s, Square ksq) { + return Square(int(s) ^ (bool(perspective) * SQ_A8) ^ ((file_of(ksq) < FILE_E) * SQ_H1)); } // Index of a feature for a given king position and another piece on some square - inline IndexType HalfKAv2::make_index(Color perspective, Square s, Piece pc, Square ksq) { - return IndexType(orient(perspective, s) + PieceSquareIndex[perspective][pc] + PS_NB * ksq); + inline IndexType HalfKAv2_hm::make_index(Color perspective, Square s, Piece pc, Square ksq) { + Square o_ksq = orient(perspective, ksq, ksq); + return IndexType(orient(perspective, s, ksq) + PieceSquareIndex[perspective][pc] + PS_NB * KingBuckets[o_ksq]); } // Get a list of indices for active features - void HalfKAv2::append_active_indices( + void HalfKAv2_hm::append_active_indices( const Position& pos, Color perspective, ValueListInserter active ) { - Square ksq = orient(perspective, pos.square(perspective)); + Square ksq = pos.square(perspective); Bitboard bb = pos.pieces(); while (bb) { @@ -52,7 +53,7 @@ namespace Stockfish::Eval::NNUE::Features { // append_changed_indices() : get a list of indices for recently changed features - void HalfKAv2::append_changed_indices( + void HalfKAv2_hm::append_changed_indices( Square ksq, StateInfo* st, Color perspective, @@ -60,25 +61,24 @@ namespace Stockfish::Eval::NNUE::Features { ValueListInserter added ) { const auto& dp = st->dirtyPiece; - Square oriented_ksq = orient(perspective, ksq); for (int i = 0; i < dp.dirty_num; ++i) { Piece pc = dp.piece[i]; if (dp.from[i] != SQ_NONE) - removed.push_back(make_index(perspective, dp.from[i], pc, oriented_ksq)); + removed.push_back(make_index(perspective, dp.from[i], pc, ksq)); if (dp.to[i] != SQ_NONE) - added.push_back(make_index(perspective, dp.to[i], pc, oriented_ksq)); + added.push_back(make_index(perspective, dp.to[i], pc, ksq)); } } - int HalfKAv2::update_cost(StateInfo* st) { + int HalfKAv2_hm::update_cost(StateInfo* st) { return st->dirtyPiece.dirty_num; } - int HalfKAv2::refresh_cost(const Position& pos) { + int HalfKAv2_hm::refresh_cost(const Position& pos) { return pos.count(); } - bool HalfKAv2::requires_refresh(StateInfo* st, Color perspective) { + bool HalfKAv2_hm::requires_refresh(StateInfo* st, Color perspective) { return st->dirtyPiece.piece[0] == make_piece(perspective, KING); } diff --git a/src/nnue/features/half_ka_v2.h b/src/nnue/features/half_ka_v2_hm.h similarity index 80% rename from src/nnue/features/half_ka_v2.h rename to src/nnue/features/half_ka_v2_hm.h index e4b2edd9..2c1144f6 100644 --- a/src/nnue/features/half_ka_v2.h +++ b/src/nnue/features/half_ka_v2_hm.h @@ -18,8 +18,8 @@ //Definition of input features HalfKP of NNUE evaluation function -#ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED -#define NNUE_FEATURES_HALF_KA_V2_H_INCLUDED +#ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED +#define NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED #include "../nnue_common.h" @@ -32,9 +32,9 @@ namespace Stockfish { namespace Stockfish::Eval::NNUE::Features { - // Feature HalfKAv2: Combination of the position of own king - // and the position of pieces - class HalfKAv2 { + // Feature HalfKAv2_hm: Combination of the position of own king + // and the position of pieces. Position mirrored such that king always on e..h files. + class HalfKAv2_hm { // unique number for each piece type on each square enum { @@ -63,21 +63,32 @@ namespace Stockfish::Eval::NNUE::Features { }; // Orient a square according to perspective (rotates by 180 for black) - static Square orient(Color perspective, Square s); + static Square orient(Color perspective, Square s, Square ksq); // Index of a feature for a given king position and another piece on some square static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq); public: // Feature name - static constexpr const char* Name = "HalfKAv2(Friend)"; + static constexpr const char* Name = "HalfKAv2_hm(Friend)"; // Hash value embedded in the evaluation file - static constexpr std::uint32_t HashValue = 0x5f234cb8u; + static constexpr std::uint32_t HashValue = 0x7f234cb8u; // Number of feature dimensions static constexpr IndexType Dimensions = - static_cast(SQUARE_NB) * static_cast(PS_NB); + static_cast(SQUARE_NB) * static_cast(PS_NB) / 2; + + static constexpr int KingBuckets[64] = { + -1, -1, -1, -1, 31, 30, 29, 28, + -1, -1, -1, -1, 27, 26, 25, 24, + -1, -1, -1, -1, 23, 22, 21, 20, + -1, -1, -1, -1, 19, 18, 17, 16, + -1, -1, -1, -1, 15, 14, 13, 12, + -1, -1, -1, -1, 11, 10, 9, 8, + -1, -1, -1, -1, 7, 6, 5, 4, + -1, -1, -1, -1, 3, 2, 1, 0 + }; // Maximum number of simultaneously active features. static constexpr IndexType MaxActiveDimensions = 32; @@ -108,4 +119,4 @@ namespace Stockfish::Eval::NNUE::Features { } // namespace Stockfish::Eval::NNUE::Features -#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED +#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 9a3b778e..b2871278 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -22,13 +22,141 @@ #define NNUE_LAYERS_AFFINE_TRANSFORM_H_INCLUDED #include +#include +#include #include "../nnue_common.h" +#include "../../simd.h" + +/* + This file contains the definition for a fully connected layer (aka affine transform). + Two approaches are employed, depending on the sizes of the transform. + + Approach 1: + - used when the PaddedInputDimensions >= 128 + - uses AVX512 if possible + - processes inputs in batches of 2*InputSimdWidth + - so in batches of 128 for AVX512 + - the weight blocks of size InputSimdWidth are transposed such that + access is sequential + - N columns of the weight matrix are processed a time, where N + depends on the architecture (the amount of registers) + - accumulate + hadd is used + + Approach 2: + - used when the PaddedInputDimensions < 128 + - does not use AVX512 + - expected use-case is for when PaddedInputDimensions == 32 and InputDimensions <= 32. + - that's why AVX512 is hard to implement + - expected use-case is small layers + - not optimized as well as the approach 1 + - inputs are processed in chunks of 4, weights are respectively transposed + - accumulation happens directly to int32s +*/ namespace Stockfish::Eval::NNUE::Layers { - // Affine transformation layer +// Fallback implementation for older/other architectures. +// Identical for both approaches. Requires the input to be padded to at least 16 values. +#if !defined(USE_SSSE3) + template + static void affine_transform_non_ssse3(std::int32_t* output, const std::int8_t* weights, const std::int32_t* biases, const std::uint8_t* input) + { +# if defined(USE_SSE2) + // At least a multiple of 16, with SSE2. + static_assert(PaddedInputDimensions % 16 == 0); + constexpr IndexType NumChunks = PaddedInputDimensions / 16; + const __m128i Zeros = _mm_setzero_si128(); + const auto inputVector = reinterpret_cast(input); + +# elif defined(USE_MMX) + static_assert(InputDimensions % 8 == 0); + constexpr IndexType NumChunks = InputDimensions / 8; + const __m64 Zeros = _mm_setzero_si64(); + const auto inputVector = reinterpret_cast(input); + +# elif defined(USE_NEON) + static_assert(PaddedInputDimensions % 16 == 0); + constexpr IndexType NumChunks = PaddedInputDimensions / 16; + const auto inputVector = reinterpret_cast(input); +# endif + + for (IndexType i = 0; i < OutputDimensions; ++i) { + const IndexType offset = i * PaddedInputDimensions; + +# if defined(USE_SSE2) + __m128i sumLo = _mm_cvtsi32_si128(biases[i]); + __m128i sumHi = Zeros; + const auto row = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumChunks; ++j) { + __m128i row_j = _mm_load_si128(&row[j]); + __m128i input_j = _mm_load_si128(&inputVector[j]); + __m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8); + __m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8); + __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros); + __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros); + __m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo); + __m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi); + sumLo = _mm_add_epi32(sumLo, productLo); + sumHi = _mm_add_epi32(sumHi, productHi); + } + __m128i sum = _mm_add_epi32(sumLo, sumHi); + __m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sumHigh_64); + __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_second_32); + output[i] = _mm_cvtsi128_si32(sum); + +# elif defined(USE_MMX) + __m64 sumLo = _mm_cvtsi32_si64(biases[i]); + __m64 sumHi = Zeros; + const auto row = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumChunks; ++j) { + __m64 row_j = row[j]; + __m64 input_j = inputVector[j]; + __m64 extendedRowLo = _mm_srai_pi16(_mm_unpacklo_pi8(row_j, row_j), 8); + __m64 extendedRowHi = _mm_srai_pi16(_mm_unpackhi_pi8(row_j, row_j), 8); + __m64 extendedInputLo = _mm_unpacklo_pi8(input_j, Zeros); + __m64 extendedInputHi = _mm_unpackhi_pi8(input_j, Zeros); + __m64 productLo = _mm_madd_pi16(extendedRowLo, extendedInputLo); + __m64 productHi = _mm_madd_pi16(extendedRowHi, extendedInputHi); + sumLo = _mm_add_pi32(sumLo, productLo); + sumHi = _mm_add_pi32(sumHi, productHi); + } + __m64 sum = _mm_add_pi32(sumLo, sumHi); + sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); + output[i] = _mm_cvtsi64_si32(sum); + +# elif defined(USE_NEON) + int32x4_t sum = {biases[i]}; + const auto row = reinterpret_cast(&weights[offset]); + for (IndexType j = 0; j < NumChunks; ++j) { + int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]); + product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]); + sum = vpadalq_s16(sum, product); + } + output[i] = sum[0] + sum[1] + sum[2] + sum[3]; + +# else + std::int32_t sum = biases[i]; + for (IndexType j = 0; j < InputDimensions; ++j) { + sum += weights[offset + j] * input[j]; + } + output[i] = sum; +# endif + } + +# if defined(USE_MMX) + _mm_empty(); +# endif + } +#endif + + template + class AffineTransform; + + // A specialization for large inputs. template - class AffineTransform { + class AffineTransform= 2*64-1)>> { public: // Input/output type using InputType = typename PreviousLayer::OutputType; @@ -36,24 +164,49 @@ namespace Stockfish::Eval::NNUE::Layers { static_assert(std::is_same::value, ""); // Number of input/output dimensions - static constexpr IndexType InputDimensions = - PreviousLayer::OutputDimensions; + static constexpr IndexType InputDimensions = PreviousLayer::OutputDimensions; static constexpr IndexType OutputDimensions = OutDims; + static constexpr IndexType PaddedInputDimensions = - ceil_to_multiple(InputDimensions, MaxSimdWidth); + ceil_to_multiple(InputDimensions, MaxSimdWidth); + + static_assert(PaddedInputDimensions >= 128, "Something went wrong. This specialization should not have been chosen."); + #if defined (USE_AVX512) - static constexpr const IndexType OutputSimdWidth = SimdWidth / 2; + static constexpr const IndexType InputSimdWidth = 64; + static constexpr const IndexType MaxNumOutputRegs = 16; +#elif defined (USE_AVX2) + static constexpr const IndexType InputSimdWidth = 32; + static constexpr const IndexType MaxNumOutputRegs = 8; #elif defined (USE_SSSE3) - static constexpr const IndexType OutputSimdWidth = SimdWidth / 4; + static constexpr const IndexType InputSimdWidth = 16; + static constexpr const IndexType MaxNumOutputRegs = 8; +#else + // The fallback implementation will not have permuted weights. + // We define these to avoid a lot of ifdefs later. + static constexpr const IndexType InputSimdWidth = 1; + static constexpr const IndexType MaxNumOutputRegs = 1; #endif + // A big block is a region in the weight matrix of the size [PaddedInputDimensions, NumOutputRegs]. + // A small block is a region of size [InputSimdWidth, 1] + + static constexpr const IndexType NumOutputRegs = std::min(MaxNumOutputRegs, OutputDimensions); + static constexpr const IndexType SmallBlockSize = InputSimdWidth; + static constexpr const IndexType BigBlockSize = NumOutputRegs * PaddedInputDimensions; + static constexpr const IndexType NumSmallBlocksInBigBlock = BigBlockSize / SmallBlockSize; + static constexpr const IndexType NumSmallBlocksPerOutput = PaddedInputDimensions / SmallBlockSize; + static constexpr const IndexType NumBigBlocks = OutputDimensions / NumOutputRegs; + + static_assert(OutputDimensions % NumOutputRegs == 0); + // Size of forward propagation buffer used in this layer static constexpr std::size_t SelfBufferSize = - ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize); + ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize); // Size of the forward propagation buffer used from the input layer to this layer static constexpr std::size_t BufferSize = - PreviousLayer::BufferSize + SelfBufferSize; + PreviousLayer::BufferSize + SelfBufferSize; // Hash value embedded in the evaluation file static constexpr std::uint32_t get_hash_value() { @@ -64,21 +217,35 @@ namespace Stockfish::Eval::NNUE::Layers { return hashValue; } + /* + Transposes the small blocks within a block. + Effectively means that weights can be traversed sequentially during inference. + */ + static IndexType get_weight_index(IndexType i) + { + const IndexType smallBlock = (i / SmallBlockSize) % NumSmallBlocksInBigBlock; + const IndexType smallBlockCol = smallBlock / NumSmallBlocksPerOutput; + const IndexType smallBlockRow = smallBlock % NumSmallBlocksPerOutput; + const IndexType bigBlock = i / BigBlockSize; + const IndexType rest = i % SmallBlockSize; + + const IndexType idx = + bigBlock * BigBlockSize + + smallBlockRow * SmallBlockSize * NumOutputRegs + + smallBlockCol * SmallBlockSize + + rest; + + return idx; + } + // Read network parameters bool read_parameters(std::istream& stream) { if (!previousLayer.read_parameters(stream)) return false; for (std::size_t i = 0; i < OutputDimensions; ++i) biases[i] = read_little_endian(stream); + for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) -#if !defined (USE_SSSE3) - weights[i] = read_little_endian(stream); -#else - weights[ - (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + - i / PaddedInputDimensions * 4 + - i % 4 - ] = read_little_endian(stream); -#endif + weights[get_weight_index(i)] = read_little_endian(stream); return !stream.fail(); } @@ -88,23 +255,9 @@ namespace Stockfish::Eval::NNUE::Layers { if (!previousLayer.write_parameters(stream)) return false; for (std::size_t i = 0; i < OutputDimensions; ++i) write_little_endian(stream, biases[i]); -#if !defined (USE_SSSE3) - for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) - write_little_endian(stream, weights[i]); -#else - std::unique_ptr unscrambledWeights = std::make_unique(OutputDimensions * PaddedInputDimensions); - for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) { - unscrambledWeights[i] = - weights[ - (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + - i / PaddedInputDimensions * 4 + - i % 4 - ]; - } for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) - write_little_endian(stream, unscrambledWeights[i]); -#endif + write_little_endian(stream, weights[get_weight_index(i)]); return !stream.fail(); } @@ -113,308 +266,274 @@ namespace Stockfish::Eval::NNUE::Layers { const OutputType* propagate( const TransformedFeatureType* transformedFeatures, char* buffer) const { const auto input = previousLayer.propagate( - transformedFeatures, buffer + SelfBufferSize); - -#if defined (USE_AVX512) - - [[maybe_unused]] const __m512i Ones512 = _mm512_set1_epi16(1); - - [[maybe_unused]] auto m512_hadd = [](__m512i sum, int bias) -> int { - return _mm512_reduce_add_epi32(sum) + bias; - }; - - [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) { -#if defined (USE_VNNI) - acc = _mm512_dpbusd_epi32(acc, a, b); -#else - __m512i product0 = _mm512_maddubs_epi16(a, b); - product0 = _mm512_madd_epi16(product0, Ones512); - acc = _mm512_add_epi32(acc, product0); -#endif - }; - - [[maybe_unused]] auto m512_add_dpbusd_epi32x4 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1, - __m512i a2, __m512i b2, __m512i a3, __m512i b3) { -#if defined (USE_VNNI) - acc = _mm512_dpbusd_epi32(acc, a0, b0); - acc = _mm512_dpbusd_epi32(acc, a1, b1); - acc = _mm512_dpbusd_epi32(acc, a2, b2); - acc = _mm512_dpbusd_epi32(acc, a3, b3); -#else - __m512i product0 = _mm512_maddubs_epi16(a0, b0); - __m512i product1 = _mm512_maddubs_epi16(a1, b1); - __m512i product2 = _mm512_maddubs_epi16(a2, b2); - __m512i product3 = _mm512_maddubs_epi16(a3, b3); - product0 = _mm512_adds_epi16(product0, product1); - product0 = _mm512_madd_epi16(product0, Ones512); - product2 = _mm512_adds_epi16(product2, product3); - product2 = _mm512_madd_epi16(product2, Ones512); - acc = _mm512_add_epi32(acc, _mm512_add_epi32(product0, product2)); -#endif - }; - -#endif -#if defined (USE_AVX2) - - [[maybe_unused]] const __m256i Ones256 = _mm256_set1_epi16(1); - - [[maybe_unused]] auto m256_hadd = [](__m256i sum, int bias) -> int { - __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); - sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); - sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); - return _mm_cvtsi128_si32(sum128) + bias; - }; - - [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) { -#if defined (USE_VNNI) - acc = _mm256_dpbusd_epi32(acc, a, b); -#else - __m256i product0 = _mm256_maddubs_epi16(a, b); - product0 = _mm256_madd_epi16(product0, Ones256); - acc = _mm256_add_epi32(acc, product0); -#endif - }; - - [[maybe_unused]] auto m256_add_dpbusd_epi32x4 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1, - __m256i a2, __m256i b2, __m256i a3, __m256i b3) { -#if defined (USE_VNNI) - acc = _mm256_dpbusd_epi32(acc, a0, b0); - acc = _mm256_dpbusd_epi32(acc, a1, b1); - acc = _mm256_dpbusd_epi32(acc, a2, b2); - acc = _mm256_dpbusd_epi32(acc, a3, b3); -#else - __m256i product0 = _mm256_maddubs_epi16(a0, b0); - __m256i product1 = _mm256_maddubs_epi16(a1, b1); - __m256i product2 = _mm256_maddubs_epi16(a2, b2); - __m256i product3 = _mm256_maddubs_epi16(a3, b3); - product0 = _mm256_adds_epi16(product0, product1); - product0 = _mm256_madd_epi16(product0, Ones256); - product2 = _mm256_adds_epi16(product2, product3); - product2 = _mm256_madd_epi16(product2, Ones256); - acc = _mm256_add_epi32(acc, _mm256_add_epi32(product0, product2)); -#endif - }; - -#endif -#if defined (USE_SSSE3) - - [[maybe_unused]] const __m128i Ones128 = _mm_set1_epi16(1); - - [[maybe_unused]] auto m128_hadd = [](__m128i sum, int bias) -> int { - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC - sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB - return _mm_cvtsi128_si32(sum) + bias; - }; - - [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) { - __m128i product0 = _mm_maddubs_epi16(a, b); - product0 = _mm_madd_epi16(product0, Ones128); - acc = _mm_add_epi32(acc, product0); - }; - - [[maybe_unused]] auto m128_add_dpbusd_epi32x4 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1, - __m128i a2, __m128i b2, __m128i a3, __m128i b3) { - __m128i product0 = _mm_maddubs_epi16(a0, b0); - __m128i product1 = _mm_maddubs_epi16(a1, b1); - __m128i product2 = _mm_maddubs_epi16(a2, b2); - __m128i product3 = _mm_maddubs_epi16(a3, b3); - product0 = _mm_adds_epi16(product0, product1); - product0 = _mm_madd_epi16(product0, Ones128); - product2 = _mm_adds_epi16(product2, product3); - product2 = _mm_madd_epi16(product2, Ones128); - acc = _mm_add_epi32(acc, _mm_add_epi32(product0, product2)); - }; - -#endif + transformedFeatures, buffer + SelfBufferSize); + OutputType* output = reinterpret_cast(buffer); #if defined (USE_AVX512) using vec_t = __m512i; #define vec_setzero _mm512_setzero_si512 #define vec_set_32 _mm512_set1_epi32 - auto& vec_add_dpbusd_32 = m512_add_dpbusd_epi32; - auto& vec_add_dpbusd_32x4 = m512_add_dpbusd_epi32x4; - auto& vec_hadd = m512_hadd; + #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32 + #define vec_add_dpbusd_32x2 Simd::m512_add_dpbusd_epi32x2 + #define vec_hadd Simd::m512_hadd + #define vec_haddx4 Simd::m512_haddx4 #elif defined (USE_AVX2) using vec_t = __m256i; #define vec_setzero _mm256_setzero_si256 #define vec_set_32 _mm256_set1_epi32 - auto& vec_add_dpbusd_32 = m256_add_dpbusd_epi32; - auto& vec_add_dpbusd_32x4 = m256_add_dpbusd_epi32x4; - auto& vec_hadd = m256_hadd; + #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 + #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2 + #define vec_hadd Simd::m256_hadd + #define vec_haddx4 Simd::m256_haddx4 #elif defined (USE_SSSE3) using vec_t = __m128i; #define vec_setzero _mm_setzero_si128 #define vec_set_32 _mm_set1_epi32 - auto& vec_add_dpbusd_32 = m128_add_dpbusd_epi32; - auto& vec_add_dpbusd_32x4 = m128_add_dpbusd_epi32x4; - auto& vec_hadd = m128_hadd; + #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 + #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2 + #define vec_hadd Simd::m128_hadd + #define vec_haddx4 Simd::m128_haddx4 #endif #if defined (USE_SSSE3) - // Different layout, we process 4 inputs at a time, always. - static_assert(InputDimensions % 4 == 0); + const vec_t* invec = reinterpret_cast(input); - const auto output = reinterpret_cast(buffer); - const auto inputVector = reinterpret_cast(input); - static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1); - - // OutputDimensions is either 1 or a multiple of SimdWidth - // because then it is also an input dimension. - if constexpr (OutputDimensions % OutputSimdWidth == 0) + // Perform accumulation to registers for each big block + for (IndexType bigBlock = 0; bigBlock < NumBigBlocks; ++bigBlock) { - constexpr IndexType NumChunks = InputDimensions / 4; + vec_t acc[NumOutputRegs] = { vec_setzero() }; + + // Each big block has NumOutputRegs small blocks in each "row", one per register. + // We process two small blocks at a time to save on one addition without VNNI. + for (IndexType smallBlock = 0; smallBlock < NumSmallBlocksPerOutput; smallBlock += 2) + { + const vec_t* weightvec = + reinterpret_cast( + weights + + bigBlock * BigBlockSize + + smallBlock * SmallBlockSize * NumOutputRegs); + + const vec_t in0 = invec[smallBlock + 0]; + const vec_t in1 = invec[smallBlock + 1]; + + for (IndexType k = 0; k < NumOutputRegs; ++k) + vec_add_dpbusd_32x2(acc[k], in0, weightvec[k], in1, weightvec[k + NumOutputRegs]); + } - const auto input32 = reinterpret_cast(input); - vec_t* outptr = reinterpret_cast(output); - std::memcpy(output, biases, OutputDimensions * sizeof(OutputType)); + // Horizontally add all accumulators. + if constexpr (NumOutputRegs % 4 == 0) + { + __m128i* outputvec = reinterpret_cast<__m128i*>(output); + const __m128i* biasvec = reinterpret_cast(biases); - for (int i = 0; i < (int)NumChunks - 3; i += 4) - { - const vec_t in0 = vec_set_32(input32[i + 0]); - const vec_t in1 = vec_set_32(input32[i + 1]); - const vec_t in2 = vec_set_32(input32[i + 2]); - const vec_t in3 = vec_set_32(input32[i + 3]); - const auto col0 = reinterpret_cast(&weights[(i + 0) * OutputDimensions * 4]); - const auto col1 = reinterpret_cast(&weights[(i + 1) * OutputDimensions * 4]); - const auto col2 = reinterpret_cast(&weights[(i + 2) * OutputDimensions * 4]); - const auto col3 = reinterpret_cast(&weights[(i + 3) * OutputDimensions * 4]); - for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j) - vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]); - } - } - else if constexpr (OutputDimensions == 1) - { -#if defined (USE_AVX512) - if constexpr (PaddedInputDimensions % (SimdWidth * 2) != 0) + for (IndexType k = 0; k < NumOutputRegs; k += 4) { - constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; - const auto inputVector256 = reinterpret_cast(input); - - __m256i sum0 = _mm256_setzero_si256(); - const auto row0 = reinterpret_cast(&weights[0]); - - for (int j = 0; j < (int)NumChunks; ++j) - { - const __m256i in = inputVector256[j]; - m256_add_dpbusd_epi32(sum0, in, row0[j]); - } - output[0] = m256_hadd(sum0, biases[0]); + const IndexType idx = (bigBlock * NumOutputRegs + k) / 4; + outputvec[idx] = vec_haddx4(acc[k+0], acc[k+1], acc[k+2], acc[k+3], biasvec[idx]); } - else -#endif + } + else + { + for (IndexType k = 0; k < NumOutputRegs; ++k) { -#if defined (USE_AVX512) - constexpr IndexType NumChunks = PaddedInputDimensions / (SimdWidth * 2); -#else - constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; -#endif - vec_t sum0 = vec_setzero(); - const auto row0 = reinterpret_cast(&weights[0]); - - for (int j = 0; j < (int)NumChunks; ++j) - { - const vec_t in = inputVector[j]; - vec_add_dpbusd_32(sum0, in, row0[j]); - } - output[0] = vec_hadd(sum0, biases[0]); + const IndexType idx = (bigBlock * NumOutputRegs + k); + output[idx] = vec_hadd(acc[k], biases[idx]); } + } } +# undef vec_setzero +# undef vec_set_32 +# undef vec_add_dpbusd_32 +# undef vec_add_dpbusd_32x2 +# undef vec_hadd +# undef vec_haddx4 #else + // Use old implementation for the other architectures. + affine_transform_non_ssse3< + InputDimensions, + PaddedInputDimensions, + OutputDimensions>(output, weights, biases, input); + +#endif -// Use old implementation for the other architectures. + return output; + } - auto output = reinterpret_cast(buffer); + private: + using BiasType = OutputType; + using WeightType = std::int8_t; -#if defined(USE_SSE2) - // At least a multiple of 16, with SSE2. - static_assert(InputDimensions % SimdWidth == 0); - constexpr IndexType NumChunks = InputDimensions / SimdWidth; - const __m128i Zeros = _mm_setzero_si128(); - const auto inputVector = reinterpret_cast(input); + PreviousLayer previousLayer; -#elif defined(USE_MMX) - static_assert(InputDimensions % SimdWidth == 0); - constexpr IndexType NumChunks = InputDimensions / SimdWidth; - const __m64 Zeros = _mm_setzero_si64(); - const auto inputVector = reinterpret_cast(input); + alignas(CacheLineSize) BiasType biases[OutputDimensions]; + alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions]; + }; -#elif defined(USE_NEON) - static_assert(InputDimensions % SimdWidth == 0); - constexpr IndexType NumChunks = InputDimensions / SimdWidth; - const auto inputVector = reinterpret_cast(input); + template + class AffineTransform> { + public: + // Input/output type + using InputType = typename PreviousLayer::OutputType; + using OutputType = std::int32_t; + static_assert(std::is_same::value, ""); + + // Number of input/output dimensions + static constexpr IndexType InputDimensions = + PreviousLayer::OutputDimensions; + static constexpr IndexType OutputDimensions = OutDims; + static constexpr IndexType PaddedInputDimensions = + ceil_to_multiple(InputDimensions, MaxSimdWidth); + + static_assert(PaddedInputDimensions < 128, "Something went wrong. This specialization should not have been chosen."); + +#if defined (USE_SSSE3) + static constexpr const IndexType OutputSimdWidth = SimdWidth / 4; + static constexpr const IndexType InputSimdWidth = SimdWidth; #endif - for (IndexType i = 0; i < OutputDimensions; ++i) { - const IndexType offset = i * PaddedInputDimensions; - -#if defined(USE_SSE2) - __m128i sumLo = _mm_cvtsi32_si128(biases[i]); - __m128i sumHi = Zeros; - const auto row = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m128i row_j = _mm_load_si128(&row[j]); - __m128i input_j = _mm_load_si128(&inputVector[j]); - __m128i extendedRowLo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8); - __m128i extendedRowHi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8); - __m128i extendedInputLo = _mm_unpacklo_epi8(input_j, Zeros); - __m128i extendedInputHi = _mm_unpackhi_epi8(input_j, Zeros); - __m128i productLo = _mm_madd_epi16(extendedRowLo, extendedInputLo); - __m128i productHi = _mm_madd_epi16(extendedRowHi, extendedInputHi); - sumLo = _mm_add_epi32(sumLo, productLo); - sumHi = _mm_add_epi32(sumHi, productHi); - } - __m128i sum = _mm_add_epi32(sumLo, sumHi); - __m128i sumHigh_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); - sum = _mm_add_epi32(sum, sumHigh_64); - __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); - sum = _mm_add_epi32(sum, sum_second_32); - output[i] = _mm_cvtsi128_si32(sum); - -#elif defined(USE_MMX) - __m64 sumLo = _mm_cvtsi32_si64(biases[i]); - __m64 sumHi = Zeros; - const auto row = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - __m64 row_j = row[j]; - __m64 input_j = inputVector[j]; - __m64 extendedRowLo = _mm_srai_pi16(_mm_unpacklo_pi8(row_j, row_j), 8); - __m64 extendedRowHi = _mm_srai_pi16(_mm_unpackhi_pi8(row_j, row_j), 8); - __m64 extendedInputLo = _mm_unpacklo_pi8(input_j, Zeros); - __m64 extendedInputHi = _mm_unpackhi_pi8(input_j, Zeros); - __m64 productLo = _mm_madd_pi16(extendedRowLo, extendedInputLo); - __m64 productHi = _mm_madd_pi16(extendedRowHi, extendedInputHi); - sumLo = _mm_add_pi32(sumLo, productLo); - sumHi = _mm_add_pi32(sumHi, productHi); - } - __m64 sum = _mm_add_pi32(sumLo, sumHi); - sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); - output[i] = _mm_cvtsi64_si32(sum); - -#elif defined(USE_NEON) - int32x4_t sum = {biases[i]}; - const auto row = reinterpret_cast(&weights[offset]); - for (IndexType j = 0; j < NumChunks; ++j) { - int16x8_t product = vmull_s8(inputVector[j * 2], row[j * 2]); - product = vmlal_s8(product, inputVector[j * 2 + 1], row[j * 2 + 1]); - sum = vpadalq_s16(sum, product); - } - output[i] = sum[0] + sum[1] + sum[2] + sum[3]; + // Size of forward propagation buffer used in this layer + static constexpr std::size_t SelfBufferSize = + ceil_to_multiple(OutputDimensions * sizeof(OutputType), CacheLineSize); + + // Size of the forward propagation buffer used from the input layer to this layer + static constexpr std::size_t BufferSize = + PreviousLayer::BufferSize + SelfBufferSize; + + // Hash value embedded in the evaluation file + static constexpr std::uint32_t get_hash_value() { + std::uint32_t hashValue = 0xCC03DAE4u; + hashValue += OutputDimensions; + hashValue ^= PreviousLayer::get_hash_value() >> 1; + hashValue ^= PreviousLayer::get_hash_value() << 31; + return hashValue; + } + static IndexType get_weight_index_scrambled(IndexType i) + { + return + (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 + + i / PaddedInputDimensions * 4 + + i % 4; + } + + static IndexType get_weight_index(IndexType i) + { +#if defined (USE_SSSE3) + return get_weight_index_scrambled(i); #else - OutputType sum = biases[i]; - for (IndexType j = 0; j < InputDimensions; ++j) { - sum += weights[offset + j] * input[j]; - } - output[i] = sum; + return i; #endif + } - } -#if defined(USE_MMX) - _mm_empty(); + // Read network parameters + bool read_parameters(std::istream& stream) { + if (!previousLayer.read_parameters(stream)) return false; + for (std::size_t i = 0; i < OutputDimensions; ++i) + biases[i] = read_little_endian(stream); + for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + weights[get_weight_index(i)] = read_little_endian(stream); + + return !stream.fail(); + } + + // Write network parameters + bool write_parameters(std::ostream& stream) const { + if (!previousLayer.write_parameters(stream)) return false; + for (std::size_t i = 0; i < OutputDimensions; ++i) + write_little_endian(stream, biases[i]); + + for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + write_little_endian(stream, weights[get_weight_index(i)]); + + return !stream.fail(); + } + // Forward propagation + const OutputType* propagate( + const TransformedFeatureType* transformedFeatures, char* buffer) const { + const auto input = previousLayer.propagate( + transformedFeatures, buffer + SelfBufferSize); + const auto output = reinterpret_cast(buffer); + +#if defined (USE_AVX2) + using vec_t = __m256i; + #define vec_setzero _mm256_setzero_si256 + #define vec_set_32 _mm256_set1_epi32 + #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32 + #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2 + #define vec_add_dpbusd_32x4 Simd::m256_add_dpbusd_epi32x4 + #define vec_hadd Simd::m256_hadd + #define vec_haddx4 Simd::m256_haddx4 +#elif defined (USE_SSSE3) + using vec_t = __m128i; + #define vec_setzero _mm_setzero_si128 + #define vec_set_32 _mm_set1_epi32 + #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32 + #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2 + #define vec_add_dpbusd_32x4 Simd::m128_add_dpbusd_epi32x4 + #define vec_hadd Simd::m128_hadd + #define vec_haddx4 Simd::m128_haddx4 #endif +#if defined (USE_SSSE3) + const auto inputVector = reinterpret_cast(input); + + static_assert(InputDimensions % 8 == 0); + static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1); + + if constexpr (OutputDimensions % OutputSimdWidth == 0) + { + constexpr IndexType NumChunks = InputDimensions / 4; + constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth; + + const auto input32 = reinterpret_cast(input); + const vec_t* biasvec = reinterpret_cast(biases); + vec_t acc[NumRegs]; + for (IndexType k = 0; k < NumRegs; ++k) + acc[k] = biasvec[k]; + + for (IndexType i = 0; i < NumChunks; i += 2) + { + const vec_t in0 = vec_set_32(input32[i + 0]); + const vec_t in1 = vec_set_32(input32[i + 1]); + const auto col0 = reinterpret_cast(&weights[(i + 0) * OutputDimensions * 4]); + const auto col1 = reinterpret_cast(&weights[(i + 1) * OutputDimensions * 4]); + for (IndexType k = 0; k < NumRegs; ++k) + vec_add_dpbusd_32x2(acc[k], in0, col0[k], in1, col1[k]); + } + + vec_t* outptr = reinterpret_cast(output); + for (IndexType k = 0; k < NumRegs; ++k) + outptr[k] = acc[k]; + } + else if constexpr (OutputDimensions == 1) + { + constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth; + vec_t sum0 = vec_setzero(); + const auto row0 = reinterpret_cast(&weights[0]); + + for (int j = 0; j < (int)NumChunks; ++j) + { + const vec_t in = inputVector[j]; + vec_add_dpbusd_32(sum0, in, row0[j]); + } + output[0] = vec_hadd(sum0, biases[0]); + } + +# undef vec_setzero +# undef vec_set_32 +# undef vec_add_dpbusd_32 +# undef vec_add_dpbusd_32x2 +# undef vec_add_dpbusd_32x4 +# undef vec_hadd +# undef vec_haddx4 +#else + // Use old implementation for the other architectures. + affine_transform_non_ssse3< + InputDimensions, + PaddedInputDimensions, + OutputDimensions>(output, weights, biases, input); #endif return output; diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 65455df4..c6f3ccad 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -35,9 +35,10 @@ namespace Stockfish::Eval::NNUE::Layers { static_assert(std::is_same::value, ""); // Number of input/output dimensions - static constexpr IndexType InputDimensions = - PreviousLayer::OutputDimensions; + static constexpr IndexType InputDimensions = PreviousLayer::OutputDimensions; static constexpr IndexType OutputDimensions = InputDimensions; + static constexpr IndexType PaddedOutputDimensions = + ceil_to_multiple(OutputDimensions, 32); // Size of forward propagation buffer used in this layer static constexpr std::size_t SelfBufferSize = @@ -179,6 +180,15 @@ namespace Stockfish::Eval::NNUE::Layers { output[i] = static_cast( std::max(0, std::min(127, input[i] >> WeightScaleBits))); } + + // Affine transform layers expect that there is at least + // ceil_to_multiple(OutputDimensions, 32) initialized values. + // We cannot do this in the affine transform because it requires + // preallocating space here. + for (IndexType i = OutputDimensions; i < PaddedOutputDimensions; ++i) { + output[i] = 0; + } + return output; } diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h index 879a39cd..193a197d 100644 --- a/src/nnue/nnue_architecture.h +++ b/src/nnue/nnue_architecture.h @@ -23,7 +23,7 @@ #include "nnue_common.h" -#include "features/half_ka_v2.h" +#include "features/half_ka_v2_hm.h" #include "layers/input_slice.h" #include "layers/affine_transform.h" @@ -32,10 +32,10 @@ namespace Stockfish::Eval::NNUE { // Input features used in evaluation function - using FeatureSet = Features::HalfKAv2; + using FeatureSet = Features::HalfKAv2_hm; // Number of input feature dimensions after conversion - constexpr IndexType TransformedFeatureDimensions = 512; + constexpr IndexType TransformedFeatureDimensions = 1024; constexpr IndexType PSQTBuckets = 8; constexpr IndexType LayerStacks = 8; @@ -43,7 +43,7 @@ namespace Stockfish::Eval::NNUE { // Define network structure using InputLayer = InputSlice; - using HiddenLayer1 = ClippedReLU>; + using HiddenLayer1 = ClippedReLU>; using HiddenLayer2 = ClippedReLU>; using OutputLayer = AffineTransform; diff --git a/src/search.cpp b/src/search.cpp index fef1b518..ed5dd6c2 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -779,8 +779,10 @@ namespace { ? ss->staticEval > (ss-4)->staticEval || (ss-4)->staticEval == VALUE_NONE : ss->staticEval > (ss-2)->staticEval; - // Step 7. Futility pruning: child node (~50 Elo) + // Step 7. Futility pruning: child node (~50 Elo). + // The depth condition is important for mate finding. if ( !PvNode + && depth < 9 && eval - futility_margin(depth, improving) >= beta && eval < VALUE_KNOWN_WIN) // Do not return unproven wins return eval; @@ -799,7 +801,7 @@ namespace { assert(eval - beta >= 0); // Null move dynamic reduction based on depth and value - Depth R = (1090 + 81 * depth) / 256 + std::min(int(eval - beta) / 205, 3); + Depth R = std::min(int(eval - beta) / 205, 3) + depth / 3 + 4; ss->currentMove = MOVE_NULL; ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0]; @@ -907,7 +909,7 @@ namespace { && !ttMove) depth -= 2; -moves_loop: // When in check, search starts from here +moves_loop: // When in check, search starts here ttCapture = ttMove && pos.capture_or_promotion(ttMove); @@ -989,7 +991,7 @@ moves_loop: // When in check, search starts from here // Calculate new depth for this move newDepth = depth - 1; - // Step 13. Pruning at shallow depth (~200 Elo) + // Step 13. Pruning at shallow depth (~200 Elo). Depth conditions are important for mate finding. if ( !rootNode && pos.non_pawn_material(us) && bestValue > VALUE_TB_LOSS_IN_MAX_PLY) @@ -1016,22 +1018,20 @@ moves_loop: // When in check, search starts from here else { // Continuation history based pruning (~20 Elo) - if ( lmrDepth < 5 - && (*contHist[0])[movedPiece][to_sq(move)] < CounterMovePruneThreshold - && (*contHist[1])[movedPiece][to_sq(move)] < CounterMovePruneThreshold) + if (lmrDepth < 5 + && (*contHist[0])[movedPiece][to_sq(move)] + + (*contHist[1])[movedPiece][to_sq(move)] + + (*contHist[3])[movedPiece][to_sq(move)] < -3000 * depth + 3000) continue; // Futility pruning: parent node (~5 Elo) if ( !ss->inCheck - && ss->staticEval + 174 + 157 * lmrDepth <= alpha - && (*contHist[0])[movedPiece][to_sq(move)] - + (*contHist[1])[movedPiece][to_sq(move)] - + (*contHist[3])[movedPiece][to_sq(move)] - + (*contHist[5])[movedPiece][to_sq(move)] / 3 < 28255) + && lmrDepth < 7 + && ss->staticEval + 174 + 157 * lmrDepth <= alpha) continue; // Prune moves with negative SEE (~20 Elo) - if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth))) + if (!pos.see_ge(move, Value(-21 * lmrDepth * lmrDepth - 21 * lmrDepth))) continue; } } @@ -1094,6 +1094,14 @@ moves_loop: // When in check, search starts from here return beta; } } + + // Capture extensions for PvNodes and cutNodes + else if ( (PvNode || cutNode) + && captureOrPromotion + && moveCount != 1) + extension = 1; + + // Check extensions else if ( givesCheck && depth > 6 && abs(ss->staticEval) > Value(100)) @@ -1159,27 +1167,23 @@ moves_loop: // When in check, search starts from here if (cutNode && move != ss->killers[0]) r += 2; - if (!captureOrPromotion) - { - // Increase reduction if ttMove is a capture (~3 Elo) - if (ttCapture) - r++; - - ss->statScore = thisThread->mainHistory[us][from_to(move)] - + (*contHist[0])[movedPiece][to_sq(move)] - + (*contHist[1])[movedPiece][to_sq(move)] - + (*contHist[3])[movedPiece][to_sq(move)] - - 4923; - - // Decrease/increase reduction for moves with a good/bad history (~30 Elo) - if (!ss->inCheck) - r -= ss->statScore / 14721; - } + // Increase reduction if ttMove is a capture (~3 Elo) + if (ttCapture) + r++; + + ss->statScore = thisThread->mainHistory[us][from_to(move)] + + (*contHist[0])[movedPiece][to_sq(move)] + + (*contHist[1])[movedPiece][to_sq(move)] + + (*contHist[3])[movedPiece][to_sq(move)] + - 4923; + + // Decrease/increase reduction for moves with a good/bad history (~30 Elo) + r -= ss->statScore / 14721; // In general we want to cap the LMR depth search at newDepth. But if // reductions are really negative and movecount is low, we allow this move - // to be searched deeper than the first move, unless ttMove was extended by 2. - Depth d = std::clamp(newDepth - r, 1, newDepth + (r < -1 && moveCount <= 5 && !doubleExtension)); + // to be searched deeper than the first move in specific cases. + Depth d = std::clamp(newDepth - r, 1, newDepth + (r < -1 && (moveCount <= 5 || (depth > 6 && PvNode)) && !doubleExtension)); value = -search(pos, ss+1, -(alpha+1), -alpha, d, true); @@ -1321,7 +1325,7 @@ moves_loop: // When in check, search starts from here // Bonus for prior countermove that caused the fail low else if ( (depth >= 3 || PvNode) && !priorCapture) - update_continuation_histories(ss-1, pos.piece_on(prevSq), prevSq, stat_bonus(depth)); + update_continuation_histories(ss-1, pos.piece_on(prevSq), prevSq, stat_bonus(depth) * (1 + (PvNode || cutNode))); if (PvNode) bestValue = std::min(bestValue, maxValue); @@ -1472,6 +1476,10 @@ moves_loop: // When in check, search starts from here { assert(is_ok(move)); + // Check for legality + if (!pos.legal(move)) + continue; + givesCheck = pos.gives_check(move); captureOrPromotion = pos.capture_or_promotion(move); @@ -1510,13 +1518,6 @@ moves_loop: // When in check, search starts from here // Speculative prefetch as early as possible prefetch(TT.first_entry(pos.key_after(move))); - // Check for legality just before making the move - if (!pos.legal(move)) - { - moveCount--; - continue; - } - ss->currentMove = move; ss->continuationHistory = &thisThread->continuationHistory[ss->inCheck] [captureOrPromotion] diff --git a/src/simd.h b/src/simd.h new file mode 100644 index 00000000..584148f1 --- /dev/null +++ b/src/simd.h @@ -0,0 +1,341 @@ +/* + Stockfish, a UCI chess playing engine derived from Glaurung 2.1 + Copyright (C) 2004-2021 The Stockfish developers (see AUTHORS file) + + Stockfish is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Stockfish is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef STOCKFISH_SIMD_H_INCLUDED +#define STOCKFISH_SIMD_H_INCLUDED + +#if defined(USE_AVX2) +# include + +#elif defined(USE_SSE41) +# include + +#elif defined(USE_SSSE3) +# include + +#elif defined(USE_SSE2) +# include + +#elif defined(USE_MMX) +# include + +#elif defined(USE_NEON) +# include +#endif + +// The inline asm is only safe for GCC, where it is necessary to get good codegen. +// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101693 +// Clang does fine without it. +// Play around here: https://godbolt.org/z/7EWqrYq51 +#if (defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)) +#define USE_INLINE_ASM +#endif + +namespace Stockfish::Simd { + +#if defined (USE_AVX512) + + [[maybe_unused]] static int m512_hadd(__m512i sum, int bias) { + return _mm512_reduce_add_epi32(sum) + bias; + } + + /* + Parameters: + sum0 = [zmm0.i128[0], zmm0.i128[1], zmm0.i128[2], zmm0.i128[3]] + sum1 = [zmm1.i128[0], zmm1.i128[1], zmm1.i128[2], zmm1.i128[3]] + sum2 = [zmm2.i128[0], zmm2.i128[1], zmm2.i128[2], zmm2.i128[3]] + sum3 = [zmm3.i128[0], zmm3.i128[1], zmm3.i128[2], zmm3.i128[3]] + + Returns: + ret = [ + reduce_add_epi32(zmm0.i128[0]), reduce_add_epi32(zmm1.i128[0]), reduce_add_epi32(zmm2.i128[0]), reduce_add_epi32(zmm3.i128[0]), + reduce_add_epi32(zmm0.i128[1]), reduce_add_epi32(zmm1.i128[1]), reduce_add_epi32(zmm2.i128[1]), reduce_add_epi32(zmm3.i128[1]), + reduce_add_epi32(zmm0.i128[2]), reduce_add_epi32(zmm1.i128[2]), reduce_add_epi32(zmm2.i128[2]), reduce_add_epi32(zmm3.i128[2]), + reduce_add_epi32(zmm0.i128[3]), reduce_add_epi32(zmm1.i128[3]), reduce_add_epi32(zmm2.i128[3]), reduce_add_epi32(zmm3.i128[3]) + ] + */ + [[maybe_unused]] static __m512i m512_hadd128x16_interleave( + __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) { + + __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1); + __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1); + + __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3); + __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3); + + __m512i sum01 = _mm512_add_epi32(sum01a, sum01b); + __m512i sum23 = _mm512_add_epi32(sum23a, sum23b); + + __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23); + __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23); + + return _mm512_add_epi32(sum0123a, sum0123b); + } + + [[maybe_unused]] static __m128i m512_haddx4( + __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, + __m128i bias) { + + __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3); + + __m256i sum256lo = _mm512_castsi512_si256(sum); + __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1); + + sum256lo = _mm256_add_epi32(sum256lo, sum256hi); + + __m128i sum128lo = _mm256_castsi256_si128(sum256lo); + __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1); + + return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); + } + + [[maybe_unused]] static void m512_add_dpbusd_epi32( + __m512i& acc, + __m512i a, + __m512i b) { + +# if defined (USE_VNNI) +# if defined (USE_INLINE_ASM) + asm( + "vpdpbusd %[b], %[a], %[acc]\n\t" + : [acc]"+v"(acc) + : [a]"v"(a), [b]"vm"(b) + ); +# else + acc = _mm512_dpbusd_epi32(acc, a, b); +# endif +# else +# if defined (USE_INLINE_ASM) + __m512i tmp = _mm512_maddubs_epi16(a, b); + asm( + "vpmaddwd %[tmp], %[ones], %[tmp]\n\t" + "vpaddd %[acc], %[tmp], %[acc]\n\t" + : [acc]"+v"(acc), [tmp]"+&v"(tmp) + : [ones]"v"(_mm512_set1_epi16(1)) + ); +# else + __m512i product0 = _mm512_maddubs_epi16(a, b); + product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1)); + acc = _mm512_add_epi32(acc, product0); +# endif +# endif + } + + [[maybe_unused]] static void m512_add_dpbusd_epi32x2( + __m512i& acc, + __m512i a0, __m512i b0, + __m512i a1, __m512i b1) { + +# if defined (USE_VNNI) +# if defined (USE_INLINE_ASM) + asm( + "vpdpbusd %[b0], %[a0], %[acc]\n\t" + "vpdpbusd %[b1], %[a1], %[acc]\n\t" + : [acc]"+v"(acc) + : [a0]"v"(a0), [b0]"vm"(b0), [a1]"v"(a1), [b1]"vm"(b1) + ); +# else + acc = _mm512_dpbusd_epi32(acc, a0, b0); + acc = _mm512_dpbusd_epi32(acc, a1, b1); +# endif +# else +# if defined (USE_INLINE_ASM) + __m512i tmp0 = _mm512_maddubs_epi16(a0, b0); + __m512i tmp1 = _mm512_maddubs_epi16(a1, b1); + asm( + "vpaddsw %[tmp0], %[tmp1], %[tmp0]\n\t" + "vpmaddwd %[tmp0], %[ones], %[tmp0]\n\t" + "vpaddd %[acc], %[tmp0], %[acc]\n\t" + : [acc]"+v"(acc), [tmp0]"+&v"(tmp0) + : [tmp1]"v"(tmp1), [ones]"v"(_mm512_set1_epi16(1)) + ); +# else + __m512i product0 = _mm512_maddubs_epi16(a0, b0); + __m512i product1 = _mm512_maddubs_epi16(a1, b1); + product0 = _mm512_adds_epi16(product0, product1); + product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1)); + acc = _mm512_add_epi32(acc, product0); +# endif +# endif + } + +#endif + +#if defined (USE_AVX2) + + [[maybe_unused]] static int m256_hadd(__m256i sum, int bias) { + __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); + return _mm_cvtsi128_si32(sum128) + bias; + } + + [[maybe_unused]] static __m128i m256_haddx4( + __m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, + __m128i bias) { + + sum0 = _mm256_hadd_epi32(sum0, sum1); + sum2 = _mm256_hadd_epi32(sum2, sum3); + + sum0 = _mm256_hadd_epi32(sum0, sum2); + + __m128i sum128lo = _mm256_castsi256_si128(sum0); + __m128i sum128hi = _mm256_extracti128_si256(sum0, 1); + + return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); + } + + [[maybe_unused]] static void m256_add_dpbusd_epi32( + __m256i& acc, + __m256i a, + __m256i b) { + +# if defined (USE_VNNI) +# if defined (USE_INLINE_ASM) + asm( + "vpdpbusd %[b], %[a], %[acc]\n\t" + : [acc]"+v"(acc) + : [a]"v"(a), [b]"vm"(b) + ); +# else + acc = _mm256_dpbusd_epi32(acc, a, b); +# endif +# else +# if defined (USE_INLINE_ASM) + __m256i tmp = _mm256_maddubs_epi16(a, b); + asm( + "vpmaddwd %[tmp], %[ones], %[tmp]\n\t" + "vpaddd %[acc], %[tmp], %[acc]\n\t" + : [acc]"+v"(acc), [tmp]"+&v"(tmp) + : [ones]"v"(_mm256_set1_epi16(1)) + ); +# else + __m256i product0 = _mm256_maddubs_epi16(a, b); + product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1)); + acc = _mm256_add_epi32(acc, product0); +# endif +# endif + } + + [[maybe_unused]] static void m256_add_dpbusd_epi32x2( + __m256i& acc, + __m256i a0, __m256i b0, + __m256i a1, __m256i b1) { + +# if defined (USE_VNNI) +# if defined (USE_INLINE_ASM) + asm( + "vpdpbusd %[b0], %[a0], %[acc]\n\t" + "vpdpbusd %[b1], %[a1], %[acc]\n\t" + : [acc]"+v"(acc) + : [a0]"v"(a0), [b0]"vm"(b0), [a1]"v"(a1), [b1]"vm"(b1) + ); +# else + acc = _mm256_dpbusd_epi32(acc, a0, b0); + acc = _mm256_dpbusd_epi32(acc, a1, b1); +# endif +# else +# if defined (USE_INLINE_ASM) + __m256i tmp0 = _mm256_maddubs_epi16(a0, b0); + __m256i tmp1 = _mm256_maddubs_epi16(a1, b1); + asm( + "vpaddsw %[tmp0], %[tmp1], %[tmp0]\n\t" + "vpmaddwd %[tmp0], %[ones], %[tmp0]\n\t" + "vpaddd %[acc], %[tmp0], %[acc]\n\t" + : [acc]"+v"(acc), [tmp0]"+&v"(tmp0) + : [tmp1]"v"(tmp1), [ones]"v"(_mm256_set1_epi16(1)) + ); +# else + __m256i product0 = _mm256_maddubs_epi16(a0, b0); + __m256i product1 = _mm256_maddubs_epi16(a1, b1); + product0 = _mm256_adds_epi16(product0, product1); + product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1)); + acc = _mm256_add_epi32(acc, product0); +# endif +# endif + } + +#endif + +#if defined (USE_SSSE3) + + [[maybe_unused]] static int m128_hadd(__m128i sum, int bias) { + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB + return _mm_cvtsi128_si32(sum) + bias; + } + + [[maybe_unused]] static __m128i m128_haddx4( + __m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, + __m128i bias) { + + sum0 = _mm_hadd_epi32(sum0, sum1); + sum2 = _mm_hadd_epi32(sum2, sum3); + sum0 = _mm_hadd_epi32(sum0, sum2); + return _mm_add_epi32(sum0, bias); + } + + [[maybe_unused]] static void m128_add_dpbusd_epi32( + __m128i& acc, + __m128i a, + __m128i b) { + +# if defined (USE_INLINE_ASM) + __m128i tmp = _mm_maddubs_epi16(a, b); + asm( + "pmaddwd %[ones], %[tmp]\n\t" + "paddd %[tmp], %[acc]\n\t" + : [acc]"+v"(acc), [tmp]"+&v"(tmp) + : [ones]"v"(_mm_set1_epi16(1)) + ); +# else + __m128i product0 = _mm_maddubs_epi16(a, b); + product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1)); + acc = _mm_add_epi32(acc, product0); +# endif + } + + [[maybe_unused]] static void m128_add_dpbusd_epi32x2( + __m128i& acc, + __m128i a0, __m128i b0, + __m128i a1, __m128i b1) { + +# if defined (USE_INLINE_ASM) + __m128i tmp0 = _mm_maddubs_epi16(a0, b0); + __m128i tmp1 = _mm_maddubs_epi16(a1, b1); + asm( + "paddsw %[tmp1], %[tmp0]\n\t" + "pmaddwd %[ones], %[tmp0]\n\t" + "paddd %[tmp0], %[acc]\n\t" + : [acc]"+v"(acc), [tmp0]"+&v"(tmp0) + : [tmp1]"v"(tmp1), [ones]"v"(_mm_set1_epi16(1)) + ); +# else + __m128i product0 = _mm_maddubs_epi16(a0, b0); + __m128i product1 = _mm_maddubs_epi16(a1, b1); + product0 = _mm_adds_epi16(product0, product1); + product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1)); + acc = _mm_add_epi32(acc, product0); +# endif + } + +#endif + +} + +#endif // STOCKFISH_SIMD_H_INCLUDED diff --git a/src/ucioption.cpp b/src/ucioption.cpp index a63db9e4..029137b7 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -174,7 +174,7 @@ Option& Option::operator=(const string& v) { assert(!type.empty()); - if ( (type != "button" && v.empty()) + if ( (type != "button" && type != "string" && v.empty()) || (type == "check" && v != "true" && v != "false") || (type == "spin" && (stof(v) < min || stof(v) > max))) return *this;