From: Tomasz Sobczyk Date: Wed, 9 Jun 2021 09:21:55 +0000 (+0200) Subject: Read NNUE net faster X-Git-Url: https://git.sesse.net/?p=stockfish;a=commitdiff_plain;h=b84fa04db6ea5fc6d7d714539c11537bde64538b Read NNUE net faster Load feature transformer weights in bulk on little-endian machines. This is in particular useful to test new nets with c-chess-cli, see https://github.com/lucasart/c-chess-cli/issues/44 ``` $ time ./stockfish.exe uci Before : 0m0.914s After : 0m0.483s ``` No functional change --- diff --git a/src/misc.h b/src/misc.h index 59ca6e37..dae37cd9 100644 --- a/src/misc.h +++ b/src/misc.h @@ -66,9 +66,10 @@ std::ostream& operator<<(std::ostream&, SyncCout); #define sync_cout std::cout << IO_LOCK #define sync_endl std::endl << IO_UNLOCK -// `ptr` must point to an array of size at least -// `sizeof(T) * N + alignment` bytes, where `N` is the -// number of elements in the array. + +// align_ptr_up() : get the first aligned element of an array. +// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes, +// where N is the number of elements in the array. template T* align_ptr_up(T* ptr) { @@ -78,6 +79,12 @@ T* align_ptr_up(T* ptr) return reinterpret_cast(reinterpret_cast((ptrint + (Alignment - 1)) / Alignment * Alignment)); } + +// IsLittleEndian : true if and only if the binary is compiled on a little endian machine +static inline const union { uint32_t i; char c[4]; } Le = { 0x01020304 }; +static inline const bool IsLittleEndian = (Le.c[0] == 4); + + template class ValueListInserter { public: diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index dc700061..390f61c3 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -24,6 +24,8 @@ #include #include +#include "../misc.h" // for IsLittleEndian + #if defined(USE_AVX2) #include @@ -86,37 +88,77 @@ namespace Stockfish::Eval::NNUE { // necessary to return a result with the byte ordering of the compiling machine. template inline IntType read_little_endian(std::istream& stream) { - IntType result; - std::uint8_t u[sizeof(IntType)]; - typename std::make_unsigned::type v = 0; - stream.read(reinterpret_cast(u), sizeof(IntType)); - for (std::size_t i = 0; i < sizeof(IntType); ++i) - v = (v << 8) | u[sizeof(IntType) - i - 1]; + if (IsLittleEndian) + stream.read(reinterpret_cast(&result), sizeof(IntType)); + else + { + std::uint8_t u[sizeof(IntType)]; + typename std::make_unsigned::type v = 0; + + stream.read(reinterpret_cast(u), sizeof(IntType)); + for (std::size_t i = 0; i < sizeof(IntType); ++i) + v = (v << 8) | u[sizeof(IntType) - i - 1]; + + std::memcpy(&result, &v, sizeof(IntType)); + } - std::memcpy(&result, &v, sizeof(IntType)); return result; } + // write_little_endian() is our utility to write an integer (signed or unsigned, any size) + // to a stream in little-endian order. We swap the byte order before the write if + // necessary to always write in little endian order, independantly of the byte + // ordering of the compiling machine. template inline void write_little_endian(std::ostream& stream, IntType value) { - std::uint8_t u[sizeof(IntType)]; - typename std::make_unsigned::type v = value; - - std::size_t i = 0; - // if constexpr to silence the warning about shift by 8 - if constexpr (sizeof(IntType) > 1) { - for (; i + 1 < sizeof(IntType); ++i) { - u[i] = v; - v >>= 8; - } + if (IsLittleEndian) + stream.write(reinterpret_cast(&value), sizeof(IntType)); + else + { + std::uint8_t u[sizeof(IntType)]; + typename std::make_unsigned::type v = value; + + std::size_t i = 0; + // if constexpr to silence the warning about shift by 8 + if constexpr (sizeof(IntType) > 1) + { + for (; i + 1 < sizeof(IntType); ++i) + { + u[i] = v; + v >>= 8; + } + } + u[i] = v; + + stream.write(reinterpret_cast(u), sizeof(IntType)); } - u[i] = v; + } + + // read_little_endian(s, out, N) : read integers in bulk from a little indian stream. + // This reads N integers from stream s and put them in array out. + template + inline void read_little_endian(std::istream& stream, IntType* out, std::size_t count) { + if (IsLittleEndian) + stream.read(reinterpret_cast(out), sizeof(IntType) * count); + else + for (std::size_t i = 0; i < count; ++i) + out[i] = read_little_endian(stream); + } - stream.write(reinterpret_cast(u), sizeof(IntType)); + // write_little_endian(s, out, N) : write integers in bulk to a little indian stream. + // This takes N integers from array values and writes them on stream s. + template + inline void write_little_endian(std::ostream& stream, const IntType* values, std::size_t count) { + if (IsLittleEndian) + stream.write(reinterpret_cast(values), sizeof(IntType) * count); + else + for (std::size_t i = 0; i < count; ++i) + write_little_endian(stream, values[i]); } + } // namespace Stockfish::Eval::NNUE #endif // #ifndef NNUE_COMMON_H_INCLUDED diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 10b226b3..300ce367 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -24,8 +24,6 @@ #include "nnue_common.h" #include "nnue_architecture.h" -#include "../misc.h" - #include // std::memset() namespace Stockfish::Eval::NNUE { @@ -150,23 +148,21 @@ namespace Stockfish::Eval::NNUE { // Read network parameters bool read_parameters(std::istream& stream) { - for (std::size_t i = 0; i < HalfDimensions; ++i) - biases[i] = read_little_endian(stream); - for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i) - weights[i] = read_little_endian(stream); - for (std::size_t i = 0; i < PSQTBuckets * InputDimensions; ++i) - psqtWeights[i] = read_little_endian(stream); + + read_little_endian(stream, biases , HalfDimensions ); + read_little_endian(stream, weights , HalfDimensions * InputDimensions); + read_little_endian(stream, psqtWeights, PSQTBuckets * InputDimensions); + return !stream.fail(); } // Write network parameters bool write_parameters(std::ostream& stream) const { - for (std::size_t i = 0; i < HalfDimensions; ++i) - write_little_endian(stream, biases[i]); - for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i) - write_little_endian(stream, weights[i]); - for (std::size_t i = 0; i < PSQTBuckets * InputDimensions; ++i) - write_little_endian(stream, psqtWeights[i]); + + write_little_endian(stream, biases , HalfDimensions ); + write_little_endian(stream, weights , HalfDimensions * InputDimensions); + write_little_endian(stream, psqtWeights, PSQTBuckets * InputDimensions); + return !stream.fail(); } diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp index 831c8259..a0ac727f 100644 --- a/src/syzygy/tbprobe.cpp +++ b/src/syzygy/tbprobe.cpp @@ -105,9 +105,6 @@ template<> inline void swap_endian(uint8_t&) {} template T number(void* addr) { - static const union { uint32_t i; char c[4]; } Le = { 0x01020304 }; - static const bool IsLittleEndian = (Le.c[0] == 4); - T v; if ((uintptr_t)addr & (alignof(T) - 1)) // Unaligned pointer (very rare)