From 5f781d366e0f4369ec12e36c9978ad63ffa32235 Mon Sep 17 00:00:00 2001 From: mstembera Date: Wed, 23 Feb 2022 18:19:36 -0800 Subject: [PATCH] Clean up and simplify some nnue code. Remove some unnecessary code and it's execution during inference. Also the change on line 49 in nnue_architecture.h results in a more efficient SIMD code path through ClippedReLU::propagate(). passed STC: https://tests.stockfishchess.org/tests/view/6217d3bfda649bba32ef25d5 LLR: 2.94 (-2.94,2.94) <-2.25,0.25> Total: 12056 W: 3281 L: 3092 D: 5683 Ptnml(0-2): 55, 1213, 3312, 1384, 64 passed STC SMP: https://tests.stockfishchess.org/tests/view/6217f344da649bba32ef295e LLR: 2.94 (-2.94,2.94) <-2.25,0.25> Total: 27376 W: 7295 L: 7137 D: 12944 Ptnml(0-2): 52, 2859, 7715, 3003, 59 closes https://github.com/official-stockfish/Stockfish/pull/3944 No functional change bench: 6820724 --- src/nnue/evaluate_nnue.cpp | 6 +++--- src/nnue/layers/affine_transform.h | 16 ++++++++-------- src/nnue/layers/clipped_relu.h | 8 -------- src/nnue/nnue_architecture.h | 19 ++++++++++--------- src/nnue/nnue_common.h | 4 ++-- src/nnue/nnue_feature_transformer.h | 6 ++++-- 6 files changed, 27 insertions(+), 32 deletions(-) diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp index 0fd58462..9254e36f 100644 --- a/src/nnue/evaluate_nnue.cpp +++ b/src/nnue/evaluate_nnue.cpp @@ -109,7 +109,7 @@ namespace Stockfish::Eval::NNUE { { write_little_endian(stream, Version); write_little_endian(stream, hashValue); - write_little_endian(stream, desc.size()); + write_little_endian(stream, (std::uint32_t)desc.size()); stream.write(&desc[0], desc.size()); return !stream.fail(); } @@ -157,7 +157,7 @@ namespace Stockfish::Eval::NNUE { ASSERT_ALIGNED(transformedFeatures, alignment); - const std::size_t bucket = (pos.count() - 1) / 4; + const int bucket = (pos.count() - 1) / 4; const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket); const auto positional = network[bucket]->propagate(transformedFeatures); @@ -197,7 +197,7 @@ namespace Stockfish::Eval::NNUE { NnueEvalTrace t{}; t.correctBucket = (pos.count() - 1) / 4; - for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket) { + for (IndexType bucket = 0; bucket < LayerStacks; ++bucket) { const auto materialist = featureTransformer->transform(pos, transformedFeatures, bucket); const auto positional = network[bucket]->propagate(transformedFeatures); diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 22451915..9a992608 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -235,10 +235,10 @@ namespace Stockfish::Eval::NNUE::Layers { // Read network parameters bool read_parameters(std::istream& stream) { - for (std::size_t i = 0; i < OutputDimensions; ++i) + for (IndexType i = 0; i < OutputDimensions; ++i) biases[i] = read_little_endian(stream); - for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) weights[get_weight_index(i)] = read_little_endian(stream); return !stream.fail(); @@ -246,10 +246,10 @@ namespace Stockfish::Eval::NNUE::Layers { // Write network parameters bool write_parameters(std::ostream& stream) const { - for (std::size_t i = 0; i < OutputDimensions; ++i) + for (IndexType i = 0; i < OutputDimensions; ++i) write_little_endian(stream, biases[i]); - for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) write_little_endian(stream, weights[get_weight_index(i)]); return !stream.fail(); @@ -422,9 +422,9 @@ namespace Stockfish::Eval::NNUE::Layers { // Read network parameters bool read_parameters(std::istream& stream) { - for (std::size_t i = 0; i < OutputDimensions; ++i) + for (IndexType i = 0; i < OutputDimensions; ++i) biases[i] = read_little_endian(stream); - for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) weights[get_weight_index(i)] = read_little_endian(stream); return !stream.fail(); @@ -432,10 +432,10 @@ namespace Stockfish::Eval::NNUE::Layers { // Write network parameters bool write_parameters(std::ostream& stream) const { - for (std::size_t i = 0; i < OutputDimensions; ++i) + for (IndexType i = 0; i < OutputDimensions; ++i) write_little_endian(stream, biases[i]); - for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) + for (IndexType i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) write_little_endian(stream, weights[get_weight_index(i)]); return !stream.fail(); diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index ffd2e3b7..f94d3082 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -171,14 +171,6 @@ namespace Stockfish::Eval::NNUE::Layers { std::max(0, std::min(127, input[i] >> WeightScaleBits))); } - // Affine transform layers expect that there is at least - // ceil_to_multiple(OutputDimensions, 32) initialized values. - // We cannot do this in the affine transform because it requires - // preallocating space here. - for (IndexType i = OutputDimensions; i < PaddedOutputDimensions; ++i) { - output[i] = 0; - } - return output; } }; diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h index 725b40fb..b4f65364 100644 --- a/src/nnue/nnue_architecture.h +++ b/src/nnue/nnue_architecture.h @@ -46,7 +46,7 @@ struct Network static constexpr int FC_1_OUTPUTS = 32; Layers::AffineTransform fc_0; - Layers::ClippedReLU ac_0; + Layers::ClippedReLU ac_0; Layers::AffineTransform fc_1; Layers::ClippedReLU ac_1; Layers::AffineTransform fc_2; @@ -97,14 +97,19 @@ struct Network alignas(CacheLineSize) decltype(fc_1)::OutputBuffer fc_1_out; alignas(CacheLineSize) decltype(ac_1)::OutputBuffer ac_1_out; alignas(CacheLineSize) decltype(fc_2)::OutputBuffer fc_2_out; + + Buffer() + { + std::memset(this, 0, sizeof(*this)); + } }; #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) - char bufferRaw[sizeof(Buffer) + alignment]; - char* bufferRawAligned = align_ptr_up(&bufferRaw[0]); - Buffer& buffer = *(new (bufferRawAligned) Buffer); + static thread_local char bufferRaw[sizeof(Buffer) + alignment]; + static thread_local char* bufferRawAligned = align_ptr_up(&bufferRaw[0]); + static thread_local Buffer& buffer = *(new (bufferRawAligned) Buffer); #else - alignas(alignment) Buffer buffer; + alignas(alignment) static thread_local Buffer buffer; #endif fc_0.propagate(transformedFeatures, buffer.fc_0_out); @@ -118,10 +123,6 @@ struct Network std::int32_t fwdOut = int(buffer.fc_0_out[FC_0_OUTPUTS]) * (600*OutputScale) / (127*(1<>= 8; } } - u[i] = v; + u[i] = (std::uint8_t)v; stream.write(reinterpret_cast(u), sizeof(IntType)); } diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index fb867421..85598018 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -123,8 +123,10 @@ namespace Stockfish::Eval::NNUE { // We use __m* types as template arguments, which causes GCC to emit warnings // about losing some attribute information. This is irrelevant to us as we // only take their size, so the following pragma are harmless. + #if defined(__GNUC__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wignored-attributes" + #endif template (); static constexpr int NumPsqtRegs = BestRegisterCount(); - + #if defined(__GNUC__) #pragma GCC diagnostic pop - + #endif #endif -- 2.39.2