X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Fnnue%2Fnnue_architecture.h;h=cac8373075e368b37e3c72493490c7e23920afcc;hb=4568f6369bfb94d8a105a8074e43e8d79c57eaee;hp=725b40fb43d2e87d9ab484603528f9965a06f59d;hpb=cb9c2594fcedc881ae8f8bfbfdf130cf89840e4c;p=stockfish diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h index 725b40fb..cac83730 100644 --- a/src/nnue/nnue_architecture.h +++ b/src/nnue/nnue_architecture.h @@ -21,12 +21,15 @@ #ifndef NNUE_ARCHITECTURE_H_INCLUDED #define NNUE_ARCHITECTURE_H_INCLUDED +#include + #include "nnue_common.h" #include "features/half_ka_v2_hm.h" #include "layers/affine_transform.h" #include "layers/clipped_relu.h" +#include "layers/sqr_clipped_relu.h" #include "../misc.h" @@ -46,8 +49,9 @@ struct Network static constexpr int FC_1_OUTPUTS = 32; Layers::AffineTransform fc_0; - Layers::ClippedReLU ac_0; - Layers::AffineTransform fc_1; + Layers::SqrClippedReLU ac_sqr_0; + Layers::ClippedReLU ac_0; + Layers::AffineTransform fc_1; Layers::ClippedReLU ac_1; Layers::AffineTransform fc_2; @@ -88,28 +92,35 @@ struct Network std::int32_t propagate(const TransformedFeatureType* transformedFeatures) { - constexpr uint64_t alignment = CacheLineSize; - - struct Buffer + struct alignas(CacheLineSize) Buffer { alignas(CacheLineSize) decltype(fc_0)::OutputBuffer fc_0_out; + alignas(CacheLineSize) decltype(ac_sqr_0)::OutputType ac_sqr_0_out[ceil_to_multiple(FC_0_OUTPUTS * 2, 32)]; alignas(CacheLineSize) decltype(ac_0)::OutputBuffer ac_0_out; alignas(CacheLineSize) decltype(fc_1)::OutputBuffer fc_1_out; alignas(CacheLineSize) decltype(ac_1)::OutputBuffer ac_1_out; alignas(CacheLineSize) decltype(fc_2)::OutputBuffer fc_2_out; + + Buffer() + { + std::memset(this, 0, sizeof(*this)); + } }; -#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN) - char bufferRaw[sizeof(Buffer) + alignment]; - char* bufferRawAligned = align_ptr_up(&bufferRaw[0]); - Buffer& buffer = *(new (bufferRawAligned) Buffer); +#if defined(__clang__) && (__APPLE__) + // workaround for a bug reported with xcode 12 + static thread_local auto tlsBuffer = std::make_unique(); + // Access TLS only once, cache result. + Buffer& buffer = *tlsBuffer; #else - alignas(alignment) Buffer buffer; + alignas(CacheLineSize) static thread_local Buffer buffer; #endif fc_0.propagate(transformedFeatures, buffer.fc_0_out); + ac_sqr_0.propagate(buffer.fc_0_out, buffer.ac_sqr_0_out); ac_0.propagate(buffer.fc_0_out, buffer.ac_0_out); - fc_1.propagate(buffer.ac_0_out, buffer.fc_1_out); + std::memcpy(buffer.ac_sqr_0_out + FC_0_OUTPUTS, buffer.ac_0_out, FC_0_OUTPUTS * sizeof(decltype(ac_0)::OutputType)); + fc_1.propagate(buffer.ac_sqr_0_out, buffer.fc_1_out); ac_1.propagate(buffer.fc_1_out, buffer.ac_1_out); fc_2.propagate(buffer.ac_1_out, buffer.fc_2_out); @@ -118,10 +129,6 @@ struct Network std::int32_t fwdOut = int(buffer.fc_0_out[FC_0_OUTPUTS]) * (600*OutputScale) / (127*(1<