From 26edf9534ad571a6d26bf9db47d21776cbf45d54 Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Tue, 27 Jul 2021 22:12:14 +0200
Subject: [PATCH] Avoid unnecessary stores in the affine transform

This patch improves the codegen in the AffineTransform::forward function for architectures >=SSSE3. Current code works directly on memory and the compiler cannot see that the stores through outptr do not alias the loads through weights and input32. The solution implemented is to perform the affine transform with local variables as accumulators and only store the result to memory at the end. The number of accumulators required is OutputDimensions / OutputSimdWidth, which means that for the 1024->16 affine transform it requires 4 registers with SSSE3, 2 with AVX2, 1 with AVX512. It also cuts the number of stores required by NumRegs * 256 for each node evaluated. The local accumulators are expected to be assigned to registers, but even if this cannot be done in some case due to register pressure it will help the compiler to see that there is no aliasing between the loads and stores and may still result in better codegen.

See https://godbolt.org/z/59aTKbbYc for codegen comparison.

passed STC:
LLR: 2.94 (-2.94,2.94) <-0.50,2.50>
Total: 140328 W: 10635 L: 10358 D: 119335
Ptnml(0-2): 302, 8339, 52636, 8554, 333

closes https://github.com/official-stockfish/Stockfish/pull/3634

No functional change
---
 src/nnue/layers/affine_transform.h | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 9a3b778e..9a5f62c0 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -251,9 +251,6 @@ namespace Stockfish::Eval::NNUE::Layers {
 #endif
 
 #if defined (USE_SSSE3)
-      // Different layout, we process 4 inputs at a time, always.
-      static_assert(InputDimensions % 4 == 0);
-
       const auto output = reinterpret_cast<OutputType*>(buffer);
       const auto inputVector = reinterpret_cast<const vec_t*>(input);
 
@@ -263,13 +260,18 @@ namespace Stockfish::Eval::NNUE::Layers {
       // because then it is also an input dimension.
       if constexpr (OutputDimensions % OutputSimdWidth == 0)
       {
+          static_assert(InputDimensions % 16 == 0);
+
           constexpr IndexType NumChunks = InputDimensions / 4;
+          constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
 
           const auto input32 = reinterpret_cast<const std::int32_t*>(input);
-          vec_t* outptr = reinterpret_cast<vec_t*>(output);
-          std::memcpy(output, biases, OutputDimensions * sizeof(OutputType));
+          const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
+          vec_t outs[NumRegs];
+          for (IndexType k = 0; k < NumRegs; ++k)
+              outs[k] = biasvec[k];
 
-          for (int i = 0; i < (int)NumChunks - 3; i += 4)
+          for (IndexType i = 0; i < NumChunks; i += 4)
           {
               const vec_t in0 = vec_set_32(input32[i + 0]);
               const vec_t in1 = vec_set_32(input32[i + 1]);
@@ -279,12 +281,18 @@ namespace Stockfish::Eval::NNUE::Layers {
               const auto col1 = reinterpret_cast<const vec_t*>(&weights[(i + 1) * OutputDimensions * 4]);
               const auto col2 = reinterpret_cast<const vec_t*>(&weights[(i + 2) * OutputDimensions * 4]);
               const auto col3 = reinterpret_cast<const vec_t*>(&weights[(i + 3) * OutputDimensions * 4]);
-              for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j)
-                  vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]);
+              for (IndexType k = 0; k < NumRegs; ++k)
+                  vec_add_dpbusd_32x4(outs[k], in0, col0[k], in1, col1[k], in2, col2[k], in3, col3[k]);
           }
+
+          vec_t* outptr = reinterpret_cast<vec_t*>(output);
+          for (IndexType k = 0; k < NumRegs; ++k)
+              outptr[k] = outs[k];
       }
       else if constexpr (OutputDimensions == 1)
       {
+          static_assert(InputDimensions % 4 == 0);
+
 #if defined (USE_AVX512)
           if constexpr (PaddedInputDimensions % (SimdWidth * 2) != 0)
           {
-- 
2.39.2