From 4766dfc3956f78d853c5e0c4636d6f90fd93df9a Mon Sep 17 00:00:00 2001
From: Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Date: Thu, 2 Dec 2021 12:29:11 +0100
Subject: [PATCH] Optimize FT activation and affine transform for NEON.

This patch optimizes the NEON implementation in two ways.

    The activation layer after the feature transformer is rewritten to make it easier for the compiler to see through dependencies and unroll. This in itself is a minimal, but a positive improvement. Other architectures could benefit from this too in the future. This is not an algorithmic change.
    The affine transform for large matrices (first layer after FT) on NEON now utilizes the same optimized code path as >=SSSE3, which makes the memory accesses more sequential and makes better use of the available registers, which allows for code that has longer dependency chains.

Benchmarks from Redshift#161, profile-build with apple clang

george@Georges-MacBook-Air nets % ./stockfish-b82d93 bench 2>&1 | tail -4 (current master)
===========================
Total time (ms) : 2167
Nodes searched  : 4667742
Nodes/second    : 2154011
george@Georges-MacBook-Air nets % ./stockfish-7377b8 bench 2>&1 | tail -4 (this patch)
===========================
Total time (ms) : 1842
Nodes searched  : 4667742
Nodes/second    : 2534061

This is a solid 18% improvement overall, larger in a bench with NNUE-only, not mixed.

Improvement is also observed on armv7-neon (Raspberry Pi, and older phones), around 5% speedup.

No changes for architectures other than NEON.

closes https://github.com/official-stockfish/Stockfish/pull/3837

No functional changes.
---
 src/Makefile                        |  8 +++-
 src/nnue/layers/affine_transform.h  | 64 +++++++++++++++++------------
 src/nnue/nnue_feature_transformer.h | 13 ++++--
 src/simd.h                          | 39 ++++++++++++++++++
 4 files changed, 94 insertions(+), 30 deletions(-)
diff --git a/src/Makefile b/src/Makefile
index a9333a22..3cf97873 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -128,6 +128,7 @@ avx512 = no
 vnni256 = no
 vnni512 = no
 neon = no
+arm_version = 0
 STRIP = strip
 
 ### 2.2 Architecture specific
@@ -275,6 +276,7 @@ ifeq ($(ARCH),armv7)
 	arch = armv7
 	prefetch = yes
 	bits = 32
+	arm_version = 7
 endif
 
 ifeq ($(ARCH),armv7-neon)
@@ -283,6 +285,7 @@ ifeq ($(ARCH),armv7-neon)
 	popcnt = yes
 	neon = yes
 	bits = 32
+	arm_version = 7
 endif
 
 ifeq ($(ARCH),armv8)
@@ -290,6 +293,7 @@ ifeq ($(ARCH),armv8)
 	prefetch = yes
 	popcnt = yes
 	neon = yes
+	arm_version = 8
 endif
 
 ifeq ($(ARCH),apple-silicon)
@@ -297,6 +301,7 @@ ifeq ($(ARCH),apple-silicon)
 	prefetch = yes
 	popcnt = yes
 	neon = yes
+	arm_version = 8
 endif
 
 ifeq ($(ARCH),ppc-32)
@@ -614,7 +619,7 @@ ifeq ($(mmx),yes)
 endif
 
 ifeq ($(neon),yes)
-	CXXFLAGS += -DUSE_NEON
+	CXXFLAGS += -DUSE_NEON=$(arm_version)
 	ifeq ($(KERNEL),Linux)
 	ifneq ($(COMP),ndk)
 	ifneq ($(arch),armv8)
@@ -863,6 +868,7 @@ config-sanity: net
 	@echo "vnni256: '$(vnni256)'"
 	@echo "vnni512: '$(vnni512)'"
 	@echo "neon: '$(neon)'"
+	@echo "arm_version: '$(arm_version)'"
 	@echo ""
 	@echo "Flags:"
 	@echo "CXX: $(CXX)"
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index b2871278..11038d69 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -75,8 +75,7 @@ namespace Stockfish::Eval::NNUE::Layers {
     const auto inputVector = reinterpret_cast<const __m64*>(input);
 
 # elif defined(USE_NEON)
-    static_assert(PaddedInputDimensions % 16 == 0);
-    constexpr IndexType NumChunks = PaddedInputDimensions / 16;
+    constexpr IndexType NumChunks = (InputDimensions + 15) / 16;
     const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
 # endif
 
@@ -181,6 +180,9 @@ namespace Stockfish::Eval::NNUE::Layers {
 #elif defined (USE_SSSE3)
     static constexpr const IndexType InputSimdWidth = 16;
     static constexpr const IndexType MaxNumOutputRegs = 8;
+#elif defined (USE_NEON)
+    static constexpr const IndexType InputSimdWidth = 8;
+    static constexpr const IndexType MaxNumOutputRegs = 8;
 #else
     // The fallback implementation will not have permuted weights.
     // We define these to avoid a lot of ifdefs later.
@@ -270,52 +272,64 @@ namespace Stockfish::Eval::NNUE::Layers {
       OutputType* output = reinterpret_cast<OutputType*>(buffer);
 
 #if defined (USE_AVX512)
-      using vec_t = __m512i;
-      #define vec_setzero _mm512_setzero_si512
-      #define vec_set_32 _mm512_set1_epi32
-      #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
+      using acc_vec_t = __m512i;
+      using bias_vec_t = __m128i;
+      using weight_vec_t = __m512i;
+      using in_vec_t = __m512i;
+      #define vec_zero _mm512_setzero_si512()
       #define vec_add_dpbusd_32x2 Simd::m512_add_dpbusd_epi32x2
       #define vec_hadd Simd::m512_hadd
       #define vec_haddx4 Simd::m512_haddx4
 #elif defined (USE_AVX2)
-      using vec_t = __m256i;
-      #define vec_setzero _mm256_setzero_si256
-      #define vec_set_32 _mm256_set1_epi32
-      #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
+      using acc_vec_t = __m256i;
+      using bias_vec_t = __m128i;
+      using weight_vec_t = __m256i;
+      using in_vec_t = __m256i;
+      #define vec_zero _mm256_setzero_si256()
       #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
       #define vec_hadd Simd::m256_hadd
       #define vec_haddx4 Simd::m256_haddx4
 #elif defined (USE_SSSE3)
-      using vec_t = __m128i;
-      #define vec_setzero _mm_setzero_si128
-      #define vec_set_32 _mm_set1_epi32
-      #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
+      using acc_vec_t = __m128i;
+      using bias_vec_t = __m128i;
+      using weight_vec_t = __m128i;
+      using in_vec_t = __m128i;
+      #define vec_zero _mm_setzero_si128()
       #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
       #define vec_hadd Simd::m128_hadd
       #define vec_haddx4 Simd::m128_haddx4
+#elif defined (USE_NEON)
+      using acc_vec_t = int32x4_t;
+      using bias_vec_t = int32x4_t;
+      using weight_vec_t = int8x8_t;
+      using in_vec_t = int8x8_t;
+      #define vec_zero {0}
+      #define vec_add_dpbusd_32x2 Simd::neon_m128_add_dpbusd_epi32x2
+      #define vec_hadd Simd::neon_m128_hadd
+      #define vec_haddx4 Simd::neon_m128_haddx4
 #endif
 
-#if defined (USE_SSSE3)
-      const vec_t* invec = reinterpret_cast<const vec_t*>(input);
+#if defined (USE_SSSE3) || defined (USE_NEON)
+      const in_vec_t* invec = reinterpret_cast<const in_vec_t*>(input);
 
 
       // Perform accumulation to registers for each big block
       for (IndexType bigBlock = 0; bigBlock < NumBigBlocks; ++bigBlock)
       {
-        vec_t acc[NumOutputRegs] = { vec_setzero() };
+        acc_vec_t acc[NumOutputRegs] = { vec_zero };
 
         // Each big block has NumOutputRegs small blocks in each "row", one per register.
         // We process two small blocks at a time to save on one addition without VNNI.
         for (IndexType smallBlock = 0; smallBlock < NumSmallBlocksPerOutput; smallBlock += 2)
         {
-          const vec_t* weightvec =
-            reinterpret_cast<const vec_t*>(
+          const weight_vec_t* weightvec =
+            reinterpret_cast<const weight_vec_t*>(
                 weights
               + bigBlock * BigBlockSize
               + smallBlock * SmallBlockSize * NumOutputRegs);
 
-          const vec_t in0 = invec[smallBlock + 0];
-          const vec_t in1 = invec[smallBlock + 1];
+          const in_vec_t in0 = invec[smallBlock + 0];
+          const in_vec_t in1 = invec[smallBlock + 1];
 
           for (IndexType k = 0; k < NumOutputRegs; ++k)
             vec_add_dpbusd_32x2(acc[k], in0, weightvec[k], in1, weightvec[k + NumOutputRegs]);
@@ -324,8 +338,8 @@ namespace Stockfish::Eval::NNUE::Layers {
         // Horizontally add all accumulators.
         if constexpr (NumOutputRegs % 4 == 0)
         {
-          __m128i* outputvec = reinterpret_cast<__m128i*>(output);
-          const __m128i* biasvec = reinterpret_cast<const __m128i*>(biases);
+          bias_vec_t* outputvec = reinterpret_cast<bias_vec_t*>(output);
+          const bias_vec_t* biasvec = reinterpret_cast<const bias_vec_t*>(biases);
 
           for (IndexType k = 0; k < NumOutputRegs; k += 4)
           {
@@ -343,9 +357,7 @@ namespace Stockfish::Eval::NNUE::Layers {
         }
       }
 
-# undef vec_setzero
-# undef vec_set_32
-# undef vec_add_dpbusd_32
+# undef vec_zero
 # undef vec_add_dpbusd_32x2
 # undef vec_hadd
 # undef vec_haddx4
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 0297b323..4f6a174a 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -336,10 +336,17 @@ namespace Stockfish::Eval::NNUE {
       {
           const IndexType offset = HalfDimensions * p;
           const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
+
+          constexpr IndexType UnrollFactor = 16;
+          static_assert(UnrollFactor % UnrollFactor == 0);
+          for (IndexType j = 0; j < NumChunks; j += UnrollFactor)
           {
-              int16x8_t sum = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j];
-              out[j] = vmax_s8(vqmovn_s16(sum), Zero);
+              int16x8_t sums[UnrollFactor];
+              for (IndexType i = 0; i < UnrollFactor; ++i)
+                sums[i] = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j+i];
+
+              for (IndexType i = 0; i < UnrollFactor; ++i)
+                out[j+i] = vmax_s8(vqmovn_s16(sums[i]), Zero);
           }
       }
       return psqt;
diff --git a/src/simd.h b/src/simd.h
index 1ac98067..ffa54d96 100644
--- a/src/simd.h
+++ b/src/simd.h
@@ -343,6 +343,45 @@ namespace Stockfish::Simd {
 
 #endif
 
+#if defined (USE_NEON)
+
+    [[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
+#   if USE_NEON >= 8
+      return vaddvq_s32(s);
+#   else
+      return s[0] + s[1] + s[2] + s[3];
+#   endif
+    }
+
+    [[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
+      return neon_m128_reduce_add_epi32(sum) + bias;
+    }
+
+    [[maybe_unused]] static int32x4_t neon_m128_haddx4(
+        int32x4_t sum0, int32x4_t sum1, int32x4_t sum2, int32x4_t sum3,
+        int32x4_t bias) {
+
+      int32x4_t hsums {
+        neon_m128_reduce_add_epi32(sum0),
+        neon_m128_reduce_add_epi32(sum1),
+        neon_m128_reduce_add_epi32(sum2),
+        neon_m128_reduce_add_epi32(sum3)
+      };
+      return vaddq_s32(hsums, bias);
+    }
+
+    [[maybe_unused]] static void neon_m128_add_dpbusd_epi32x2(
+        int32x4_t& acc,
+        int8x8_t a0, int8x8_t b0,
+        int8x8_t a1, int8x8_t b1) {
+
+      int16x8_t product = vmull_s8(a0, b0);
+      product = vmlal_s8(product, a1, b1);
+      acc = vpadalq_s16(acc, product);
+    }
+
+#endif
+
 }
 
 #endif // STOCKFISH_SIMD_H_INCLUDED
-- 
2.39.2