Optimize FT activation and affine transform for NEON.

author Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>

Thu, 2 Dec 2021 11:29:11 +0000 (12:29 +0100)

committer Joost VandeVondele <Joost.VandeVondele@gmail.com>

Tue, 7 Dec 2021 17:08:54 +0000 (18:08 +0100)
author Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
Thu, 2 Dec 2021 11:29:11 +0000 (12:29 +0100)
committer Joost VandeVondele <Joost.VandeVondele@gmail.com>
Tue, 7 Dec 2021 17:08:54 +0000 (18:08 +0100)
diff --git a/src/Makefile b/src/Makefile

index a9333a22f46b7ac0928acf7ddfed53cd53257bb6..3cf97873d8a7411eaf26a8c68a1b1fb0b7c0d02c 100644 (file)
--- a/src/Makefile
+++ b/src/Makefile
@@ -128,6 +128,7 @@ avx512 = no
  vnni256 = no
  vnni512 = no
  neon = no
+arm_version = 0
  STRIP = strip
  
  ### 2.2 Architecture specific
@@ -275,6 +276,7 @@ ifeq ($(ARCH),armv7)
         arch = armv7
         prefetch = yes
         bits = 32
+       arm_version = 7
  endif
  
  ifeq ($(ARCH),armv7-neon)
@@ -283,6 +285,7 @@ ifeq ($(ARCH),armv7-neon)
         popcnt = yes
         neon = yes
         bits = 32
+       arm_version = 7
  endif
  
  ifeq ($(ARCH),armv8)
@@ -290,6 +293,7 @@ ifeq ($(ARCH),armv8)
         prefetch = yes
         popcnt = yes
         neon = yes
+       arm_version = 8
  endif
  
  ifeq ($(ARCH),apple-silicon)
@@ -297,6 +301,7 @@ ifeq ($(ARCH),apple-silicon)
         prefetch = yes
         popcnt = yes
         neon = yes
+       arm_version = 8
  endif
  
  ifeq ($(ARCH),ppc-32)
@@ -614,7 +619,7 @@ ifeq ($(mmx),yes)
  endif
  
  ifeq ($(neon),yes)
-       CXXFLAGS += -DUSE_NEON
+       CXXFLAGS += -DUSE_NEON=$(arm_version)
         ifeq ($(KERNEL),Linux)
         ifneq ($(COMP),ndk)
         ifneq ($(arch),armv8)
@@ -863,6 +868,7 @@ config-sanity: net
         @echo "vnni256: '$(vnni256)'"
         @echo "vnni512: '$(vnni512)'"
         @echo "neon: '$(neon)'"
+       @echo "arm_version: '$(arm_version)'"
         @echo ""
         @echo "Flags:"
         @echo "CXX: $(CXX)"
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h

index b28712780b2684868bc2c2937628e4112b72b69c..11038d69b1c7ff40b948bace675266d73af7b12d 100644 (file)
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -75,8 +75,7 @@ namespace Stockfish::Eval::NNUE::Layers {
      const auto inputVector = reinterpret_cast<const __m64*>(input);
  
  # elif defined(USE_NEON)
-    static_assert(PaddedInputDimensions % 16 == 0);
-    constexpr IndexType NumChunks = PaddedInputDimensions / 16;
+    constexpr IndexType NumChunks = (InputDimensions + 15) / 16;
      const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
  # endif
  
@@ -181,6 +180,9 @@ namespace Stockfish::Eval::NNUE::Layers {
  #elif defined (USE_SSSE3)
      static constexpr const IndexType InputSimdWidth = 16;
      static constexpr const IndexType MaxNumOutputRegs = 8;
+#elif defined (USE_NEON)
+    static constexpr const IndexType InputSimdWidth = 8;
+    static constexpr const IndexType MaxNumOutputRegs = 8;
  #else
      // The fallback implementation will not have permuted weights.
      // We define these to avoid a lot of ifdefs later.
@@ -270,52 +272,64 @@ namespace Stockfish::Eval::NNUE::Layers {
        OutputType* output = reinterpret_cast<OutputType*>(buffer);
  
  #if defined (USE_AVX512)
-      using vec_t = __m512i;
-      #define vec_setzero _mm512_setzero_si512
-      #define vec_set_32 _mm512_set1_epi32
-      #define vec_add_dpbusd_32 Simd::m512_add_dpbusd_epi32
+      using acc_vec_t = __m512i;
+      using bias_vec_t = __m128i;
+      using weight_vec_t = __m512i;
+      using in_vec_t = __m512i;
+      #define vec_zero _mm512_setzero_si512()
        #define vec_add_dpbusd_32x2 Simd::m512_add_dpbusd_epi32x2
        #define vec_hadd Simd::m512_hadd
        #define vec_haddx4 Simd::m512_haddx4
  #elif defined (USE_AVX2)
-      using vec_t = __m256i;
-      #define vec_setzero _mm256_setzero_si256
-      #define vec_set_32 _mm256_set1_epi32
-      #define vec_add_dpbusd_32 Simd::m256_add_dpbusd_epi32
+      using acc_vec_t = __m256i;
+      using bias_vec_t = __m128i;
+      using weight_vec_t = __m256i;
+      using in_vec_t = __m256i;
+      #define vec_zero _mm256_setzero_si256()
        #define vec_add_dpbusd_32x2 Simd::m256_add_dpbusd_epi32x2
        #define vec_hadd Simd::m256_hadd
        #define vec_haddx4 Simd::m256_haddx4
  #elif defined (USE_SSSE3)
-      using vec_t = __m128i;
-      #define vec_setzero _mm_setzero_si128
-      #define vec_set_32 _mm_set1_epi32
-      #define vec_add_dpbusd_32 Simd::m128_add_dpbusd_epi32
+      using acc_vec_t = __m128i;
+      using bias_vec_t = __m128i;
+      using weight_vec_t = __m128i;
+      using in_vec_t = __m128i;
+      #define vec_zero _mm_setzero_si128()
        #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
        #define vec_hadd Simd::m128_hadd
        #define vec_haddx4 Simd::m128_haddx4
+#elif defined (USE_NEON)
+      using acc_vec_t = int32x4_t;
+      using bias_vec_t = int32x4_t;
+      using weight_vec_t = int8x8_t;
+      using in_vec_t = int8x8_t;
+      #define vec_zero {0}
+      #define vec_add_dpbusd_32x2 Simd::neon_m128_add_dpbusd_epi32x2
+      #define vec_hadd Simd::neon_m128_hadd
+      #define vec_haddx4 Simd::neon_m128_haddx4
  #endif
  
-#if defined (USE_SSSE3)
-      const vec_t* invec = reinterpret_cast<const vec_t*>(input);
+#if defined (USE_SSSE3) || defined (USE_NEON)
+      const in_vec_t* invec = reinterpret_cast<const in_vec_t*>(input);
  
  
        // Perform accumulation to registers for each big block
        for (IndexType bigBlock = 0; bigBlock < NumBigBlocks; ++bigBlock)
        {
-        vec_t acc[NumOutputRegs] = { vec_setzero() };
+        acc_vec_t acc[NumOutputRegs] = { vec_zero };
  
          // Each big block has NumOutputRegs small blocks in each "row", one per register.
          // We process two small blocks at a time to save on one addition without VNNI.
          for (IndexType smallBlock = 0; smallBlock < NumSmallBlocksPerOutput; smallBlock += 2)
          {
-          const vec_t* weightvec =
-            reinterpret_cast<const vec_t*>(
+          const weight_vec_t* weightvec =
+            reinterpret_cast<const weight_vec_t*>(
                  weights
                + bigBlock * BigBlockSize
                + smallBlock * SmallBlockSize * NumOutputRegs);
  
-          const vec_t in0 = invec[smallBlock + 0];
-          const vec_t in1 = invec[smallBlock + 1];
+          const in_vec_t in0 = invec[smallBlock + 0];
+          const in_vec_t in1 = invec[smallBlock + 1];
  
            for (IndexType k = 0; k < NumOutputRegs; ++k)
              vec_add_dpbusd_32x2(acc[k], in0, weightvec[k], in1, weightvec[k + NumOutputRegs]);
@@ -324,8 +338,8 @@ namespace Stockfish::Eval::NNUE::Layers {
          // Horizontally add all accumulators.
          if constexpr (NumOutputRegs % 4 == 0)
          {
-          __m128i* outputvec = reinterpret_cast<__m128i*>(output);
-          const __m128i* biasvec = reinterpret_cast<const __m128i*>(biases);
+          bias_vec_t* outputvec = reinterpret_cast<bias_vec_t*>(output);
+          const bias_vec_t* biasvec = reinterpret_cast<const bias_vec_t*>(biases);
  
            for (IndexType k = 0; k < NumOutputRegs; k += 4)
            {
@@ -343,9 +357,7 @@ namespace Stockfish::Eval::NNUE::Layers {
          }
        }
  
-# undef vec_setzero
-# undef vec_set_32
-# undef vec_add_dpbusd_32
+# undef vec_zero
  # undef vec_add_dpbusd_32x2
  # undef vec_hadd
  # undef vec_haddx4
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h

index 0297b3233a1e42f032b557558788de528f960e35..4f6a174a486667ea23acab3e3d99c4f1fe13d438 100644 (file)
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -336,10 +336,17 @@ namespace Stockfish::Eval::NNUE {
        {
            const IndexType offset = HalfDimensions * p;
            const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-          for (IndexType j = 0; j < NumChunks; ++j)
+
+          constexpr IndexType UnrollFactor = 16;
+          static_assert(UnrollFactor % UnrollFactor == 0);
+          for (IndexType j = 0; j < NumChunks; j += UnrollFactor)
            {
-              int16x8_t sum = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j];
-              out[j] = vmax_s8(vqmovn_s16(sum), Zero);
+              int16x8_t sums[UnrollFactor];
+              for (IndexType i = 0; i < UnrollFactor; ++i)
+                sums[i] = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j+i];
+
+              for (IndexType i = 0; i < UnrollFactor; ++i)
+                out[j+i] = vmax_s8(vqmovn_s16(sums[i]), Zero);
            }
        }
        return psqt;
diff --git a/src/simd.h b/src/simd.h

index 1ac98067f5e5ecd9e3309ae4215ec161d0357dd1..ffa54d9627bd684a01057bd21bf3a0104f19a0cf 100644 (file)
--- a/src/simd.h
+++ b/src/simd.h
@@ -343,6 +343,45 @@ namespace Stockfish::Simd {
  
  #endif
  
+#if defined (USE_NEON)
+
+    [[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
+#   if USE_NEON >= 8
+      return vaddvq_s32(s);
+#   else
+      return s[0] + s[1] + s[2] + s[3];
+#   endif
+    }
+
+    [[maybe_unused]] static int neon_m128_hadd(int32x4_t sum, int bias) {
+      return neon_m128_reduce_add_epi32(sum) + bias;
+    }
+
+    [[maybe_unused]] static int32x4_t neon_m128_haddx4(
+        int32x4_t sum0, int32x4_t sum1, int32x4_t sum2, int32x4_t sum3,
+        int32x4_t bias) {
+
+      int32x4_t hsums {
+        neon_m128_reduce_add_epi32(sum0),
+        neon_m128_reduce_add_epi32(sum1),
+        neon_m128_reduce_add_epi32(sum2),
+        neon_m128_reduce_add_epi32(sum3)
+      };
+      return vaddq_s32(hsums, bias);
+    }
+
+    [[maybe_unused]] static void neon_m128_add_dpbusd_epi32x2(
+        int32x4_t& acc,
+        int8x8_t a0, int8x8_t b0,
+        int8x8_t a1, int8x8_t b1) {
+
+      int16x8_t product = vmull_s8(a0, b0);
+      product = vmlal_s8(product, a1, b1);
+      acc = vpadalq_s16(acc, product);
+    }
+
+#endif
+
  }
  
  #endif // STOCKFISH_SIMD_H_INCLUDED
author	Tomasz Sobczyk <tomasz.sobczyk1997@gmail.com>
	Thu, 2 Dec 2021 11:29:11 +0000 (12:29 +0100)
committer	Joost VandeVondele <Joost.VandeVondele@gmail.com>
	Tue, 7 Dec 2021 17:08:54 +0000 (18:08 +0100)
src/Makefile		patch \| blob \| history
src/nnue/layers/affine_transform.h		patch \| blob \| history
src/nnue/nnue_feature_transformer.h		patch \| blob \| history
src/simd.h		patch \| blob \| history