Add support for ARM dot product instructions

author Sebastian Buchwald <UniQP@web.de>

Tue, 21 Feb 2023 21:18:17 +0000 (22:18 +0100)

committer Joost VandeVondele <Joost.VandeVondele@gmail.com>

Thu, 23 Feb 2023 12:22:03 +0000 (13:22 +0100)
author Sebastian Buchwald <UniQP@web.de>
Tue, 21 Feb 2023 21:18:17 +0000 (22:18 +0100)
committer Joost VandeVondele <Joost.VandeVondele@gmail.com>
Thu, 23 Feb 2023 12:22:03 +0000 (13:22 +0100)
diff --git a/src/Makefile b/src/Makefile

index 775c72c36e9fcf5ce558d346ee359510f1e16f1d..3d6432fd96b033c9c6964f6b11b0768968f625ad 100644 (file)
--- a/src/Makefile
+++ b/src/Makefile
@@ -69,32 +69,33 @@ VPATH = syzygy:nnue:nnue/features
  ### Section 2. High-level Configuration
  ### ==========================================================================
  #
-# flag                --- Comp switch      --- Description
+# flag                --- Comp switch        --- Description
  # ----------------------------------------------------------------------------
  #
-# debug = yes/no      --- -DNDEBUG         --- Enable/Disable debug mode
+# debug = yes/no      --- -DNDEBUG           --- Enable/Disable debug mode
  # sanitize = none/<sanitizer> ... (-fsanitize )
-#                     --- ( undefined )    --- enable undefined behavior checks
-#                     --- ( thread    )    --- enable threading error checks
-#                     --- ( address   )    --- enable memory access checks
-#                     --- ...etc...        --- see compiler documentation for supported sanitizers
-# optimize = yes/no   --- (-O3/-fast etc.) --- Enable/Disable optimizations
-# arch = (name)       --- (-arch)          --- Target architecture
-# bits = 64/32        --- -DIS_64BIT       --- 64-/32-bit operating system
-# prefetch = yes/no   --- -DUSE_PREFETCH   --- Use prefetch asm-instruction
-# popcnt = yes/no     --- -DUSE_POPCNT     --- Use popcnt asm-instruction
-# pext = yes/no       --- -DUSE_PEXT       --- Use pext x86_64 asm-instruction
-# sse = yes/no        --- -msse            --- Use Intel Streaming SIMD Extensions
-# mmx = yes/no        --- -mmmx            --- Use Intel MMX instructions
-# sse2 = yes/no       --- -msse2           --- Use Intel Streaming SIMD Extensions 2
-# ssse3 = yes/no      --- -mssse3          --- Use Intel Supplemental Streaming SIMD Extensions 3
-# sse41 = yes/no      --- -msse4.1         --- Use Intel Streaming SIMD Extensions 4.1
-# avx2 = yes/no       --- -mavx2           --- Use Intel Advanced Vector Extensions 2
-# avxvnni = yes/no    --- -mavxvnni        --- Use Intel Vector Neural Network Instructions AVX
-# avx512 = yes/no     --- -mavx512bw       --- Use Intel Advanced Vector Extensions 512
-# vnni256 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 256
-# vnni512 = yes/no    --- -mavx512vnni     --- Use Intel Vector Neural Network Instructions 512
-# neon = yes/no       --- -DUSE_NEON       --- Use ARM SIMD architecture
+#                     --- ( undefined )      --- enable undefined behavior checks
+#                     --- ( thread    )      --- enable threading error checks
+#                     --- ( address   )      --- enable memory access checks
+#                     --- ...etc...          --- see compiler documentation for supported sanitizers
+# optimize = yes/no   --- (-O3/-fast etc.)   --- Enable/Disable optimizations
+# arch = (name)       --- (-arch)            --- Target architecture
+# bits = 64/32        --- -DIS_64BIT         --- 64-/32-bit operating system
+# prefetch = yes/no   --- -DUSE_PREFETCH     --- Use prefetch asm-instruction
+# popcnt = yes/no     --- -DUSE_POPCNT       --- Use popcnt asm-instruction
+# pext = yes/no       --- -DUSE_PEXT         --- Use pext x86_64 asm-instruction
+# sse = yes/no        --- -msse              --- Use Intel Streaming SIMD Extensions
+# mmx = yes/no        --- -mmmx              --- Use Intel MMX instructions
+# sse2 = yes/no       --- -msse2             --- Use Intel Streaming SIMD Extensions 2
+# ssse3 = yes/no      --- -mssse3            --- Use Intel Supplemental Streaming SIMD Extensions 3
+# sse41 = yes/no      --- -msse4.1           --- Use Intel Streaming SIMD Extensions 4.1
+# avx2 = yes/no       --- -mavx2             --- Use Intel Advanced Vector Extensions 2
+# avxvnni = yes/no    --- -mavxvnni          --- Use Intel Vector Neural Network Instructions AVX
+# avx512 = yes/no     --- -mavx512bw         --- Use Intel Advanced Vector Extensions 512
+# vnni256 = yes/no    --- -mavx512vnni       --- Use Intel Vector Neural Network Instructions 256
+# vnni512 = yes/no    --- -mavx512vnni       --- Use Intel Vector Neural Network Instructions 512
+# neon = yes/no       --- -DUSE_NEON         --- Use ARM SIMD architecture
+# dotprod = yes/no    --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions
  #
  # Note that Makefile is space sensitive, so when adding new architectures
  # or modifying existing flags, you have to make sure there are no extra spaces
@@ -116,7 +117,7 @@ ifeq ($(ARCH), $(filter $(ARCH), \
                   x86-64-vnni512 x86-64-vnni256 x86-64-avx512 x86-64-avxvnni x86-64-bmi2 \
                   x86-64-avx2 x86-64-sse41-popcnt x86-64-modern x86-64-ssse3 x86-64-sse3-popcnt \
                   x86-64 x86-32-sse41-popcnt x86-32-sse2 x86-32 ppc-64 ppc-32 e2k \
-                 armv7 armv7-neon armv8 apple-silicon general-64 general-32 riscv64))
+                 armv7 armv7-neon armv8 armv8-dotprod apple-silicon general-64 general-32 riscv64))
     SUPPORTED_ARCH=true
  else
     SUPPORTED_ARCH=false
@@ -140,6 +141,7 @@ avx512 = no
  vnni256 = no
  vnni512 = no
  neon = no
+dotprod = no
  arm_version = 0
  STRIP = strip
  
@@ -308,11 +310,21 @@ ifeq ($(ARCH),armv8)
         arm_version = 8
  endif
  
+ifeq ($(ARCH),armv8-dotprod)
+       arch = armv8
+       prefetch = yes
+       popcnt = yes
+       neon = yes
+       dotprod = yes
+       arm_version = 8
+endif
+
  ifeq ($(ARCH),apple-silicon)
         arch = arm64
         prefetch = yes
         popcnt = yes
         neon = yes
+       dotprod = yes
         arm_version = 8
  endif
  
@@ -675,6 +687,10 @@ ifeq ($(neon),yes)
         endif
  endif
  
+ifeq ($(dotprod),yes)
+       CXXFLAGS += -march=armv8.2-a+dotprod -DUSE_NEON_DOTPROD
+endif
+
  ### 3.7 pext
  ifeq ($(pext),yes)
         CXXFLAGS += -DUSE_PEXT
@@ -776,6 +792,7 @@ help:
         @echo "armv7                   > ARMv7 32-bit"
         @echo "armv7-neon              > ARMv7 32-bit with popcnt and neon"
         @echo "armv8                   > ARMv8 64-bit with popcnt and neon"
+       @echo "armv8-dotprod           > ARMv8 64-bit with popcnt, neon and dot product support"
         @echo "e2k                     > Elbrus 2000"
         @echo "apple-silicon           > Apple silicon ARM64"
         @echo "general-64              > unspecified 64-bit"
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h

index 363b4916e37b40e80bbb719d9483f881929d75d2..63b58af33c39777cca2d435f051d3950db3f8b42 100644 (file)
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -72,6 +72,10 @@ namespace Stockfish::Eval::NNUE::Layers {
      const __m64 Zeros = _mm_setzero_si64();
      const auto inputVector = reinterpret_cast<const __m64*>(input);
  
+# elif defined(USE_NEON_DOTPROD)
+    constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
+    const auto inputVector = reinterpret_cast<const int8x16_t*>(input);
+
  # elif defined(USE_NEON)
      constexpr IndexType NumChunks = ceil_to_multiple<IndexType>(InputDimensions, 16) / 16;
      const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
@@ -123,6 +127,14 @@ namespace Stockfish::Eval::NNUE::Layers {
        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
        output[i] = _mm_cvtsi64_si32(sum);
  
+# elif defined(USE_NEON_DOTPROD)
+      int32x4_t sum = {biases[i]};
+      const auto row = reinterpret_cast<const int8x16_t*>(&weights[offset]);
+      for (IndexType j = 0; j < NumChunks; ++j) {
+        sum = vdotq_s32(sum, inputVector[j], row[j]);
+      }
+      output[i] = vaddvq_s32(sum);
+
  # elif defined(USE_NEON)
        int32x4_t sum = {biases[i]};
        const auto row = reinterpret_cast<const int8x8_t*>(&weights[offset]);
@@ -187,6 +199,9 @@ namespace Stockfish::Eval::NNUE::Layers {
  #elif defined (USE_SSSE3)
      static constexpr IndexType InputSimdWidth = 16;
      static constexpr IndexType MaxNumOutputRegs = 8;
+#elif defined (USE_NEON_DOTPROD)
+    static constexpr IndexType InputSimdWidth = 16;
+    static constexpr IndexType MaxNumOutputRegs = 8;
  #elif defined (USE_NEON)
      static constexpr IndexType InputSimdWidth = 8;
      static constexpr IndexType MaxNumOutputRegs = 8;
@@ -292,6 +307,15 @@ namespace Stockfish::Eval::NNUE::Layers {
        #define vec_add_dpbusd_32x2 Simd::m128_add_dpbusd_epi32x2
        #define vec_hadd Simd::m128_hadd
        #define vec_haddx4 Simd::m128_haddx4
+#elif defined (USE_NEON_DOTPROD)
+      using acc_vec_t = int32x4_t;
+      using bias_vec_t = int32x4_t;
+      using weight_vec_t = int8x16_t;
+      using in_vec_t = int8x16_t;
+      #define vec_zero {0}
+      #define vec_add_dpbusd_32x2 Simd::dotprod_m128_add_dpbusd_epi32x2
+      #define vec_hadd Simd::neon_m128_hadd
+      #define vec_haddx4 Simd::neon_m128_haddx4
  #elif defined (USE_NEON)
        using acc_vec_t = int32x4_t;
        using bias_vec_t = int32x4_t;
diff --git a/src/nnue/layers/simd.h b/src/nnue/layers/simd.h

index 381e7a68f8eeab8595092119329b5023e06aa795..22c51980eccd5de5b253b91609377d38df418c08 100644 (file)
--- a/src/nnue/layers/simd.h
+++ b/src/nnue/layers/simd.h
@@ -346,6 +346,19 @@ namespace Stockfish::Simd {
  
  #endif
  
+#if defined (USE_NEON_DOTPROD)
+
+    [[maybe_unused]] static void dotprod_m128_add_dpbusd_epi32x2(
+        int32x4_t& acc,
+        int8x16_t a0, int8x16_t b0,
+        int8x16_t a1, int8x16_t b1) {
+
+        acc = vdotq_s32(acc, a0, b0);
+        acc = vdotq_s32(acc, a1, b1);
+    }
+
+#endif
+
  #if defined (USE_NEON)
  
      [[maybe_unused]] static int neon_m128_reduce_add_epi32(int32x4_t s) {
author	Sebastian Buchwald <UniQP@web.de>
	Tue, 21 Feb 2023 21:18:17 +0000 (22:18 +0100)
committer	Joost VandeVondele <Joost.VandeVondele@gmail.com>
	Thu, 23 Feb 2023 12:22:03 +0000 (13:22 +0100)
src/Makefile		patch \| blob \| history
src/nnue/layers/affine_transform.h		patch \| blob \| history
src/nnue/layers/simd.h		patch \| blob \| history