Provide vectorized NNUE code for SSE2 and MMX targets

author Fanael Linithien <fanael4@gmail.com>

Sun, 9 Aug 2020 14:20:45 +0000 (16:20 +0200)

committer Joost VandeVondele <Joost.VandeVondele@gmail.com>

Mon, 10 Aug 2020 17:17:57 +0000 (19:17 +0200)
author Fanael Linithien <fanael4@gmail.com>
Sun, 9 Aug 2020 14:20:45 +0000 (16:20 +0200)
committer Joost VandeVondele <Joost.VandeVondele@gmail.com>
Mon, 10 Aug 2020 17:17:57 +0000 (19:17 +0200)
diff --git a/AUTHORS b/AUTHORS

index 21ef3e509711c80e1f1c7c755edba13bdd30a1fe..41b89705c973dad1aa82771a9075fab342c762c3 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -53,6 +53,7 @@ Ernesto Gatti
  Linmiao Xu (linrock)
  Fabian Beuke (madnight)
  Fabian Fichter (ianfab)
+Fanael Linithien (Fanael)
  fanon
  Fauzi Akram Dabat (FauziAkram)
  Felix Wittmann
diff --git a/src/Makefile b/src/Makefile

index a48e7dcb3ccd481ae6868b8e430d15ba00756c41..3d84f4826547076ef241e5f07da09c231d63282b 100644 (file)
--- a/src/Makefile
+++ b/src/Makefile
@@ -86,6 +86,7 @@ sanitize = no
  bits = 64
  prefetch = no
  popcnt = no
+mmx = no
  sse = no
  ssse3 = no
  sse41 = no
@@ -110,6 +111,7 @@ ifeq ($(ARCH),x86-32)
         arch = i386
         bits = 32
         prefetch = yes
+       mmx = yes
         sse = yes
  endif
  
@@ -250,7 +252,7 @@ ifeq ($(COMP),gcc)
         ifneq ($(KERNEL),Darwin)
            LDFLAGS += -Wl,--no-as-needed
         endif
-       
+
         gccversion = $(shell $(CXX) --version)
         gccisclang = $(findstring clang,$(gccversion))
  endif
@@ -432,6 +434,13 @@ ifeq ($(ssse3),yes)
         endif
  endif
  
+ifeq ($(mmx),yes)
+       CXXFLAGS += -DUSE_MMX
+       ifeq ($(comp),$(filter $(comp),gcc clang mingw))
+               CXXFLAGS += -mmmx
+       endif
+endif
+
  ifeq ($(neon),yes)
         CXXFLAGS += -DUSE_NEON
  endif
@@ -516,7 +525,7 @@ help:
         @echo "x86-64-ssse3            > x86 64-bit with ssse3 support"
         @echo "x86-64-sse3-popcnt      > x86 64-bit with sse3 and popcnt support"
         @echo "x86-64                  > x86 64-bit generic"
-       @echo "x86-32                  > x86 32-bit (also enables SSE)"
+       @echo "x86-32                  > x86 32-bit (also enables MMX and SSE)"
         @echo "x86-32-old              > x86 32-bit fall back for old hardware"
         @echo "ppc-64                  > PPC 64-bit"
         @echo "ppc-32                  > PPC 32-bit"
diff --git a/src/misc.cpp b/src/misc.cpp

index 5061ae13a1742bb4f4b10fd8c306164ccb63787d..401a65058fc8ddc058fa64223148a7b861b5600c 100644 (file)
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -228,6 +228,9 @@ const std::string compiler_info() {
    #endif
      compiler += (HasPext ? " BMI2" : "");
      compiler += (HasPopCnt ? " POPCNT" : "");
+  #if defined(USE_MMX)
+    compiler += " MMX";
+  #endif
    #if !defined(NDEBUG)
      compiler += " DEBUG";
    #endif
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h

index 89cfaad7dbd8ff847bbccf8bd451c08927860292..985ee71a4193e571f9ecdddfc144ca4c2c571aea 100644 (file)
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -87,11 +87,20 @@ namespace Eval::NNUE::Layers {
        const __m256i kOnes = _mm256_set1_epi16(1);
        const auto input_vector = reinterpret_cast<const __m256i*>(input);
  
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
        constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+  #ifndef USE_SSSE3
+      const __m128i kZeros = _mm_setzero_si128();
+  #else
        const __m128i kOnes = _mm_set1_epi16(1);
+  #endif
        const auto input_vector = reinterpret_cast<const __m128i*>(input);
  
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
+      const __m64 kZeros = _mm_setzero_si64();
+      const auto input_vector = reinterpret_cast<const __m64*>(input);
+
    #elif defined(USE_NEON)
        constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
        const auto input_vector = reinterpret_cast<const int8x8_t*>(input);
@@ -155,6 +164,51 @@ namespace Eval::NNUE::Layers {
          sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
          output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
  
+  #elif defined(USE_SSE2)
+        __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]);
+        __m128i sum_hi = kZeros;
+        const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m128i row_j = _mm_load_si128(&row[j]);
+          __m128i input_j = _mm_load_si128(&input_vector[j]);
+          __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j);
+          __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs);
+          __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs);
+          __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros);
+          __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros);
+          __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo);
+          __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi);
+          sum_lo = _mm_add_epi32(sum_lo, product_lo);
+          sum_hi = _mm_add_epi32(sum_hi, product_hi);
+        }
+        __m128i sum = _mm_add_epi32(sum_lo, sum_hi);
+        __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum = _mm_add_epi32(sum, sum_high_64);
+        __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2));
+        sum = _mm_add_epi32(sum, sum_second_32);
+        output[i] = _mm_cvtsi128_si32(sum);
+
+  #elif defined(USE_MMX)
+        __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]);
+        __m64 sum_hi = kZeros;
+        const auto row = reinterpret_cast<const __m64*>(&weights_[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 row_j = row[j];
+          __m64 input_j = input_vector[j];
+          __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j);
+          __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs);
+          __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs);
+          __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros);
+          __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros);
+          __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo);
+          __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi);
+          sum_lo = _mm_add_pi32(sum_lo, product_lo);
+          sum_hi = _mm_add_pi32(sum_hi, product_hi);
+        }
+        __m64 sum = _mm_add_pi32(sum_lo, sum_hi);
+        sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum));
+        output[i] = _mm_cvtsi64_si32(sum);
+
    #elif defined(USE_NEON)
          int32x4_t sum = {biases_[i]};
          const auto row = reinterpret_cast<const int8x8_t*>(&weights_[offset]);
@@ -174,6 +228,9 @@ namespace Eval::NNUE::Layers {
    #endif
  
        }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
        return output;
      }
  
diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h

index 13196ec28b49d133afeb0c0f704e644b86583b8d..44d8a7def4c92ff55f7fcf1150b9553b37a90458 100644 (file)
--- a/src/nnue/layers/clipped_relu.h
+++ b/src/nnue/layers/clipped_relu.h
@@ -84,7 +84,7 @@ namespace Eval::NNUE::Layers {
        }
        constexpr IndexType kStart = kNumChunks * kSimdWidth;
  
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
        constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
  
    #ifdef USE_SSE41
@@ -115,6 +115,24 @@ namespace Eval::NNUE::Layers {
        }
        constexpr IndexType kStart = kNumChunks * kSimdWidth;
  
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+      const auto in = reinterpret_cast<const __m64*>(input);
+      const auto out = reinterpret_cast<__m64*>(output);
+      for (IndexType i = 0; i < kNumChunks; ++i) {
+        const __m64 words0 = _mm_srai_pi16(
+            _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]),
+            kWeightScaleBits);
+        const __m64 words1 = _mm_srai_pi16(
+            _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]),
+            kWeightScaleBits);
+        const __m64 packedbytes = _mm_packs_pi16(words0, words1);
+        out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+      }
+      _mm_empty();
+      constexpr IndexType kStart = kNumChunks * kSimdWidth;
+
    #elif defined(USE_NEON)
        constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2);
        const int8x8_t kZero = {0};
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h

index ff33cc7974b12d616d6f6f49c3dcbcaa5d252e5e..cb1251c58aa37923a23e87fb4e6dbaa668d24b8d 100644 (file)
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -33,6 +33,9 @@
  #elif defined(USE_SSE2)
  #include <emmintrin.h>
  
+#elif defined(USE_MMX)
+#include <mmintrin.h>
+
  #elif defined(USE_NEON)
  #include <arm_neon.h>
  #endif
@@ -79,6 +82,9 @@ namespace Eval::NNUE {
    #elif defined(USE_SSE2)
    constexpr std::size_t kSimdWidth = 16;
  
+  #elif defined(USE_MMX)
+  constexpr std::size_t kSimdWidth = 8;
+
    #elif defined(USE_NEON)
    constexpr std::size_t kSimdWidth = 16;
    #endif
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h

index 3818e444b6af9710110dff8eba49b4148d55b53b..40f2603d9d5c8a8fe212a1a40b465876f822cf6e 100644 (file)
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -88,7 +88,7 @@ namespace Eval::NNUE {
        constexpr int kControl = 0b11011000;
        const __m256i kZero = _mm256_setzero_si256();
  
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
        constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
  
    #ifdef USE_SSE41
@@ -97,6 +97,10 @@ namespace Eval::NNUE {
        const __m128i k0x80s = _mm_set1_epi8(-128);
    #endif
  
+  #elif defined(USE_MMX)
+      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+      const __m64 k0x80s = _mm_set1_pi8(-128);
+
    #elif defined(USE_NEON)
        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
        const int8x8_t kZero = {0};
@@ -117,7 +121,7 @@ namespace Eval::NNUE {
                _mm256_packs_epi16(sum0, sum1), kZero), kControl));
          }
  
-  #elif defined(USE_SSSE3)
+  #elif defined(USE_SSE2)
          auto out = reinterpret_cast<__m128i*>(&output[offset]);
          for (IndexType j = 0; j < kNumChunks; ++j) {
            __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
@@ -137,6 +141,17 @@ namespace Eval::NNUE {
            );
          }
  
+  #elif defined(USE_MMX)
+        auto out = reinterpret_cast<__m64*>(&output[offset]);
+        for (IndexType j = 0; j < kNumChunks; ++j) {
+          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 0]);
+          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
+              accumulation[perspectives[p]][0])[j * 2 + 1]);
+          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+        }
+
    #elif defined(USE_NEON)
          const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
          for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -154,6 +169,9 @@ namespace Eval::NNUE {
    #endif
  
        }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
      }
  
     private:
@@ -193,6 +211,15 @@ namespace Eval::NNUE {
            for (IndexType j = 0; j < kNumChunks; ++j)
              accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
  
+  #elif defined(USE_MMX)
+          auto accumulation = reinterpret_cast<__m64*>(
+              &accumulator.accumulation[perspective][i][0]);
+          auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+          for (IndexType j = 0; j < kNumChunks; ++j) {
+            accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+          }
+
    #elif defined(USE_NEON)
            auto accumulation = reinterpret_cast<int16x8_t*>(
                &accumulator.accumulation[perspective][i][0]);
@@ -208,6 +235,9 @@ namespace Eval::NNUE {
  
          }
        }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
  
        accumulator.computed_accumulation = true;
        accumulator.computed_score = false;
@@ -234,6 +264,11 @@ namespace Eval::NNUE {
          auto accumulation = reinterpret_cast<__m128i*>(
              &accumulator.accumulation[perspective][i][0]);
  
+  #elif defined(USE_MMX)
+        constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
+        auto accumulation = reinterpret_cast<__m64*>(
+            &accumulator.accumulation[perspective][i][0]);
+
    #elif defined(USE_NEON)
          constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
          auto accumulation = reinterpret_cast<int16x8_t*>(
@@ -263,6 +298,12 @@ namespace Eval::NNUE {
                accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]);
              }
  
+  #elif defined(USE_MMX)
+            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]);
+            }
+
    #elif defined(USE_NEON)
              auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
              for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -294,6 +335,12 @@ namespace Eval::NNUE {
                accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
              }
  
+  #elif defined(USE_MMX)
+            auto column = reinterpret_cast<const __m64*>(&weights_[offset]);
+            for (IndexType j = 0; j < kNumChunks; ++j) {
+              accumulation[j] = _mm_add_pi16(accumulation[j], column[j]);
+            }
+
    #elif defined(USE_NEON)
              auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
              for (IndexType j = 0; j < kNumChunks; ++j) {
@@ -310,6 +357,9 @@ namespace Eval::NNUE {
            }
          }
        }
+  #if defined(USE_MMX)
+      _mm_empty();
+  #endif
  
        accumulator.computed_accumulation = true;
        accumulator.computed_score = false;
author	Fanael Linithien <fanael4@gmail.com>
	Sun, 9 Aug 2020 14:20:45 +0000 (16:20 +0200)
committer	Joost VandeVondele <Joost.VandeVondele@gmail.com>
	Mon, 10 Aug 2020 17:17:57 +0000 (19:17 +0200)
AUTHORS		patch \| blob \| history
src/Makefile		patch \| blob \| history
src/misc.cpp		patch \| blob \| history
src/nnue/layers/affine_transform.h		patch \| blob \| history
src/nnue/layers/clipped_relu.h		patch \| blob \| history
src/nnue/nnue_common.h		patch \| blob \| history
src/nnue/nnue_feature_transformer.h		patch \| blob \| history