From 21df37d7fd4dcc9b4a9c319382cc43685c0259c8 Mon Sep 17 00:00:00 2001 From: Fanael Linithien Date: Sun, 9 Aug 2020 16:20:45 +0200 Subject: [PATCH] Provide vectorized NNUE code for SSE2 and MMX targets This patch allows old x86 CPUs, from AMD K8 (which the x86-64 baseline targets) all the way down to the Pentium MMX, to benefit from NNUE with comparable performance hit versus hand-written eval as on more modern processors. NPS of the bench with NNUE enabled on a Pentium III 1.13 GHz (using the MMX code): master: 38951 this patch: 80586 NPS of the bench with NNUE enabled using baseline x86-64 arch, which is how linux distros are likely to package stockfish, on a modern CPU (using the SSE2 code): master: 882584 this patch: 1203945 closes https://github.com/official-stockfish/Stockfish/pull/2956 No functional change. --- AUTHORS | 1 + src/Makefile | 13 ++++++- src/misc.cpp | 3 ++ src/nnue/layers/affine_transform.h | 59 ++++++++++++++++++++++++++++- src/nnue/layers/clipped_relu.h | 20 +++++++++- src/nnue/nnue_common.h | 6 +++ src/nnue/nnue_feature_transformer.h | 54 +++++++++++++++++++++++++- 7 files changed, 150 insertions(+), 6 deletions(-) diff --git a/AUTHORS b/AUTHORS index 21ef3e50..41b89705 100644 --- a/AUTHORS +++ b/AUTHORS @@ -53,6 +53,7 @@ Ernesto Gatti Linmiao Xu (linrock) Fabian Beuke (madnight) Fabian Fichter (ianfab) +Fanael Linithien (Fanael) fanon Fauzi Akram Dabat (FauziAkram) Felix Wittmann diff --git a/src/Makefile b/src/Makefile index a48e7dcb..3d84f482 100644 --- a/src/Makefile +++ b/src/Makefile @@ -86,6 +86,7 @@ sanitize = no bits = 64 prefetch = no popcnt = no +mmx = no sse = no ssse3 = no sse41 = no @@ -110,6 +111,7 @@ ifeq ($(ARCH),x86-32) arch = i386 bits = 32 prefetch = yes + mmx = yes sse = yes endif @@ -250,7 +252,7 @@ ifeq ($(COMP),gcc) ifneq ($(KERNEL),Darwin) LDFLAGS += -Wl,--no-as-needed endif - + gccversion = $(shell $(CXX) --version) gccisclang = $(findstring clang,$(gccversion)) endif @@ -432,6 +434,13 @@ ifeq ($(ssse3),yes) endif endif +ifeq ($(mmx),yes) + CXXFLAGS += -DUSE_MMX + ifeq ($(comp),$(filter $(comp),gcc clang mingw)) + CXXFLAGS += -mmmx + endif +endif + ifeq ($(neon),yes) CXXFLAGS += -DUSE_NEON endif @@ -516,7 +525,7 @@ help: @echo "x86-64-ssse3 > x86 64-bit with ssse3 support" @echo "x86-64-sse3-popcnt > x86 64-bit with sse3 and popcnt support" @echo "x86-64 > x86 64-bit generic" - @echo "x86-32 > x86 32-bit (also enables SSE)" + @echo "x86-32 > x86 32-bit (also enables MMX and SSE)" @echo "x86-32-old > x86 32-bit fall back for old hardware" @echo "ppc-64 > PPC 64-bit" @echo "ppc-32 > PPC 32-bit" diff --git a/src/misc.cpp b/src/misc.cpp index 5061ae13..401a6505 100644 --- a/src/misc.cpp +++ b/src/misc.cpp @@ -228,6 +228,9 @@ const std::string compiler_info() { #endif compiler += (HasPext ? " BMI2" : ""); compiler += (HasPopCnt ? " POPCNT" : ""); + #if defined(USE_MMX) + compiler += " MMX"; + #endif #if !defined(NDEBUG) compiler += " DEBUG"; #endif diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 89cfaad7..985ee71a 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -87,11 +87,20 @@ namespace Eval::NNUE::Layers { const __m256i kOnes = _mm256_set1_epi16(1); const auto input_vector = reinterpret_cast(input); - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + #ifndef USE_SSSE3 + const __m128i kZeros = _mm_setzero_si128(); + #else const __m128i kOnes = _mm_set1_epi16(1); + #endif const auto input_vector = reinterpret_cast(input); + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; + const __m64 kZeros = _mm_setzero_si64(); + const auto input_vector = reinterpret_cast(input); + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; const auto input_vector = reinterpret_cast(input); @@ -155,6 +164,51 @@ namespace Eval::NNUE::Layers { sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB output[i] = _mm_cvtsi128_si32(sum) + biases_[i]; + #elif defined(USE_SSE2) + __m128i sum_lo = _mm_cvtsi32_si128(biases_[i]); + __m128i sum_hi = kZeros; + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m128i row_j = _mm_load_si128(&row[j]); + __m128i input_j = _mm_load_si128(&input_vector[j]); + __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j); + __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs); + __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs); + __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros); + __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros); + __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo); + __m128i product_hi = _mm_madd_epi16(extended_row_hi, extended_input_hi); + sum_lo = _mm_add_epi32(sum_lo, product_lo); + sum_hi = _mm_add_epi32(sum_hi, product_hi); + } + __m128i sum = _mm_add_epi32(sum_lo, sum_hi); + __m128i sum_high_64 = _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_high_64); + __m128i sum_second_32 = _mm_shufflelo_epi16(sum, _MM_SHUFFLE(1, 0, 3, 2)); + sum = _mm_add_epi32(sum, sum_second_32); + output[i] = _mm_cvtsi128_si32(sum); + + #elif defined(USE_MMX) + __m64 sum_lo = _mm_cvtsi32_si64(biases_[i]); + __m64 sum_hi = kZeros; + const auto row = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m64 row_j = row[j]; + __m64 input_j = input_vector[j]; + __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j); + __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs); + __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs); + __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros); + __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros); + __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo); + __m64 product_hi = _mm_madd_pi16(extended_row_hi, extended_input_hi); + sum_lo = _mm_add_pi32(sum_lo, product_lo); + sum_hi = _mm_add_pi32(sum_hi, product_hi); + } + __m64 sum = _mm_add_pi32(sum_lo, sum_hi); + sum = _mm_add_pi32(sum, _mm_unpackhi_pi32(sum, sum)); + output[i] = _mm_cvtsi64_si32(sum); + #elif defined(USE_NEON) int32x4_t sum = {biases_[i]}; const auto row = reinterpret_cast(&weights_[offset]); @@ -174,6 +228,9 @@ namespace Eval::NNUE::Layers { #endif } + #if defined(USE_MMX) + _mm_empty(); + #endif return output; } diff --git a/src/nnue/layers/clipped_relu.h b/src/nnue/layers/clipped_relu.h index 13196ec2..44d8a7de 100644 --- a/src/nnue/layers/clipped_relu.h +++ b/src/nnue/layers/clipped_relu.h @@ -84,7 +84,7 @@ namespace Eval::NNUE::Layers { } constexpr IndexType kStart = kNumChunks * kSimdWidth; - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; #ifdef USE_SSE41 @@ -115,6 +115,24 @@ namespace Eval::NNUE::Layers { } constexpr IndexType kStart = kNumChunks * kSimdWidth; + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kInputDimensions / kSimdWidth; + const __m64 k0x80s = _mm_set1_pi8(-128); + const auto in = reinterpret_cast(input); + const auto out = reinterpret_cast<__m64*>(output); + for (IndexType i = 0; i < kNumChunks; ++i) { + const __m64 words0 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 0], in[i * 4 + 1]), + kWeightScaleBits); + const __m64 words1 = _mm_srai_pi16( + _mm_packs_pi32(in[i * 4 + 2], in[i * 4 + 3]), + kWeightScaleBits); + const __m64 packedbytes = _mm_packs_pi16(words0, words1); + out[i] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + _mm_empty(); + constexpr IndexType kStart = kNumChunks * kSimdWidth; + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kInputDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0}; diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index ff33cc79..cb1251c5 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -33,6 +33,9 @@ #elif defined(USE_SSE2) #include +#elif defined(USE_MMX) +#include + #elif defined(USE_NEON) #include #endif @@ -79,6 +82,9 @@ namespace Eval::NNUE { #elif defined(USE_SSE2) constexpr std::size_t kSimdWidth = 16; + #elif defined(USE_MMX) + constexpr std::size_t kSimdWidth = 8; + #elif defined(USE_NEON) constexpr std::size_t kSimdWidth = 16; #endif diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 3818e444..40f2603d 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -88,7 +88,7 @@ namespace Eval::NNUE { constexpr int kControl = 0b11011000; const __m256i kZero = _mm256_setzero_si256(); - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; #ifdef USE_SSE41 @@ -97,6 +97,10 @@ namespace Eval::NNUE { const __m128i k0x80s = _mm_set1_epi8(-128); #endif + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; + const __m64 k0x80s = _mm_set1_pi8(-128); + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); const int8x8_t kZero = {0}; @@ -117,7 +121,7 @@ namespace Eval::NNUE { _mm256_packs_epi16(sum0, sum1), kZero), kControl)); } - #elif defined(USE_SSSE3) + #elif defined(USE_SSE2) auto out = reinterpret_cast<__m128i*>(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { __m128i sum0 = _mm_load_si128(&reinterpret_cast( @@ -137,6 +141,17 @@ namespace Eval::NNUE { ); } + #elif defined(USE_MMX) + auto out = reinterpret_cast<__m64*>(&output[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + __m64 sum0 = *(&reinterpret_cast( + accumulation[perspectives[p]][0])[j * 2 + 0]); + __m64 sum1 = *(&reinterpret_cast( + accumulation[perspectives[p]][0])[j * 2 + 1]); + const __m64 packedbytes = _mm_packs_pi16(sum0, sum1); + out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s); + } + #elif defined(USE_NEON) const auto out = reinterpret_cast(&output[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -154,6 +169,9 @@ namespace Eval::NNUE { #endif } + #if defined(USE_MMX) + _mm_empty(); + #endif } private: @@ -193,6 +211,15 @@ namespace Eval::NNUE { for (IndexType j = 0; j < kNumChunks; ++j) accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); + #elif defined(USE_MMX) + auto accumulation = reinterpret_cast<__m64*>( + &accumulator.accumulation[perspective][i][0]); + auto column = reinterpret_cast(&weights_[offset]); + constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); + for (IndexType j = 0; j < kNumChunks; ++j) { + accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); + } + #elif defined(USE_NEON) auto accumulation = reinterpret_cast( &accumulator.accumulation[perspective][i][0]); @@ -208,6 +235,9 @@ namespace Eval::NNUE { } } + #if defined(USE_MMX) + _mm_empty(); + #endif accumulator.computed_accumulation = true; accumulator.computed_score = false; @@ -234,6 +264,11 @@ namespace Eval::NNUE { auto accumulation = reinterpret_cast<__m128i*>( &accumulator.accumulation[perspective][i][0]); + #elif defined(USE_MMX) + constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); + auto accumulation = reinterpret_cast<__m64*>( + &accumulator.accumulation[perspective][i][0]); + #elif defined(USE_NEON) constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); auto accumulation = reinterpret_cast( @@ -263,6 +298,12 @@ namespace Eval::NNUE { accumulation[j] = _mm_sub_epi16(accumulation[j], column[j]); } + #elif defined(USE_MMX) + auto column = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + accumulation[j] = _mm_sub_pi16(accumulation[j], column[j]); + } + #elif defined(USE_NEON) auto column = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -294,6 +335,12 @@ namespace Eval::NNUE { accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); } + #elif defined(USE_MMX) + auto column = reinterpret_cast(&weights_[offset]); + for (IndexType j = 0; j < kNumChunks; ++j) { + accumulation[j] = _mm_add_pi16(accumulation[j], column[j]); + } + #elif defined(USE_NEON) auto column = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { @@ -310,6 +357,9 @@ namespace Eval::NNUE { } } } + #if defined(USE_MMX) + _mm_empty(); + #endif accumulator.computed_accumulation = true; accumulator.computed_score = false; -- 2.39.2