From 701b2427bd84d112376ce858b66befc5b66c4bb2 Mon Sep 17 00:00:00 2001 From: mstembera Date: Thu, 20 Aug 2020 16:59:27 -0700 Subject: [PATCH] Support VNNI on 256bit vectors due to downclocking on current chips (tested up to cascade lake) supporting avx512 and vnni512, it is better to use avx2 or vnni256 in multithreaded (in particular hyperthreaded) engine use. In single threaded use, the picture is different. gcc compilation for vnni256 requires a toolchain for gcc >= 9. closes https://github.com/official-stockfish/Stockfish/pull/3038 No functional change --- .travis.yml | 6 +++-- src/Makefile | 39 ++++++++++++++++++++++++------ src/nnue/layers/affine_transform.h | 8 +++++- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/.travis.yml b/.travis.yml index a029c4fc..c1e6d6df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -77,8 +77,10 @@ script: # compile only for some more advanced architectures (might not run in travis) - make clean && make -j2 ARCH=x86-64-avx2 build - make clean && make -j2 ARCH=x86-64-bmi2 build - # needs gcc 10 to compile - - if [[ "$COMPILER" != "g++-8" ]]; then make clean && make -j2 ARCH=x86-64-avx512 build; fi + - make clean && make -j2 ARCH=x86-64-avx512 build + - make clean && make -j2 ARCH=x86-64-vnni512 build + # requires gcc 9 or higher + - if [[ "$COMPILER" != "g++-8" ]]; make clean && make -j2 ARCH=x86-64-vnni256 build; fi # # Check perft and reproducible search diff --git a/src/Makefile b/src/Makefile index 3e1b7c35..228ea851 100644 --- a/src/Makefile +++ b/src/Makefile @@ -75,7 +75,8 @@ endif # sse41 = yes/no --- -msse4.1 --- Use Intel Streaming SIMD Extensions 4.1 # avx2 = yes/no --- -mavx2 --- Use Intel Advanced Vector Extensions 2 # avx512 = yes/no --- -mavx512bw --- Use Intel Advanced Vector Extensions 512 -# vnni = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 +# vnni256 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 256 +# vnni512 = yes/no --- -mavx512vnni --- Use Intel Vector Neural Network Instructions 512 # neon = yes/no --- -DUSE_NEON --- Use ARM SIMD architecture # # Note that Makefile is space sensitive, so when adding new architectures @@ -102,7 +103,8 @@ ssse3 = no sse41 = no avx2 = no avx512 = no -vnni = no +vnni256 = no +vnni512 = no neon = no ARCH = x86-64-modern STRIP = strip @@ -192,7 +194,18 @@ ifeq ($(findstring -avx512,$(ARCH)),-avx512) avx512 = yes endif -ifeq ($(findstring -vnni,$(ARCH)),-vnni) +ifeq ($(findstring -vnni256,$(ARCH)),-vnni256) + popcnt = yes + sse = yes + sse2 = yes + ssse3 = yes + sse41 = yes + avx2 = yes + pext = yes + vnni256 = yes +endif + +ifeq ($(findstring -vnni512,$(ARCH)),-vnni512) popcnt = yes sse = yes sse2 = yes @@ -201,7 +214,7 @@ ifeq ($(findstring -vnni,$(ARCH)),-vnni) avx2 = yes pext = yes avx512 = yes - vnni = yes + vnni512 = yes endif ifeq ($(sse),yes) @@ -500,7 +513,14 @@ ifeq ($(avx512),yes) endif endif -ifeq ($(vnni),yes) +ifeq ($(vnni256),yes) + CXXFLAGS += -DUSE_VNNI + ifeq ($(comp),$(filter $(comp),gcc clang mingw)) + CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl -mprefer-vector-width=256 + endif +endif + +ifeq ($(vnni512),yes) CXXFLAGS += -DUSE_VNNI ifeq ($(comp),$(filter $(comp),gcc clang mingw)) CXXFLAGS += -mavx512vnni -mavx512dq -mavx512vl @@ -623,7 +643,8 @@ help: @echo "" @echo "Supported archs:" @echo "" - @echo "x86-64-vnni > x86 64-bit with vnni support" + @echo "x86-64-vnni512 > x86 64-bit with vnni support 512bit wide" + @echo "x86-64-vnni256 > x86 64-bit with vnni support 256bit wide" @echo "x86-64-avx512 > x86 64-bit with avx512 support" @echo "x86-64-bmi2 > x86 64-bit with bmi2 support" @echo "x86-64-avx2 > x86 64-bit with avx2 support" @@ -767,7 +788,8 @@ config-sanity: @echo "sse41: '$(sse41)'" @echo "avx2: '$(avx2)'" @echo "avx512: '$(avx512)'" - @echo "vnni: '$(vnni)'" + @echo "vnni256: '$(vnni256)'" + @echo "vnni512: '$(vnni512)'" @echo "neon: '$(neon)'" @echo "" @echo "Flags:" @@ -794,7 +816,8 @@ config-sanity: @test "$(sse41)" = "yes" || test "$(sse41)" = "no" @test "$(avx2)" = "yes" || test "$(avx2)" = "no" @test "$(avx512)" = "yes" || test "$(avx512)" = "no" - @test "$(vnni)" = "yes" || test "$(vnni)" = "no" + @test "$(vnni256)" = "yes" || test "$(vnni256)" = "no" + @test "$(vnni512)" = "yes" || test "$(vnni512)" = "no" @test "$(neon)" = "yes" || test "$(neon)" = "no" @test "$(comp)" = "gcc" || test "$(comp)" = "icc" || test "$(comp)" = "mingw" || test "$(comp)" = "clang" \ || test "$(comp)" = "armv7a-linux-androideabi16-clang" || test "$(comp)" = "aarch64-linux-android21-clang" diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 7ac5a1c0..94d0b5a9 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -85,8 +85,10 @@ namespace Eval::NNUE::Layers { #elif defined(USE_AVX2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; - const __m256i kOnes = _mm256_set1_epi16(1); const auto input_vector = reinterpret_cast(input); + #if !defined(USE_VNNI) + const __m256i kOnes = _mm256_set1_epi16(1); + #endif #elif defined(USE_SSE2) constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth; @@ -145,9 +147,13 @@ namespace Eval::NNUE::Layers { __m256i sum = _mm256_setzero_si256(); const auto row = reinterpret_cast(&weights_[offset]); for (IndexType j = 0; j < kNumChunks; ++j) { + #if defined(USE_VNNI) + sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); + #else __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j])); product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); + #endif } __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); -- 2.39.2