From f948cd008d3a289ebbadc463271f84888e8069ba Mon Sep 17 00:00:00 2001 From: mstembera Date: Sun, 9 Aug 2020 16:23:33 -0700 Subject: [PATCH] Cleanup and optimize SSE/AVX code AVX512 +4% faster AVX2 +1% faster SSSE3 +5% faster passed non-regression STC: STC https://tests.stockfishchess.org/tests/view/5f31249f90816720665374f6 LLR: 2.96 (-2.94,2.94) {-1.50,0.50} Total: 17576 W: 2344 L: 2245 D: 12987 Ptnml(0-2): 127, 1570, 5292, 1675, 124 closes https://github.com/official-stockfish/Stockfish/pull/2962 No functional change --- src/nnue/layers/affine_transform.h | 46 +++++++++++++++-------------- src/nnue/nnue_accumulator.h | 2 +- src/nnue/nnue_common.h | 6 ++-- src/nnue/nnue_feature_transformer.h | 21 +++++++------ 4 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index 20ec2f12..89cfaad7 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -108,24 +108,19 @@ namespace Eval::NNUE::Layers { product = _mm512_madd_epi16(product, kOnes); sum = _mm512_add_epi32(sum, product); } - output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; // Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks. // As a result kPaddedInputDimensions may not be an even multiple of 64(512bit) // and we have to do one more 256bit chunk. if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2) { - const auto iv_256 = reinterpret_cast(input); - const auto row_256 = reinterpret_cast(&weights_[offset]); - int j = kNumChunks * 2; - __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv_256[j]), _mm256_load_si256(&row_256[j])); - sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1)); - sum256 = _mm256_hadd_epi32(sum256, sum256); - sum256 = _mm256_hadd_epi32(sum256, sum256); - const __m128i lo = _mm256_extracti128_si256(sum256, 0); - const __m128i hi = _mm256_extracti128_si256(sum256, 1); - output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi); + const auto iv256 = reinterpret_cast(&input_vector[kNumChunks]); + const auto row256 = reinterpret_cast(&row[kNumChunks]); + __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0])); + product256 = _mm256_madd_epi16(product256, _mm256_set1_epi16(1)); + sum = _mm512_add_epi32(sum, _mm512_zextsi256_si512(product256)); } + output[i] = _mm512_reduce_add_epi32(sum) + biases_[i]; #elif defined(USE_AVX2) __m256i sum = _mm256_setzero_si256(); @@ -135,23 +130,30 @@ namespace Eval::NNUE::Layers { product = _mm256_madd_epi16(product, kOnes); sum = _mm256_add_epi32(sum, product); } - sum = _mm256_hadd_epi32(sum, sum); - sum = _mm256_hadd_epi32(sum, sum); - const __m128i lo = _mm256_extracti128_si256(sum, 0); - const __m128i hi = _mm256_extracti128_si256(sum, 1); - output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i]; + __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB)); + output[i] = _mm_cvtsi128_si32(sum128) + biases_[i]; #elif defined(USE_SSSE3) - __m128i sum = _mm_cvtsi32_si128(biases_[i]); + __m128i sum = _mm_setzero_si128(); const auto row = reinterpret_cast(&weights_[offset]); - for (IndexType j = 0; j < kNumChunks; ++j) { - __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); + for (int j = 0; j < (int)kNumChunks - 1; j += 2) { + __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j])); + product0 = _mm_madd_epi16(product0, kOnes); + sum = _mm_add_epi32(sum, product0); + __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1])); + product1 = _mm_madd_epi16(product1, kOnes); + sum = _mm_add_epi32(sum, product1); + } + if (kNumChunks & 0x1) { + __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1])); product = _mm_madd_epi16(product, kOnes); sum = _mm_add_epi32(sum, product); } - sum = _mm_hadd_epi32(sum, sum); - sum = _mm_hadd_epi32(sum, sum); - output[i] = _mm_cvtsi128_si32(sum); + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC + sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB + output[i] = _mm_cvtsi128_si32(sum) + biases_[i]; #elif defined(USE_NEON) int32x4_t sum = {biases_[i]}; diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h index 2a354a3c..69dfaad2 100644 --- a/src/nnue/nnue_accumulator.h +++ b/src/nnue/nnue_accumulator.h @@ -26,7 +26,7 @@ namespace Eval::NNUE { // Class that holds the result of affine transformation of input features - struct alignas(32) Accumulator { + struct alignas(kCacheLineSize) Accumulator { std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions]; Value score; diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h index e7ce84f7..ff33cc79 100644 --- a/src/nnue/nnue_common.h +++ b/src/nnue/nnue_common.h @@ -52,9 +52,11 @@ #if defined(USE_AVX512) #if defined(__GNUC__ ) && (__GNUC__ < 9) -#define _mm512_loadA_si512 _mm512_loadu_si512 +#define _mm512_loadA_si512 _mm512_loadu_si512 +#define _mm512_storeA_si512 _mm512_storeu_si512 #else -#define _mm512_loadA_si512 _mm512_load_si512 +#define _mm512_loadA_si512 _mm512_load_si512 +#define _mm512_storeA_si512 _mm512_store_si512 #endif #endif diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index cbcc26f3..3818e444 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -169,38 +169,41 @@ namespace Eval::NNUE { kHalfDimensions * sizeof(BiasType)); for (const auto index : active_indices[perspective]) { const IndexType offset = kHalfDimensions * index; + #if defined(USE_AVX512) + auto accumulation = reinterpret_cast<__m512i*>( + &accumulator.accumulation[perspective][i][0]); + auto column = reinterpret_cast(&weights_[offset]); + constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth; + for (IndexType j = 0; j < kNumChunks; ++j) + _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j])); - #if defined(USE_AVX2) + #elif defined(USE_AVX2) auto accumulation = reinterpret_cast<__m256i*>( &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < kNumChunks; ++j) _mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j])); - } #elif defined(USE_SSE2) auto accumulation = reinterpret_cast<__m128i*>( &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < kNumChunks; ++j) accumulation[j] = _mm_add_epi16(accumulation[j], column[j]); - } #elif defined(USE_NEON) auto accumulation = reinterpret_cast( &accumulator.accumulation[perspective][i][0]); auto column = reinterpret_cast(&weights_[offset]); constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2); - for (IndexType j = 0; j < kNumChunks; ++j) { + for (IndexType j = 0; j < kNumChunks; ++j) accumulation[j] = vaddq_s16(accumulation[j], column[j]); - } #else - for (IndexType j = 0; j < kHalfDimensions; ++j) { + for (IndexType j = 0; j < kHalfDimensions; ++j) accumulator.accumulation[perspective][i][j] += weights_[offset + j]; - } #endif } -- 2.39.2