summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
cb05040)
AVX512 +4% faster
AVX2 +1% faster
SSSE3 +5% faster
passed non-regression STC:
STC https://tests.stockfishchess.org/tests/view/
5f31249f90816720665374f6
LLR: 2.96 (-2.94,2.94) {-1.50,0.50}
Total: 17576 W: 2344 L: 2245 D: 12987
Ptnml(0-2): 127, 1570, 5292, 1675, 124
closes https://github.com/official-stockfish/Stockfish/pull/2962
No functional change
product = _mm512_madd_epi16(product, kOnes);
sum = _mm512_add_epi32(sum, product);
}
product = _mm512_madd_epi16(product, kOnes);
sum = _mm512_add_epi32(sum, product);
}
- output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
// Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
// As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
// and we have to do one more 256bit chunk.
if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
{
// Note: Changing kMaxSimdWidth from 32 to 64 breaks loading existing networks.
// As a result kPaddedInputDimensions may not be an even multiple of 64(512bit)
// and we have to do one more 256bit chunk.
if (kPaddedInputDimensions != kNumChunks * kSimdWidth * 2)
{
- const auto iv_256 = reinterpret_cast<const __m256i*>(input);
- const auto row_256 = reinterpret_cast<const __m256i*>(&weights_[offset]);
- int j = kNumChunks * 2;
- __m256i sum256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv_256[j]), _mm256_load_si256(&row_256[j]));
- sum256 = _mm256_madd_epi16(sum256, _mm256_set1_epi16(1));
- sum256 = _mm256_hadd_epi32(sum256, sum256);
- sum256 = _mm256_hadd_epi32(sum256, sum256);
- const __m128i lo = _mm256_extracti128_si256(sum256, 0);
- const __m128i hi = _mm256_extracti128_si256(sum256, 1);
- output[i] += _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi);
+ const auto iv256 = reinterpret_cast<const __m256i*>(&input_vector[kNumChunks]);
+ const auto row256 = reinterpret_cast<const __m256i*>(&row[kNumChunks]);
+ __m256i product256 = _mm256_maddubs_epi16(_mm256_loadA_si256(&iv256[0]), _mm256_load_si256(&row256[0]));
+ product256 = _mm256_madd_epi16(product256, _mm256_set1_epi16(1));
+ sum = _mm512_add_epi32(sum, _mm512_zextsi256_si512(product256));
+ output[i] = _mm512_reduce_add_epi32(sum) + biases_[i];
#elif defined(USE_AVX2)
__m256i sum = _mm256_setzero_si256();
#elif defined(USE_AVX2)
__m256i sum = _mm256_setzero_si256();
product = _mm256_madd_epi16(product, kOnes);
sum = _mm256_add_epi32(sum, product);
}
product = _mm256_madd_epi16(product, kOnes);
sum = _mm256_add_epi32(sum, product);
}
- sum = _mm256_hadd_epi32(sum, sum);
- sum = _mm256_hadd_epi32(sum, sum);
- const __m128i lo = _mm256_extracti128_si256(sum, 0);
- const __m128i hi = _mm256_extracti128_si256(sum, 1);
- output[i] = _mm_cvtsi128_si32(lo) + _mm_cvtsi128_si32(hi) + biases_[i];
+ __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+ sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));
+ sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_CDAB));
+ output[i] = _mm_cvtsi128_si32(sum128) + biases_[i];
- __m128i sum = _mm_cvtsi32_si128(biases_[i]);
+ __m128i sum = _mm_setzero_si128();
const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
const auto row = reinterpret_cast<const __m128i*>(&weights_[offset]);
- for (IndexType j = 0; j < kNumChunks; ++j) {
- __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+ for (int j = 0; j < (int)kNumChunks - 1; j += 2) {
+ __m128i product0 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j]), _mm_load_si128(&row[j]));
+ product0 = _mm_madd_epi16(product0, kOnes);
+ sum = _mm_add_epi32(sum, product0);
+ __m128i product1 = _mm_maddubs_epi16(_mm_load_si128(&input_vector[j+1]), _mm_load_si128(&row[j+1]));
+ product1 = _mm_madd_epi16(product1, kOnes);
+ sum = _mm_add_epi32(sum, product1);
+ }
+ if (kNumChunks & 0x1) {
+ __m128i product = _mm_maddubs_epi16(_mm_load_si128(&input_vector[kNumChunks-1]), _mm_load_si128(&row[kNumChunks-1]));
product = _mm_madd_epi16(product, kOnes);
sum = _mm_add_epi32(sum, product);
}
product = _mm_madd_epi16(product, kOnes);
sum = _mm_add_epi32(sum, product);
}
- sum = _mm_hadd_epi32(sum, sum);
- sum = _mm_hadd_epi32(sum, sum);
- output[i] = _mm_cvtsi128_si32(sum);
+ sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0x4E)); //_MM_PERM_BADC
+ sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, 0xB1)); //_MM_PERM_CDAB
+ output[i] = _mm_cvtsi128_si32(sum) + biases_[i];
#elif defined(USE_NEON)
int32x4_t sum = {biases_[i]};
#elif defined(USE_NEON)
int32x4_t sum = {biases_[i]};
namespace Eval::NNUE {
// Class that holds the result of affine transformation of input features
namespace Eval::NNUE {
// Class that holds the result of affine transformation of input features
- struct alignas(32) Accumulator {
+ struct alignas(kCacheLineSize) Accumulator {
std::int16_t
accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
Value score;
std::int16_t
accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
Value score;
#if defined(USE_AVX512)
#if defined(__GNUC__ ) && (__GNUC__ < 9)
#if defined(USE_AVX512)
#if defined(__GNUC__ ) && (__GNUC__ < 9)
-#define _mm512_loadA_si512 _mm512_loadu_si512
+#define _mm512_loadA_si512 _mm512_loadu_si512
+#define _mm512_storeA_si512 _mm512_storeu_si512
-#define _mm512_loadA_si512 _mm512_load_si512
+#define _mm512_loadA_si512 _mm512_load_si512
+#define _mm512_storeA_si512 _mm512_store_si512
kHalfDimensions * sizeof(BiasType));
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
kHalfDimensions * sizeof(BiasType));
for (const auto index : active_indices[perspective]) {
const IndexType offset = kHalfDimensions * index;
+ #if defined(USE_AVX512)
+ auto accumulation = reinterpret_cast<__m512i*>(
+ &accumulator.accumulation[perspective][i][0]);
+ auto column = reinterpret_cast<const __m512i*>(&weights_[offset]);
+ constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
+ for (IndexType j = 0; j < kNumChunks; ++j)
+ _mm512_storeA_si512(&accumulation[j], _mm512_add_epi16(_mm512_loadA_si512(&accumulation[j]), column[j]));
+ #elif defined(USE_AVX2)
auto accumulation = reinterpret_cast<__m256i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
auto accumulation = reinterpret_cast<__m256i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m256i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
- for (IndexType j = 0; j < kNumChunks; ++j) {
+ for (IndexType j = 0; j < kNumChunks; ++j)
_mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
_mm256_storeA_si256(&accumulation[j], _mm256_add_epi16(_mm256_loadA_si256(&accumulation[j]), column[j]));
#elif defined(USE_SSE2)
auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
#elif defined(USE_SSE2)
auto accumulation = reinterpret_cast<__m128i*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const __m128i*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
- for (IndexType j = 0; j < kNumChunks; ++j) {
+ for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
accumulation[j] = _mm_add_epi16(accumulation[j], column[j]);
#elif defined(USE_NEON)
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
#elif defined(USE_NEON)
auto accumulation = reinterpret_cast<int16x8_t*>(
&accumulator.accumulation[perspective][i][0]);
auto column = reinterpret_cast<const int16x8_t*>(&weights_[offset]);
constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
- for (IndexType j = 0; j < kNumChunks; ++j) {
+ for (IndexType j = 0; j < kNumChunks; ++j)
accumulation[j] = vaddq_s16(accumulation[j], column[j]);
accumulation[j] = vaddq_s16(accumulation[j], column[j]);
- for (IndexType j = 0; j < kHalfDimensions; ++j) {
+ for (IndexType j = 0; j < kHalfDimensions; ++j)
accumulator.accumulation[perspective][i][j] += weights_[offset + j];
accumulator.accumulation[perspective][i][j] += weights_[offset + j];