From 95fe2b9a9d33811a7fcad1cdfea79c54e8fdb074 Mon Sep 17 00:00:00 2001 From: mstembera Date: Thu, 21 Sep 2023 19:26:11 -0700 Subject: [PATCH] Reduce SIMD register count from 32 to 16 in the case of avx512 and vnni512 archs. Up to 17% speedup, depending on the compiler, e.g. ``` AMD pro 7840u (zen4 phoenix apu 4nm) bash bench_parallel.sh ./stockfish_avx512_gcc13 ./stockfish_avx512_pr_gcc13 20 10 sf_base = 1077737 +/- 8446 (95%) sf_test = 1264268 +/- 8543 (95%) diff = 186531 +/- 4280 (95%) speedup = 17.308% +/- 0.397% (95%) ``` Prior to this patch, it appears gcc spills registers. closes https://github.com/official-stockfish/Stockfish/pull/4796 No functional change --- src/nnue/nnue_feature_transformer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h index 56442bac..902918b2 100644 --- a/src/nnue/nnue_feature_transformer.h +++ b/src/nnue/nnue_feature_transformer.h @@ -69,7 +69,7 @@ namespace Stockfish::Eval::NNUE { #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b) #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b) #define vec_zero_psqt() _mm256_setzero_si256() - #define NumRegistersSIMD 32 + #define NumRegistersSIMD 16 #define MaxChunkSize 64 #elif USE_AVX2 -- 2.39.2