From 2c36d1e7e7374b8babb3cc503c0bc07ceb83dbf8 Mon Sep 17 00:00:00 2001 From: MinetaS Date: Mon, 13 Feb 2023 11:54:59 +0900 Subject: [PATCH] Fix overflow in add_dpbusd_epi32x2 This patch fixes 16bit overflow in *_add_dpbusd_epi32x2 functions, that can be triggered in rare cases depending on the NNUE weights. While the code leads to some slowdown on affected architectures (most notably avx2), the fix is simpler than some of the other options discussed in https://github.com/official-stockfish/Stockfish/pull/4394 Code suggested by Sopel97. Result of "bench 4096 1 30 default depth nnue": | Architecture | master | patch (gcc) | patch (clang) | |---------------------|-----------|-------------|---------------| | x86-64-vnni512 | 762122798 | 762122798 | 762122798 | | x86-64-avx512 | 769723503 | 762122798 | 762122798 | | x86-64-bmi2 | 769723503 | 762122798 | 762122798 | | x86-64-ssse3 | 769723503 | 762122798 | 762122798 | | x86-64 | 762122798 | 762122798 | 762122798 | Following architectures will experience ~4% slowdown due to an additional instruction in the middle of hot path: * x86-64-avx512 * x86-64-bmi2 * x86-64-avx2 * x86-64-sse41-popcnt (x86-64-modern) * x86-64-ssse3 * x86-32-sse41-popcnt This patch clearly loses Elo against master with both STC and LTC. Failed non-regression STC (256bit fix only): LLR: -2.95 (-2.94,2.94) <-1.75,0.25> Total: 33528 W: 8769 L: 9049 D: 15710 Ptnml(0-2): 96, 3616, 9600, 3376, 76 https://tests.stockfishchess.org/tests/view/63e6a5b44299542b1e26a485 60+0.6 @ 30000 games: Elo: -1.67 +-1.7 (95%) LOS: 2.8% Total: 30000 W: 7848 L: 7992 D: 14160 Ptnml(0-2): 12, 2847, 9436, 2683, 22 nElo: -3.84 +-3.9 (95%) PairsRatio: 0.95 https://tests.stockfishchess.org/tests/view/63e7ac716d0e1db55f35a660 However, a test against nn-a3dc078bafc7.nnue, which is the latest "safe" network not causing the bug, passed with regular bounds. Passed STC: LLR: 2.94 (-2.94,2.94) <0.00,2.00> Total: 160456 W: 42658 L: 42175 D: 75623 Ptnml(0-2): 487, 17638, 43469, 18173, 461 https://tests.stockfishchess.org/tests/view/63e89836d62a5d02b0fa82c8 closes https://github.com/official-stockfish/Stockfish/pull/4391 closes https://github.com/official-stockfish/Stockfish/pull/4394 No functional change --- src/nnue/layers/simd.h | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/nnue/layers/simd.h b/src/nnue/layers/simd.h index 231f7891..381e7a68 100644 --- a/src/nnue/layers/simd.h +++ b/src/nnue/layers/simd.h @@ -165,18 +165,19 @@ namespace Stockfish::Simd { __m512i tmp0 = _mm512_maddubs_epi16(a0, b0); __m512i tmp1 = _mm512_maddubs_epi16(a1, b1); asm( - "vpaddsw %[tmp0], %[tmp1], %[tmp0]\n\t" "vpmaddwd %[tmp0], %[ones], %[tmp0]\n\t" + "vpmaddwd %[tmp1], %[ones], %[tmp1]\n\t" + "vpaddd %[tmp0], %[tmp1], %[tmp0]\n\t" "vpaddd %[acc], %[tmp0], %[acc]\n\t" - : [acc]"+v"(acc), [tmp0]"+&v"(tmp0) - : [tmp1]"v"(tmp1), [ones]"v"(_mm512_set1_epi16(1)) + : [acc]"+v"(acc), [tmp0]"+&v"(tmp0), [tmp1]"+&v"(tmp1) + : [ones]"v"(_mm512_set1_epi16(1)) ); # else __m512i product0 = _mm512_maddubs_epi16(a0, b0); __m512i product1 = _mm512_maddubs_epi16(a1, b1); - product0 = _mm512_adds_epi16(product0, product1); product0 = _mm512_madd_epi16(product0, _mm512_set1_epi16(1)); - acc = _mm512_add_epi32(acc, product0); + product1 = _mm512_madd_epi16(product1, _mm512_set1_epi16(1)); + acc = _mm512_add_epi32(acc, _mm512_add_epi32(product0, product1)); # endif # endif } @@ -261,18 +262,19 @@ namespace Stockfish::Simd { __m256i tmp0 = _mm256_maddubs_epi16(a0, b0); __m256i tmp1 = _mm256_maddubs_epi16(a1, b1); asm( - "vpaddsw %[tmp0], %[tmp1], %[tmp0]\n\t" "vpmaddwd %[tmp0], %[ones], %[tmp0]\n\t" + "vpmaddwd %[tmp1], %[ones], %[tmp1]\n\t" + "vpaddd %[tmp0], %[tmp1], %[tmp0]\n\t" "vpaddd %[acc], %[tmp0], %[acc]\n\t" - : [acc]"+v"(acc), [tmp0]"+&v"(tmp0) - : [tmp1]"v"(tmp1), [ones]"v"(_mm256_set1_epi16(1)) + : [acc]"+v"(acc), [tmp0]"+&v"(tmp0), [tmp1]"+&v"(tmp1) + : [ones]"v"(_mm256_set1_epi16(1)) ); # else __m256i product0 = _mm256_maddubs_epi16(a0, b0); __m256i product1 = _mm256_maddubs_epi16(a1, b1); - product0 = _mm256_adds_epi16(product0, product1); product0 = _mm256_madd_epi16(product0, _mm256_set1_epi16(1)); - acc = _mm256_add_epi32(acc, product0); + product1 = _mm256_madd_epi16(product1, _mm256_set1_epi16(1)); + acc = _mm256_add_epi32(acc, _mm256_add_epi32(product0, product1)); # endif # endif } @@ -326,18 +328,19 @@ namespace Stockfish::Simd { __m128i tmp0 = _mm_maddubs_epi16(a0, b0); __m128i tmp1 = _mm_maddubs_epi16(a1, b1); asm( - "paddsw %[tmp1], %[tmp0]\n\t" "pmaddwd %[ones], %[tmp0]\n\t" + "pmaddwd %[ones], %[tmp1]\n\t" + "paddd %[tmp1], %[tmp0]\n\t" "paddd %[tmp0], %[acc]\n\t" - : [acc]"+v"(acc), [tmp0]"+&v"(tmp0) - : [tmp1]"v"(tmp1), [ones]"v"(_mm_set1_epi16(1)) + : [acc]"+v"(acc), [tmp0]"+&v"(tmp0), [tmp1]"+&v"(tmp1) + : [ones]"v"(_mm_set1_epi16(1)) ); # else __m128i product0 = _mm_maddubs_epi16(a0, b0); __m128i product1 = _mm_maddubs_epi16(a1, b1); - product0 = _mm_adds_epi16(product0, product1); product0 = _mm_madd_epi16(product0, _mm_set1_epi16(1)); - acc = _mm_add_epi32(acc, product0); + product1 = _mm_madd_epi16(product1, _mm_set1_epi16(1)); + acc = _mm_add_epi32(acc, _mm_add_epi32(product0, product1)); # endif } -- 2.39.2