From: Steinar H. Gunderson Date: Mon, 14 Dec 2020 23:33:25 +0000 (+0100) Subject: Merge remote-tracking branch 'upstream/master' into HEAD X-Git-Url: https://git.sesse.net/?p=stockfish;a=commitdiff_plain;h=a6e771dff1768b177d69dffa058839d075db8679;hp=aa75388ec136a8cf83b09da2328c5fefd5a010bd Merge remote-tracking branch 'upstream/master' into HEAD --- diff --git a/src/evaluate.cpp b/src/evaluate.cpp index a1b04316..c507aa06 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -594,7 +594,7 @@ namespace { int kingFlankDefense = popcount(b3); kingDanger += kingAttackersCount[Them] * kingAttackersWeight[Them] // (~10 Elo) - + 185 * popcount(kingRing[Us] & weak) // (~15 Elo) + + 183 * popcount(kingRing[Us] & weak) // (~15 Elo) + 148 * popcount(unsafeChecks) // (~4 Elo) + 98 * popcount(pos.blockers_for_king(Us)) // (~2 Elo) + 69 * kingAttacksCount[Them] // (~0.5 Elo) @@ -844,6 +844,8 @@ namespace { behind |= shift(behind); behind |= shift(behind); + // Compute space score based on the number of safe squares and number of our pieces + // increased with number of total blocked pawns in position. int bonus = popcount(safe) + popcount(behind & safe & ~attackedBy[Them][ALL_PIECES]); int weight = pos.count(Us) - 3 + std::min(pe->blocked_count(), 9); Score score = make_score(bonus * weight * weight / 16, 0); @@ -905,24 +907,36 @@ namespace { { if (pos.opposite_bishops()) { + // For pure opposite colored bishops endgames use scale factor + // based on the number of passed pawns of the strong side. if ( pos.non_pawn_material(WHITE) == BishopValueMg && pos.non_pawn_material(BLACK) == BishopValueMg) sf = 18 + 4 * popcount(pe->passed_pawns(strongSide)); + // For every other opposite colored bishops endgames use scale factor + // based on the number of all pieces of the strong side. else sf = 22 + 3 * pos.count(strongSide); } + // For rook endgames with strong side not having overwhelming pawn number advantage + // and its pawns being on one flank and weak side protecting its pieces with a king + // use lower scale factor. else if ( pos.non_pawn_material(WHITE) == RookValueMg && pos.non_pawn_material(BLACK) == RookValueMg && pos.count(strongSide) - pos.count(~strongSide) <= 1 && bool(KingSide & pos.pieces(strongSide, PAWN)) != bool(QueenSide & pos.pieces(strongSide, PAWN)) && (attacks_bb(pos.square(~strongSide)) & pos.pieces(~strongSide, PAWN))) sf = 36; + // For queen vs no queen endgames use scale factor + // based on number of minors of side that doesn't have queen. else if (pos.count() == 1) sf = 37 + 3 * (pos.count(WHITE) == 1 ? pos.count(BLACK) + pos.count(BLACK) : pos.count(WHITE) + pos.count(WHITE)); + // In every other case use scale factor based on + // the number of pawns of the strong side reduced if pawns are on a single flank. else sf = std::min(sf, 36 + 7 * pos.count(strongSide)) - 4 * !pawnsOnBothFlanks; + // Reduce scale factor in case of pawns being on a single flank sf -= 4 * !pawnsOnBothFlanks; } @@ -1046,6 +1060,8 @@ Value Eval::evaluate(const Position& pos) { bool largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50; bool classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB)); + // Use classical evaluation for really low piece endgames. + // The most critical case is a bishop + A/H file pawn vs naked king draw. bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count() < 2; v = classical || strongClassical ? Evaluation(pos).value() : adjusted_NNUE(); diff --git a/src/material.cpp b/src/material.cpp index 870a5e11..f77972e3 100644 --- a/src/material.cpp +++ b/src/material.cpp @@ -25,31 +25,34 @@ using namespace std; namespace { + #define S(mg, eg) make_score(mg, eg) // Polynomial material imbalance parameters - constexpr int QuadraticOurs[][PIECE_TYPE_NB] = { + constexpr Score QuadraticOurs[][PIECE_TYPE_NB] = { // OUR PIECES // pair pawn knight bishop rook queen - {1438 }, // Bishop pair - { 40, 38 }, // Pawn - { 32, 255, -62 }, // Knight OUR PIECES - { 0, 104, 4, 0 }, // Bishop - { -26, -2, 47, 105, -208 }, // Rook - {-189, 24, 117, 133, -134, -6 } // Queen + {S(1419, 1455) }, // Bishop pair + {S( 101, 28), S( 37, 39) }, // Pawn + {S( 57, 64), S(249, 187), S(-49, -62) }, // Knight OUR PIECES + {S( 0, 0), S(118, 137), S( 10, 27), S( 0, 0) }, // Bishop + {S( -63, -68), S( -5, 3), S(100, 81), S(132, 118), S(-246, -244) }, // Rook + {S(-210, -211), S( 37, 14), S(147, 141), S(161, 105), S(-158, -174), S(-9,-31) } // Queen }; - constexpr int QuadraticTheirs[][PIECE_TYPE_NB] = { + constexpr Score QuadraticTheirs[][PIECE_TYPE_NB] = { // THEIR PIECES // pair pawn knight bishop rook queen - { }, // Bishop pair - { 36, }, // Pawn - { 9, 63, }, // Knight OUR PIECES - { 59, 65, 42, }, // Bishop - { 46, 39, 24, -24, }, // Rook - { 97, 100, -42, 137, 268, } // Queen + { }, // Bishop pair + {S( 33, 30) }, // Pawn + {S( 46, 18), S(106, 84) }, // Knight OUR PIECES + {S( 75, 35), S( 59, 44), S( 60, 15) }, // Bishop + {S( 26, 35), S( 6, 22), S( 38, 39), S(-12, -2) }, // Rook + {S( 97, 93), S(100, 163), S(-58, -91), S(112, 192), S(276, 225) } // Queen }; + #undef S + // Endgame evaluation and scaling functions are accessed directly and not through // the function maps because they correspond to more than one material hash key. Endgame EvaluateKXK[] = { Endgame(WHITE), Endgame(BLACK) }; @@ -82,11 +85,11 @@ namespace { /// piece type for both colors. template - int imbalance(const int pieceCount[][PIECE_TYPE_NB]) { + Score imbalance(const int pieceCount[][PIECE_TYPE_NB]) { constexpr Color Them = ~Us; - int bonus = 0; + Score bonus = SCORE_ZERO; // Second-degree polynomial material imbalance, by Tord Romstad for (int pt1 = NO_PIECE_TYPE; pt1 <= QUEEN; ++pt1) @@ -213,7 +216,7 @@ Entry* probe(const Position& pos) { { pos.count(BLACK) > 1, pos.count(BLACK), pos.count(BLACK), pos.count(BLACK) , pos.count(BLACK), pos.count(BLACK) } }; - e->value = int16_t((imbalance(pieceCount) - imbalance(pieceCount)) / 16); + e->score = (imbalance(pieceCount) - imbalance(pieceCount)) / 16; return e; } diff --git a/src/material.h b/src/material.h index 80d01655..28da59db 100644 --- a/src/material.h +++ b/src/material.h @@ -37,8 +37,8 @@ namespace Material { struct Entry { - Score imbalance() const { return make_score(value, value); } - Phase game_phase() const { return gamePhase; } + Score imbalance() const { return score; } + Phase game_phase() const { return (Phase)gamePhase; } bool specialized_eval_exists() const { return evaluationFunction != nullptr; } Value evaluate(const Position& pos) const { return (*evaluationFunction)(pos); } @@ -57,9 +57,9 @@ struct Entry { const EndgameBase* evaluationFunction; const EndgameBase* scalingFunction[COLOR_NB]; // Could be one for each // side (e.g. KPKP, KBPsK) - int16_t value; + Score score; + int16_t gamePhase; uint8_t factor[COLOR_NB]; - Phase gamePhase; }; typedef HashTable Table; diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h index caf315b2..a715ca85 100644 --- a/src/nnue/layers/affine_transform.h +++ b/src/nnue/layers/affine_transform.h @@ -66,6 +66,53 @@ namespace Eval::NNUE::Layers { biases_[i] = read_little_endian(stream); for (std::size_t i = 0; i < kOutputDimensions * kPaddedInputDimensions; ++i) weights_[i] = read_little_endian(stream); + +#if defined (USE_SSSE3) + // Determine if quadruplets of weight and input products can be summed using 16bits + // without saturation. We assume worst case combinations of 0 and 127 for all inputs. + if (!stream.fail()) + { + auto can_saturate = [](const WeightType* w, int idx[4]) { + int pSum = 0, nSum = 0; + for (int p = 0; p < 4; ++p) + if (w[idx[p]] > 0) + pSum += w[idx[p]]; + else + nSum += w[idx[p]]; + + return pSum > 258 || nSum < -258; + }; + + for (IndexType i = 0; i < kOutputDimensions; ++i) + { + canSaturate16[i] = false; + const WeightType* w = &weights_[i * kPaddedInputDimensions]; +#if defined (USE_AVX512) + for (IndexType j = 0; j < (kPaddedInputDimensions & ~127) && !canSaturate16[i]; j += 128) + for (int k = 0; k < 64 && !canSaturate16[i]; k += 2) + { + int spacing[4] = { 0, 1, 64, 65 }; + canSaturate16[i] = can_saturate(&w[j + k], spacing); + } +#elif defined (USE_AVX2) + for (IndexType j = 0; j < (kPaddedInputDimensions & ~63) && !canSaturate16[i]; j += 64) + for (int k = 0; k < 32 && !canSaturate16[i]; k += 2) + { + int spacing[4] = { 0, 1, 32, 33 }; + canSaturate16[i] = can_saturate(&w[j + k], spacing); + } +#elif defined (USE_SSSE3) + for (IndexType j = 0; j < (kPaddedInputDimensions & ~31) && !canSaturate16[i]; j += 32) + for (int k = 0; k < 16 && !canSaturate16[i]; k += 2) + { + int spacing[4] = { 0, 1, 16, 17 }; + canSaturate16[i] = can_saturate(&w[j + k], spacing); + } +#endif + } + } +#endif + return !stream.fail(); } @@ -181,13 +228,26 @@ namespace Eval::NNUE::Layers { return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias); }; -#if defined (USE_VNNI) [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) { +#if defined (USE_VNNI) acc = _mm512_dpbusd_epi32(acc, a, b); #else - [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i { __m512i product0 = _mm512_maddubs_epi16(a, b); - return _mm512_madd_epi16(product0, kOnes512); + product0 = _mm512_madd_epi16(product0, kOnes512); + acc = _mm512_add_epi32(acc, product0); +#endif + }; + + [[maybe_unused]] auto m512_add_dpbusd_epi32x2 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1) { +#if defined (USE_VNNI) + acc = _mm512_dpbusd_epi32(acc, a0, b0); + acc = _mm512_dpbusd_epi32(acc, a1, b1); +#else + __m512i product0 = _mm512_maddubs_epi16(a0, b0); + __m512i product1 = _mm512_maddubs_epi16(a1, b1); + product0 = _mm512_adds_epi16(product0, product1); + product0 = _mm512_madd_epi16(product0, kOnes512); + acc = _mm512_add_epi32(acc, product0); #endif }; @@ -214,13 +274,27 @@ namespace Eval::NNUE::Layers { return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias); }; -#if defined (USE_VNNI) + [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) { +#if defined (USE_VNNI) acc = _mm256_dpbusd_epi32(acc, a, b); #else - [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i { __m256i product0 = _mm256_maddubs_epi16(a, b); - return _mm256_madd_epi16(product0, kOnes256); + product0 = _mm256_madd_epi16(product0, kOnes256); + acc = _mm256_add_epi32(acc, product0); +#endif + }; + + [[maybe_unused]] auto m256_add_dpbusd_epi32x2 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1) { +#if defined (USE_VNNI) + acc = _mm256_dpbusd_epi32(acc, a0, b0); + acc = _mm256_dpbusd_epi32(acc, a1, b1); +#else + __m256i product0 = _mm256_maddubs_epi16(a0, b0); + __m256i product1 = _mm256_maddubs_epi16(a1, b1); + product0 = _mm256_adds_epi16(product0, product1); + product0 = _mm256_madd_epi16(product0, kOnes256); + acc = _mm256_add_epi32(acc, product0); #endif }; @@ -245,9 +319,18 @@ namespace Eval::NNUE::Layers { return _mm_add_epi32(sum0, bias); }; - [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i { + [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) { __m128i product0 = _mm_maddubs_epi16(a, b); - return _mm_madd_epi16(product0, kOnes128); + product0 = _mm_madd_epi16(product0, kOnes128); + acc = _mm_add_epi32(acc, product0); + }; + + [[maybe_unused]] auto m128_add_dpbusd_epi32x2 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1) { + __m128i product0 = _mm_maddubs_epi16(a0, b0); + __m128i product1 = _mm_maddubs_epi16(a1, b1); + product0 = _mm_adds_epi16(product0, product1); + product0 = _mm_madd_epi16(product0, kOnes128); + acc = _mm_add_epi32(acc, product0); }; #endif @@ -291,6 +374,15 @@ namespace Eval::NNUE::Layers { const __m512i bias = *reinterpret_cast(&biases_[i]); __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]); + __m512i sum01a = _mm512_setzero_si512(); + __m512i sum23a = _mm512_setzero_si512(); + __m512i sum45a = _mm512_setzero_si512(); + __m512i sum67a = _mm512_setzero_si512(); + __m512i sum01b = _mm512_setzero_si512(); + __m512i sum23b = _mm512_setzero_si512(); + __m512i sum45b = _mm512_setzero_si512(); + __m512i sum67b = _mm512_setzero_si512(); + const auto row01a = *reinterpret_cast(&weights_[offset01a]); const auto row23a = *reinterpret_cast(&weights_[offset23a]); const auto row45a = *reinterpret_cast(&weights_[offset45a]); @@ -303,16 +395,6 @@ namespace Eval::NNUE::Layers { const __m256i in256 = input_vector256[0]; const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1); -#if defined (USE_VNNI) - __m512i sum01a = _mm512_setzero_si512(); - __m512i sum23a = _mm512_setzero_si512(); - __m512i sum45a = _mm512_setzero_si512(); - __m512i sum67a = _mm512_setzero_si512(); - __m512i sum01b = _mm512_setzero_si512(); - __m512i sum23b = _mm512_setzero_si512(); - __m512i sum45b = _mm512_setzero_si512(); - __m512i sum67b = _mm512_setzero_si512(); - m512_add_dpbusd_epi32(sum01a, in, row01a); m512_add_dpbusd_epi32(sum23a, in, row23a); m512_add_dpbusd_epi32(sum45a, in, row45a); @@ -321,16 +403,6 @@ namespace Eval::NNUE::Layers { m512_add_dpbusd_epi32(sum23b, in, row23b); m512_add_dpbusd_epi32(sum45b, in, row45b); m512_add_dpbusd_epi32(sum67b, in, row67b); -#else - __m512i sum01a = m512_dpbusd_epi32(in, row01a); - __m512i sum23a = m512_dpbusd_epi32(in, row23a); - __m512i sum45a = m512_dpbusd_epi32(in, row45a); - __m512i sum67a = m512_dpbusd_epi32(in, row67a); - __m512i sum01b = m512_dpbusd_epi32(in, row01b); - __m512i sum23b = m512_dpbusd_epi32(in, row23b); - __m512i sum45b = m512_dpbusd_epi32(in, row45b); - __m512i sum67b = m512_dpbusd_epi32(in, row67b); -#endif *outptr = m512_hadd256x16( sum01a, sum23a, sum45a, sum67a, @@ -351,80 +423,62 @@ namespace Eval::NNUE::Layers { if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) { - const auto row0 = reinterpret_cast(&weights_[offset0]); - const auto row1 = reinterpret_cast(&weights_[offset1]); - const auto row2 = reinterpret_cast(&weights_[offset2]); - const auto row3 = reinterpret_cast(&weights_[offset3]); - -#if defined (USE_VNNI) __m512i sum0 = _mm512_setzero_si512(); __m512i sum1 = _mm512_setzero_si512(); __m512i sum2 = _mm512_setzero_si512(); __m512i sum3 = _mm512_setzero_si512(); - const IndexType kStart = 0; -#else - __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]); - __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]); - __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]); - __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]); - const IndexType kStart = 1; -#endif - for (IndexType j = kStart; j < kNumChunks512; ++j) + const auto row0 = reinterpret_cast(&weights_[offset0]); + const auto row1 = reinterpret_cast(&weights_[offset1]); + const auto row2 = reinterpret_cast(&weights_[offset2]); + const auto row3 = reinterpret_cast(&weights_[offset3]); + + int j = 0; + if (!canSaturate16x4[i / 4]) + { + for (; j < (int)kNumChunks512 - 1; j += 2) + { + const __m512i in0 = input_vector512[j]; + const __m512i in1 = input_vector512[j + 1]; + + m512_add_dpbusd_epi32x2(sum0, in0, row0[j], in1, row0[j + 1]); + m512_add_dpbusd_epi32x2(sum1, in0, row1[j], in1, row1[j + 1]); + m512_add_dpbusd_epi32x2(sum2, in0, row2[j], in1, row2[j + 1]); + m512_add_dpbusd_epi32x2(sum3, in0, row3[j], in1, row3[j + 1]); + } + } + for (; j < (int)kNumChunks512; ++j) { const __m512i in = input_vector512[j]; -#if defined (USE_VNNI) m512_add_dpbusd_epi32(sum0, in, row0[j]); m512_add_dpbusd_epi32(sum1, in, row1[j]); m512_add_dpbusd_epi32(sum2, in, row2[j]); m512_add_dpbusd_epi32(sum3, in, row3[j]); -#else - sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j])); - sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j])); - sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j])); - sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j])); -#endif } *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias); } else { - const auto row0 = reinterpret_cast(&weights_[offset0]); - const auto row1 = reinterpret_cast(&weights_[offset1]); - const auto row2 = reinterpret_cast(&weights_[offset2]); - const auto row3 = reinterpret_cast(&weights_[offset3]); - -#if defined (USE_VNNI) __m256i sum0 = _mm256_setzero_si256(); __m256i sum1 = _mm256_setzero_si256(); __m256i sum2 = _mm256_setzero_si256(); __m256i sum3 = _mm256_setzero_si256(); - const IndexType kStart = 0; -#else - __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]); - __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]); - __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]); - __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]); - const IndexType kStart = 1; -#endif - for (IndexType j = kStart; j < kNumChunks256; ++j) + const auto row0 = reinterpret_cast(&weights_[offset0]); + const auto row1 = reinterpret_cast(&weights_[offset1]); + const auto row2 = reinterpret_cast(&weights_[offset2]); + const auto row3 = reinterpret_cast(&weights_[offset3]); + + for (IndexType j = 0; j < kNumChunks256; ++j) { const __m256i in = input_vector256[j]; -#if defined (USE_VNNI) m256_add_dpbusd_epi32(sum0, in, row0[j]); m256_add_dpbusd_epi32(sum1, in, row1[j]); m256_add_dpbusd_epi32(sum2, in, row2[j]); m256_add_dpbusd_epi32(sum3, in, row3[j]); -#else - sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j])); - sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j])); - sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j])); - sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j])); -#endif } *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); @@ -435,50 +489,30 @@ namespace Eval::NNUE::Layers { { if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0) { - const auto row0 = reinterpret_cast(&weights_[0]); - -#if defined (USE_VNNI) __m512i sum0 = _mm512_setzero_si512(); - const IndexType kStart = 0; -#else - __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]); - const IndexType kStart = 1; -#endif - for (IndexType j = kStart; j < kNumChunks512; ++j) + const auto row0 = reinterpret_cast(&weights_[0]); + + for (IndexType j = 0; j < kNumChunks512; ++j) { const __m512i in = input_vector512[j]; -#if defined (USE_VNNI) m512_add_dpbusd_epi32(sum0, in, row0[j]); -#else - sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j])); -#endif } output[0] = m512_hadd(sum0, biases_[0]); } else { - const auto row0 = reinterpret_cast(&weights_[0]); - -#if defined (USE_VNNI) __m256i sum0 = _mm256_setzero_si256(); - const IndexType kStart = 0; -#else - __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]); - const IndexType kStart = 1; -#endif - for (IndexType j = kStart; j < kNumChunks256; ++j) + const auto row0 = reinterpret_cast(&weights_[0]); + + for (IndexType j = 0; j < kNumChunks256; ++j) { const __m256i in = input_vector256[j]; -#if defined (USE_VNNI) m256_add_dpbusd_epi32(sum0, in, row0[j]); -#else - sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j])); -#endif } output[0] = m256_hadd(sum0, biases_[0]); @@ -512,40 +546,38 @@ namespace Eval::NNUE::Layers { const __m128i bias = *reinterpret_cast(&biases_[i]); __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); - const auto row0 = reinterpret_cast(&weights_[offset0]); - const auto row1 = reinterpret_cast(&weights_[offset1]); - const auto row2 = reinterpret_cast(&weights_[offset2]); - const auto row3 = reinterpret_cast(&weights_[offset3]); - -#if defined (USE_VNNI) __m256i sum0 = _mm256_setzero_si256(); __m256i sum1 = _mm256_setzero_si256(); __m256i sum2 = _mm256_setzero_si256(); __m256i sum3 = _mm256_setzero_si256(); - const IndexType kStart = 0; -#else - __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]); - __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]); - __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]); - __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]); - const IndexType kStart = 1; -#endif - for (IndexType j = kStart; j < kNumChunks; ++j) + const auto row0 = reinterpret_cast(&weights_[offset0]); + const auto row1 = reinterpret_cast(&weights_[offset1]); + const auto row2 = reinterpret_cast(&weights_[offset2]); + const auto row3 = reinterpret_cast(&weights_[offset3]); + + int j = 0; + if (!canSaturate16x4[i / 4]) { - const __m256i in = input_vector[j]; + for (; j < (int)kNumChunks - 1; j += 2) + { + const __m256i in0 = input_vector[j]; + const __m256i in1 = input_vector[j + 1]; + + m256_add_dpbusd_epi32x2(sum0, in0, row0[j], in1, row0[j + 1]); + m256_add_dpbusd_epi32x2(sum1, in0, row1[j], in1, row1[j + 1]); + m256_add_dpbusd_epi32x2(sum2, in0, row2[j], in1, row2[j + 1]); + m256_add_dpbusd_epi32x2(sum3, in0, row3[j], in1, row3[j + 1]); + } + } + for (; j < (int)kNumChunks; ++j) + { + const __m256i in = input_vector[j]; -#if defined (USE_VNNI) - m256_add_dpbusd_epi32(sum0, in, row0[j]); - m256_add_dpbusd_epi32(sum1, in, row1[j]); - m256_add_dpbusd_epi32(sum2, in, row2[j]); - m256_add_dpbusd_epi32(sum3, in, row3[j]); -#else - sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j])); - sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j])); - sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j])); - sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j])); -#endif + m256_add_dpbusd_epi32(sum0, in, row0[j]); + m256_add_dpbusd_epi32(sum1, in, row1[j]); + m256_add_dpbusd_epi32(sum2, in, row2[j]); + m256_add_dpbusd_epi32(sum3, in, row3[j]); } *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias); @@ -553,25 +585,15 @@ namespace Eval::NNUE::Layers { } else if constexpr (kOutputDimensions == 1) { - const auto row0 = reinterpret_cast(&weights_[0]); - -#if defined (USE_VNNI) __m256i sum0 = _mm256_setzero_si256(); - const IndexType kStart = 0; -#else - __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]); - const IndexType kStart = 1; -#endif - for (IndexType j = kStart; j < kNumChunks; ++j) + const auto row0 = reinterpret_cast(&weights_[0]); + + for (IndexType j = 0; j < kNumChunks; ++j) { - const __m256i in = input_vector[j]; + const __m256i in = input_vector[j]; -#if defined (USE_VNNI) - m256_add_dpbusd_epi32(sum0, in, row0[j]); -#else - sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j])); -#endif + m256_add_dpbusd_epi32(sum0, in, row0[j]); } output[0] = m256_hadd(sum0, biases_[0]); @@ -604,24 +626,38 @@ namespace Eval::NNUE::Layers { const __m128i bias = *reinterpret_cast(&biases_[i]); __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]); + __m128i sum0 = _mm_setzero_si128(); + __m128i sum1 = _mm_setzero_si128(); + __m128i sum2 = _mm_setzero_si128(); + __m128i sum3 = _mm_setzero_si128(); + const auto row0 = reinterpret_cast(&weights_[offset0]); const auto row1 = reinterpret_cast(&weights_[offset1]); const auto row2 = reinterpret_cast(&weights_[offset2]); const auto row3 = reinterpret_cast(&weights_[offset3]); - __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]); - __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]); - __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]); - __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]); - - for (int j = 1; j < (int)kNumChunks; ++j) + int j = 0; + if (!canSaturate16x4[i / 4]) + { + for (; j < (int)kNumChunks - 1; j += 2) + { + const __m128i in0 = input_vector[j]; + const __m128i in1 = input_vector[j + 1]; + + m128_add_dpbusd_epi32x2(sum0, in0, row0[j], in1, row0[j + 1]); + m128_add_dpbusd_epi32x2(sum1, in0, row1[j], in1, row1[j + 1]); + m128_add_dpbusd_epi32x2(sum2, in0, row2[j], in1, row2[j + 1]); + m128_add_dpbusd_epi32x2(sum3, in0, row3[j], in1, row3[j + 1]); + } + } + for (; j < (int)kNumChunks; ++j) { - const __m128i in = input_vector[j]; + const __m128i in = input_vector[j]; - sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j])); - sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j])); - sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j])); - sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j])); + m128_add_dpbusd_epi32(sum0, in, row0[j]); + m128_add_dpbusd_epi32(sum1, in, row1[j]); + m128_add_dpbusd_epi32(sum2, in, row2[j]); + m128_add_dpbusd_epi32(sum3, in, row3[j]); } *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias); @@ -629,12 +665,16 @@ namespace Eval::NNUE::Layers { } else if constexpr (kOutputDimensions == 1) { + __m128i sum0 = _mm_setzero_si128(); + const auto row0 = reinterpret_cast(&weights_[0]); - __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]); + for (int j = 0; j < (int)kNumChunks; ++j) + { + const __m128i in = input_vector[j]; - for (int j = 1; j < (int)kNumChunks; ++j) - sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j])); + m128_add_dpbusd_epi32(sum0, in, row0[j]); + } output[0] = m128_hadd(sum0, biases_[0]); } @@ -680,9 +720,8 @@ namespace Eval::NNUE::Layers { for (IndexType j = 0; j < kNumChunks; ++j) { __m128i row_j = _mm_load_si128(&row[j]); __m128i input_j = _mm_load_si128(&input_vector[j]); - __m128i row_signs = _mm_cmpgt_epi8(kZeros, row_j); - __m128i extended_row_lo = _mm_unpacklo_epi8(row_j, row_signs); - __m128i extended_row_hi = _mm_unpackhi_epi8(row_j, row_signs); + __m128i extended_row_lo = _mm_srai_epi16(_mm_unpacklo_epi8(row_j, row_j), 8); + __m128i extended_row_hi = _mm_srai_epi16(_mm_unpackhi_epi8(row_j, row_j), 8); __m128i extended_input_lo = _mm_unpacklo_epi8(input_j, kZeros); __m128i extended_input_hi = _mm_unpackhi_epi8(input_j, kZeros); __m128i product_lo = _mm_madd_epi16(extended_row_lo, extended_input_lo); @@ -704,9 +743,8 @@ namespace Eval::NNUE::Layers { for (IndexType j = 0; j < kNumChunks; ++j) { __m64 row_j = row[j]; __m64 input_j = input_vector[j]; - __m64 row_signs = _mm_cmpgt_pi8(kZeros, row_j); - __m64 extended_row_lo = _mm_unpacklo_pi8(row_j, row_signs); - __m64 extended_row_hi = _mm_unpackhi_pi8(row_j, row_signs); + __m64 extended_row_lo = _mm_srai_pi16(_mm_unpacklo_pi8(row_j, row_j), 8); + __m64 extended_row_hi = _mm_srai_pi16(_mm_unpackhi_pi8(row_j, row_j), 8); __m64 extended_input_lo = _mm_unpacklo_pi8(input_j, kZeros); __m64 extended_input_hi = _mm_unpackhi_pi8(input_j, kZeros); __m64 product_lo = _mm_madd_pi16(extended_row_lo, extended_input_lo); @@ -753,8 +791,11 @@ namespace Eval::NNUE::Layers { PreviousLayer previous_layer_; alignas(kCacheLineSize) BiasType biases_[kOutputDimensions]; - alignas(kCacheLineSize) - WeightType weights_[kOutputDimensions * kPaddedInputDimensions]; + alignas(kCacheLineSize) WeightType weights_[kOutputDimensions * kPaddedInputDimensions]; + union { + uint32_t canSaturate16x4[(kOutputDimensions + 3) / 4]; + bool canSaturate16[kOutputDimensions]; + }; }; } // namespace Eval::NNUE::Layers diff --git a/src/pawns.cpp b/src/pawns.cpp index b6d29003..16dbf27a 100644 --- a/src/pawns.cpp +++ b/src/pawns.cpp @@ -66,6 +66,7 @@ namespace { { V(-17), V( -13), V( 100), V( 4), V( 9), V(-16), V(-31) } }; + // KingOnFile[semi-open Us][semi-open Them] contains bonuses/penalties // for king when the king is on a semi-open or open file. constexpr Score KingOnFile[2][2] = {{ S(-19,12), S(-6, 7) }, diff --git a/src/search.cpp b/src/search.cpp index 52541868..cdbccb4c 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -676,6 +676,7 @@ namespace { ss->ttPv = PvNode || (ss->ttHit && tte->is_pv()); formerPv = ss->ttPv && !PvNode; + // Update low ply history for previous move if we are near root and position is or has been in PV if ( ss->ttPv && depth > 12 && ss->ply - 1 < MAX_LPH @@ -700,6 +701,7 @@ namespace { { if (ttValue >= beta) { + // Bonus for a quiet ttMove that fails high if (!pos.capture_or_promotion(ttMove)) update_quiet_stats(pos, ss, ttMove, stat_bonus(depth), depth); @@ -716,6 +718,8 @@ namespace { } } + // Partial workaround for the graph history interaction problem + // For high rule50 counts don't produce transposition table cutoffs. if (pos.rule50_count() < 90) return ttValue; } @@ -789,6 +793,7 @@ namespace { if (eval == VALUE_NONE) ss->staticEval = eval = evaluate(pos); + // Randomize draw evaluation if (eval == VALUE_DRAW) eval = value_draw(thisThread); @@ -799,20 +804,33 @@ namespace { } else { + // In case of null move search use previous static eval with a different sign + // and addition of two tempos if ((ss-1)->currentMove != MOVE_NULL) ss->staticEval = eval = evaluate(pos); else ss->staticEval = eval = -(ss-1)->staticEval + 2 * Tempo; + // Save static evaluation into transposition table tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE, eval); } + if ((ss-1)->moveCount > 1 && is_ok((ss-1)->currentMove) && !(ss-1)->inCheck && !priorCapture && depth < 7) + { + int bonus = std::clamp(- (depth+1) * 2 * int((ss-1)->staticEval + ss->staticEval - 2 * Tempo), -1000, 1000); + thisThread->mainHistory[~us][from_to((ss-1)->currentMove)] << bonus; + } + // Step 7. Razoring (~1 Elo) if ( !rootNode // The required rootNode PV handling is not available in qsearch && depth == 1 && eval <= alpha - RazorMargin) return qsearch(pos, ss, alpha, beta); + // Set up improving flag that is used in various pruning heuristics + // We define position as improving if static evaluation of position is better + // Than the previous static evaluation at our turn + // In case of us being in check at our previous move we look at move prior to it improving = (ss-2)->staticEval == VALUE_NONE ? ss->staticEval > (ss-4)->staticEval || (ss-4)->staticEval == VALUE_NONE : ss->staticEval > (ss-2)->staticEval; @@ -1170,9 +1188,10 @@ moves_loop: // When in check, search starts from here r -= 2; // Increase reduction at root and non-PV nodes when the best move does not change frequently - if ((rootNode || !PvNode) && depth > 10 && thisThread->bestMoveChanges <= 2) + if ((rootNode || !PvNode) && thisThread->rootDepth > 10 && thisThread->bestMoveChanges <= 2) r++; + // More reductions for late moves if position was not in previous PV if (moveCountPruning && !formerPv) r++; @@ -1248,6 +1267,7 @@ moves_loop: // When in check, search starts from here { value = -search(pos, ss+1, -(alpha+1), -alpha, newDepth, !cutNode); + // If the move passed LMR update its stats if (didLMR && !captureOrPromotion) { int bonus = value > alpha ? stat_bonus(newDepth) @@ -1299,8 +1319,7 @@ moves_loop: // When in check, search starts from here rm.pv.push_back(*m); // We record how often the best move has been changed in each - // iteration. This information is used for time management: when - // the best move changes frequently, we allocate some more time. + // iteration. This information is used for time management and LMR if (moveCount > 1) ++thisThread->bestMoveChanges; } @@ -1333,6 +1352,7 @@ moves_loop: // When in check, search starts from here } } + // If the move is worse than some previously searched move, remember it to update its stats later if (move != bestMove) { if (captureOrPromotion && captureCount < 32) @@ -1362,6 +1382,7 @@ moves_loop: // When in check, search starts from here bestValue = excludedMove ? alpha : ss->inCheck ? mated_in(ss->ply) : VALUE_DRAW; + // If there is a move which produces search value greater than alpha we update stats of searched moves else if (bestMove) update_all_stats(pos, ss, bestMove, bestValue, beta, prevSq, quietsSearched, quietCount, capturesSearched, captureCount, depth); @@ -1383,6 +1404,7 @@ moves_loop: // When in check, search starts from here else if (depth > 3) ss->ttPv = ss->ttPv && (ss+1)->ttPv; + // Write gathered information in transposition table if (!excludedMove && !(rootNode && thisThread->pvIdx)) tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv, bestValue >= beta ? BOUND_LOWER : @@ -1478,6 +1500,8 @@ moves_loop: // When in check, search starts from here bestValue = ttValue; } else + // In case of null move search use previous static eval with a different sign + // and addition of two tempos ss->staticEval = bestValue = (ss-1)->currentMove != MOVE_NULL ? evaluate(pos) : -(ss-1)->staticEval + 2 * Tempo; @@ -1485,6 +1509,7 @@ moves_loop: // When in check, search starts from here // Stand pat. Return immediately if static value is at least beta if (bestValue >= beta) { + // Save gathered info in transposition table if (!ss->ttHit) tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER, DEPTH_NONE, MOVE_NONE, ss->staticEval); @@ -1612,6 +1637,7 @@ moves_loop: // When in check, search starts from here return mated_in(ss->ply); // Plies to mate from the root } + // Save gathered info in transposition table tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit, bestValue >= beta ? BOUND_LOWER : PvNode && bestValue > oldAlpha ? BOUND_EXACT : BOUND_UPPER, @@ -1695,9 +1721,10 @@ moves_loop: // When in check, search starts from here if (!pos.capture_or_promotion(bestMove)) { + // Increase stats for the best move in case it was a quiet move update_quiet_stats(pos, ss, bestMove, bonus2, depth); - // Decrease all the non-best quiet moves + // Decrease stats for all non-best quiet moves for (int i = 0; i < quietCount; ++i) { thisThread->mainHistory[us][from_to(quietsSearched[i])] << -bonus2; @@ -1705,14 +1732,16 @@ moves_loop: // When in check, search starts from here } } else + // Increase stats for the best move in case it was a capture move captureHistory[moved_piece][to_sq(bestMove)][captured] << bonus1; - // Extra penalty for a quiet early move that was not a TT move or main killer move in previous ply when it gets refuted + // Extra penalty for a quiet early move that was not a TT move or + // main killer move in previous ply when it gets refuted. if ( ((ss-1)->moveCount == 1 + (ss-1)->ttHit || ((ss-1)->currentMove == (ss-1)->killers[0])) && !pos.captured_piece()) update_continuation_histories(ss-1, pos.piece_on(prevSq), prevSq, -bonus1); - // Decrease all the non-best capture moves + // Decrease stats for all non-best capture moves for (int i = 0; i < captureCount; ++i) { moved_piece = pos.moved_piece(capturesSearched[i]); @@ -1729,6 +1758,7 @@ moves_loop: // When in check, search starts from here for (int i : {1, 2, 4, 6}) { + // Only update first 2 continuation histories if we are in check if (ss->inCheck && i > 2) break; if (is_ok((ss-i)->currentMove)) @@ -1741,6 +1771,7 @@ moves_loop: // When in check, search starts from here void update_quiet_stats(const Position& pos, Stack* ss, Move move, int bonus, int depth) { + // Update killers if (ss->killers[0] != move) { ss->killers[1] = ss->killers[0]; @@ -1752,15 +1783,18 @@ moves_loop: // When in check, search starts from here thisThread->mainHistory[us][from_to(move)] << bonus; update_continuation_histories(ss, pos.moved_piece(move), to_sq(move), bonus); + // Penalty for reversed move in case of moved piece not being a pawn if (type_of(pos.moved_piece(move)) != PAWN) thisThread->mainHistory[us][from_to(reverse_move(move))] << -bonus; + // Update countermove history if (is_ok((ss-1)->currentMove)) { Square prevSq = to_sq((ss-1)->currentMove); thisThread->counterMoves[pos.piece_on(prevSq)][prevSq] = move; } + // Update low ply history if (depth > 11 && ss->ply < MAX_LPH) thisThread->lowPlyHistory[ss->ply][from_to(move)] << stat_bonus(depth - 7); }