SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
- nnue/evaluate_nnue.cpp nnue/features/half_kp.cpp
+ nnue/evaluate_nnue.cpp nnue/features/half_ka_v2.cpp
OBJS = $(notdir $(SRCS:.cpp=.o))
if (filename.has_value())
actualFilename = filename.value();
- else
+ else
{
if (eval_file_loaded != EvalFileDefaultName)
{
// Scale and shift NNUE for compatibility with search and classical evaluation
auto adjusted_NNUE = [&]()
{
- int material = pos.non_pawn_material() + 4 * PawnValueMg * pos.count<PAWN>();
- int scale = 580
- + material / 32
- - 4 * pos.rule50_count();
+
+ int scale = 903 + 28 * pos.count<PAWN>() + 28 * pos.non_pawn_material() / 1024;
Value nnue = NNUE::evaluate(pos) * scale / 1024 + Time.tempoNNUE;
Value psq = Value(abs(eg_value(pos.psq_score())));
int r50 = 16 + pos.rule50_count();
bool largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
- bool classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
+ bool classical = largePsq;
// Use classical evaluation for really low piece endgames.
// One critical case is the draw for bishop + A/H file pawn vs naked king.
&& !lowPieceEndgame
&& ( abs(v) * 16 < NNUEThreshold2 * r50
|| ( pos.opposite_bishops()
- && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
- && !(pos.this_thread()->nodes & 0xB))))
+ && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50)))
v = adjusted_NNUE();
}
// The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
// for the build process (profile-build and fishtest) to work. Do not change the
// name of the macro, as it is used in the Makefile.
- #define EvalFileDefaultName "nn-62ef826d1a6d.nnue"
+ #define EvalFileDefaultName "nn-8a08400ed089.nnue"
namespace NNUE {
LargePagePtr<FeatureTransformer> featureTransformer;
// Evaluation function
- AlignedPtr<Network> network;
+ AlignedPtr<Network> network[LayerStacks];
// Evaluation function file name
std::string fileName;
void initialize() {
Detail::initialize(featureTransformer);
- Detail::initialize(network);
+ for (std::size_t i = 0; i < LayerStacks; ++i)
+ Detail::initialize(network[i]);
}
// Read network header
std::uint32_t version, size;
version = read_little_endian<std::uint32_t>(stream);
- *hashValue = read_little_endian<std::uint32_t>(stream);
+ *hashValue = read_little_endian<std::uint32_t>(stream);
size = read_little_endian<std::uint32_t>(stream);
if (!stream || version != Version) return false;
desc->resize(size);
if (!read_header(stream, &hashValue, &netDescription)) return false;
if (hashValue != HashValue) return false;
if (!Detail::read_parameters(stream, *featureTransformer)) return false;
- if (!Detail::read_parameters(stream, *network)) return false;
+ for (std::size_t i = 0; i < LayerStacks; ++i)
+ if (!Detail::read_parameters(stream, *(network[i]))) return false;
return stream && stream.peek() == std::ios::traits_type::eof();
}
if (!write_header(stream, HashValue, netDescription)) return false;
if (!Detail::write_parameters(stream, *featureTransformer)) return false;
- if (!Detail::write_parameters(stream, *network)) return false;
+ for (std::size_t i = 0; i < LayerStacks; ++i)
+ if (!Detail::write_parameters(stream, *(network[i]))) return false;
return (bool)stream;
}
ASSERT_ALIGNED(transformedFeatures, alignment);
ASSERT_ALIGNED(buffer, alignment);
- featureTransformer->transform(pos, transformedFeatures);
- const auto output = network->propagate(transformedFeatures, buffer);
+ const std::size_t bucket = (pos.count<ALL_PIECES>() - 1) / 4;
- return static_cast<Value>(output[0] / OutputScale);
+ const auto [psqt, lazy] = featureTransformer->transform(pos, transformedFeatures, bucket);
+ if (lazy) {
+ return static_cast<Value>(psqt / OutputScale);
+ } else {
+ const auto output = network[bucket]->propagate(transformedFeatures, buffer);
+ return static_cast<Value>((output[0] + psqt) / OutputScale);
+ }
}
// Load eval, from a file stream or a memory stream
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-//Definition of input features HalfKP of NNUE evaluation function
+//Definition of input features HalfKAv2 of NNUE evaluation function
-#include "half_kp.h"
+#include "half_ka_v2.h"
#include "../../position.h"
namespace Stockfish::Eval::NNUE::Features {
// Orient a square according to perspective (rotates by 180 for black)
- inline Square HalfKP::orient(Color perspective, Square s) {
- return Square(int(s) ^ (bool(perspective) * 63));
+ inline Square HalfKAv2::orient(Color perspective, Square s) {
+ return Square(int(s) ^ (bool(perspective) * 56));
}
// Index of a feature for a given king position and another piece on some square
- inline IndexType HalfKP::make_index(Color perspective, Square s, Piece pc, Square ksq) {
+ inline IndexType HalfKAv2::make_index(Color perspective, Square s, Piece pc, Square ksq) {
return IndexType(orient(perspective, s) + PieceSquareIndex[perspective][pc] + PS_NB * ksq);
}
// Get a list of indices for active features
- void HalfKP::append_active_indices(
+ void HalfKAv2::append_active_indices(
const Position& pos,
Color perspective,
ValueListInserter<IndexType> active
) {
Square ksq = orient(perspective, pos.square<KING>(perspective));
- Bitboard bb = pos.pieces() & ~pos.pieces(KING);
+ Bitboard bb = pos.pieces();
while (bb)
{
Square s = pop_lsb(bb);
// append_changed_indices() : get a list of indices for recently changed features
- void HalfKP::append_changed_indices(
+ void HalfKAv2::append_changed_indices(
Square ksq,
StateInfo* st,
Color perspective,
Square oriented_ksq = orient(perspective, ksq);
for (int i = 0; i < dp.dirty_num; ++i) {
Piece pc = dp.piece[i];
- if (type_of(pc) == KING) continue;
if (dp.from[i] != SQ_NONE)
removed.push_back(make_index(perspective, dp.from[i], pc, oriented_ksq));
if (dp.to[i] != SQ_NONE)
}
}
- int HalfKP::update_cost(StateInfo* st) {
+ int HalfKAv2::update_cost(StateInfo* st) {
return st->dirtyPiece.dirty_num;
}
- int HalfKP::refresh_cost(const Position& pos) {
- return pos.count<ALL_PIECES>() - 2;
+ int HalfKAv2::refresh_cost(const Position& pos) {
+ return pos.count<ALL_PIECES>();
}
- bool HalfKP::requires_refresh(StateInfo* st, Color perspective) {
+ bool HalfKAv2::requires_refresh(StateInfo* st, Color perspective) {
return st->dirtyPiece.piece[0] == make_piece(perspective, KING);
}
//Definition of input features HalfKP of NNUE evaluation function
-#ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
-#define NNUE_FEATURES_HALF_KP_H_INCLUDED
+#ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
+#define NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
#include "../nnue_common.h"
namespace Stockfish::Eval::NNUE::Features {
- // Feature HalfKP: Combination of the position of own king
- // and the position of pieces other than kings
- class HalfKP {
+ // Feature HalfKAv2: Combination of the position of own king
+ // and the position of pieces
+ class HalfKAv2 {
// unique number for each piece type on each square
enum {
PS_NONE = 0,
- PS_W_PAWN = 1,
- PS_B_PAWN = 1 * SQUARE_NB + 1,
- PS_W_KNIGHT = 2 * SQUARE_NB + 1,
- PS_B_KNIGHT = 3 * SQUARE_NB + 1,
- PS_W_BISHOP = 4 * SQUARE_NB + 1,
- PS_B_BISHOP = 5 * SQUARE_NB + 1,
- PS_W_ROOK = 6 * SQUARE_NB + 1,
- PS_B_ROOK = 7 * SQUARE_NB + 1,
- PS_W_QUEEN = 8 * SQUARE_NB + 1,
- PS_B_QUEEN = 9 * SQUARE_NB + 1,
- PS_NB = 10 * SQUARE_NB + 1
+ PS_W_PAWN = 0,
+ PS_B_PAWN = 1 * SQUARE_NB,
+ PS_W_KNIGHT = 2 * SQUARE_NB,
+ PS_B_KNIGHT = 3 * SQUARE_NB,
+ PS_W_BISHOP = 4 * SQUARE_NB,
+ PS_B_BISHOP = 5 * SQUARE_NB,
+ PS_W_ROOK = 6 * SQUARE_NB,
+ PS_B_ROOK = 7 * SQUARE_NB,
+ PS_W_QUEEN = 8 * SQUARE_NB,
+ PS_B_QUEEN = 9 * SQUARE_NB,
+ PS_KING = 10 * SQUARE_NB,
+ PS_NB = 11 * SQUARE_NB
};
static constexpr IndexType PieceSquareIndex[COLOR_NB][PIECE_NB] = {
// convention: W - us, B - them
// viewed from other side, W and B are reversed
- { PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_NONE, PS_NONE,
- PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_NONE, PS_NONE },
- { PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_NONE, PS_NONE,
- PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_NONE, PS_NONE }
+ { PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_KING, PS_NONE,
+ PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_KING, PS_NONE },
+ { PS_NONE, PS_B_PAWN, PS_B_KNIGHT, PS_B_BISHOP, PS_B_ROOK, PS_B_QUEEN, PS_KING, PS_NONE,
+ PS_NONE, PS_W_PAWN, PS_W_KNIGHT, PS_W_BISHOP, PS_W_ROOK, PS_W_QUEEN, PS_KING, PS_NONE }
};
// Orient a square according to perspective (rotates by 180 for black)
public:
// Feature name
- static constexpr const char* Name = "HalfKP(Friend)";
+ static constexpr const char* Name = "HalfKAv2(Friend)";
// Hash value embedded in the evaluation file
- static constexpr std::uint32_t HashValue = 0x5D69D5B8u;
+ static constexpr std::uint32_t HashValue = 0x5f234cb8u;
// Number of feature dimensions
static constexpr IndexType Dimensions =
static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_NB);
- // Maximum number of simultaneously active features. 30 because kins are not included.
- static constexpr IndexType MaxActiveDimensions = 30;
+ // Maximum number of simultaneously active features.
+ static constexpr IndexType MaxActiveDimensions = 32;
// Get a list of indices for active features
static void append_active_indices(
} // namespace Stockfish::Eval::NNUE::Features
-#endif // #ifndef NNUE_FEATURES_HALF_KP_H_INCLUDED
+#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
if (!previousLayer.read_parameters(stream)) return false;
for (std::size_t i = 0; i < OutputDimensions; ++i)
biases[i] = read_little_endian<BiasType>(stream);
-#if !defined (USE_SSSE3)
for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
+#if !defined (USE_SSSE3)
weights[i] = read_little_endian<WeightType>(stream);
#else
- std::unique_ptr<uint32_t[]> indexMap = std::make_unique<uint32_t[]>(OutputDimensions * PaddedInputDimensions);
- for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i) {
- const uint32_t scrambledIdx =
+ weights[
(i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
i / PaddedInputDimensions * 4 +
- i % 4;
- weights[scrambledIdx] = read_little_endian<WeightType>(stream);
- indexMap[scrambledIdx] = i;
- }
-
- // Determine if eights of weight and input products can be summed using 16bits
- // without saturation. We assume worst case combinations of 0 and 127 for all inputs.
- if (OutputDimensions > 1 && !stream.fail())
- {
- canSaturate16.count = 0;
-#if !defined(USE_VNNI)
- for (IndexType i = 0; i < PaddedInputDimensions; i += 16)
- for (IndexType j = 0; j < OutputDimensions; ++j)
- for (int x = 0; x < 2; ++x)
- {
- WeightType* w = &weights[i * OutputDimensions + j * 4 + x * 2];
- int sum[2] = {0, 0};
- for (int k = 0; k < 8; ++k)
- {
- IndexType idx = k / 2 * OutputDimensions * 4 + k % 2;
- sum[w[idx] < 0] += w[idx];
- }
- for (int sign : { -1, 1 })
- while (sign * sum[sign == -1] > 258)
- {
- int maxK = 0, maxW = 0;
- for (int k = 0; k < 8; ++k)
- {
- IndexType idx = k / 2 * OutputDimensions * 4 + k % 2;
- if (maxW < sign * w[idx])
- maxK = k, maxW = sign * w[idx];
- }
-
- IndexType idx = maxK / 2 * OutputDimensions * 4 + maxK % 2;
- sum[sign == -1] -= w[idx];
- const uint32_t scrambledIdx = idx + i * OutputDimensions + j * 4 + x * 2;
- canSaturate16.add(j, i + maxK / 2 * 4 + maxK % 2 + x * 2, w[idx], indexMap[scrambledIdx]);
- w[idx] = 0;
- }
- }
-
- // Non functional optimization for faster more linear access
- std::sort(canSaturate16.ids, canSaturate16.ids + canSaturate16.count,
- [](const typename CanSaturate::Entry& e1, const typename CanSaturate::Entry& e2)
- { return e1.in == e2.in ? e1.out < e2.out : e1.in < e2.in; });
-#endif
- }
+ i % 4
+ ] = read_little_endian<WeightType>(stream);
#endif
return !stream.fail();
i % 4
];
}
- for (int i = 0; i < canSaturate16.count; ++i)
- unscrambledWeights[canSaturate16.ids[i].wIdx] = canSaturate16.ids[i].w;
for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
write_little_endian<WeightType>(stream, unscrambledWeights[i]);
__m512i product1 = _mm512_maddubs_epi16(a1, b1);
__m512i product2 = _mm512_maddubs_epi16(a2, b2);
__m512i product3 = _mm512_maddubs_epi16(a3, b3);
- product0 = _mm512_add_epi16(product0, product1);
- product2 = _mm512_add_epi16(product2, product3);
- product0 = _mm512_add_epi16(product0, product2);
+ product0 = _mm512_adds_epi16(product0, product1);
product0 = _mm512_madd_epi16(product0, Ones512);
- acc = _mm512_add_epi32(acc, product0);
+ product2 = _mm512_adds_epi16(product2, product3);
+ product2 = _mm512_madd_epi16(product2, Ones512);
+ acc = _mm512_add_epi32(acc, _mm512_add_epi32(product0, product2));
#endif
};
__m256i product1 = _mm256_maddubs_epi16(a1, b1);
__m256i product2 = _mm256_maddubs_epi16(a2, b2);
__m256i product3 = _mm256_maddubs_epi16(a3, b3);
- product0 = _mm256_add_epi16(product0, product1);
- product2 = _mm256_add_epi16(product2, product3);
- product0 = _mm256_add_epi16(product0, product2);
+ product0 = _mm256_adds_epi16(product0, product1);
product0 = _mm256_madd_epi16(product0, Ones256);
- acc = _mm256_add_epi32(acc, product0);
+ product2 = _mm256_adds_epi16(product2, product3);
+ product2 = _mm256_madd_epi16(product2, Ones256);
+ acc = _mm256_add_epi32(acc, _mm256_add_epi32(product0, product2));
#endif
};
__m128i product1 = _mm_maddubs_epi16(a1, b1);
__m128i product2 = _mm_maddubs_epi16(a2, b2);
__m128i product3 = _mm_maddubs_epi16(a3, b3);
- product0 = _mm_add_epi16(product0, product1);
- product2 = _mm_add_epi16(product2, product3);
- product0 = _mm_add_epi16(product0, product2);
+ product0 = _mm_adds_epi16(product0, product1);
product0 = _mm_madd_epi16(product0, Ones128);
- acc = _mm_add_epi32(acc, product0);
+ product2 = _mm_adds_epi16(product2, product3);
+ product2 = _mm_madd_epi16(product2, Ones128);
+ acc = _mm_add_epi32(acc, _mm_add_epi32(product0, product2));
};
#endif
#endif
#if defined (USE_SSSE3)
+ // Different layout, we process 4 inputs at a time, always.
+ static_assert(InputDimensions % 4 == 0);
const auto output = reinterpret_cast<OutputType*>(buffer);
const auto inputVector = reinterpret_cast<const vec_t*>(input);
// because then it is also an input dimension.
if constexpr (OutputDimensions % OutputSimdWidth == 0)
{
- constexpr IndexType NumChunks = PaddedInputDimensions / 4;
+ constexpr IndexType NumChunks = InputDimensions / 4;
const auto input32 = reinterpret_cast<const std::int32_t*>(input);
vec_t* outptr = reinterpret_cast<vec_t*>(output);
for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j)
vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]);
}
- for (int i = 0; i < canSaturate16.count; ++i)
- output[canSaturate16.ids[i].out] += input[canSaturate16.ids[i].in] * canSaturate16.ids[i].w;
}
else if constexpr (OutputDimensions == 1)
{
auto output = reinterpret_cast<OutputType*>(buffer);
#if defined(USE_SSE2)
- constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
+ // At least a multiple of 16, with SSE2.
+ static_assert(InputDimensions % SimdWidth == 0);
+ constexpr IndexType NumChunks = InputDimensions / SimdWidth;
const __m128i Zeros = _mm_setzero_si128();
const auto inputVector = reinterpret_cast<const __m128i*>(input);
#elif defined(USE_MMX)
- constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
+ static_assert(InputDimensions % SimdWidth == 0);
+ constexpr IndexType NumChunks = InputDimensions / SimdWidth;
const __m64 Zeros = _mm_setzero_si64();
const auto inputVector = reinterpret_cast<const __m64*>(input);
#elif defined(USE_NEON)
- constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
+ static_assert(InputDimensions % SimdWidth == 0);
+ constexpr IndexType NumChunks = InputDimensions / SimdWidth;
const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
#endif
alignas(CacheLineSize) BiasType biases[OutputDimensions];
alignas(CacheLineSize) WeightType weights[OutputDimensions * PaddedInputDimensions];
-#if defined (USE_SSSE3)
- struct CanSaturate {
- int count;
- struct Entry {
- uint32_t wIdx;
- uint16_t out;
- uint16_t in;
- int8_t w;
- } ids[PaddedInputDimensions * OutputDimensions * 3 / 4];
-
- void add(int i, int j, int8_t w, uint32_t wIdx) {
- ids[count].wIdx = wIdx;
- ids[count].out = i;
- ids[count].in = j;
- ids[count].w = w;
- ++count;
- }
- } canSaturate16;
-#endif
};
} // namespace Stockfish::Eval::NNUE::Layers
const auto output = reinterpret_cast<OutputType*>(buffer);
#if defined(USE_AVX2)
- constexpr IndexType NumChunks = InputDimensions / SimdWidth;
- const __m256i Zero = _mm256_setzero_si256();
- const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
- const auto in = reinterpret_cast<const __m256i*>(input);
- const auto out = reinterpret_cast<__m256i*>(output);
- for (IndexType i = 0; i < NumChunks; ++i) {
- const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
- _mm256_load_si256(&in[i * 4 + 0]),
- _mm256_load_si256(&in[i * 4 + 1])), WeightScaleBits);
- const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
- _mm256_load_si256(&in[i * 4 + 2]),
- _mm256_load_si256(&in[i * 4 + 3])), WeightScaleBits);
- _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
- _mm256_packs_epi16(words0, words1), Zero), Offsets));
+ if constexpr (InputDimensions % SimdWidth == 0) {
+ constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+ const __m256i Zero = _mm256_setzero_si256();
+ const __m256i Offsets = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ const auto in = reinterpret_cast<const __m256i*>(input);
+ const auto out = reinterpret_cast<__m256i*>(output);
+ for (IndexType i = 0; i < NumChunks; ++i) {
+ const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
+ _mm256_load_si256(&in[i * 4 + 0]),
+ _mm256_load_si256(&in[i * 4 + 1])), WeightScaleBits);
+ const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
+ _mm256_load_si256(&in[i * 4 + 2]),
+ _mm256_load_si256(&in[i * 4 + 3])), WeightScaleBits);
+ _mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
+ _mm256_packs_epi16(words0, words1), Zero), Offsets));
+ }
+ } else {
+ constexpr IndexType NumChunks = InputDimensions / (SimdWidth / 2);
+ const __m128i Zero = _mm_setzero_si128();
+ const auto in = reinterpret_cast<const __m128i*>(input);
+ const auto out = reinterpret_cast<__m128i*>(output);
+ for (IndexType i = 0; i < NumChunks; ++i) {
+ const __m128i words0 = _mm_srai_epi16(_mm_packs_epi32(
+ _mm_load_si128(&in[i * 4 + 0]),
+ _mm_load_si128(&in[i * 4 + 1])), WeightScaleBits);
+ const __m128i words1 = _mm_srai_epi16(_mm_packs_epi32(
+ _mm_load_si128(&in[i * 4 + 2]),
+ _mm_load_si128(&in[i * 4 + 3])), WeightScaleBits);
+ const __m128i packedbytes = _mm_packs_epi16(words0, words1);
+ _mm_store_si128(&out[i], _mm_max_epi8(packedbytes, Zero));
+ }
}
- constexpr IndexType Start = NumChunks * SimdWidth;
+ constexpr IndexType Start =
+ InputDimensions % SimdWidth == 0
+ ? InputDimensions / SimdWidth * SimdWidth
+ : InputDimensions / (SimdWidth / 2) * (SimdWidth / 2);
#elif defined(USE_SSE2)
constexpr IndexType NumChunks = InputDimensions / SimdWidth;
// Class that holds the result of affine transformation of input features
struct alignas(CacheLineSize) Accumulator {
- std::int16_t
- accumulation[2][TransformedFeatureDimensions];
+ std::int16_t accumulation[2][TransformedFeatureDimensions];
+ std::int32_t psqtAccumulation[2][PSQTBuckets];
AccumulatorState state[2];
};
#include "nnue_common.h"
-#include "features/half_kp.h"
+#include "features/half_ka_v2.h"
#include "layers/input_slice.h"
#include "layers/affine_transform.h"
namespace Stockfish::Eval::NNUE {
// Input features used in evaluation function
- using FeatureSet = Features::HalfKP;
+ using FeatureSet = Features::HalfKAv2;
// Number of input feature dimensions after conversion
- constexpr IndexType TransformedFeatureDimensions = 256;
+ constexpr IndexType TransformedFeatureDimensions = 512;
+ constexpr IndexType PSQTBuckets = 8;
+ constexpr IndexType LayerStacks = 8;
namespace Layers {
// Define network structure
using InputLayer = InputSlice<TransformedFeatureDimensions * 2>;
- using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 32>>;
+ using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 16>>;
using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
using OutputLayer = AffineTransform<HiddenLayer2, 1>;
namespace Stockfish::Eval::NNUE {
// Version of the evaluation file
- constexpr std::uint32_t Version = 0x7AF32F16u;
+ constexpr std::uint32_t Version = 0x7AF32F20u;
// Constant used in evaluation value calculation
constexpr int OutputScale = 16;
// vector registers.
#define VECTOR
+ static_assert(PSQTBuckets == 8, "Assumed by the current choice of constants.");
+
#ifdef USE_AVX512
typedef __m512i vec_t;
+ typedef __m256i psqt_vec_t;
#define vec_load(a) _mm512_load_si512(a)
#define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+ #define vec_load_psqt(a) _mm256_load_si256(a)
+ #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
+ #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
+ #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
+ #define vec_zero_psqt() _mm256_setzero_si256()
static constexpr IndexType NumRegs = 8; // only 8 are needed
+ static constexpr IndexType NumPsqtRegs = 1;
#elif USE_AVX2
typedef __m256i vec_t;
+ typedef __m256i psqt_vec_t;
#define vec_load(a) _mm256_load_si256(a)
#define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+ #define vec_load_psqt(a) _mm256_load_si256(a)
+ #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
+ #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
+ #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
+ #define vec_zero_psqt() _mm256_setzero_si256()
static constexpr IndexType NumRegs = 16;
+ static constexpr IndexType NumPsqtRegs = 1;
#elif USE_SSE2
typedef __m128i vec_t;
+ typedef __m128i psqt_vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_epi16(a,b)
#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+ #define vec_load_psqt(a) (*(a))
+ #define vec_store_psqt(a,b) *(a)=(b)
+ #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
+ #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
+ #define vec_zero_psqt() _mm_setzero_si128()
static constexpr IndexType NumRegs = Is64Bit ? 16 : 8;
+ static constexpr IndexType NumPsqtRegs = 2;
#elif USE_MMX
typedef __m64 vec_t;
+ typedef std::int32_t psqt_vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) _mm_add_pi16(a,b)
#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+ #define vec_load_psqt(a) (*(a))
+ #define vec_store_psqt(a,b) *(a)=(b)
+ #define vec_add_psqt_32(a,b) a+b
+ #define vec_sub_psqt_32(a,b) a-b
+ #define vec_zero_psqt() 0
static constexpr IndexType NumRegs = 8;
+ static constexpr IndexType NumPsqtRegs = 8;
#elif USE_NEON
typedef int16x8_t vec_t;
+ typedef int32x4_t psqt_vec_t;
#define vec_load(a) (*(a))
#define vec_store(a,b) *(a)=(b)
#define vec_add_16(a,b) vaddq_s16(a,b)
#define vec_sub_16(a,b) vsubq_s16(a,b)
+ #define vec_load_psqt(a) (*(a))
+ #define vec_store_psqt(a,b) *(a)=(b)
+ #define vec_add_psqt_32(a,b) vaddq_s32(a,b)
+ #define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
+ #define vec_zero_psqt() psqt_vec_t{0}
static constexpr IndexType NumRegs = 16;
+ static constexpr IndexType NumPsqtRegs = 2;
#else
#undef VECTOR
// Number of output dimensions for one side
static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
+ static constexpr int LazyThreshold = 1400;
+
#ifdef VECTOR
static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2;
+ static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
+ static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets");
#endif
public:
biases[i] = read_little_endian<BiasType>(stream);
for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i)
weights[i] = read_little_endian<WeightType>(stream);
+ for (std::size_t i = 0; i < PSQTBuckets * InputDimensions; ++i)
+ psqtWeights[i] = read_little_endian<PSQTWeightType>(stream);
return !stream.fail();
}
}
// Convert input features
- void transform(const Position& pos, OutputType* output) const {
+ std::pair<std::int32_t, bool> transform(const Position& pos, OutputType* output, int bucket) const {
update_accumulator(pos, WHITE);
update_accumulator(pos, BLACK);
+ const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
const auto& accumulation = pos.state()->accumulator.accumulation;
+ const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation;
+
+ const auto psqt = (
+ psqtAccumulation[static_cast<int>(perspectives[0])][bucket]
+ - psqtAccumulation[static_cast<int>(perspectives[1])][bucket]
+ ) / 2;
+
+ if (abs(psqt) > LazyThreshold * OutputScale)
+ return { psqt, true };
#if defined(USE_AVX512)
constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
const int8x8_t Zero = {0};
#endif
- const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
for (IndexType p = 0; p < 2; ++p) {
const IndexType offset = HalfDimensions * p;
#if defined(USE_MMX)
_mm_empty();
#endif
+
+ return { psqt, false };
}
private:
// Gcc-10.2 unnecessarily spills AVX2 registers if this array
// is defined in the VECTOR code below, once in each branch
vec_t acc[NumRegs];
+ psqt_vec_t psqt[NumPsqtRegs];
#endif
// Look for a usable accumulator of an earlier position. We keep track
}
}
+ for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
+ {
+ // Load accumulator
+ auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
+ &st->accumulator.psqtAccumulation[perspective][j * PsqtTileHeight]);
+ for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+ psqt[k] = vec_load_psqt(&accTilePsqt[k]);
+
+ for (IndexType i = 0; states_to_update[i]; ++i)
+ {
+ // Difference calculation for the deactivated features
+ for (const auto index : removed[i])
+ {
+ const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
+ auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+ for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+ psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
+ }
+
+ // Difference calculation for the activated features
+ for (const auto index : added[i])
+ {
+ const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
+ auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+ for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+ psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+ }
+
+ // Store accumulator
+ accTilePsqt = reinterpret_cast<psqt_vec_t*>(
+ &states_to_update[i]->accumulator.psqtAccumulation[perspective][j * PsqtTileHeight]);
+ for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+ vec_store_psqt(&accTilePsqt[k], psqt[k]);
+ }
+ }
+
#else
for (IndexType i = 0; states_to_update[i]; ++i)
{
std::memcpy(states_to_update[i]->accumulator.accumulation[perspective],
st->accumulator.accumulation[perspective],
HalfDimensions * sizeof(BiasType));
+
+ for (std::size_t k = 0; k < PSQTBuckets; ++k)
+ states_to_update[i]->accumulator.psqtAccumulation[perspective][k] = st->accumulator.psqtAccumulation[perspective][k];
+
st = states_to_update[i];
// Difference calculation for the deactivated features
for (IndexType j = 0; j < HalfDimensions; ++j)
st->accumulator.accumulation[perspective][j] -= weights[offset + j];
+
+ for (std::size_t k = 0; k < PSQTBuckets; ++k)
+ st->accumulator.psqtAccumulation[perspective][k] -= psqtWeights[index * PSQTBuckets + k];
}
// Difference calculation for the activated features
for (IndexType j = 0; j < HalfDimensions; ++j)
st->accumulator.accumulation[perspective][j] += weights[offset + j];
+
+ for (std::size_t k = 0; k < PSQTBuckets; ++k)
+ st->accumulator.psqtAccumulation[perspective][k] += psqtWeights[index * PSQTBuckets + k];
}
}
#endif
vec_store(&accTile[k], acc[k]);
}
+ for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
+ {
+ for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+ psqt[k] = vec_zero_psqt();
+
+ for (const auto index : active)
+ {
+ const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
+ auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+
+ for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+ psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+ }
+
+ auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
+ &accumulator.psqtAccumulation[perspective][j * PsqtTileHeight]);
+ for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+ vec_store_psqt(&accTilePsqt[k], psqt[k]);
+ }
+
#else
std::memcpy(accumulator.accumulation[perspective], biases,
HalfDimensions * sizeof(BiasType));
+ for (std::size_t k = 0; k < PSQTBuckets; ++k)
+ accumulator.psqtAccumulation[perspective][k] = 0;
+
for (const auto index : active)
{
const IndexType offset = HalfDimensions * index;
for (IndexType j = 0; j < HalfDimensions; ++j)
accumulator.accumulation[perspective][j] += weights[offset + j];
+
+ for (std::size_t k = 0; k < PSQTBuckets; ++k)
+ accumulator.psqtAccumulation[perspective][k] += psqtWeights[index * PSQTBuckets + k];
}
#endif
}
using BiasType = std::int16_t;
using WeightType = std::int16_t;
+ using PSQTWeightType = std::int32_t;
alignas(CacheLineSize) BiasType biases[HalfDimensions];
alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions];
+ alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];
};
} // namespace Stockfish::Eval::NNUE
// Futility margin
Value futility_margin(Depth d, bool improving) {
- return Value(234 * (d - improving));
+ return Value(231 * (d - improving));
}
// Reductions lookup table, initialized at startup
&& (ss-1)->statScore < 24185
&& eval >= beta
&& eval >= ss->staticEval
- && ss->staticEval >= beta - 24 * depth - 34 * improving + 162 * ss->ttPv + 159
+ && ss->staticEval >= beta - 22 * depth - 34 * improving + 162 * ss->ttPv + 159
&& !excludedMove
&& pos.non_pawn_material(us)
&& (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
+ (*contHist[0])[movedPiece][to_sq(move)]
+ (*contHist[1])[movedPiece][to_sq(move)]
+ (*contHist[3])[movedPiece][to_sq(move)]
- - 4741;
+ - 4791;
// Decrease/increase reduction for moves with a good/bad history (~30 Elo)
if (!ss->inCheck)