X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=src%2Ftt.cpp;h=53e78595045548bed917e75132dfc084c60ed88a;hb=HEAD;hp=d0e2d729c8c766e883092277bfc5fd97b2bb58e2;hpb=40548c9153ea89c0b27b198efb443c5bb9b9c490;p=stockfish diff --git a/src/tt.cpp b/src/tt.cpp index d0e2d729..4b55e53f 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -1,7 +1,6 @@ /* Stockfish, a UCI chess playing engine derived from Glaurung 2.1 - Copyright (C) 2004-2008 Tord Romstad (Glaurung author) - Copyright (C) 2008-2015 Marco Costalba, Joona Kiiski, Tord Romstad + Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file) Stockfish is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,97 +16,234 @@ along with this program. If not, see . */ -#include // For std::memset +#include "tt.h" + +#include +#include +#include +#include #include -#include "bitboard.h" -#include "tt.h" +#include "memory.h" +#include "misc.h" +#include "syzygy/tbprobe.h" +#include "thread.h" + +namespace Stockfish { + + +// TTEntry struct is the 10 bytes transposition table entry, defined as below: +// +// key 16 bit +// depth 8 bit +// generation 5 bit +// pv node 1 bit +// bound type 2 bit +// move 16 bit +// value 16 bit +// evaluation 16 bit +// +// These fields are in the same order as accessed by TT::probe(), since memory is fastest sequentially. +// Equally, the store order in save() matches this order. + +struct TTEntry { + + // Convert internal bitfields to external types + TTData read() const { + return TTData{Move(move16), Value(value16), + Value(eval16), Depth(depth8 + DEPTH_ENTRY_OFFSET), + Bound(genBound8 & 0x3), bool(genBound8 & 0x4)}; + } + + bool is_occupied() const; + void save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8); + // The returned age is a multiple of TranspositionTable::GENERATION_DELTA + uint8_t relative_age(const uint8_t generation8) const; + + private: + friend class TranspositionTable; + + uint16_t key16; + uint8_t depth8; + uint8_t genBound8; + Move move16; + int16_t value16; + int16_t eval16; +}; + +// `genBound8` is where most of the details are. We use the following constants to manipulate 5 leading generation bits +// and 3 trailing miscellaneous bits. + +// These bits are reserved for other things. +static constexpr unsigned GENERATION_BITS = 3; +// increment for generation field +static constexpr int GENERATION_DELTA = (1 << GENERATION_BITS); +// cycle length +static constexpr int GENERATION_CYCLE = 255 + GENERATION_DELTA; +// mask to pull out generation number +static constexpr int GENERATION_MASK = (0xFF << GENERATION_BITS) & 0xFF; + +// DEPTH_ENTRY_OFFSET exists because 1) we use `bool(depth8)` as the occupancy check, but +// 2) we need to store negative depths for QS. (`depth8` is the only field with "spare bits": +// we sacrifice the ability to store depths greater than 1<<8 less the offset, as asserted in `save`.) +bool TTEntry::is_occupied() const { return bool(depth8); } + +// Populates the TTEntry with a new node's data, possibly +// overwriting an old position. The update is not atomic and can be racy. +void TTEntry::save( + Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8) { + + // Preserve the old ttmove if we don't have a new one + if (m || uint16_t(k) != key16) + move16 = m; + + // Overwrite less valuable entries (cheapest checks first) + if (b == BOUND_EXACT || uint16_t(k) != key16 || d - DEPTH_ENTRY_OFFSET + 2 * pv > depth8 - 4 + || relative_age(generation8)) + { + assert(d > DEPTH_ENTRY_OFFSET); + assert(d < 256 + DEPTH_ENTRY_OFFSET); + + key16 = uint16_t(k); + depth8 = uint8_t(d - DEPTH_ENTRY_OFFSET); + genBound8 = uint8_t(generation8 | uint8_t(pv) << 2 | b); + value16 = int16_t(v); + eval16 = int16_t(ev); + } +} + + +uint8_t TTEntry::relative_age(const uint8_t generation8) const { + // Due to our packed storage format for generation and its cyclic + // nature we add GENERATION_CYCLE (256 is the modulus, plus what + // is needed to keep the unrelated lowest n bits from affecting + // the result) to calculate the entry age correctly even after + // generation8 overflows into the next cycle. + return (GENERATION_CYCLE + generation8 - genBound8) & GENERATION_MASK; +} + + +// TTWriter is but a very thin wrapper around the pointer +TTWriter::TTWriter(TTEntry* tte) : + entry(tte) {} + +void TTWriter::write( + Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8) { + entry->save(k, v, pv, b, d, m, ev, generation8); +} -TranspositionTable TT; // Our global transposition table +// A TranspositionTable is an array of Cluster, of size clusterCount. Each cluster consists of ClusterSize number +// of TTEntry. Each non-empty TTEntry contains information on exactly one position. The size of a Cluster should +// divide the size of a cache line for best performance, as the cacheline is prefetched when possible. -/// TranspositionTable::resize() sets the size of the transposition table, -/// measured in megabytes. Transposition table consists of a power of 2 number -/// of clusters and each cluster consists of ClusterSize number of TTEntry. +static constexpr int ClusterSize = 3; -void TranspositionTable::resize(size_t mbSize) { +struct Cluster { + TTEntry entry[ClusterSize]; + char padding[2]; // Pad to 32 bytes +}; - size_t newClusterCount = size_t(1) << msb((mbSize * 1024 * 1024) / sizeof(Cluster)); +static_assert(sizeof(Cluster) == 32, "Suboptimal Cluster size"); - if (newClusterCount == clusterCount) - return; - clusterCount = newClusterCount; +// Sets the size of the transposition table, +// measured in megabytes. Transposition table consists +// of clusters and each cluster consists of ClusterSize number of TTEntry. +void TranspositionTable::resize(size_t mbSize, ThreadPool& threads) { + aligned_large_pages_free(table); - free(mem); - mem = calloc(clusterCount * sizeof(Cluster) + CacheLineSize - 1, 1); + clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); - if (!mem) - { - std::cerr << "Failed to allocate " << mbSize - << "MB for transposition table." << std::endl; - exit(EXIT_FAILURE); - } + table = static_cast(aligned_large_pages_alloc(clusterCount * sizeof(Cluster))); - table = (Cluster*)((uintptr_t(mem) + CacheLineSize - 1) & ~(CacheLineSize - 1)); + if (!table) + { + std::cerr << "Failed to allocate " << mbSize << "MB for transposition table." << std::endl; + exit(EXIT_FAILURE); + } + + clear(threads); } -/// TranspositionTable::clear() overwrites the entire transposition table -/// with zeros. It is called whenever the table is resized, or when the -/// user asks the program to clear the table (from the UCI interface). +// Initializes the entire transposition table to zero, +// in a multi-threaded way. +void TranspositionTable::clear(ThreadPool& threads) { + generation8 = 0; + const size_t threadCount = threads.num_threads(); + + for (size_t i = 0; i < threadCount; ++i) + { + threads.run_on_thread(i, [this, i, threadCount]() { + // Each thread will zero its part of the hash table + const size_t stride = clusterCount / threadCount; + const size_t start = stride * i; + const size_t len = i + 1 != threadCount ? stride : clusterCount - start; -void TranspositionTable::clear() { + std::memset(&table[start], 0, len * sizeof(Cluster)); + }); + } - std::memset(table, 0, clusterCount * sizeof(Cluster)); + for (size_t i = 0; i < threadCount; ++i) + threads.wait_on_thread(i); } -/// TranspositionTable::probe() looks up the current position in the transposition -/// table. It returns true and a pointer to the TTEntry if the position is found. -/// Otherwise, it returns false and a pointer to an empty or least valuable TTEntry -/// to be replaced later. A TTEntry t1 is considered to be more valuable than a -/// TTEntry t2 if t1 is from the current search and t2 is from a previous search, -/// or if the depth of t1 is bigger than the depth of t2. +// Returns an approximation of the hashtable +// occupation during a search. The hash is x permill full, as per UCI protocol. +// Only counts entries which match the current generation. +int TranspositionTable::hashfull() const { + + int cnt = 0; + for (int i = 0; i < 1000; ++i) + for (int j = 0; j < ClusterSize; ++j) + cnt += table[i].entry[j].is_occupied() + && (table[i].entry[j].genBound8 & GENERATION_MASK) == generation8; + + return cnt / ClusterSize; +} -TTEntry* TranspositionTable::probe(const Key key, bool& found) const { - TTEntry* const tte = first_entry(key); - const uint16_t key16 = key >> 48; // Use the high 16 bits as key inside the cluster +void TranspositionTable::new_search() { + // increment by delta to keep lower bits as is + generation8 += GENERATION_DELTA; +} - for (int i = 0; i < ClusterSize; ++i) - if (!tte[i].key16 || tte[i].key16 == key16) - { - if (tte[i].key16) - tte[i].genBound8 = uint8_t(generation8 | tte[i].bound()); // Refresh - return found = (bool)tte[i].key16, &tte[i]; - } +uint8_t TranspositionTable::generation() const { return generation8; } - // Find an entry to be replaced according to the replacement strategy - TTEntry* replace = tte; - for (int i = 1; i < ClusterSize; ++i) - if ( (( tte[i].genBound8 & 0xFC) == generation8 || tte[i].bound() == BOUND_EXACT) - - ((replace->genBound8 & 0xFC) == generation8) - - (tte[i].depth8 < replace->depth8) < 0) - replace = &tte[i]; - return found = false, replace; +// Looks up the current position in the transposition +// table. It returns true if the position is found. +// Otherwise, it returns false and a pointer to an empty or least valuable TTEntry +// to be replaced later. The replace value of an entry is calculated as its depth +// minus 8 times its relative age. TTEntry t1 is considered more valuable than +// TTEntry t2 if its replace value is greater than that of t2. +std::tuple TranspositionTable::probe(const Key key) const { + + TTEntry* const tte = first_entry(key); + const uint16_t key16 = uint16_t(key); // Use the low 16 bits as key inside the cluster + + for (int i = 0; i < ClusterSize; ++i) + if (tte[i].key16 == key16) + // This gap is the main place for read races. + // After `read()` completes that copy is final, but may be self-inconsistent. + return {tte[i].is_occupied(), tte[i].read(), TTWriter(&tte[i])}; + + // Find an entry to be replaced according to the replacement strategy + TTEntry* replace = tte; + for (int i = 1; i < ClusterSize; ++i) + if (replace->depth8 - replace->relative_age(generation8) * 2 + > tte[i].depth8 - tte[i].relative_age(generation8) * 2) + replace = &tte[i]; + + return {false, TTData(), TTWriter(replace)}; } -/// Returns an approximation of the hashtable occupation during a search. The -/// hash is x permill full, as per UCI protocol. - -int TranspositionTable::hashfull() const -{ - int cnt = 0; - for (int i = 0; i < 1000 / ClusterSize; i++) - { - const TTEntry* tte = &table[i].entry[0]; - for (int j = 0; j < ClusterSize; j++) - if ((tte[j].genBound8 & 0xFC) == generation8) - cnt++; - } - return cnt; +TTEntry* TranspositionTable::first_entry(const Key key) const { + return &table[mul_hi64(key, clusterCount)].entry[0]; } + +} // namespace Stockfish