From: mstembera Date: Mon, 15 Jun 2020 06:35:07 +0000 (-0700) Subject: Use 128 bit multiply for TT index X-Git-Url: https://git.sesse.net/?p=stockfish;a=commitdiff_plain;h=1ea488d34c0b6a03fa3d89d289fe72fd1408cafd Use 128 bit multiply for TT index Remove super cluster stuff from TT and just use a 128 bit multiply. STC https://tests.stockfishchess.org/tests/view/5ee719b3aae8aec816ab7548 LLR: 2.94 (-2.94,2.94) {-1.50,0.50} Total: 12736 W: 2502 L: 2333 D: 7901 Ptnml(0-2): 191, 1452, 2944, 1559, 222 LTC https://tests.stockfishchess.org/tests/view/5ee732d1aae8aec816ab7556 LLR: 2.93 (-2.94,2.94) {-1.50,0.50} Total: 27584 W: 3431 L: 3350 D: 20803 Ptnml(0-2): 173, 2500, 8400, 2511, 208 Scheme back to being derived from https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ Also the default optimized version of the index calculation now uses fewer instructions. https://godbolt.org/z/Tktxbv Might benefit from mulx (requires -mbmi2) closes https://github.com/official-stockfish/Stockfish/pull/2744 bench: 4320954 --- diff --git a/src/misc.h b/src/misc.h index 05bfc7de..373f1b77 100644 --- a/src/misc.h +++ b/src/misc.h @@ -110,6 +110,19 @@ public: { return T(rand64() & rand64() & rand64()); } }; +inline uint64_t mul_hi64(uint64_t a, uint64_t b) { +#if defined(__GNUC__) && defined(IS_64BIT) + __extension__ typedef unsigned __int128 uint128; + return ((uint128)a * (uint128)b) >> 64; +#else + uint64_t aL = (uint32_t)a, aH = a >> 32; + uint64_t bL = (uint32_t)b, bH = b >> 32; + uint64_t c1 = (aL * bL) >> 32; + uint64_t c2 = aH * bL + c1; + uint64_t c3 = aL * bH + (uint32_t)c2; + return aH * bH + (c2 >> 32) + (c3 >> 32); +#endif +} /// Under Windows it is not possible for a process to run on more than one /// logical processor group. This usually means to be limited to use max 64 diff --git a/src/search.cpp b/src/search.cpp index cf89a892..67339ed7 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -662,7 +662,7 @@ namespace { // search to overwrite a previous full search TT value, so we use a different // position key in case of an excluded move. excludedMove = ss->excludedMove; - posKey = pos.key() ^ Key(excludedMove << 16); // Isn't a very good hash + posKey = pos.key() ^ (Key(excludedMove) << 48); // Isn't a very good hash tte = TT.probe(posKey, ttHit); ttValue = ttHit ? value_from_tt(tte->value(), ss->ply, pos.rule50_count()) : VALUE_NONE; ttMove = rootNode ? thisThread->rootMoves[thisThread->pvIdx].pv[0] diff --git a/src/tt.cpp b/src/tt.cpp index 92aaee00..d0a5d4e0 100644 --- a/src/tt.cpp +++ b/src/tt.cpp @@ -36,17 +36,17 @@ TranspositionTable TT; // Our global transposition table void TTEntry::save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev) { // Preserve any existing move for the same position - if (m || (k >> 48) != key16) + if (m || (uint16_t)k != key16) move16 = (uint16_t)m; // Overwrite less valuable entries - if ( (k >> 48) != key16 + if ((uint16_t)k != key16 || d - DEPTH_OFFSET > depth8 - 4 || b == BOUND_EXACT) { assert(d >= DEPTH_OFFSET); - key16 = (uint16_t)(k >> 48); + key16 = (uint16_t)k; value16 = (int16_t)v; eval16 = (int16_t)ev; genBound8 = (uint8_t)(TT.generation8 | uint8_t(pv) << 2 | b); @@ -65,10 +65,8 @@ void TranspositionTable::resize(size_t mbSize) { aligned_ttmem_free(mem); - superClusterCount = mbSize * 1024 * 1024 / (sizeof(Cluster) * ClustersPerSuperCluster); - - table = static_cast( - aligned_ttmem_alloc(superClusterCount * ClustersPerSuperCluster * sizeof(Cluster), mem)); + clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster); + table = static_cast(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem)); if (!mem) { std::cerr << "Failed to allocate " << mbSize @@ -91,8 +89,6 @@ void TranspositionTable::clear() { { threads.emplace_back([this, idx]() { - const size_t clusterCount = superClusterCount * ClustersPerSuperCluster; - // Thread binding gives faster search on systems with a first-touch policy if (Options["Threads"] > 8) WinProcGroup::bindThisThread(idx); @@ -121,7 +117,7 @@ void TranspositionTable::clear() { TTEntry* TranspositionTable::probe(const Key key, bool& found) const { TTEntry* const tte = first_entry(key); - const uint16_t key16 = key >> 48; // Use the high 16 bits as key inside the cluster + const uint16_t key16 = (uint16_t)key; // Use the low 16 bits as key inside the cluster for (int i = 0; i < ClusterSize; ++i) if (!tte[i].key16 || tte[i].key16 == key16) diff --git a/src/tt.h b/src/tt.h index 76db03da..3e1d0e99 100644 --- a/src/tt.h +++ b/src/tt.h @@ -66,7 +66,6 @@ private: class TranspositionTable { static constexpr int ClusterSize = 3; - static constexpr int ClustersPerSuperCluster = 256; struct Cluster { TTEntry entry[ClusterSize]; @@ -84,20 +83,13 @@ public: void clear(); TTEntry* first_entry(const Key key) const { - - // The index is computed from - // Idx = (K48 * SCC) / 2^40, with K48 the 48 lowest bits swizzled. - - const uint64_t firstTerm = uint32_t(key) * uint64_t(superClusterCount); - const uint64_t secondTerm = (uint16_t(key >> 32) * uint64_t(superClusterCount)) >> 16; - - return &table[(firstTerm + secondTerm) >> 24].entry[0]; + return &table[mul_hi64(key, clusterCount)].entry[0]; } private: friend struct TTEntry; - size_t superClusterCount; + size_t clusterCount; Cluster* table; void* mem; uint8_t generation8; // Size must be not bigger than TTEntry::genBound8 diff --git a/src/ucioption.cpp b/src/ucioption.cpp index 90190b53..7037ea57 100644 --- a/src/ucioption.cpp +++ b/src/ucioption.cpp @@ -56,7 +56,6 @@ bool CaseInsensitiveLess::operator() (const string& s1, const string& s2) const void init(OptionsMap& o) { - // At most 2^32 superclusters. Supercluster = 8 kB constexpr int MaxHashMB = Is64Bit ? 33554432 : 2048; o["Debug Log File"] << Option("", on_logger);