+template<>
+TBTable<DTZ>::TBTable(const TBTable<WDL>& wdl) : TBTable() {
+
+ // Use the corresponding WDL table to avoid recalculating all from scratch
+ key = wdl.key;
+ key2 = wdl.key2;
+ pieceCount = wdl.pieceCount;
+ hasPawns = wdl.hasPawns;
+ hasUniquePieces = wdl.hasUniquePieces;
+ pawnCount[0] = wdl.pawnCount[0];
+ pawnCount[1] = wdl.pawnCount[1];
+}
+
+// class TBTables creates and keeps ownership of the TBTable objects, one for
+// each TB file found. It supports a fast, hash based, table lookup. Populated
+// at init time, accessed at probe time.
+class TBTables {
+
+ struct Entry
+ {
+ Key key;
+ TBTable<WDL>* wdl;
+ TBTable<DTZ>* dtz;
+
+ template <TBType Type>
+ TBTable<Type>* get() const {
+ return (TBTable<Type>*)(Type == WDL ? (void*)wdl : (void*)dtz);
+ }
+ };
+
+ static constexpr int Size = 1 << 12; // 4K table, indexed by key's 12 lsb
+ static constexpr int Overflow = 1; // Number of elements allowed to map to the last bucket
+
+ Entry hashTable[Size + Overflow];
+
+ std::deque<TBTable<WDL>> wdlTable;
+ std::deque<TBTable<DTZ>> dtzTable;
+
+ void insert(Key key, TBTable<WDL>* wdl, TBTable<DTZ>* dtz) {
+ uint32_t homeBucket = (uint32_t)key & (Size - 1);
+ Entry entry{ key, wdl, dtz };
+
+ // Ensure last element is empty to avoid overflow when looking up
+ for (uint32_t bucket = homeBucket; bucket < Size + Overflow - 1; ++bucket) {
+ Key otherKey = hashTable[bucket].key;
+ if (otherKey == key || !hashTable[bucket].get<WDL>()) {
+ hashTable[bucket] = entry;
+ return;
+ }
+
+ // Robin Hood hashing: If we've probed for longer than this element,
+ // insert here and search for a new spot for the other element instead.
+ uint32_t otherHomeBucket = (uint32_t)otherKey & (Size - 1);
+ if (otherHomeBucket > homeBucket) {
+ std::swap(entry, hashTable[bucket]);
+ key = otherKey;
+ homeBucket = otherHomeBucket;
+ }
+ }
+ std::cerr << "TB hash table size too low!" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+public:
+ template<TBType Type>
+ TBTable<Type>* get(Key key) {
+ for (const Entry* entry = &hashTable[(uint32_t)key & (Size - 1)]; ; ++entry) {
+ if (entry->key == key || !entry->get<Type>())
+ return entry->get<Type>();
+ }
+ }
+
+ void clear() {
+ memset(hashTable, 0, sizeof(hashTable));
+ wdlTable.clear();
+ dtzTable.clear();
+ }
+ size_t size() const { return wdlTable.size(); }
+ void add(const std::vector<PieceType>& pieces);
+};
+
+TBTables TBTables;
+
+// If the corresponding file exists two new objects TBTable<WDL> and TBTable<DTZ>
+// are created and added to the lists and hash table. Called at init time.
+void TBTables::add(const std::vector<PieceType>& pieces) {
+
+ std::string code;
+
+ for (PieceType pt : pieces)
+ code += PieceToChar[pt];
+
+ TBFile file(code.insert(code.find('K', 1), "v") + ".rtbw"); // KRK -> KRvK
+
+ if (!file.is_open()) // Only WDL file is checked
+ return;
+
+ file.close();
+
+ MaxCardinality = std::max((int)pieces.size(), MaxCardinality);
+
+ wdlTable.emplace_back(code);
+ dtzTable.emplace_back(wdlTable.back());
+
+ // Insert into the hash keys for both colors: KRvK with KR white and black
+ insert(wdlTable.back().key , &wdlTable.back(), &dtzTable.back());
+ insert(wdlTable.back().key2, &wdlTable.back(), &dtzTable.back());
+}
+
+// TB tables are compressed with canonical Huffman code. The compressed data is divided into
+// blocks of size d->sizeofBlock, and each block stores a variable number of symbols.
+// Each symbol represents either a WDL or a (remapped) DTZ value, or a pair of other symbols
+// (recursively). If you keep expanding the symbols in a block, you end up with up to 65536
+// WDL or DTZ values. Each symbol represents up to 256 values and will correspond after
+// Huffman coding to at least 1 bit. So a block of 32 bytes corresponds to at most
+// 32 x 8 x 256 = 65536 values. This maximum is only reached for tables that consist mostly
+// of draws or mostly of wins, but such tables are actually quite common. In principle, the
+// blocks in WDL tables are 64 bytes long (and will be aligned on cache lines). But for
+// mostly-draw or mostly-win tables this can leave many 64-byte blocks only half-filled, so
+// in such cases blocks are 32 bytes long. The blocks of DTZ tables are up to 1024 bytes long.
+// The generator picks the size that leads to the smallest table. The "book" of symbols and
+// Huffman codes is the same for all blocks in the table. A non-symmetric pawnless TB file
+// will have one table for wtm and one for btm, a TB file with pawns will have tables per
+// file a,b,c,d also in this case one set for wtm and one for btm.
+int decompress_pairs(PairsData* d, uint64_t idx) {
+
+ // Special case where all table positions store the same value
+ if (d->flags & TBFlag::SingleValue)
+ return d->minSymLen;
+
+ // First we need to locate the right block that stores the value at index "idx".
+ // Because each block n stores blockLength[n] + 1 values, the index i of the block
+ // that contains the value at position idx is:
+ //
+ // for (i = -1, sum = 0; sum <= idx; i++)
+ // sum += blockLength[i + 1] + 1;
+ //
+ // This can be slow, so we use SparseIndex[] populated with a set of SparseEntry that
+ // point to known indices into blockLength[]. Namely SparseIndex[k] is a SparseEntry
+ // that stores the blockLength[] index and the offset within that block of the value
+ // with index I(k), where:
+ //
+ // I(k) = k * d->span + d->span / 2 (1)
+
+ // First step is to get the 'k' of the I(k) nearest to our idx, using definition (1)
+ uint32_t k = uint32_t(idx / d->span);
+
+ // Then we read the corresponding SparseIndex[] entry
+ uint32_t block = number<uint32_t, LittleEndian>(&d->sparseIndex[k].block);
+ int offset = number<uint16_t, LittleEndian>(&d->sparseIndex[k].offset);
+
+ // Now compute the difference idx - I(k). From definition of k we know that
+ //
+ // idx = k * d->span + idx % d->span (2)
+ //
+ // So from (1) and (2) we can compute idx - I(K):
+ int diff = idx % d->span - d->span / 2;
+
+ // Sum the above to offset to find the offset corresponding to our idx
+ offset += diff;
+
+ // Move to previous/next block, until we reach the correct block that contains idx,
+ // that is when 0 <= offset <= d->blockLength[block]
+ while (offset < 0)
+ offset += d->blockLength[--block] + 1;
+
+ while (offset > d->blockLength[block])
+ offset -= d->blockLength[block++] + 1;
+
+ // Finally, we find the start address of our block of canonical Huffman symbols
+ uint32_t* ptr = (uint32_t*)(d->data + ((uint64_t)block * d->sizeofBlock));
+
+ // Read the first 64 bits in our block, this is a (truncated) sequence of
+ // unknown number of symbols of unknown length but we know the first one
+ // is at the beginning of this 64 bits sequence.
+ uint64_t buf64 = number<uint64_t, BigEndian>(ptr); ptr += 2;
+ int buf64Size = 64;
+ Sym sym;
+
+ while (true)
+ {
+ int len = 0; // This is the symbol length - d->min_sym_len
+
+ // Now get the symbol length. For any symbol s64 of length l right-padded
+ // to 64 bits we know that d->base64[l-1] >= s64 >= d->base64[l] so we
+ // can find the symbol length iterating through base64[].
+ while (buf64 < d->base64[len])
+ ++len;
+
+ // All the symbols of a given length are consecutive integers (numerical
+ // sequence property), so we can compute the offset of our symbol of
+ // length len, stored at the beginning of buf64.
+ sym = Sym((buf64 - d->base64[len]) >> (64 - len - d->minSymLen));
+
+ // Now add the value of the lowest symbol of length len to get our symbol
+ sym += number<Sym, LittleEndian>(&d->lowestSym[len]);
+
+ // If our offset is within the number of values represented by symbol sym
+ // we are done...
+ if (offset < d->symlen[sym] + 1)
+ break;
+
+ // ...otherwise update the offset and continue to iterate
+ offset -= d->symlen[sym] + 1;
+ len += d->minSymLen; // Get the real length
+ buf64 <<= len; // Consume the just processed symbol
+ buf64Size -= len;
+
+ if (buf64Size <= 32) { // Refill the buffer
+ buf64Size += 32;
+ buf64 |= (uint64_t)number<uint32_t, BigEndian>(ptr++) << (64 - buf64Size);
+ }