From c41f998855416a8619751653f3cedd9bc6f0f4c0 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Wed, 30 Sep 2020 19:44:28 +0200 Subject: [PATCH] Switch trigram lookup from binary search to a hash table. Binary search was fine when we just wanted simplicity, but for I/O optimization on rotating media, we want as few seeks as possible. A hash table with open addressing gives us just that; Robin Hood hashing makes it possible for us to guarantee maximum probe length, so we can just read 256 bytes (plus a little slop) for each lookup and that's it. This kills ~30 ms or so cold-cache. This breaks the format, so we use the chance to add a magic and a proper header to provide some more flexibility in case we want to change the builder. --- db.h | 44 +++++++++++++++ plocate-build.cpp | 134 +++++++++++++++++++++++++++++++++++++--------- plocate.cpp | 93 ++++++++++++-------------------- 3 files changed, 188 insertions(+), 83 deletions(-) create mode 100644 db.h diff --git a/db.h b/db.h new file mode 100644 index 0000000..d4baf53 --- /dev/null +++ b/db.h @@ -0,0 +1,44 @@ +#ifndef DB_H +#define DB_H 1 + +#include + +struct Header { + char magic[8]; // "\0plocate"; + uint32_t version; // 0. + uint32_t hashtable_size; + uint32_t extra_ht_slots; + uint64_t hash_table_offset_bytes; + uint64_t filename_index_offset_bytes; +}; + +struct Trigram { + uint32_t trgm; + uint32_t num_docids; + uint64_t offset; + + bool operator==(const Trigram &other) const + { + return trgm == other.trgm; + } + bool operator<(const Trigram &other) const + { + return trgm < other.trgm; + } +}; + +inline uint32_t hash_trigram(uint32_t trgm, uint32_t ht_size) +{ + // CRC-like computation. + uint32_t crc = trgm; + for (int i = 0; i < 32; i++) { + bool bit = crc & 0x80000000; + crc <<= 1; + if (bit) { + crc ^= 0x1edc6f41; + } + } + return crc % ht_size; +} + +#endif // !defined(DB_H) diff --git a/plocate-build.cpp b/plocate-build.cpp index 53cefde..40ce2ff 100644 --- a/plocate-build.cpp +++ b/plocate-build.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include #include @@ -17,6 +19,8 @@ #include #include +#include "db.h" + #define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t)) #define dprintf(...) //#define dprintf(...) fprintf(stderr, __VA_ARGS__); @@ -265,6 +269,63 @@ string zstd_compress(const string &src, string *tempbuf) return string(tempbuf->data(), size); } +bool is_prime(uint32_t x) +{ + if ((x % 2) == 0 || (x % 3) == 0) { + return false; + } + uint32_t limit = ceil(sqrt(x)); + for (uint32_t factor = 5; factor <= limit; ++factor) { + if ((x % factor) == 0) { + return false; + } + } + return true; +} + +uint32_t next_prime(uint32_t x) +{ + if ((x % 2) == 0) { + ++x; + } + while (!is_prime(x)) { + x += 2; + } + return x; +} + +unique_ptr create_hashtable(const Corpus &corpus, const vector &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots) +{ + unique_ptr ht(new Trigram[ht_size + num_overflow_slots + 1]); // 1 for the sentinel element at the end. + for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) { + ht[i].trgm = uint32_t(-1); + ht[i].num_docids = 0; + ht[i].offset = 0; + } + for (uint32_t trgm : all_trigrams) { + // We don't know offset yet, so set it to zero. + Trigram to_insert{trgm, uint32_t(corpus.invindex.find(trgm)->second.num_docids), 0}; + + uint32_t bucket = hash_trigram(trgm, ht_size); + unsigned distance = 0; + while (ht[bucket].num_docids != 0) { + // Robin Hood hashing; reduces the longest distance by a lot. + unsigned other_distance = bucket - hash_trigram(ht[bucket].trgm, ht_size); + if (distance > other_distance) { + swap(to_insert, ht[bucket]); + distance = other_distance; + } + + ++bucket, ++distance; + if (distance > num_overflow_slots) { + return nullptr; + } + } + ht[bucket] = to_insert; + } + return ht; +} + void do_build(const char *infile, const char *outfile, int block_size) { steady_clock::time_point start __attribute__((unused)) = steady_clock::now(); @@ -303,42 +364,64 @@ void do_build(const char *infile, const char *outfile, int block_size) dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration(steady_clock::now() - start).count()); + // Sort the trigrams, mostly to get a consistent result every time + // (the hash table will put things in random order anyway). vector all_trigrams; for (auto &[trigram, pl_builder] : corpus.invindex) { all_trigrams.push_back(trigram); } sort(all_trigrams.begin(), all_trigrams.end()); + unique_ptr hashtable; + uint32_t ht_size = next_prime(all_trigrams.size()); + constexpr unsigned num_overflow_slots = 16; + for ( ;; ) { + hashtable = create_hashtable(corpus, all_trigrams, ht_size, num_overflow_slots); + if (hashtable == nullptr) { + dprintf("Failed creating hash table of size %u, increasing by 5%% and trying again.\n", ht_size); + ht_size = next_prime(ht_size * 1.05); + } else { + dprintf("Created hash table of size %u.\n\n", ht_size); + break; + } + } + umask(0027); FILE *outfp = fopen(outfile, "wb"); - // Write the header. - uint64_t num_trigrams = corpus.invindex.size(); // 64 to get alignment. - fwrite(&num_trigrams, sizeof(num_trigrams), 1, outfp); - - // Find out where the posting lists will end. - uint64_t offset = sizeof(uint64_t) * 2 + 16 * all_trigrams.size(); - uint64_t filename_index_offset = offset; - for (auto &[trigram, pl_builder] : corpus.invindex) { - filename_index_offset += pl_builder.encoded.size(); - } - fwrite(&filename_index_offset, sizeof(filename_index_offset), 1, outfp); + // Find the offsets for each posting list. + size_t bytes_for_hashtable = (ht_size + num_overflow_slots + 1) * sizeof(Trigram); + uint64_t offset = sizeof(Header) + bytes_for_hashtable; + for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) { + hashtable[i].offset = offset; // Needs to be there even for empty slots. + if (hashtable[i].num_docids == 0) { + continue; + } - size_t bytes_for_trigrams = 0, bytes_for_posting_lists = 0, bytes_for_filename_index = 0, bytes_for_filenames = 0; - for (uint32_t trgm : all_trigrams) { - // 16 bytes: trgm (4), number of docids (4), byte offset (8) - fwrite(&trgm, sizeof(trgm), 1, outfp); // One byte wasted, but OK. - const PostingListBuilder &pl_builder = corpus.invindex[trgm]; - uint32_t num_docids = pl_builder.num_docids; - fwrite(&num_docids, sizeof(num_docids), 1, outfp); - fwrite(&offset, sizeof(offset), 1, outfp); + const PostingListBuilder &pl_builder = corpus.invindex[hashtable[i].trgm]; offset += pl_builder.encoded.size(); - bytes_for_trigrams += 16; } + // Write the header. + Header hdr; + memcpy(hdr.magic, "\0plocate", 8); + hdr.version = 0; + hdr.hashtable_size = ht_size; + hdr.extra_ht_slots = num_overflow_slots; + hdr.hash_table_offset_bytes = sizeof(hdr); // This member is just there for flexibility. + hdr.filename_index_offset_bytes = offset; + fwrite(&hdr, sizeof(hdr), 1, outfp); + + // Write the hash table. + fwrite(hashtable.get(), ht_size + num_overflow_slots + 1, sizeof(Trigram), outfp); + // Write the actual posting lists. - for (uint32_t trgm : all_trigrams) { - const string &encoded = corpus.invindex[trgm].encoded; + size_t bytes_for_posting_lists = 0; + for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) { + if (hashtable[i].num_docids == 0) { + continue; + } + const string &encoded = corpus.invindex[hashtable[i].trgm].encoded; fwrite(encoded.data(), encoded.size(), 1, outfp); bytes_for_posting_lists += encoded.size(); } @@ -347,7 +430,8 @@ void do_build(const char *infile, const char *outfile, int block_size) corpus.filename_blocks.push_back(""); // Write the offsets to the filenames. - offset = filename_index_offset + corpus.filename_blocks.size() * sizeof(offset); + size_t bytes_for_filename_index = 0, bytes_for_filenames = 0; + offset = hdr.filename_index_offset_bytes + corpus.filename_blocks.size() * sizeof(offset); for (const string &filename : corpus.filename_blocks) { fwrite(&offset, sizeof(offset), 1, outfp); offset += filename.size(); @@ -362,10 +446,10 @@ void do_build(const char *infile, const char *outfile, int block_size) fclose(outfp); - size_t total_bytes __attribute__((unused)) = (bytes_for_trigrams + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames); + size_t total_bytes __attribute__((unused)) = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames); dprintf("Block size: %7d files\n", block_size); - dprintf("Trigrams: %'7.1f MB\n", bytes_for_trigrams / 1048576.0); + dprintf("Hash table: %'7.1f MB\n", bytes_for_hashtable / 1048576.0); dprintf("Posting lists: %'7.1f MB\n", bytes_for_posting_lists / 1048576.0); dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0); dprintf("Filenames: %'7.1f MB\n", bytes_for_filenames / 1048576.0); diff --git a/plocate.cpp b/plocate.cpp index 1c54bf4..84fcd7a 100644 --- a/plocate.cpp +++ b/plocate.cpp @@ -1,3 +1,4 @@ +#include "db.h" #include "vp4.h" #include "io_uring_engine.h" @@ -95,21 +96,6 @@ bool has_access(const char *filename, return true; } -struct Trigram { - uint32_t trgm; - uint32_t num_docids; - uint64_t offset; - - bool operator==(const Trigram &other) const - { - return trgm == other.trgm; - } - bool operator<(const Trigram &other) const - { - return trgm < other.trgm; - } -}; - class Corpus { public: Corpus(int fd, IOUringEngine *engine); @@ -118,39 +104,38 @@ public: void get_compressed_filename_block(uint32_t docid, function cb) const; size_t get_num_filename_blocks() const; off_t offset_for_block(uint32_t docid) const { - return filename_index_offset + docid * sizeof(uint64_t); + return hdr.filename_index_offset_bytes + docid * sizeof(uint64_t); } public: const int fd; IOUringEngine *const engine; - off_t len; - uint64_t filename_index_offset; - - uint64_t num_trigrams; - const off_t trigram_offset = sizeof(uint64_t) * 2; - - void binary_search_trigram(uint32_t trgm, uint32_t left, uint32_t right, function cb); + Header hdr; }; Corpus::Corpus(int fd, IOUringEngine *engine) : fd(fd), engine(engine) { - len = lseek(fd, 0, SEEK_END); - if (len == -1) { - perror("lseek"); - exit(1); + // Enable to test cold-cache behavior (except for access()). + if (false) { + off_t len = lseek(fd, 0, SEEK_END); + if (len == -1) { + perror("lseek"); + exit(1); + } + posix_fadvise(fd, 0, len, POSIX_FADV_DONTNEED); } - // Uncomment to test cold-cache behavior (except for access()). - // posix_fadvise(fd, 0, len, POSIX_FADV_DONTNEED); - - uint64_t vals[2]; - complete_pread(fd, vals, sizeof(vals), /*offset=*/0); - - num_trigrams = vals[0]; - filename_index_offset = vals[1]; + complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0); + if (memcmp(hdr.magic, "\0plocate", 8) != 0) { + fprintf(stderr, "plocate.db is corrupt or an old version; please rebuild it.\n"); + exit(1); + } + if (hdr.version != 0) { + fprintf(stderr, "plocate.db has version %u, expected 0; please rebuild it.\n", hdr.version); + exit(1); + } } Corpus::~Corpus() @@ -160,26 +145,18 @@ Corpus::~Corpus() void Corpus::find_trigram(uint32_t trgm, function cb) { - binary_search_trigram(trgm, 0, num_trigrams - 1, move(cb)); -} - -void Corpus::binary_search_trigram(uint32_t trgm, uint32_t left, uint32_t right, function cb) -{ - if (left > right) { - cb(nullptr, 0); - return; - } - uint32_t mid = (left + right) / 2; - engine->submit_read(fd, sizeof(Trigram) * 2, trigram_offset + sizeof(Trigram) * mid, [this, trgm, left, mid, right, cb{ move(cb) }](string s) { + uint32_t bucket = hash_trigram(trgm, hdr.hashtable_size); + engine->submit_read(fd, sizeof(Trigram) * (hdr.extra_ht_slots + 2), hdr.hash_table_offset_bytes + sizeof(Trigram) * bucket, [this, trgm, bucket, cb{ move(cb) }](string s) { const Trigram *trgmptr = reinterpret_cast(s.data()); - const Trigram *next_trgmptr = trgmptr + 1; - if (trgmptr->trgm < trgm) { - binary_search_trigram(trgm, mid + 1, right, move(cb)); - } else if (trgmptr->trgm > trgm) { - binary_search_trigram(trgm, left, mid - 1, move(cb)); - } else { - cb(trgmptr, next_trgmptr->offset - trgmptr->offset); + for (unsigned i = 0; i < hdr.extra_ht_slots + 1; ++i) { + if (trgmptr[i].trgm == trgm) { + cb(trgmptr + i, trgmptr[i + 1].offset - trgmptr[i].offset); + return; + } } + + // Not found. + cb(nullptr, 0); }); } @@ -199,10 +176,10 @@ size_t Corpus::get_num_filename_blocks() const { // The beginning of the filename blocks is the end of the filename index blocks. uint64_t end; - complete_pread(fd, &end, sizeof(end), filename_index_offset); + complete_pread(fd, &end, sizeof(end), hdr.filename_index_offset_bytes); // Subtract the sentinel block. - return (end - filename_index_offset) / sizeof(uint64_t) - 1; + return (end - hdr.filename_index_offset_bytes) / sizeof(uint64_t) - 1; } size_t scan_file_block(const string &needle, string_view compressed, @@ -306,7 +283,7 @@ void do_search_file(const string &needle, const char *filename) IOUringEngine engine; Corpus corpus(fd, &engine); - dprintf("Corpus init took %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); + dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); if (needle.size() < 3) { // Too short for trigram matching. Apply brute force. @@ -329,7 +306,7 @@ void do_search_file(const string &needle, const char *filename) }); } engine.finish(); - dprintf("Binary search took %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); + dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); sort(trigrams.begin(), trigrams.end()); { @@ -392,7 +369,7 @@ void do_search_file(const string &needle, const char *filename) }); } engine.finish(); - dprintf("Intersection took %.1f ms. Doing final verification and printing:\n", + dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n", 1e3 * duration(steady_clock::now() - start).count()); size_t matched __attribute__((unused)) = scan_docids(needle, in1, corpus, &engine); -- 2.39.2