]> git.sesse.net Git - plocate/commitdiff
Switch trigram lookup from binary search to a hash table.
authorSteinar H. Gunderson <steinar+git@gunderson.no>
Wed, 30 Sep 2020 17:44:28 +0000 (19:44 +0200)
committerSteinar H. Gunderson <steinar+git@gunderson.no>
Wed, 30 Sep 2020 17:46:53 +0000 (19:46 +0200)
Binary search was fine when we just wanted simplicity, but for I/O
optimization on rotating media, we want as few seeks as possible.
A hash table with open addressing gives us just that; Robin Hood
hashing makes it possible for us to guarantee maximum probe length,
so we can just read 256 bytes (plus a little slop) for each lookup
and that's it. This kills ~30 ms or so cold-cache.

This breaks the format, so we use the chance to add a magic and
a proper header to provide some more flexibility in case we want
to change the builder.

db.h [new file with mode: 0644]
plocate-build.cpp
plocate.cpp

diff --git a/db.h b/db.h
new file mode 100644 (file)
index 0000000..d4baf53
--- /dev/null
+++ b/db.h
@@ -0,0 +1,44 @@
+#ifndef DB_H
+#define DB_H 1
+
+#include <stdint.h>
+
+struct Header {
+       char magic[8]; // "\0plocate";
+       uint32_t version;  // 0.
+       uint32_t hashtable_size;
+       uint32_t extra_ht_slots;
+       uint64_t hash_table_offset_bytes;
+       uint64_t filename_index_offset_bytes;
+}; 
+
+struct Trigram {
+       uint32_t trgm;
+       uint32_t num_docids;
+       uint64_t offset;
+
+       bool operator==(const Trigram &other) const
+       {
+               return trgm == other.trgm;
+       }
+       bool operator<(const Trigram &other) const
+       {
+               return trgm < other.trgm;
+       }
+};
+
+inline uint32_t hash_trigram(uint32_t trgm, uint32_t ht_size)
+{
+       // CRC-like computation.
+       uint32_t crc = trgm;
+       for (int i = 0; i < 32; i++) {
+               bool bit = crc & 0x80000000;
+               crc <<= 1;
+               if (bit) {
+                       crc ^= 0x1edc6f41;
+               }
+       }
+       return crc % ht_size;
+}
+
+#endif  // !defined(DB_H)
index 53cefde64c2149064d43a41fba3639143ec32b2b..40ce2ff893ad455e6816b73c43e567c0aea1ac16 100644 (file)
@@ -6,6 +6,8 @@
 #include <chrono>
 #include <endian.h>
 #include <fcntl.h>
+#include <math.h>
+#include <memory>
 #include <stdio.h>
 #include <string.h>
 #include <string>
@@ -17,6 +19,8 @@
 #include <vector>
 #include <zstd.h>
 
+#include "db.h"
+
 #define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t))
 #define dprintf(...)
 //#define dprintf(...) fprintf(stderr, __VA_ARGS__);
@@ -265,6 +269,63 @@ string zstd_compress(const string &src, string *tempbuf)
        return string(tempbuf->data(), size);
 }
 
+bool is_prime(uint32_t x)
+{
+       if ((x % 2) == 0 || (x % 3) == 0) {
+               return false;
+       }
+       uint32_t limit = ceil(sqrt(x));
+       for (uint32_t factor = 5; factor <= limit; ++factor) {
+               if ((x % factor) == 0) {
+                       return false;
+               }
+       }
+       return true;
+}
+
+uint32_t next_prime(uint32_t x)
+{
+       if ((x % 2) == 0) {
+               ++x;
+       }
+       while (!is_prime(x)) {
+               x += 2;
+       }
+       return x;
+}
+
+unique_ptr<Trigram[]> create_hashtable(const Corpus &corpus, const vector<uint32_t> &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots)
+{
+       unique_ptr<Trigram[]> ht(new Trigram[ht_size + num_overflow_slots + 1]);  // 1 for the sentinel element at the end.
+       for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+               ht[i].trgm = uint32_t(-1);
+               ht[i].num_docids = 0;
+               ht[i].offset = 0;
+       }
+       for (uint32_t trgm : all_trigrams) {
+               // We don't know offset yet, so set it to zero.
+               Trigram to_insert{trgm, uint32_t(corpus.invindex.find(trgm)->second.num_docids), 0};
+
+               uint32_t bucket = hash_trigram(trgm, ht_size);
+               unsigned distance = 0;
+               while (ht[bucket].num_docids != 0) {
+                       // Robin Hood hashing; reduces the longest distance by a lot.
+                       unsigned other_distance = bucket - hash_trigram(ht[bucket].trgm, ht_size);
+                       if (distance > other_distance) {
+                               swap(to_insert, ht[bucket]);
+                               distance = other_distance;
+                       }
+
+                       ++bucket, ++distance;
+                       if (distance > num_overflow_slots) {
+                               return nullptr;
+                       }
+               }
+               ht[bucket] = to_insert;
+       }
+       return ht;
+}
+
 void do_build(const char *infile, const char *outfile, int block_size)
 {
        steady_clock::time_point start __attribute__((unused)) = steady_clock::now();
@@ -303,42 +364,64 @@ void do_build(const char *infile, const char *outfile, int block_size)
 
        dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(steady_clock::now() - start).count());
 
+       // Sort the trigrams, mostly to get a consistent result every time
+       // (the hash table will put things in random order anyway).
        vector<uint32_t> all_trigrams;
        for (auto &[trigram, pl_builder] : corpus.invindex) {
                all_trigrams.push_back(trigram);
        }
        sort(all_trigrams.begin(), all_trigrams.end());
 
+       unique_ptr<Trigram[]> hashtable;
+       uint32_t ht_size = next_prime(all_trigrams.size());
+       constexpr unsigned num_overflow_slots = 16;
+       for ( ;; ) {
+               hashtable = create_hashtable(corpus, all_trigrams, ht_size, num_overflow_slots);
+               if (hashtable == nullptr) {
+                       dprintf("Failed creating hash table of size %u, increasing by 5%% and trying again.\n", ht_size);
+                       ht_size = next_prime(ht_size * 1.05);
+               } else {
+                       dprintf("Created hash table of size %u.\n\n", ht_size);
+                       break;
+               }
+       }
+
        umask(0027);
        FILE *outfp = fopen(outfile, "wb");
 
-       // Write the header.
-       uint64_t num_trigrams = corpus.invindex.size();  // 64 to get alignment.
-       fwrite(&num_trigrams, sizeof(num_trigrams), 1, outfp);
-
-       // Find out where the posting lists will end.
-       uint64_t offset = sizeof(uint64_t) * 2 + 16 * all_trigrams.size();
-       uint64_t filename_index_offset = offset;
-       for (auto &[trigram, pl_builder] : corpus.invindex) {
-               filename_index_offset += pl_builder.encoded.size();
-       }
-       fwrite(&filename_index_offset, sizeof(filename_index_offset), 1, outfp);
+       // Find the offsets for each posting list.
+       size_t bytes_for_hashtable = (ht_size + num_overflow_slots + 1) * sizeof(Trigram);
+       uint64_t offset = sizeof(Header) + bytes_for_hashtable;
+       for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+               hashtable[i].offset = offset;  // Needs to be there even for empty slots.
+               if (hashtable[i].num_docids == 0) {
+                       continue;
+               }
 
-       size_t bytes_for_trigrams = 0, bytes_for_posting_lists = 0, bytes_for_filename_index = 0, bytes_for_filenames = 0;
-       for (uint32_t trgm : all_trigrams) {
-               // 16 bytes: trgm (4), number of docids (4), byte offset (8)
-               fwrite(&trgm, sizeof(trgm), 1, outfp);  // One byte wasted, but OK.
-               const PostingListBuilder &pl_builder = corpus.invindex[trgm];
-               uint32_t num_docids = pl_builder.num_docids;
-               fwrite(&num_docids, sizeof(num_docids), 1, outfp);
-               fwrite(&offset, sizeof(offset), 1, outfp);
+               const PostingListBuilder &pl_builder = corpus.invindex[hashtable[i].trgm];
                offset += pl_builder.encoded.size();
-               bytes_for_trigrams += 16;
        }
 
+       // Write the header.
+       Header hdr;
+       memcpy(hdr.magic, "\0plocate", 8);
+       hdr.version = 0;
+       hdr.hashtable_size = ht_size;
+       hdr.extra_ht_slots = num_overflow_slots;
+       hdr.hash_table_offset_bytes = sizeof(hdr);  // This member is just there for flexibility.
+       hdr.filename_index_offset_bytes = offset;
+       fwrite(&hdr, sizeof(hdr), 1, outfp);
+
+       // Write the hash table.
+       fwrite(hashtable.get(), ht_size + num_overflow_slots + 1, sizeof(Trigram), outfp);
+
        // Write the actual posting lists.
-       for (uint32_t trgm : all_trigrams) {
-               const string &encoded = corpus.invindex[trgm].encoded;
+       size_t bytes_for_posting_lists = 0;
+       for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+               if (hashtable[i].num_docids == 0) {
+                       continue;
+               }
+               const string &encoded = corpus.invindex[hashtable[i].trgm].encoded;
                fwrite(encoded.data(), encoded.size(), 1, outfp);
                bytes_for_posting_lists += encoded.size();
        }
@@ -347,7 +430,8 @@ void do_build(const char *infile, const char *outfile, int block_size)
        corpus.filename_blocks.push_back("");
 
        // Write the offsets to the filenames.
-       offset = filename_index_offset + corpus.filename_blocks.size() * sizeof(offset);
+       size_t bytes_for_filename_index = 0, bytes_for_filenames = 0;
+       offset = hdr.filename_index_offset_bytes + corpus.filename_blocks.size() * sizeof(offset);
        for (const string &filename : corpus.filename_blocks) {
                fwrite(&offset, sizeof(offset), 1, outfp);
                offset += filename.size();
@@ -362,10 +446,10 @@ void do_build(const char *infile, const char *outfile, int block_size)
 
        fclose(outfp);
 
-       size_t total_bytes __attribute__((unused)) = (bytes_for_trigrams + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
+       size_t total_bytes __attribute__((unused)) = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
 
        dprintf("Block size:     %7d files\n", block_size);
-       dprintf("Trigrams:       %'7.1f MB\n", bytes_for_trigrams / 1048576.0);
+       dprintf("Hash table:     %'7.1f MB\n", bytes_for_hashtable / 1048576.0);
        dprintf("Posting lists:  %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
        dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
        dprintf("Filenames:      %'7.1f MB\n", bytes_for_filenames / 1048576.0);
index 1c54bf4d157adfdd593073ccfa0a91c84badb5b0..84fcd7a7e47a93e0b85a453e742e50bd444df994 100644 (file)
@@ -1,3 +1,4 @@
+#include "db.h"
 #include "vp4.h"
 #include "io_uring_engine.h"
 
@@ -95,21 +96,6 @@ bool has_access(const char *filename,
        return true;
 }
 
-struct Trigram {
-       uint32_t trgm;
-       uint32_t num_docids;
-       uint64_t offset;
-
-       bool operator==(const Trigram &other) const
-       {
-               return trgm == other.trgm;
-       }
-       bool operator<(const Trigram &other) const
-       {
-               return trgm < other.trgm;
-       }
-};
-
 class Corpus {
 public:
        Corpus(int fd, IOUringEngine *engine);
@@ -118,39 +104,38 @@ public:
        void get_compressed_filename_block(uint32_t docid, function<void(string)> cb) const;
        size_t get_num_filename_blocks() const;
        off_t offset_for_block(uint32_t docid) const {
-               return filename_index_offset + docid * sizeof(uint64_t);
+               return hdr.filename_index_offset_bytes + docid * sizeof(uint64_t);
        }
 
 public:
        const int fd;
        IOUringEngine *const engine;
 
-       off_t len;
-       uint64_t filename_index_offset;
-
-       uint64_t num_trigrams;
-       const off_t trigram_offset = sizeof(uint64_t) * 2;
-
-       void binary_search_trigram(uint32_t trgm, uint32_t left, uint32_t right, function<void(const Trigram *trgmptr, size_t len)> cb);
+       Header hdr;
 };
 
 Corpus::Corpus(int fd, IOUringEngine *engine)
        : fd(fd), engine(engine)
 {
-       len = lseek(fd, 0, SEEK_END);
-       if (len == -1) {
-               perror("lseek");
-               exit(1);
+       // Enable to test cold-cache behavior (except for access()).
+       if (false) {
+               off_t len = lseek(fd, 0, SEEK_END);
+               if (len == -1) {
+                       perror("lseek");
+                       exit(1);
+               }
+               posix_fadvise(fd, 0, len, POSIX_FADV_DONTNEED);
        }
 
-       // Uncomment to test cold-cache behavior (except for access()).
-       // posix_fadvise(fd, 0, len, POSIX_FADV_DONTNEED);
-
-       uint64_t vals[2];
-       complete_pread(fd, vals, sizeof(vals), /*offset=*/0);
-
-       num_trigrams = vals[0];
-       filename_index_offset = vals[1];
+       complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0);
+       if (memcmp(hdr.magic, "\0plocate", 8) != 0) {
+               fprintf(stderr, "plocate.db is corrupt or an old version; please rebuild it.\n");
+               exit(1);
+       }
+       if (hdr.version != 0) {
+               fprintf(stderr, "plocate.db has version %u, expected 0; please rebuild it.\n", hdr.version);
+               exit(1);
+       }
 }
 
 Corpus::~Corpus()
@@ -160,26 +145,18 @@ Corpus::~Corpus()
 
 void Corpus::find_trigram(uint32_t trgm, function<void(const Trigram *trgmptr, size_t len)> cb)
 {
-       binary_search_trigram(trgm, 0, num_trigrams - 1, move(cb));
-}
-
-void Corpus::binary_search_trigram(uint32_t trgm, uint32_t left, uint32_t right, function<void(const Trigram *trgmptr, size_t len)> cb)
-{
-       if (left > right) {
-               cb(nullptr, 0);
-               return;
-       }
-       uint32_t mid = (left + right) / 2;
-       engine->submit_read(fd, sizeof(Trigram) * 2, trigram_offset + sizeof(Trigram) * mid, [this, trgm, left, mid, right, cb{ move(cb) }](string s) {
+       uint32_t bucket = hash_trigram(trgm, hdr.hashtable_size);
+       engine->submit_read(fd, sizeof(Trigram) * (hdr.extra_ht_slots + 2), hdr.hash_table_offset_bytes + sizeof(Trigram) * bucket, [this, trgm, bucket, cb{ move(cb) }](string s) {
                const Trigram *trgmptr = reinterpret_cast<const Trigram *>(s.data());
-               const Trigram *next_trgmptr = trgmptr + 1;
-               if (trgmptr->trgm < trgm) {
-                       binary_search_trigram(trgm, mid + 1, right, move(cb));
-               } else if (trgmptr->trgm > trgm) {
-                       binary_search_trigram(trgm, left, mid - 1, move(cb));
-               } else {
-                       cb(trgmptr, next_trgmptr->offset - trgmptr->offset);
+               for (unsigned i = 0; i < hdr.extra_ht_slots + 1; ++i) {
+                       if (trgmptr[i].trgm == trgm) {
+                               cb(trgmptr + i, trgmptr[i + 1].offset - trgmptr[i].offset);
+                               return;
+                       }
                }
+
+               // Not found.
+               cb(nullptr, 0);
        });
 }
 
@@ -199,10 +176,10 @@ size_t Corpus::get_num_filename_blocks() const
 {
        // The beginning of the filename blocks is the end of the filename index blocks.
        uint64_t end;
-       complete_pread(fd, &end, sizeof(end), filename_index_offset);
+       complete_pread(fd, &end, sizeof(end), hdr.filename_index_offset_bytes);
 
        // Subtract the sentinel block.
-       return (end - filename_index_offset) / sizeof(uint64_t) - 1;
+       return (end - hdr.filename_index_offset_bytes) / sizeof(uint64_t) - 1;
 }
 
 size_t scan_file_block(const string &needle, string_view compressed,
@@ -306,7 +283,7 @@ void do_search_file(const string &needle, const char *filename)
 
        IOUringEngine engine;
        Corpus corpus(fd, &engine);
-       dprintf("Corpus init took %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+       dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
 
        if (needle.size() < 3) {
                // Too short for trigram matching. Apply brute force.
@@ -329,7 +306,7 @@ void do_search_file(const string &needle, const char *filename)
                });
        }
        engine.finish();
-       dprintf("Binary search took %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+       dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
 
        sort(trigrams.begin(), trigrams.end());
        {
@@ -392,7 +369,7 @@ void do_search_file(const string &needle, const char *filename)
                });
        }
        engine.finish();
-       dprintf("Intersection took %.1f ms. Doing final verification and printing:\n",
+       dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n",
                1e3 * duration<float>(steady_clock::now() - start).count());
 
        size_t matched __attribute__((unused)) = scan_docids(needle, in1, corpus, &engine);