#include <chrono>
#include <endian.h>
#include <fcntl.h>
+#include <math.h>
+#include <memory>
#include <stdio.h>
#include <string.h>
#include <string>
#include <vector>
#include <zstd.h>
+#include "db.h"
+
#define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t))
#define dprintf(...)
//#define dprintf(...) fprintf(stderr, __VA_ARGS__);
return string(tempbuf->data(), size);
}
+bool is_prime(uint32_t x)
+{
+ if ((x % 2) == 0 || (x % 3) == 0) {
+ return false;
+ }
+ uint32_t limit = ceil(sqrt(x));
+ for (uint32_t factor = 5; factor <= limit; ++factor) {
+ if ((x % factor) == 0) {
+ return false;
+ }
+ }
+ return true;
+}
+
+uint32_t next_prime(uint32_t x)
+{
+ if ((x % 2) == 0) {
+ ++x;
+ }
+ while (!is_prime(x)) {
+ x += 2;
+ }
+ return x;
+}
+
+unique_ptr<Trigram[]> create_hashtable(const Corpus &corpus, const vector<uint32_t> &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots)
+{
+ unique_ptr<Trigram[]> ht(new Trigram[ht_size + num_overflow_slots + 1]); // 1 for the sentinel element at the end.
+ for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+ ht[i].trgm = uint32_t(-1);
+ ht[i].num_docids = 0;
+ ht[i].offset = 0;
+ }
+ for (uint32_t trgm : all_trigrams) {
+ // We don't know offset yet, so set it to zero.
+ Trigram to_insert{trgm, uint32_t(corpus.invindex.find(trgm)->second.num_docids), 0};
+
+ uint32_t bucket = hash_trigram(trgm, ht_size);
+ unsigned distance = 0;
+ while (ht[bucket].num_docids != 0) {
+ // Robin Hood hashing; reduces the longest distance by a lot.
+ unsigned other_distance = bucket - hash_trigram(ht[bucket].trgm, ht_size);
+ if (distance > other_distance) {
+ swap(to_insert, ht[bucket]);
+ distance = other_distance;
+ }
+
+ ++bucket, ++distance;
+ if (distance > num_overflow_slots) {
+ return nullptr;
+ }
+ }
+ ht[bucket] = to_insert;
+ }
+ return ht;
+}
+
void do_build(const char *infile, const char *outfile, int block_size)
{
steady_clock::time_point start __attribute__((unused)) = steady_clock::now();
dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+ // Sort the trigrams, mostly to get a consistent result every time
+ // (the hash table will put things in random order anyway).
vector<uint32_t> all_trigrams;
for (auto &[trigram, pl_builder] : corpus.invindex) {
all_trigrams.push_back(trigram);
}
sort(all_trigrams.begin(), all_trigrams.end());
+ unique_ptr<Trigram[]> hashtable;
+ uint32_t ht_size = next_prime(all_trigrams.size());
+ constexpr unsigned num_overflow_slots = 16;
+ for ( ;; ) {
+ hashtable = create_hashtable(corpus, all_trigrams, ht_size, num_overflow_slots);
+ if (hashtable == nullptr) {
+ dprintf("Failed creating hash table of size %u, increasing by 5%% and trying again.\n", ht_size);
+ ht_size = next_prime(ht_size * 1.05);
+ } else {
+ dprintf("Created hash table of size %u.\n\n", ht_size);
+ break;
+ }
+ }
+
umask(0027);
FILE *outfp = fopen(outfile, "wb");
- // Write the header.
- uint64_t num_trigrams = corpus.invindex.size(); // 64 to get alignment.
- fwrite(&num_trigrams, sizeof(num_trigrams), 1, outfp);
-
- // Find out where the posting lists will end.
- uint64_t offset = sizeof(uint64_t) * 2 + 16 * all_trigrams.size();
- uint64_t filename_index_offset = offset;
- for (auto &[trigram, pl_builder] : corpus.invindex) {
- filename_index_offset += pl_builder.encoded.size();
- }
- fwrite(&filename_index_offset, sizeof(filename_index_offset), 1, outfp);
+ // Find the offsets for each posting list.
+ size_t bytes_for_hashtable = (ht_size + num_overflow_slots + 1) * sizeof(Trigram);
+ uint64_t offset = sizeof(Header) + bytes_for_hashtable;
+ for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+ hashtable[i].offset = offset; // Needs to be there even for empty slots.
+ if (hashtable[i].num_docids == 0) {
+ continue;
+ }
- size_t bytes_for_trigrams = 0, bytes_for_posting_lists = 0, bytes_for_filename_index = 0, bytes_for_filenames = 0;
- for (uint32_t trgm : all_trigrams) {
- // 16 bytes: trgm (4), number of docids (4), byte offset (8)
- fwrite(&trgm, sizeof(trgm), 1, outfp); // One byte wasted, but OK.
- const PostingListBuilder &pl_builder = corpus.invindex[trgm];
- uint32_t num_docids = pl_builder.num_docids;
- fwrite(&num_docids, sizeof(num_docids), 1, outfp);
- fwrite(&offset, sizeof(offset), 1, outfp);
+ const PostingListBuilder &pl_builder = corpus.invindex[hashtable[i].trgm];
offset += pl_builder.encoded.size();
- bytes_for_trigrams += 16;
}
+ // Write the header.
+ Header hdr;
+ memcpy(hdr.magic, "\0plocate", 8);
+ hdr.version = 0;
+ hdr.hashtable_size = ht_size;
+ hdr.extra_ht_slots = num_overflow_slots;
+ hdr.hash_table_offset_bytes = sizeof(hdr); // This member is just there for flexibility.
+ hdr.filename_index_offset_bytes = offset;
+ fwrite(&hdr, sizeof(hdr), 1, outfp);
+
+ // Write the hash table.
+ fwrite(hashtable.get(), ht_size + num_overflow_slots + 1, sizeof(Trigram), outfp);
+
// Write the actual posting lists.
- for (uint32_t trgm : all_trigrams) {
- const string &encoded = corpus.invindex[trgm].encoded;
+ size_t bytes_for_posting_lists = 0;
+ for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+ if (hashtable[i].num_docids == 0) {
+ continue;
+ }
+ const string &encoded = corpus.invindex[hashtable[i].trgm].encoded;
fwrite(encoded.data(), encoded.size(), 1, outfp);
bytes_for_posting_lists += encoded.size();
}
corpus.filename_blocks.push_back("");
// Write the offsets to the filenames.
- offset = filename_index_offset + corpus.filename_blocks.size() * sizeof(offset);
+ size_t bytes_for_filename_index = 0, bytes_for_filenames = 0;
+ offset = hdr.filename_index_offset_bytes + corpus.filename_blocks.size() * sizeof(offset);
for (const string &filename : corpus.filename_blocks) {
fwrite(&offset, sizeof(offset), 1, outfp);
offset += filename.size();
fclose(outfp);
- size_t total_bytes __attribute__((unused)) = (bytes_for_trigrams + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
+ size_t total_bytes __attribute__((unused)) = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
dprintf("Block size: %7d files\n", block_size);
- dprintf("Trigrams: %'7.1f MB\n", bytes_for_trigrams / 1048576.0);
+ dprintf("Hash table: %'7.1f MB\n", bytes_for_hashtable / 1048576.0);
dprintf("Posting lists: %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
dprintf("Filenames: %'7.1f MB\n", bytes_for_filenames / 1048576.0);
+#include "db.h"
#include "vp4.h"
#include "io_uring_engine.h"
return true;
}
-struct Trigram {
- uint32_t trgm;
- uint32_t num_docids;
- uint64_t offset;
-
- bool operator==(const Trigram &other) const
- {
- return trgm == other.trgm;
- }
- bool operator<(const Trigram &other) const
- {
- return trgm < other.trgm;
- }
-};
-
class Corpus {
public:
Corpus(int fd, IOUringEngine *engine);
void get_compressed_filename_block(uint32_t docid, function<void(string)> cb) const;
size_t get_num_filename_blocks() const;
off_t offset_for_block(uint32_t docid) const {
- return filename_index_offset + docid * sizeof(uint64_t);
+ return hdr.filename_index_offset_bytes + docid * sizeof(uint64_t);
}
public:
const int fd;
IOUringEngine *const engine;
- off_t len;
- uint64_t filename_index_offset;
-
- uint64_t num_trigrams;
- const off_t trigram_offset = sizeof(uint64_t) * 2;
-
- void binary_search_trigram(uint32_t trgm, uint32_t left, uint32_t right, function<void(const Trigram *trgmptr, size_t len)> cb);
+ Header hdr;
};
Corpus::Corpus(int fd, IOUringEngine *engine)
: fd(fd), engine(engine)
{
- len = lseek(fd, 0, SEEK_END);
- if (len == -1) {
- perror("lseek");
- exit(1);
+ // Enable to test cold-cache behavior (except for access()).
+ if (false) {
+ off_t len = lseek(fd, 0, SEEK_END);
+ if (len == -1) {
+ perror("lseek");
+ exit(1);
+ }
+ posix_fadvise(fd, 0, len, POSIX_FADV_DONTNEED);
}
- // Uncomment to test cold-cache behavior (except for access()).
- // posix_fadvise(fd, 0, len, POSIX_FADV_DONTNEED);
-
- uint64_t vals[2];
- complete_pread(fd, vals, sizeof(vals), /*offset=*/0);
-
- num_trigrams = vals[0];
- filename_index_offset = vals[1];
+ complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0);
+ if (memcmp(hdr.magic, "\0plocate", 8) != 0) {
+ fprintf(stderr, "plocate.db is corrupt or an old version; please rebuild it.\n");
+ exit(1);
+ }
+ if (hdr.version != 0) {
+ fprintf(stderr, "plocate.db has version %u, expected 0; please rebuild it.\n", hdr.version);
+ exit(1);
+ }
}
Corpus::~Corpus()
void Corpus::find_trigram(uint32_t trgm, function<void(const Trigram *trgmptr, size_t len)> cb)
{
- binary_search_trigram(trgm, 0, num_trigrams - 1, move(cb));
-}
-
-void Corpus::binary_search_trigram(uint32_t trgm, uint32_t left, uint32_t right, function<void(const Trigram *trgmptr, size_t len)> cb)
-{
- if (left > right) {
- cb(nullptr, 0);
- return;
- }
- uint32_t mid = (left + right) / 2;
- engine->submit_read(fd, sizeof(Trigram) * 2, trigram_offset + sizeof(Trigram) * mid, [this, trgm, left, mid, right, cb{ move(cb) }](string s) {
+ uint32_t bucket = hash_trigram(trgm, hdr.hashtable_size);
+ engine->submit_read(fd, sizeof(Trigram) * (hdr.extra_ht_slots + 2), hdr.hash_table_offset_bytes + sizeof(Trigram) * bucket, [this, trgm, bucket, cb{ move(cb) }](string s) {
const Trigram *trgmptr = reinterpret_cast<const Trigram *>(s.data());
- const Trigram *next_trgmptr = trgmptr + 1;
- if (trgmptr->trgm < trgm) {
- binary_search_trigram(trgm, mid + 1, right, move(cb));
- } else if (trgmptr->trgm > trgm) {
- binary_search_trigram(trgm, left, mid - 1, move(cb));
- } else {
- cb(trgmptr, next_trgmptr->offset - trgmptr->offset);
+ for (unsigned i = 0; i < hdr.extra_ht_slots + 1; ++i) {
+ if (trgmptr[i].trgm == trgm) {
+ cb(trgmptr + i, trgmptr[i + 1].offset - trgmptr[i].offset);
+ return;
+ }
}
+
+ // Not found.
+ cb(nullptr, 0);
});
}
{
// The beginning of the filename blocks is the end of the filename index blocks.
uint64_t end;
- complete_pread(fd, &end, sizeof(end), filename_index_offset);
+ complete_pread(fd, &end, sizeof(end), hdr.filename_index_offset_bytes);
// Subtract the sentinel block.
- return (end - filename_index_offset) / sizeof(uint64_t) - 1;
+ return (end - hdr.filename_index_offset_bytes) / sizeof(uint64_t) - 1;
}
size_t scan_file_block(const string &needle, string_view compressed,
IOUringEngine engine;
Corpus corpus(fd, &engine);
- dprintf("Corpus init took %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+ dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
if (needle.size() < 3) {
// Too short for trigram matching. Apply brute force.
});
}
engine.finish();
- dprintf("Binary search took %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+ dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
sort(trigrams.begin(), trigrams.end());
{
});
}
engine.finish();
- dprintf("Intersection took %.1f ms. Doing final verification and printing:\n",
+ dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n",
1e3 * duration<float>(steady_clock::now() - start).count());
size_t matched __attribute__((unused)) = scan_docids(needle, in1, corpus, &engine);