From c41f998855416a8619751653f3cedd9bc6f0f4c0 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <steinar+git@gunderson.no>
Date: Wed, 30 Sep 2020 19:44:28 +0200
Subject: [PATCH] Switch trigram lookup from binary search to a hash table.

Binary search was fine when we just wanted simplicity, but for I/O
optimization on rotating media, we want as few seeks as possible.
A hash table with open addressing gives us just that; Robin Hood
hashing makes it possible for us to guarantee maximum probe length,
so we can just read 256 bytes (plus a little slop) for each lookup
and that's it. This kills ~30 ms or so cold-cache.

This breaks the format, so we use the chance to add a magic and
a proper header to provide some more flexibility in case we want
to change the builder.
---
 db.h              |  44 +++++++++++++++
 plocate-build.cpp | 134 +++++++++++++++++++++++++++++++++++++---------
 plocate.cpp       |  93 ++++++++++++--------------------
 3 files changed, 188 insertions(+), 83 deletions(-)
 create mode 100644 db.h
diff --git a/db.h b/db.h
new file mode 100644
index 0000000..d4baf53
--- /dev/null
+++ b/db.h
@@ -0,0 +1,44 @@
+#ifndef DB_H
+#define DB_H 1
+
+#include <stdint.h>
+
+struct Header {
+	char magic[8]; // "\0plocate";
+	uint32_t version;  // 0.
+	uint32_t hashtable_size;
+	uint32_t extra_ht_slots;
+	uint64_t hash_table_offset_bytes;
+	uint64_t filename_index_offset_bytes;
+}; 
+
+struct Trigram {
+	uint32_t trgm;
+	uint32_t num_docids;
+	uint64_t offset;
+
+	bool operator==(const Trigram &other) const
+	{
+		return trgm == other.trgm;
+	}
+	bool operator<(const Trigram &other) const
+	{
+		return trgm < other.trgm;
+	}
+};
+
+inline uint32_t hash_trigram(uint32_t trgm, uint32_t ht_size)
+{
+	// CRC-like computation.
+	uint32_t crc = trgm;
+	for (int i = 0; i < 32; i++) {
+		bool bit = crc & 0x80000000;
+		crc <<= 1;
+		if (bit) {
+			crc ^= 0x1edc6f41;
+		}
+	}
+	return crc % ht_size;
+}
+
+#endif  // !defined(DB_H)
diff --git a/plocate-build.cpp b/plocate-build.cpp
index 53cefde..40ce2ff 100644
--- a/plocate-build.cpp
+++ b/plocate-build.cpp
@@ -6,6 +6,8 @@
 #include <chrono>
 #include <endian.h>
 #include <fcntl.h>
+#include <math.h>
+#include <memory>
 #include <stdio.h>
 #include <string.h>
 #include <string>
@@ -17,6 +19,8 @@
 #include <vector>
 #include <zstd.h>
 
+#include "db.h"
+
 #define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t))
 #define dprintf(...)
 //#define dprintf(...) fprintf(stderr, __VA_ARGS__);
@@ -265,6 +269,63 @@ string zstd_compress(const string &src, string *tempbuf)
 	return string(tempbuf->data(), size);
 }
 
+bool is_prime(uint32_t x)
+{
+	if ((x % 2) == 0 || (x % 3) == 0) {
+		return false;
+	}
+	uint32_t limit = ceil(sqrt(x));
+	for (uint32_t factor = 5; factor <= limit; ++factor) {
+		if ((x % factor) == 0) {
+			return false;
+		}
+	}
+	return true;
+}
+
+uint32_t next_prime(uint32_t x)
+{
+	if ((x % 2) == 0) {
+		++x;
+	}
+	while (!is_prime(x)) {
+		x += 2;
+	}
+	return x;
+}
+
+unique_ptr<Trigram[]> create_hashtable(const Corpus &corpus, const vector<uint32_t> &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots)
+{
+	unique_ptr<Trigram[]> ht(new Trigram[ht_size + num_overflow_slots + 1]);  // 1 for the sentinel element at the end.
+	for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+		ht[i].trgm = uint32_t(-1);
+		ht[i].num_docids = 0;
+		ht[i].offset = 0;
+	}
+	for (uint32_t trgm : all_trigrams) {
+		// We don't know offset yet, so set it to zero.
+		Trigram to_insert{trgm, uint32_t(corpus.invindex.find(trgm)->second.num_docids), 0};
+
+		uint32_t bucket = hash_trigram(trgm, ht_size);
+		unsigned distance = 0;
+		while (ht[bucket].num_docids != 0) {
+			// Robin Hood hashing; reduces the longest distance by a lot.
+			unsigned other_distance = bucket - hash_trigram(ht[bucket].trgm, ht_size);
+			if (distance > other_distance) {
+				swap(to_insert, ht[bucket]);
+				distance = other_distance;
+			}
+
+			++bucket, ++distance;
+			if (distance > num_overflow_slots) {
+				return nullptr;
+			}
+		}
+		ht[bucket] = to_insert;
+	}
+	return ht;
+}
+
 void do_build(const char *infile, const char *outfile, int block_size)
 {
 	steady_clock::time_point start __attribute__((unused)) = steady_clock::now();
@@ -303,42 +364,64 @@ void do_build(const char *infile, const char *outfile, int block_size)
 
 	dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(steady_clock::now() - start).count());
 
+	// Sort the trigrams, mostly to get a consistent result every time
+	// (the hash table will put things in random order anyway).
 	vector<uint32_t> all_trigrams;
 	for (auto &[trigram, pl_builder] : corpus.invindex) {
 		all_trigrams.push_back(trigram);
 	}
 	sort(all_trigrams.begin(), all_trigrams.end());
 
+	unique_ptr<Trigram[]> hashtable;
+	uint32_t ht_size = next_prime(all_trigrams.size());
+	constexpr unsigned num_overflow_slots = 16;
+	for ( ;; ) {
+		hashtable = create_hashtable(corpus, all_trigrams, ht_size, num_overflow_slots);
+		if (hashtable == nullptr) {
+			dprintf("Failed creating hash table of size %u, increasing by 5%% and trying again.\n", ht_size);
+			ht_size = next_prime(ht_size * 1.05);
+		} else {
+			dprintf("Created hash table of size %u.\n\n", ht_size);
+			break;
+		}
+	}
+
 	umask(0027);
 	FILE *outfp = fopen(outfile, "wb");
 
-	// Write the header.
-	uint64_t num_trigrams = corpus.invindex.size();  // 64 to get alignment.
-	fwrite(&num_trigrams, sizeof(num_trigrams), 1, outfp);
-
-	// Find out where the posting lists will end.
-	uint64_t offset = sizeof(uint64_t) * 2 + 16 * all_trigrams.size();
-	uint64_t filename_index_offset = offset;
-	for (auto &[trigram, pl_builder] : corpus.invindex) {
-		filename_index_offset += pl_builder.encoded.size();
-	}
-	fwrite(&filename_index_offset, sizeof(filename_index_offset), 1, outfp);
+	// Find the offsets for each posting list.
+	size_t bytes_for_hashtable = (ht_size + num_overflow_slots + 1) * sizeof(Trigram);
+	uint64_t offset = sizeof(Header) + bytes_for_hashtable;
+	for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+		hashtable[i].offset = offset;  // Needs to be there even for empty slots.
+		if (hashtable[i].num_docids == 0) {
+			continue;
+		}
 
-	size_t bytes_for_trigrams = 0, bytes_for_posting_lists = 0, bytes_for_filename_index = 0, bytes_for_filenames = 0;
-	for (uint32_t trgm : all_trigrams) {
-		// 16 bytes: trgm (4), number of docids (4), byte offset (8)
-		fwrite(&trgm, sizeof(trgm), 1, outfp);  // One byte wasted, but OK.
-		const PostingListBuilder &pl_builder = corpus.invindex[trgm];
-		uint32_t num_docids = pl_builder.num_docids;
-		fwrite(&num_docids, sizeof(num_docids), 1, outfp);
-		fwrite(&offset, sizeof(offset), 1, outfp);
+		const PostingListBuilder &pl_builder = corpus.invindex[hashtable[i].trgm];
 		offset += pl_builder.encoded.size();
-		bytes_for_trigrams += 16;
 	}
 
+	// Write the header.
+	Header hdr;
+	memcpy(hdr.magic, "\0plocate", 8);
+	hdr.version = 0;
+	hdr.hashtable_size = ht_size;
+	hdr.extra_ht_slots = num_overflow_slots;
+	hdr.hash_table_offset_bytes = sizeof(hdr);  // This member is just there for flexibility.
+	hdr.filename_index_offset_bytes = offset;
+	fwrite(&hdr, sizeof(hdr), 1, outfp);
+
+	// Write the hash table.
+	fwrite(hashtable.get(), ht_size + num_overflow_slots + 1, sizeof(Trigram), outfp);
+
 	// Write the actual posting lists.
-	for (uint32_t trgm : all_trigrams) {
-		const string &encoded = corpus.invindex[trgm].encoded;
+	size_t bytes_for_posting_lists = 0;
+	for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+		if (hashtable[i].num_docids == 0) {
+			continue;
+		}
+		const string &encoded = corpus.invindex[hashtable[i].trgm].encoded;
 		fwrite(encoded.data(), encoded.size(), 1, outfp);
 		bytes_for_posting_lists += encoded.size();
 	}
@@ -347,7 +430,8 @@ void do_build(const char *infile, const char *outfile, int block_size)
 	corpus.filename_blocks.push_back("");
 
 	// Write the offsets to the filenames.
-	offset = filename_index_offset + corpus.filename_blocks.size() * sizeof(offset);
+	size_t bytes_for_filename_index = 0, bytes_for_filenames = 0;
+	offset = hdr.filename_index_offset_bytes + corpus.filename_blocks.size() * sizeof(offset);
 	for (const string &filename : corpus.filename_blocks) {
 		fwrite(&offset, sizeof(offset), 1, outfp);
 		offset += filename.size();
@@ -362,10 +446,10 @@ void do_build(const char *infile, const char *outfile, int block_size)
 
 	fclose(outfp);
 
-	size_t total_bytes __attribute__((unused)) = (bytes_for_trigrams + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
+	size_t total_bytes __attribute__((unused)) = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
 
 	dprintf("Block size:     %7d files\n", block_size);
-	dprintf("Trigrams:       %'7.1f MB\n", bytes_for_trigrams / 1048576.0);
+	dprintf("Hash table:     %'7.1f MB\n", bytes_for_hashtable / 1048576.0);
 	dprintf("Posting lists:  %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
 	dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
 	dprintf("Filenames:      %'7.1f MB\n", bytes_for_filenames / 1048576.0);
diff --git a/plocate.cpp b/plocate.cpp
index 1c54bf4..84fcd7a 100644
--- a/plocate.cpp
+++ b/plocate.cpp
@@ -1,3 +1,4 @@
+#include "db.h"
 #include "vp4.h"
 #include "io_uring_engine.h"
 
@@ -95,21 +96,6 @@ bool has_access(const char *filename,
 	return true;
 }
 
-struct Trigram {
-	uint32_t trgm;
-	uint32_t num_docids;
-	uint64_t offset;
-
-	bool operator==(const Trigram &other) const
-	{
-		return trgm == other.trgm;
-	}
-	bool operator<(const Trigram &other) const
-	{
-		return trgm < other.trgm;
-	}
-};
-
 class Corpus {
 public:
 	Corpus(int fd, IOUringEngine *engine);
@@ -118,39 +104,38 @@ public:
 	void get_compressed_filename_block(uint32_t docid, function<void(string)> cb) const;
 	size_t get_num_filename_blocks() const;
 	off_t offset_for_block(uint32_t docid) const {
-		return filename_index_offset + docid * sizeof(uint64_t);
+		return hdr.filename_index_offset_bytes + docid * sizeof(uint64_t);
 	}
 
 public:
 	const int fd;
 	IOUringEngine *const engine;
 
-	off_t len;
-	uint64_t filename_index_offset;
-
-	uint64_t num_trigrams;
-	const off_t trigram_offset = sizeof(uint64_t) * 2;
-
-	void binary_search_trigram(uint32_t trgm, uint32_t left, uint32_t right, function<void(const Trigram *trgmptr, size_t len)> cb);
+	Header hdr;
 };
 
 Corpus::Corpus(int fd, IOUringEngine *engine)
 	: fd(fd), engine(engine)
 {
-	len = lseek(fd, 0, SEEK_END);
-	if (len == -1) {
-		perror("lseek");
-		exit(1);
+	// Enable to test cold-cache behavior (except for access()).
+	if (false) {
+		off_t len = lseek(fd, 0, SEEK_END);
+		if (len == -1) {
+			perror("lseek");
+			exit(1);
+		}
+		posix_fadvise(fd, 0, len, POSIX_FADV_DONTNEED);
 	}
 
-	// Uncomment to test cold-cache behavior (except for access()).
-	// posix_fadvise(fd, 0, len, POSIX_FADV_DONTNEED);
-
-	uint64_t vals[2];
-	complete_pread(fd, vals, sizeof(vals), /*offset=*/0);
-
-	num_trigrams = vals[0];
-	filename_index_offset = vals[1];
+	complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0);
+	if (memcmp(hdr.magic, "\0plocate", 8) != 0) {
+		fprintf(stderr, "plocate.db is corrupt or an old version; please rebuild it.\n");
+		exit(1);
+	}
+	if (hdr.version != 0) {
+		fprintf(stderr, "plocate.db has version %u, expected 0; please rebuild it.\n", hdr.version);
+		exit(1);
+	}
 }
 
 Corpus::~Corpus()
@@ -160,26 +145,18 @@ Corpus::~Corpus()
 
 void Corpus::find_trigram(uint32_t trgm, function<void(const Trigram *trgmptr, size_t len)> cb)
 {
-	binary_search_trigram(trgm, 0, num_trigrams - 1, move(cb));
-}
-
-void Corpus::binary_search_trigram(uint32_t trgm, uint32_t left, uint32_t right, function<void(const Trigram *trgmptr, size_t len)> cb)
-{
-	if (left > right) {
-		cb(nullptr, 0);
-		return;
-	}
-	uint32_t mid = (left + right) / 2;
-	engine->submit_read(fd, sizeof(Trigram) * 2, trigram_offset + sizeof(Trigram) * mid, [this, trgm, left, mid, right, cb{ move(cb) }](string s) {
+	uint32_t bucket = hash_trigram(trgm, hdr.hashtable_size);
+	engine->submit_read(fd, sizeof(Trigram) * (hdr.extra_ht_slots + 2), hdr.hash_table_offset_bytes + sizeof(Trigram) * bucket, [this, trgm, bucket, cb{ move(cb) }](string s) {
 		const Trigram *trgmptr = reinterpret_cast<const Trigram *>(s.data());
-		const Trigram *next_trgmptr = trgmptr + 1;
-		if (trgmptr->trgm < trgm) {
-			binary_search_trigram(trgm, mid + 1, right, move(cb));
-		} else if (trgmptr->trgm > trgm) {
-			binary_search_trigram(trgm, left, mid - 1, move(cb));
-		} else {
-			cb(trgmptr, next_trgmptr->offset - trgmptr->offset);
+		for (unsigned i = 0; i < hdr.extra_ht_slots + 1; ++i) {
+			if (trgmptr[i].trgm == trgm) {
+				cb(trgmptr + i, trgmptr[i + 1].offset - trgmptr[i].offset);
+				return;
+			}
 		}
+
+		// Not found.
+		cb(nullptr, 0);
 	});
 }
 
@@ -199,10 +176,10 @@ size_t Corpus::get_num_filename_blocks() const
 {
 	// The beginning of the filename blocks is the end of the filename index blocks.
 	uint64_t end;
-	complete_pread(fd, &end, sizeof(end), filename_index_offset);
+	complete_pread(fd, &end, sizeof(end), hdr.filename_index_offset_bytes);
 
 	// Subtract the sentinel block.
-	return (end - filename_index_offset) / sizeof(uint64_t) - 1;
+	return (end - hdr.filename_index_offset_bytes) / sizeof(uint64_t) - 1;
 }
 
 size_t scan_file_block(const string &needle, string_view compressed,
@@ -306,7 +283,7 @@ void do_search_file(const string &needle, const char *filename)
 
 	IOUringEngine engine;
 	Corpus corpus(fd, &engine);
-	dprintf("Corpus init took %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+	dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
 
 	if (needle.size() < 3) {
 		// Too short for trigram matching. Apply brute force.
@@ -329,7 +306,7 @@ void do_search_file(const string &needle, const char *filename)
 		});
 	}
 	engine.finish();
-	dprintf("Binary search took %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+	dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
 
 	sort(trigrams.begin(), trigrams.end());
 	{
@@ -392,7 +369,7 @@ void do_search_file(const string &needle, const char *filename)
 		});
 	}
 	engine.finish();
-	dprintf("Intersection took %.1f ms. Doing final verification and printing:\n",
+	dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n",
 	        1e3 * duration<float>(steady_clock::now() - start).count());
 
 	size_t matched __attribute__((unused)) = scan_docids(needle, in1, corpus, &engine);
-- 
2.39.2