From: Steinar H. Gunderson <steinar+git@gunderson.no>
Date: Sun, 27 Sep 2020 22:28:49 +0000 (+0200)
Subject: Compress filenames with zstd.
X-Git-Tag: 1.0.0~101
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=184cfc93ecb5a17b3d5360d5953d7bf4126b5f5c;p=plocate

Compress filenames with zstd.

Make blocks of 32 and 32 filenames, and compress then with zstd -6
(the level is fairly arbitrarily chosen). This compresses the repetitive
path information very well, and also allows us to have shorter posting
lists, as they can point into the blocks (allowing dedup).

32 was chosen after eyeballing some compressed sizes, looking for
diminishing returns and then verifying it didn't cost much in terms
of search performance.
---

diff --git a/Makefile b/Makefile
index bbcdccb..ccbaf15 100644
--- a/Makefile
+++ b/Makefile
@@ -8,10 +8,10 @@ PREFIX ?= /usr/local
 all: plocate plocate-build
 
 plocate: plocate.o TurboPFor-Integer-Compression/libic.a
-	$(CXX) -o $@ $^
+	$(CXX) -o $@ $^ -lzstd
 
 plocate-build: plocate-build.o TurboPFor-Integer-Compression/libic.a
-	$(CXX) -o $@ $^
+	$(CXX) -o $@ $^ -lzstd
 
 TurboPFor-Integer-Compression/libic.a:
 	cd TurboPFor-Integer-Compression/ && $(MAKE)
diff --git a/plocate-build.cpp b/plocate-build.cpp
index b64d06f..e6789d0 100644
--- a/plocate-build.cpp
+++ b/plocate-build.cpp
@@ -12,6 +12,7 @@
 #include <endian.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <zstd.h>
 
 #include "vp4.h"
 
@@ -122,7 +123,17 @@ void read_mlocate(const char *filename, vector<string> *files)
 	close(fd);
 }
 
-void do_build(const char *infile, const char *outfile)
+string zstd_compress(const string &src)
+{
+	size_t max_size = ZSTD_compressBound(src.size());
+	string dst;
+	dst.resize(max_size);
+	size_t size = ZSTD_compress(&dst[0], max_size, src.data(), src.size(), /*level=*/6);
+	dst.resize(size);
+	return dst;
+}
+
+void do_build(const char *infile, const char *outfile, int block_size)
 {
 	//steady_clock::time_point start = steady_clock::now();
 
@@ -151,7 +162,7 @@ void do_build(const char *infile, const char *outfile)
 		if (s.size() >= 3) {
 			for (size_t j = 0; j < s.size() - 2; ++j) {
 				uint32_t trgm = read_trigram(s, j);
-				invindex[trgm].push_back(i);
+				invindex[trgm].push_back(i / block_size);
 			}
 		}
 	}
@@ -215,24 +226,51 @@ void do_build(const char *infile, const char *outfile)
 		bytes_for_posting_lists += pl[trgm].size();
 	}
 
+	// Compress the filenames into blocks.
+	vector<string> filename_blocks;
+	string uncompressed_filenames;
+	int num_files_this_block = 0;
+
+	string dst;
+
+	for (string &filename : files) {
+		uncompressed_filenames += filename;
+		filename.clear();
+		if (++num_files_this_block == block_size) {
+			filename_blocks.push_back(zstd_compress(uncompressed_filenames));
+			uncompressed_filenames.clear();
+			num_files_this_block = 0;
+		} else {
+			uncompressed_filenames.push_back('\0');
+		}
+	}
+	if (num_files_this_block > 0) {
+		filename_blocks.push_back(zstd_compress(uncompressed_filenames));
+	}
+	files.clear();
+
+	// Stick an empty block at the end as sentinel.
+	filename_blocks.push_back("");
+
 	// Write the offsets to the filenames.
-	offset = filename_index_offset + files.size() * sizeof(offset);
-	for (const string &filename : files) {
+	offset = filename_index_offset + filename_blocks.size() * sizeof(offset);
+	for (const string &filename : filename_blocks) {
 		fwrite(&offset, sizeof(offset), 1, outfp);
-		offset += filename.size() + 1;
+		offset += filename.size();
 		bytes_for_filename_index += sizeof(offset);
-		bytes_for_filenames += filename.size() + 1;
+		bytes_for_filenames += filename.size();
 	}
 	
 	// Write the actual filenames.
-	for (const string &filename : files) {
-		fwrite(filename.c_str(), filename.size() + 1, 1, outfp);
+	for (const string &filename : filename_blocks) {
+		fwrite(filename.data(), filename.size(), 1, outfp);
 	}
 
 	fclose(outfp);
 
-	size_t total_bytes = (bytes_for_trigrams + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
+	//size_t total_bytes = (bytes_for_trigrams + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
 
+	dprintf("Block size:     %7d files\n", block_size);
 	dprintf("Trigrams:       %'7.1f MB\n", bytes_for_trigrams / 1048576.0);
 	dprintf("Posting lists:  %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
 	dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
@@ -243,6 +281,6 @@ void do_build(const char *infile, const char *outfile)
 
 int main(int argc, char **argv)
 {
-	do_build(argv[1], argv[2]);
+	do_build(argv[1], argv[2], 32);
 	exit(EXIT_SUCCESS);
 }
diff --git a/plocate.cpp b/plocate.cpp
index a9f624f..5f5ca4e 100644
--- a/plocate.cpp
+++ b/plocate.cpp
@@ -10,6 +10,7 @@
 #include <sys/mman.h>
 #include <arpa/inet.h>
 #include <endian.h>
+#include <zstd.h>
 
 #include "vp4.h"
 
@@ -176,13 +177,25 @@ void do_search_file(const string &needle, const char *filename)
 	const uint64_t *filename_offsets = (const uint64_t *)(data + filename_index_offset);
 	int matched = 0;
 	for (uint32_t docid : in1) {
-		const char *filename = (const char *)(data + filename_offsets[docid]);
-		if (strstr(filename, needle.c_str()) == nullptr) {
-			continue;
-		}
-		if (has_access(filename, &access_rx_cache)) {
-			++matched;
-			printf("%s\n", filename);
+		const char *compressed = (const char *)(data + filename_offsets[docid]);
+		size_t compressed_size = filename_offsets[docid + 1] - filename_offsets[docid];  // Allowed we have a sentinel block at the end.
+
+		string block;
+		block.resize(ZSTD_getFrameContentSize(compressed, compressed_size) + 1);
+
+		ZSTD_decompress(&block[0], block.size(), compressed, compressed_size);
+		block[block.size() - 1] = '\0';
+
+		for (const char *filename = block.data();
+		     filename != block.data() + block.size();
+		     filename += strlen(filename) + 1) {
+			if (strstr(filename, needle.c_str()) == nullptr) {
+				continue;
+			}
+			if (has_access(filename, &access_rx_cache)) {
+				++matched;
+				printf("%s\n", filename);
+			}
 		}
 	}
 	end = steady_clock::now();