]> git.sesse.net Git - plocate/commitdiff
Compress filenames with zstd.
authorSteinar H. Gunderson <steinar+git@gunderson.no>
Sun, 27 Sep 2020 22:28:49 +0000 (00:28 +0200)
committerSteinar H. Gunderson <steinar+git@gunderson.no>
Sun, 27 Sep 2020 22:28:49 +0000 (00:28 +0200)
Make blocks of 32 and 32 filenames, and compress then with zstd -6
(the level is fairly arbitrarily chosen). This compresses the repetitive
path information very well, and also allows us to have shorter posting
lists, as they can point into the blocks (allowing dedup).

32 was chosen after eyeballing some compressed sizes, looking for
diminishing returns and then verifying it didn't cost much in terms
of search performance.

Makefile
plocate-build.cpp
plocate.cpp

index bbcdccb8ffec3f7a31022c0ee53c9ddfa8cac595..ccbaf1547910ed4a935c9adf37f1063793a0591d 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -8,10 +8,10 @@ PREFIX ?= /usr/local
 all: plocate plocate-build
 
 plocate: plocate.o TurboPFor-Integer-Compression/libic.a
-       $(CXX) -o $@ $^
+       $(CXX) -o $@ $^ -lzstd
 
 plocate-build: plocate-build.o TurboPFor-Integer-Compression/libic.a
-       $(CXX) -o $@ $^
+       $(CXX) -o $@ $^ -lzstd
 
 TurboPFor-Integer-Compression/libic.a:
        cd TurboPFor-Integer-Compression/ && $(MAKE)
index b64d06feffd28c20ec8b700529d6f946010095a5..e6789d0ab8084586166ac3e50bd7e4138731eda7 100644 (file)
@@ -12,6 +12,7 @@
 #include <endian.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <zstd.h>
 
 #include "vp4.h"
 
@@ -122,7 +123,17 @@ void read_mlocate(const char *filename, vector<string> *files)
        close(fd);
 }
 
-void do_build(const char *infile, const char *outfile)
+string zstd_compress(const string &src)
+{
+       size_t max_size = ZSTD_compressBound(src.size());
+       string dst;
+       dst.resize(max_size);
+       size_t size = ZSTD_compress(&dst[0], max_size, src.data(), src.size(), /*level=*/6);
+       dst.resize(size);
+       return dst;
+}
+
+void do_build(const char *infile, const char *outfile, int block_size)
 {
        //steady_clock::time_point start = steady_clock::now();
 
@@ -151,7 +162,7 @@ void do_build(const char *infile, const char *outfile)
                if (s.size() >= 3) {
                        for (size_t j = 0; j < s.size() - 2; ++j) {
                                uint32_t trgm = read_trigram(s, j);
-                               invindex[trgm].push_back(i);
+                               invindex[trgm].push_back(i / block_size);
                        }
                }
        }
@@ -215,24 +226,51 @@ void do_build(const char *infile, const char *outfile)
                bytes_for_posting_lists += pl[trgm].size();
        }
 
+       // Compress the filenames into blocks.
+       vector<string> filename_blocks;
+       string uncompressed_filenames;
+       int num_files_this_block = 0;
+
+       string dst;
+
+       for (string &filename : files) {
+               uncompressed_filenames += filename;
+               filename.clear();
+               if (++num_files_this_block == block_size) {
+                       filename_blocks.push_back(zstd_compress(uncompressed_filenames));
+                       uncompressed_filenames.clear();
+                       num_files_this_block = 0;
+               } else {
+                       uncompressed_filenames.push_back('\0');
+               }
+       }
+       if (num_files_this_block > 0) {
+               filename_blocks.push_back(zstd_compress(uncompressed_filenames));
+       }
+       files.clear();
+
+       // Stick an empty block at the end as sentinel.
+       filename_blocks.push_back("");
+
        // Write the offsets to the filenames.
-       offset = filename_index_offset + files.size() * sizeof(offset);
-       for (const string &filename : files) {
+       offset = filename_index_offset + filename_blocks.size() * sizeof(offset);
+       for (const string &filename : filename_blocks) {
                fwrite(&offset, sizeof(offset), 1, outfp);
-               offset += filename.size() + 1;
+               offset += filename.size();
                bytes_for_filename_index += sizeof(offset);
-               bytes_for_filenames += filename.size() + 1;
+               bytes_for_filenames += filename.size();
        }
        
        // Write the actual filenames.
-       for (const string &filename : files) {
-               fwrite(filename.c_str(), filename.size() + 1, 1, outfp);
+       for (const string &filename : filename_blocks) {
+               fwrite(filename.data(), filename.size(), 1, outfp);
        }
 
        fclose(outfp);
 
-       size_t total_bytes = (bytes_for_trigrams + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
+       //size_t total_bytes = (bytes_for_trigrams + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
 
+       dprintf("Block size:     %7d files\n", block_size);
        dprintf("Trigrams:       %'7.1f MB\n", bytes_for_trigrams / 1048576.0);
        dprintf("Posting lists:  %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
        dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
@@ -243,6 +281,6 @@ void do_build(const char *infile, const char *outfile)
 
 int main(int argc, char **argv)
 {
-       do_build(argv[1], argv[2]);
+       do_build(argv[1], argv[2], 32);
        exit(EXIT_SUCCESS);
 }
index a9f624f66466c8a16c19a0f45f8365da6aaf4707..5f5ca4ea4356d628e16aa2f769ec9ea6fb26b470 100644 (file)
@@ -10,6 +10,7 @@
 #include <sys/mman.h>
 #include <arpa/inet.h>
 #include <endian.h>
+#include <zstd.h>
 
 #include "vp4.h"
 
@@ -176,13 +177,25 @@ void do_search_file(const string &needle, const char *filename)
        const uint64_t *filename_offsets = (const uint64_t *)(data + filename_index_offset);
        int matched = 0;
        for (uint32_t docid : in1) {
-               const char *filename = (const char *)(data + filename_offsets[docid]);
-               if (strstr(filename, needle.c_str()) == nullptr) {
-                       continue;
-               }
-               if (has_access(filename, &access_rx_cache)) {
-                       ++matched;
-                       printf("%s\n", filename);
+               const char *compressed = (const char *)(data + filename_offsets[docid]);
+               size_t compressed_size = filename_offsets[docid + 1] - filename_offsets[docid];  // Allowed we have a sentinel block at the end.
+
+               string block;
+               block.resize(ZSTD_getFrameContentSize(compressed, compressed_size) + 1);
+
+               ZSTD_decompress(&block[0], block.size(), compressed, compressed_size);
+               block[block.size() - 1] = '\0';
+
+               for (const char *filename = block.data();
+                    filename != block.data() + block.size();
+                    filename += strlen(filename) + 1) {
+                       if (strstr(filename, needle.c_str()) == nullptr) {
+                               continue;
+                       }
+                       if (has_access(filename, &access_rx_cache)) {
+                               ++matched;
+                               printf("%s\n", filename);
+                       }
                }
        }
        end = steady_clock::now();