]> git.sesse.net Git - plocate/blob - plocate-build.cpp
Simplify docid deduplication in plocate-builder.
[plocate] / plocate-build.cpp
1 #include "db.h"
2 #include "vp4.h"
3
4 #include <algorithm>
5 #include <arpa/inet.h>
6 #include <assert.h>
7 #include <chrono>
8 #include <endian.h>
9 #include <fcntl.h>
10 #include <math.h>
11 #include <memory>
12 #include <stdio.h>
13 #include <string.h>
14 #include <string>
15 #include <sys/stat.h>
16 #include <sys/types.h>
17 #include <unistd.h>
18 #include <unordered_map>
19 #include <vector>
20 #include <zstd.h>
21
22 #define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t))
23 #define dprintf(...)
24 //#define dprintf(...) fprintf(stderr, __VA_ARGS__);
25
26 using namespace std;
27 using namespace std::chrono;
28
29 string zstd_compress(const string &src, string *tempbuf);
30
31 constexpr unsigned num_overflow_slots = 16;
32
33 static inline uint32_t read_unigram(const string_view s, size_t idx)
34 {
35         if (idx < s.size()) {
36                 return (unsigned char)s[idx];
37         } else {
38                 return 0;
39         }
40 }
41
42 static inline uint32_t read_trigram(const string_view s, size_t start)
43 {
44         return read_unigram(s, start) |
45                 (read_unigram(s, start + 1) << 8) |
46                 (read_unigram(s, start + 2) << 16);
47 }
48
49 enum {
50         DBE_NORMAL = 0, /* A non-directory file */
51         DBE_DIRECTORY = 1, /* A directory */
52         DBE_END = 2 /* End of directory contents; contains no name */
53 };
54
55 // From mlocate.
56 struct db_header {
57         uint8_t magic[8];
58         uint32_t conf_size;
59         uint8_t version;
60         uint8_t check_visibility;
61         uint8_t pad[2];
62 };
63
64 // From mlocate.
65 struct db_directory {
66         uint64_t time_sec;
67         uint32_t time_nsec;
68         uint8_t pad[4];
69 };
70
71 class PostingListBuilder {
72 public:
73         void add_docid(uint32_t docid);
74         void finish();
75
76         string encoded;
77         size_t num_docids = 0;
78
79 private:
80         void write_header(uint32_t docid);
81         void append_block();
82
83         vector<uint32_t> pending_docids;
84
85         uint32_t last_block_end, last_docid = -1;
86 };
87
88 void PostingListBuilder::add_docid(uint32_t docid)
89 {
90         // Deduplicate against the last inserted value, if any.
91         if (docid == last_docid) {
92                 return;
93         }
94
95         if (num_docids == 0) {
96                 // Very first docid.
97                 write_header(docid);
98                 ++num_docids;
99                 last_block_end = last_docid = docid;
100                 return;
101         }
102
103         last_docid = docid;
104         pending_docids.push_back(docid);
105         if (pending_docids.size() == 128) {
106                 append_block();
107                 pending_docids.clear();
108                 last_block_end = docid;
109         }
110         ++num_docids;
111 }
112
113 void PostingListBuilder::finish()
114 {
115         if (pending_docids.empty()) {
116                 return;
117         }
118
119         assert(!encoded.empty());  // write_header() should already have run.
120
121         // No interleaving for partial blocks.
122         unsigned char buf[P4NENC_BOUND(128)];
123         unsigned char *end = p4d1enc32(pending_docids.data(), pending_docids.size(), buf, last_block_end);
124         encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
125 }
126
127 void PostingListBuilder::append_block()
128 {
129         unsigned char buf[P4NENC_BOUND(128)];
130         assert(pending_docids.size() == 128);
131         unsigned char *end = p4d1enc128v32(pending_docids.data(), 128, buf, last_block_end);
132         encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
133 }
134
135 void PostingListBuilder::write_header(uint32_t docid)
136 {
137         unsigned char buf[P4NENC_BOUND(1)];
138         size_t bytes = p4nd1enc128v32(&docid, 1, buf);
139         encoded.append(reinterpret_cast<char *>(buf), bytes);
140 }
141
142 class Corpus {
143 public:
144         Corpus(FILE *outfp, size_t block_size)
145                 : outfp(outfp), block_size(block_size) {}
146         void add_file(string filename);
147         void flush_block();
148
149         vector<uint64_t> filename_blocks;
150         unordered_map<uint32_t, PostingListBuilder> invindex;
151         size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
152
153 private:
154         FILE *outfp;
155         string current_block;
156         string tempbuf;
157         const size_t block_size;
158 };
159
160 void Corpus::add_file(string filename)
161 {
162         ++num_files;
163         if (!current_block.empty()) {
164                 current_block.push_back('\0');
165         }
166         current_block += filename;
167         if (++num_files_in_block == block_size) {
168                 flush_block();
169         }
170 }
171
172 void Corpus::flush_block()
173 {
174         if (current_block.empty()) {
175                 return;
176         }
177
178         uint32_t docid = num_blocks;
179
180         // Create trigrams.
181         const char *ptr = current_block.c_str();
182         while (ptr < current_block.c_str() + current_block.size()) {
183                 string_view s(ptr);
184                 if (s.size() >= 3) {
185                         for (size_t j = 0; j < s.size() - 2; ++j) {
186                                 uint32_t trgm = read_trigram(s, j);
187                                 invindex[trgm].add_docid(docid);
188                         }
189                 }
190                 ptr += s.size() + 1;
191         }
192
193         // Compress and add the filename block.
194         filename_blocks.push_back(ftell(outfp));
195         string compressed = zstd_compress(current_block, &tempbuf);
196         if (fwrite(compressed.data(), compressed.size(), 1, outfp) != 1) {
197                 perror("fwrite()");
198                 exit(1);
199         }
200
201         current_block.clear();
202         num_files_in_block = 0;
203         ++num_blocks;
204 }
205
206 string read_cstr(FILE *fp)
207 {
208         string ret;
209         for ( ;; ) {
210                 int ch = getc(fp);
211                 if (ch == -1) {
212                         perror("getc");
213                         exit(1);
214                 }
215                 if (ch == 0) {
216                         return ret;
217                 }
218                 ret.push_back(ch);
219         }
220 }
221
222 void handle_directory(FILE *fp, Corpus *corpus)
223 {
224         db_directory dummy;
225         if (fread(&dummy, sizeof(dummy), 1, fp) != 1) {
226                 if (feof(fp)) {
227                         return;
228                 } else {
229                         perror("fread");
230                 }
231         }
232
233         string dir_path = read_cstr(fp);
234         if (dir_path == "/") {
235                 dir_path = "";
236         }
237
238         for (;;) {
239                 int type = getc(fp);
240                 if (type == DBE_NORMAL) {
241                         string filename = read_cstr(fp);
242                         corpus->add_file(dir_path + "/" + filename);
243                 } else if (type == DBE_DIRECTORY) {
244                         string dirname = read_cstr(fp);
245                         corpus->add_file(dir_path + "/" + dirname);
246                 } else {
247                         return;  // Probably end.
248                 }
249         }
250 }
251
252 void read_mlocate(const char *filename, Corpus *corpus)
253 {
254         FILE *fp = fopen(filename, "rb");
255         if (fp == nullptr) {
256                 perror(filename);
257                 exit(1);
258         }
259
260         db_header hdr;
261         if (fread(&hdr, sizeof(hdr), 1, fp) != 1) {
262                 perror("short read");
263                 exit(1);
264         }
265
266         // TODO: Care about the base path.
267         string path = read_cstr(fp);
268         while (!feof(fp)) {
269                 handle_directory(fp, corpus);
270         }
271         fclose(fp);
272 }
273
274 string zstd_compress(const string &src, string *tempbuf)
275 {
276         size_t max_size = ZSTD_compressBound(src.size());
277         if (tempbuf->size() < max_size) {
278                 tempbuf->resize(max_size);
279         }
280         size_t size = ZSTD_compress(&(*tempbuf)[0], max_size, src.data(), src.size(), /*level=*/6);
281         return string(tempbuf->data(), size);
282 }
283
284 bool is_prime(uint32_t x)
285 {
286         if ((x % 2) == 0 || (x % 3) == 0) {
287                 return false;
288         }
289         uint32_t limit = ceil(sqrt(x));
290         for (uint32_t factor = 5; factor <= limit; ++factor) {
291                 if ((x % factor) == 0) {
292                         return false;
293                 }
294         }
295         return true;
296 }
297
298 uint32_t next_prime(uint32_t x)
299 {
300         if ((x % 2) == 0) {
301                 ++x;
302         }
303         while (!is_prime(x)) {
304                 x += 2;
305         }
306         return x;
307 }
308
309 unique_ptr<Trigram[]> create_hashtable(const Corpus &corpus, const vector<uint32_t> &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots)
310 {
311         unique_ptr<Trigram[]> ht(new Trigram[ht_size + num_overflow_slots + 1]);  // 1 for the sentinel element at the end.
312         for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
313                 ht[i].trgm = uint32_t(-1);
314                 ht[i].num_docids = 0;
315                 ht[i].offset = 0;
316         }
317         for (uint32_t trgm : all_trigrams) {
318                 // We don't know offset yet, so set it to zero.
319                 Trigram to_insert{ trgm, uint32_t(corpus.invindex.find(trgm)->second.num_docids), 0 };
320
321                 uint32_t bucket = hash_trigram(trgm, ht_size);
322                 unsigned distance = 0;
323                 while (ht[bucket].num_docids != 0) {
324                         // Robin Hood hashing; reduces the longest distance by a lot.
325                         unsigned other_distance = bucket - hash_trigram(ht[bucket].trgm, ht_size);
326                         if (distance > other_distance) {
327                                 swap(to_insert, ht[bucket]);
328                                 distance = other_distance;
329                         }
330
331                         ++bucket, ++distance;
332                         if (distance > num_overflow_slots) {
333                                 return nullptr;
334                         }
335                 }
336                 ht[bucket] = to_insert;
337         }
338         return ht;
339 }
340
341 void do_build(const char *infile, const char *outfile, int block_size)
342 {
343         steady_clock::time_point start __attribute__((unused)) = steady_clock::now();
344
345         umask(0027);
346         FILE *outfp = fopen(outfile, "wb");
347
348         // Write the header.
349         Header hdr;
350         memcpy(hdr.magic, "\0plocate", 8);
351         hdr.version = -1;  // Mark as broken.
352         hdr.hashtable_size = 0;  // Not known yet.
353         hdr.extra_ht_slots = num_overflow_slots;
354         hdr.hash_table_offset_bytes = -1;  // We don't know these offsets yet.
355         hdr.filename_index_offset_bytes = -1;
356         fwrite(&hdr, sizeof(hdr), 1, outfp);
357
358         Corpus corpus(outfp, block_size);
359
360         read_mlocate(infile, &corpus);
361         if (false) {  // To read a plain text file.
362                 FILE *fp = fopen(infile, "r");
363                 while (!feof(fp)) {
364                         char buf[1024];
365                         if (fgets(buf, 1024, fp) == nullptr || feof(fp)) {
366                                 break;
367                         }
368                         string s(buf);
369                         if (s.back() == '\n')
370                                 s.pop_back();
371                         corpus.add_file(move(s));
372                 }
373                 fclose(fp);
374         }
375         corpus.flush_block();
376         dprintf("Read %zu files from %s\n", corpus.num_files, infile);
377         hdr.num_docids = corpus.filename_blocks.size();
378
379         // Stick an empty block at the end as sentinel.
380         corpus.filename_blocks.push_back(ftell(outfp));
381         const size_t bytes_for_filenames = corpus.filename_blocks.back() - corpus.filename_blocks.front();
382
383         // Write the offsets to the filenames.
384         hdr.filename_index_offset_bytes = ftell(outfp);
385         const size_t bytes_for_filename_index = corpus.filename_blocks.size() * sizeof(uint64_t);
386         fwrite(corpus.filename_blocks.data(), corpus.filename_blocks.size(), sizeof(uint64_t), outfp);
387         corpus.filename_blocks.clear();
388         corpus.filename_blocks.shrink_to_fit();
389
390         // Finish up encoding the posting lists.
391         size_t trigrams = 0, longest_posting_list = 0;
392         size_t bytes_for_posting_lists = 0;
393         for (auto &[trigram, pl_builder] : corpus.invindex) {
394                 pl_builder.finish();
395                 longest_posting_list = max(longest_posting_list, pl_builder.num_docids);
396                 trigrams += pl_builder.num_docids;
397                 bytes_for_posting_lists += pl_builder.encoded.size();
398         }
399         dprintf("%zu files, %zu different trigrams, %zu entries, avg len %.2f, longest %zu\n",
400                 corpus.num_files, corpus.invindex.size(), trigrams, double(trigrams) / corpus.invindex.size(), longest_posting_list);
401         dprintf("%zu bytes used for posting lists (%.2f bits/entry)\n", bytes_for_posting_lists, 8 * bytes_for_posting_lists / double(trigrams));
402
403         dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(steady_clock::now() - start).count());
404
405         // Sort the trigrams, mostly to get a consistent result every time
406         // (the hash table will put things in random order anyway).
407         vector<uint32_t> all_trigrams;
408         for (auto &[trigram, pl_builder] : corpus.invindex) {
409                 all_trigrams.push_back(trigram);
410         }
411         sort(all_trigrams.begin(), all_trigrams.end());
412
413         // Create the hash table.
414         unique_ptr<Trigram[]> hashtable;
415         uint32_t ht_size = next_prime(all_trigrams.size());
416         for (;;) {
417                 hashtable = create_hashtable(corpus, all_trigrams, ht_size, num_overflow_slots);
418                 if (hashtable == nullptr) {
419                         dprintf("Failed creating hash table of size %u, increasing by 5%% and trying again.\n", ht_size);
420                         ht_size = next_prime(ht_size * 1.05);
421                 } else {
422                         dprintf("Created hash table of size %u.\n\n", ht_size);
423                         break;
424                 }
425         }
426
427         // Find the offsets for each posting list.
428         size_t bytes_for_hashtable = (ht_size + num_overflow_slots + 1) * sizeof(Trigram);
429         uint64_t offset = ftell(outfp) + bytes_for_hashtable;
430         for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
431                 hashtable[i].offset = offset;  // Needs to be there even for empty slots.
432                 if (hashtable[i].num_docids == 0) {
433                         continue;
434                 }
435
436                 const string &encoded = corpus.invindex[hashtable[i].trgm].encoded;
437                 offset += encoded.size();
438         }
439
440         // Write the hash table.
441         hdr.hash_table_offset_bytes = ftell(outfp);
442         hdr.hashtable_size = ht_size;
443         fwrite(hashtable.get(), ht_size + num_overflow_slots + 1, sizeof(Trigram), outfp);
444
445         // Write the actual posting lists.
446         for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
447                 if (hashtable[i].num_docids == 0) {
448                         continue;
449                 }
450                 const string &encoded = corpus.invindex[hashtable[i].trgm].encoded;
451                 fwrite(encoded.data(), encoded.size(), 1, outfp);
452         }
453
454         // Rewind, and write the updated header.
455         hdr.version = 0;
456         fseek(outfp, 0, SEEK_SET);
457         fwrite(&hdr, sizeof(hdr), 1, outfp);
458         fclose(outfp);
459
460         size_t total_bytes __attribute__((unused)) = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
461
462         dprintf("Block size:     %7d files\n", block_size);
463         dprintf("Hash table:     %'7.1f MB\n", bytes_for_hashtable / 1048576.0);
464         dprintf("Posting lists:  %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
465         dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
466         dprintf("Filenames:      %'7.1f MB\n", bytes_for_filenames / 1048576.0);
467         dprintf("Total:          %'7.1f MB\n", total_bytes / 1048576.0);
468         dprintf("\n");
469 }
470
471 int main(int argc, char **argv)
472 {
473         do_build(argv[1], argv[2], 32);
474         exit(EXIT_SUCCESS);
475 }