]> git.sesse.net Git - plocate/blob - plocate-build.cpp
Compress filenames with zstd.
[plocate] / plocate-build.cpp
1 #include <stdio.h>
2 #include <string.h>
3 #include <algorithm>
4 #include <unordered_map>
5 #include <string>
6 #include <vector>
7 #include <chrono>
8 #include <unistd.h>
9 #include <fcntl.h>
10 #include <sys/mman.h>
11 #include <arpa/inet.h>
12 #include <endian.h>
13 #include <sys/types.h>
14 #include <sys/stat.h>
15 #include <zstd.h>
16
17 #include "vp4.h"
18
19 #define P4NENC_BOUND(n) ((n+127)/128+(n+32)*sizeof(uint32_t))
20 #define dprintf(...)
21 //#define dprintf(...) fprintf(stderr, __VA_ARGS__);
22
23 using namespace std;
24 using namespace std::chrono;
25         
26 static inline uint32_t read_unigram(const string &s, size_t idx)
27 {
28         if (idx < s.size()) {
29                 return (unsigned char)s[idx];
30         } else {
31                 return 0;
32         }
33 }
34
35 static inline uint32_t read_trigram(const string &s, size_t start)
36 {
37         return read_unigram(s, start) |
38                 (read_unigram(s, start + 1) << 8) |
39                 (read_unigram(s, start + 2) << 16);
40 }
41
42 enum
43 {
44         DBE_NORMAL          = 0,    /* A non-directory file */
45         DBE_DIRECTORY       = 1,    /* A directory */
46         DBE_END             = 2   /* End of directory contents; contains no name */
47 };
48
49 // From mlocate.
50 struct db_header
51 {
52         uint8_t magic[8];
53         uint32_t conf_size;
54         uint8_t version;
55         uint8_t check_visibility;
56         uint8_t pad[2];
57 };
58
59 // From mlocate.
60 struct db_directory
61 {
62         uint64_t time_sec;
63         uint32_t time_nsec;
64         uint8_t pad[4];
65 };
66
67 const char *handle_directory(const char *ptr, vector<string> *files)
68 {
69         ptr += sizeof(db_directory);
70
71         string dir_path = ptr;
72         ptr += dir_path.size() + 1;
73         if (dir_path == "/") {
74                 dir_path = "";
75         }
76                 
77         for ( ;; ) {
78                 uint8_t type = *ptr++;
79                 if (type == DBE_NORMAL) {
80                         string filename = ptr;
81                         files->push_back(dir_path + "/" + filename);
82                         ptr += filename.size() + 1;
83                 } else if (type == DBE_DIRECTORY) {
84                         string dirname = ptr;
85                         files->push_back(dir_path + "/" + dirname);
86                         ptr += dirname.size() + 1;
87                 } else {
88                         return ptr;
89                 }
90         }
91 }
92
93 void read_mlocate(const char *filename, vector<string> *files)
94 {
95         int fd = open(filename, O_RDONLY);
96         if (fd == -1) {
97                 perror(filename);
98                 exit(1);
99         }
100         off_t len = lseek(fd, 0, SEEK_END);
101         if (len == -1) {
102                 perror("lseek");
103                 exit(1);
104         }
105         const char *data = (char *)mmap(nullptr, len, PROT_READ, MAP_SHARED, fd, /*offset=*/0);
106         if (data == MAP_FAILED) {
107                 perror("mmap");
108                 exit(1);
109         }
110
111         const db_header *hdr = (const db_header *)data;
112
113         // TODO: Care about the base path.
114         string path = data + sizeof(db_header);
115         uint64_t offset = sizeof(db_header) + path.size() + 1 + ntohl(hdr->conf_size);
116
117         const char *ptr = data + offset;
118         while (ptr < data + len) {
119                 ptr = handle_directory(ptr, files);
120         }
121
122         munmap((void *)data, len);
123         close(fd);
124 }
125
126 string zstd_compress(const string &src)
127 {
128         size_t max_size = ZSTD_compressBound(src.size());
129         string dst;
130         dst.resize(max_size);
131         size_t size = ZSTD_compress(&dst[0], max_size, src.data(), src.size(), /*level=*/6);
132         dst.resize(size);
133         return dst;
134 }
135
136 void do_build(const char *infile, const char *outfile, int block_size)
137 {
138         //steady_clock::time_point start = steady_clock::now();
139
140         vector<string> files;
141         read_mlocate(infile, &files);
142         if (false) {  // To read a plain text file.
143                 FILE *fp = fopen(infile, "r");
144                 while (!feof(fp)) {
145                         char buf[1024];
146                         if (fgets(buf, 1024, fp) == nullptr || feof(fp)) {
147                                 break;
148                         }
149                         string s(buf);
150                         if (s.back() == '\n') s.pop_back();
151                         files.push_back(move(s));
152                 }
153                 fclose(fp);
154         }
155         dprintf("Read %zu files from %s\n", files.size(), infile);
156         
157         unordered_map<uint32_t, string> pl;
158         size_t trigrams = 0, longest_posting_list = 0;
159         unordered_map<uint32_t, vector<uint32_t>> invindex;
160         for (size_t i = 0; i < files.size(); ++i) {
161                 const string &s = files[i];
162                 if (s.size() >= 3) {
163                         for (size_t j = 0; j < s.size() - 2; ++j) {
164                                 uint32_t trgm = read_trigram(s, j);
165                                 invindex[trgm].push_back(i / block_size);
166                         }
167                 }
168         }
169         string buf;
170         size_t bytes_used = 0;
171         for (auto &[trigram, docids] : invindex) {
172                 auto last = unique(docids.begin(), docids.end());
173                 docids.erase(last, docids.end());
174                 longest_posting_list = max(longest_posting_list, docids.size());
175                 trigrams += docids.size();
176
177                 size_t bytes_needed = P4NENC_BOUND(docids.size());
178                 if (buf.size() < bytes_needed) buf.resize(bytes_needed);
179                 size_t bytes = p4nd1enc128v32(&docids[0], docids.size(), reinterpret_cast<unsigned char *>(&buf[0]));
180                 pl[trigram] = string(buf.data(), bytes);
181                 bytes_used += bytes;
182         }
183         dprintf("%zu files, %zu different trigrams, %zu entries, avg len %.2f, longest %zu\n",
184                 files.size(), invindex.size(), trigrams, double(trigrams) / invindex.size(), longest_posting_list);
185
186         dprintf("%zu bytes used for posting lists (%.2f bits/entry)\n", bytes_used, 8 * bytes_used / double(trigrams));
187         //steady_clock::time_point end = steady_clock::now();
188         dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(end - start).count());
189
190         vector<uint32_t> all_trigrams;
191         for (auto &[trigram, docids] : invindex) {
192                 all_trigrams.push_back(trigram);
193         }
194         sort(all_trigrams.begin(), all_trigrams.end());
195
196         umask(0027);
197         FILE *outfp = fopen(outfile, "wb");
198
199         // Write the header.
200         uint64_t num_trigrams = invindex.size();  // 64 to get alignment.
201         fwrite(&num_trigrams, sizeof(num_trigrams), 1, outfp);
202
203         // Find out where the posting lists will end.
204         uint64_t offset = sizeof(uint64_t) * 2 + 16 * all_trigrams.size();
205         uint64_t filename_index_offset = offset;
206         for (uint32_t trgm : all_trigrams) {
207                 filename_index_offset += pl[trgm].size();
208         }
209         fwrite(&filename_index_offset, sizeof(filename_index_offset), 1, outfp);
210
211         size_t bytes_for_trigrams = 0, bytes_for_posting_lists = 0, bytes_for_filename_index = 0, bytes_for_filenames = 0;
212
213         for (uint32_t trgm : all_trigrams) {
214                 // 16 bytes: trgm (4), number of docids (4), byte offset (8)
215                 fwrite(&trgm, sizeof(trgm), 1, outfp);  // One byte wasted, but OK.
216                 uint32_t num_docids = invindex[trgm].size();
217                 fwrite(&num_docids, sizeof(num_docids), 1, outfp);
218                 fwrite(&offset, sizeof(offset), 1, outfp);
219                 offset += pl[trgm].size();
220                 bytes_for_trigrams += 16;
221         }
222
223         // Write the actual posting lists.
224         for (uint32_t trgm : all_trigrams) {
225                 fwrite(pl[trgm].data(), pl[trgm].size(), 1, outfp);
226                 bytes_for_posting_lists += pl[trgm].size();
227         }
228
229         // Compress the filenames into blocks.
230         vector<string> filename_blocks;
231         string uncompressed_filenames;
232         int num_files_this_block = 0;
233
234         string dst;
235
236         for (string &filename : files) {
237                 uncompressed_filenames += filename;
238                 filename.clear();
239                 if (++num_files_this_block == block_size) {
240                         filename_blocks.push_back(zstd_compress(uncompressed_filenames));
241                         uncompressed_filenames.clear();
242                         num_files_this_block = 0;
243                 } else {
244                         uncompressed_filenames.push_back('\0');
245                 }
246         }
247         if (num_files_this_block > 0) {
248                 filename_blocks.push_back(zstd_compress(uncompressed_filenames));
249         }
250         files.clear();
251
252         // Stick an empty block at the end as sentinel.
253         filename_blocks.push_back("");
254
255         // Write the offsets to the filenames.
256         offset = filename_index_offset + filename_blocks.size() * sizeof(offset);
257         for (const string &filename : filename_blocks) {
258                 fwrite(&offset, sizeof(offset), 1, outfp);
259                 offset += filename.size();
260                 bytes_for_filename_index += sizeof(offset);
261                 bytes_for_filenames += filename.size();
262         }
263         
264         // Write the actual filenames.
265         for (const string &filename : filename_blocks) {
266                 fwrite(filename.data(), filename.size(), 1, outfp);
267         }
268
269         fclose(outfp);
270
271         //size_t total_bytes = (bytes_for_trigrams + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
272
273         dprintf("Block size:     %7d files\n", block_size);
274         dprintf("Trigrams:       %'7.1f MB\n", bytes_for_trigrams / 1048576.0);
275         dprintf("Posting lists:  %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
276         dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
277         dprintf("Filenames:      %'7.1f MB\n", bytes_for_filenames / 1048576.0);
278         dprintf("Total:          %'7.1f MB\n", total_bytes / 1048576.0);
279         dprintf("\n");
280 }
281
282 int main(int argc, char **argv)
283 {
284         do_build(argv[1], argv[2], 32);
285         exit(EXIT_SUCCESS);
286 }