]> git.sesse.net Git - plocate/blob - bench.cpp
Remove dependency on non-POSIX header error.h.
[plocate] / bench.cpp
1 #include <chrono>
2 #include <fcntl.h>
3 #include <memory>
4 #include <stdio.h>
5 #include <string>
6 #include <unistd.h>
7
8 #define dprintf(...)
9 //#define dprintf(...) fprintf(stderr, __VA_ARGS__);
10
11 #include "complete_pread.h"
12 #include "db.h"
13 #include "io_uring_engine.h"
14 #include "turbopfor-encode.h"
15 #include "turbopfor.h"
16 #include "vp4.h"
17
18 using namespace std;
19 using namespace std::chrono;
20
21 bool use_debug = false;
22
23 int main(void)
24 {
25         int fd = open("plocate.db", O_RDONLY);
26         if (fd == -1) {
27                 perror("plocate.db");
28                 exit(1);
29         }
30
31         Header hdr;
32         complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0);
33
34         unique_ptr<Trigram[]> ht(new Trigram[hdr.hashtable_size + hdr.extra_ht_slots + 1]);
35         complete_pread(fd, ht.get(), (hdr.hashtable_size + hdr.extra_ht_slots + 1) * sizeof(Trigram), hdr.hash_table_offset_bytes);
36
37         size_t posting_list_bytes = 0, own_posting_list_bytes = 0, total_elements = 0, most_bytes_pl = 0;
38         uint32_t longest_pl = 0;
39         vector<pair<string, unsigned>> posting_lists;
40         for (unsigned i = 0; i < hdr.hashtable_size + hdr.extra_ht_slots; ++i) {
41                 if (ht[i].num_docids == 0) {
42                         continue;
43                 }
44                 size_t len = ht[i + 1].offset - ht[i].offset;
45                 string str;
46                 str.resize(len);
47                 complete_pread(fd, &str[0], len, ht[i].offset);
48                 posting_lists.emplace_back(move(str), ht[i].num_docids);
49                 longest_pl = std::max(ht[i].num_docids, longest_pl);
50                 most_bytes_pl = std::max(len, most_bytes_pl);
51                 posting_list_bytes += len;
52                 total_elements += ht[i].num_docids;
53         }
54         ht.reset();
55         fprintf(stderr, "Read %zu posting lists.\n", posting_lists.size());
56
57         string encoded_pl;
58         encoded_pl.resize(longest_pl * 2 + 16384);  // Lots of margin.
59
60         size_t num_decode_errors = 0, num_encode_errors = 0;
61         for (auto &[pl, num_docids] : posting_lists) {
62                 //fprintf(stderr, "%zu bytes, %u docids\n", pl.size(), num_docids);
63                 vector<uint32_t> out1, out2;
64                 out1.resize(num_docids + 128);
65                 out2.resize(num_docids + 128);
66                 unsigned char *pldata = reinterpret_cast<unsigned char *>(&pl[0]);
67                 p4nd1dec128v32(pldata, num_docids, &out1[0]);
68                 decode_pfor_delta1_128(pldata, num_docids, /*interleaved=*/true, &out2[0]);
69                 for (unsigned i = 0; i < num_docids; ++i) {
70                         if (out1[i] != out2[i]) {
71                                 if (++num_decode_errors < 10) {
72                                         fprintf(stderr, "Decode error:\n");
73                                         for (unsigned j = 0; j < num_docids; ++j) {
74                                                 fprintf(stderr, "%3u: reference=%u ours=%u  (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]);
75                                         }
76                                 }
77                                 break;
78                         }
79                 }
80
81                 // Test encoding, by encoding with out own implementation
82                 // and checking that decoding with the reference gives
83                 // the same result. We do not measure performance (we're slow).
84                 uint32_t deltas[128];
85                 unsigned char *ptr = reinterpret_cast<unsigned char *>(&encoded_pl[0]);
86                 ptr = write_baseval(out1[0], ptr);
87                 for (unsigned i = 1; i < num_docids; i += 128) {
88                         unsigned num_docids_this_block = std::min(num_docids - i, 128u);
89                         for (unsigned j = 0; j < num_docids_this_block; ++j) {
90                                 deltas[j] = out1[i + j] - out1[i + j - 1] - 1;
91                         }
92                         bool interleaved = (num_docids_this_block == 128);
93                         ptr = encode_pfor_single_block<128>(deltas, num_docids_this_block, interleaved, ptr);
94                 }
95                 own_posting_list_bytes += ptr - reinterpret_cast<unsigned char *>(&encoded_pl[0]);
96
97                 pldata = reinterpret_cast<unsigned char *>(&encoded_pl[0]);
98                 p4nd1dec128v32(pldata, num_docids, &out2[0]);
99                 for (unsigned i = 0; i < num_docids; ++i) {
100                         if (out1[i] != out2[i]) {
101                                 if (++num_encode_errors < 10) {
102                                         fprintf(stderr, "Encode error:\n");
103                                         for (unsigned j = 0; j < num_docids; ++j) {
104                                                 fprintf(stderr, "%3u: reference=%u ours=%u  (diff=%d)\n", j, out1[j], out2[j], out1[j] - out2[j]);
105                                         }
106                                 }
107                                 break;
108                         }
109                 }
110         }
111         fprintf(stderr, "%zu/%zu posting lists had errors in decoding.\n", num_decode_errors, posting_lists.size());
112         fprintf(stderr, "%zu/%zu posting lists had errors in encoding.\n", num_encode_errors, posting_lists.size());
113
114         // Benchmark.
115         vector<uint32_t> dummy;
116         dummy.resize(longest_pl + 128);
117         steady_clock::time_point start = steady_clock::now();
118         for (auto &[pl, num_docids] : posting_lists) {
119                 unsigned char *pldata = reinterpret_cast<unsigned char *>(&pl[0]);
120                 p4nd1dec128v32(pldata, num_docids, &dummy[0]);
121         }
122         steady_clock::time_point end = steady_clock::now();
123         double reference_sec = duration<double>(end - start).count();
124         fprintf(stderr, "Decoding with reference implementation: %.1f ms\n", 1e3 * reference_sec);
125
126         start = steady_clock::now();
127         for (auto &[pl, num_docids] : posting_lists) {
128                 unsigned char *pldata = reinterpret_cast<unsigned char *>(&pl[0]);
129                 decode_pfor_delta1_128(pldata, num_docids, /*interleaved=*/true, &dummy[0]);
130         }
131         end = steady_clock::now();
132         double own_sec = duration<double>(end - start).count();
133         fprintf(stderr, "Decoding with own implementation: %.3f ms (%.2f%% speed)\n", 1e3 * own_sec, 100.0 * reference_sec / own_sec);
134         fprintf(stderr, "Size with own implementation: %.1f MB (%.2f%% of reference, %+d bytes)\n", own_posting_list_bytes / 1048576.0, 100.0 * own_posting_list_bytes / posting_list_bytes, int(own_posting_list_bytes) - int(posting_list_bytes));
135
136         // Three numbers giving rules of thumb for judging our own implementation:
137         //
138         // - Compressed speed is easy to compare to disk I/O, to see the relative importance
139         // - Uncompressed speed is easy to compare to intersection speeds and memory bandwidth
140         //   (also very roughly comparable to the benchmark numbers in the TurboPFor README)
141         // - ns/element gives an absolute measure for plocate (e.g. if we can decompress at
142         //   1 ns/element, a 10k-element posting list goes by in 0.01 ms, which is way beyond
143         //   instantaneous in practice).
144         fprintf(stderr, "%.1f MB/sec (compressed), %.1f MB/sec (uncompressed), %.1f ns/element\n", posting_list_bytes / own_sec / 1048576.0,
145                 (total_elements * sizeof(uint32_t)) / own_sec / 1048576.0, 1e9 * own_sec / total_elements);
146 }