]> git.sesse.net Git - plocate/blob - database-builder.h
Add a native updatedb.
[plocate] / database-builder.h
1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
3
4 #include "db.h"
5
6 #include <chrono>
7 #include <memory>
8 #include <random>
9 #include <stddef.h>
10 #include <string>
11 #include <utility>
12 #include <vector>
13 #include <zstd.h>
14
15 class PostingListBuilder;
16
17 // {0,0} means unknown or so current that it should never match.
18 // {-1,0} means it's not a directory.
19 struct dir_time {
20         int64_t sec;
21         int32_t nsec;
22
23         bool operator<(const dir_time &other) const
24         {
25                 if (sec != other.sec)
26                         return sec < other.sec;
27                 return nsec < other.nsec;
28         }
29         bool operator>=(const dir_time &other) const
30         {
31                 return !(other < *this);
32         }
33 };
34 constexpr dir_time unknown_dir_time{ 0, 0 };
35 constexpr dir_time not_a_dir{ -1, 0 };
36
37 class DatabaseReceiver {
38 public:
39         virtual ~DatabaseReceiver() = default;
40         virtual void add_file(std::string filename, dir_time dt) = 0;
41         virtual void flush_block() = 0;
42         virtual void finish() { flush_block(); }
43 };
44
45 class DictionaryBuilder : public DatabaseReceiver {
46 public:
47         DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
48                 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
49         void add_file(std::string filename, dir_time dt) override;
50         void flush_block() override;
51         std::string train(size_t buf_size);
52
53 private:
54         const size_t blocks_to_keep, block_size;
55         std::string current_block;
56         uint64_t block_num = 0;
57         size_t num_files_in_block = 0;
58
59         std::mt19937 reservoir_rand{ 1234 };  // Fixed seed for reproducibility.
60         bool keep_current_block = true;
61         int64_t slot_for_current_block = -1;
62
63         std::vector<std::string> sampled_blocks;
64         std::vector<size_t> lengths;
65 };
66
67 class Corpus : public DatabaseReceiver {
68 public:
69         Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times);
70         ~Corpus();
71
72         void add_file(std::string filename, dir_time dt) override;
73         void flush_block() override;
74         void finish() override;
75
76         std::vector<uint64_t> filename_blocks;
77         size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
78         bool seen_trigram(uint32_t trgm)
79         {
80                 return invindex[trgm] != nullptr;
81         }
82         PostingListBuilder &get_pl_builder(uint32_t trgm);
83         size_t num_trigrams() const;
84         std::string get_compressed_dir_times();
85
86 private:
87         void compress_dir_times(size_t allowed_slop);
88
89         std::unique_ptr<PostingListBuilder *[]> invindex;
90         FILE *outfp;
91         std::string current_block;
92         std::string tempbuf;
93         const size_t block_size;
94         const bool store_dir_times;
95         ZSTD_CDict *cdict;
96
97         ZSTD_CStream *dir_time_ctx = nullptr;
98         std::string dir_times;  // Buffer of still-uncompressed data.
99         std::string dir_times_compressed;
100 };
101
102 class DatabaseBuilder {
103 public:
104         DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary);
105         Corpus *start_corpus(bool store_dir_times);
106         void set_next_dictionary(std::string next_dictionary);
107         void set_conf_block(std::string conf_block);
108         void finish_corpus();
109
110 private:
111         FILE *outfp;
112         std::string outfile;
113         Header hdr;
114         const int block_size;
115         std::chrono::steady_clock::time_point corpus_start;
116         Corpus *corpus = nullptr;
117         ZSTD_CDict *cdict = nullptr;
118         std::string next_dictionary, conf_block;
119 };
120
121 #endif  // !defined(_DATABASE_BUILDER_H)