]> git.sesse.net Git - plocate/blob - database-builder.h
Compile on systems without O_TMPFILE.
[plocate] / database-builder.h
1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
3
4 #include "db.h"
5
6 #include <chrono>
7 #include <fcntl.h>
8 #include <memory>
9 #include <random>
10 #include <stddef.h>
11 #include <string>
12 #include <utility>
13 #include <vector>
14 #include <zstd.h>
15
16 class PostingListBuilder;
17
18 // {0,0} means unknown or so current that it should never match.
19 // {-1,0} means it's not a directory.
20 struct dir_time {
21         int64_t sec;
22         int32_t nsec;
23
24         bool operator<(const dir_time &other) const
25         {
26                 if (sec != other.sec)
27                         return sec < other.sec;
28                 return nsec < other.nsec;
29         }
30         bool operator>=(const dir_time &other) const
31         {
32                 return !(other < *this);
33         }
34 };
35 constexpr dir_time unknown_dir_time{ 0, 0 };
36 constexpr dir_time not_a_dir{ -1, 0 };
37
38 class DatabaseReceiver {
39 public:
40         virtual ~DatabaseReceiver() = default;
41         virtual void add_file(std::string filename, dir_time dt) = 0;
42         virtual void flush_block() = 0;
43         virtual void finish() { flush_block(); }
44 };
45
46 class DictionaryBuilder : public DatabaseReceiver {
47 public:
48         DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
49                 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
50         void add_file(std::string filename, dir_time dt) override;
51         void flush_block() override;
52         std::string train(size_t buf_size);
53
54 private:
55         const size_t blocks_to_keep, block_size;
56         std::string current_block;
57         uint64_t block_num = 0;
58         size_t num_files_in_block = 0;
59
60         std::mt19937 reservoir_rand{ 1234 };  // Fixed seed for reproducibility.
61         bool keep_current_block = true;
62         int64_t slot_for_current_block = -1;
63
64         std::vector<std::string> sampled_blocks;
65         std::vector<size_t> lengths;
66 };
67
68 class Corpus : public DatabaseReceiver {
69 public:
70         Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times);
71         ~Corpus();
72
73         void add_file(std::string filename, dir_time dt) override;
74         void flush_block() override;
75         void finish() override;
76
77         std::vector<uint64_t> filename_blocks;
78         size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
79         bool seen_trigram(uint32_t trgm)
80         {
81                 return invindex[trgm] != nullptr;
82         }
83         PostingListBuilder &get_pl_builder(uint32_t trgm);
84         size_t num_trigrams() const;
85         std::string get_compressed_dir_times();
86
87 private:
88         void compress_dir_times(size_t allowed_slop);
89
90         std::unique_ptr<PostingListBuilder *[]> invindex;
91         FILE *outfp;
92         std::string current_block;
93         std::string tempbuf;
94         const size_t block_size;
95         const bool store_dir_times;
96         ZSTD_CDict *cdict;
97
98         ZSTD_CStream *dir_time_ctx = nullptr;
99         std::string dir_times;  // Buffer of still-uncompressed data.
100         std::string dir_times_compressed;
101 };
102
103 class DatabaseBuilder {
104 public:
105         DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility);
106         Corpus *start_corpus(bool store_dir_times);
107         void set_next_dictionary(std::string next_dictionary);
108         void set_conf_block(std::string conf_block);
109         void finish_corpus();
110
111 private:
112         FILE *outfp;
113         std::string outfile;
114 #ifndef O_TMPFILE
115         std::string temp_filename;
116 #endif
117         Header hdr;
118         const int block_size;
119         std::chrono::steady_clock::time_point corpus_start;
120         Corpus *corpus = nullptr;
121         ZSTD_CDict *cdict = nullptr;
122         std::string next_dictionary, conf_block;
123 };
124
125 #endif  // !defined(_DATABASE_BUILDER_H)