]> git.sesse.net Git - plocate/blob - database-builder.h
Remove dependency on non-POSIX header error.h.
[plocate] / database-builder.h
1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
3
4 #include "db.h"
5
6 #include <chrono>
7 #include <fcntl.h>
8 #include <memory>
9 #include <random>
10 #include <stddef.h>
11 #include <string>
12 #include <unistd.h>
13 #include <utility>
14 #include <vector>
15 #include <zstd.h>
16
17 class PostingListBuilder;
18
19 // {0,0} means unknown or so current that it should never match.
20 // {-1,0} means it's not a directory.
21 struct dir_time {
22         int64_t sec;
23         int32_t nsec;
24
25         bool operator<(const dir_time &other) const
26         {
27                 if (sec != other.sec)
28                         return sec < other.sec;
29                 return nsec < other.nsec;
30         }
31         bool operator>=(const dir_time &other) const
32         {
33                 return !(other < *this);
34         }
35 };
36 constexpr dir_time unknown_dir_time{ 0, 0 };
37 constexpr dir_time not_a_dir{ -1, 0 };
38
39 class DatabaseReceiver {
40 public:
41         virtual ~DatabaseReceiver() = default;
42         virtual void add_file(std::string filename, dir_time dt) = 0;
43         virtual void flush_block() = 0;
44         virtual void finish() { flush_block(); }
45
46         // EncodingCorpus only.
47         virtual size_t num_files_seen() const { return -1; }
48 };
49
50 class DictionaryBuilder : public DatabaseReceiver {
51 public:
52         DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
53                 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
54         void add_file(std::string filename, dir_time dt) override;
55         void flush_block() override;
56         std::string train(size_t buf_size);
57
58 private:
59         const size_t blocks_to_keep, block_size;
60         std::string current_block;
61         uint64_t block_num = 0;
62         size_t num_files_in_block = 0;
63
64         std::mt19937 reservoir_rand{ 1234 };  // Fixed seed for reproducibility.
65         bool keep_current_block = true;
66         int64_t slot_for_current_block = -1;
67
68         std::vector<std::string> sampled_blocks;
69         std::vector<size_t> lengths;
70 };
71
72 class EncodingCorpus;
73
74 class DatabaseBuilder {
75 public:
76         DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility);
77         DatabaseReceiver *start_corpus(bool store_dir_times);
78         void set_next_dictionary(std::string next_dictionary);
79         void set_conf_block(std::string conf_block);
80         void finish_corpus();
81
82 private:
83         FILE *outfp;
84         std::string outfile;
85         std::string temp_filename;
86         Header hdr;
87         const int block_size;
88         std::chrono::steady_clock::time_point corpus_start;
89         EncodingCorpus *corpus = nullptr;
90         ZSTD_CDict *cdict = nullptr;
91         std::string next_dictionary, conf_block;
92 };
93
94 #endif  // !defined(_DATABASE_BUILDER_H)