]> git.sesse.net Git - plocate/blob - database-builder.h
Release plocate 1.1.7.
[plocate] / database-builder.h
1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
3
4 #include "db.h"
5
6 #include <chrono>
7 #include <fcntl.h>
8 #include <memory>
9 #include <random>
10 #include <stddef.h>
11 #include <string>
12 #include <utility>
13 #include <vector>
14 #include <zstd.h>
15
16 class PostingListBuilder;
17
18 // {0,0} means unknown or so current that it should never match.
19 // {-1,0} means it's not a directory.
20 struct dir_time {
21         int64_t sec;
22         int32_t nsec;
23
24         bool operator<(const dir_time &other) const
25         {
26                 if (sec != other.sec)
27                         return sec < other.sec;
28                 return nsec < other.nsec;
29         }
30         bool operator>=(const dir_time &other) const
31         {
32                 return !(other < *this);
33         }
34 };
35 constexpr dir_time unknown_dir_time{ 0, 0 };
36 constexpr dir_time not_a_dir{ -1, 0 };
37
38 class DatabaseReceiver {
39 public:
40         virtual ~DatabaseReceiver() = default;
41         virtual void add_file(std::string filename, dir_time dt) = 0;
42         virtual void flush_block() = 0;
43         virtual void finish() { flush_block(); }
44
45         // EncodingCorpus only.
46         virtual size_t num_files_seen() const { return -1; }
47 };
48
49 class DictionaryBuilder : public DatabaseReceiver {
50 public:
51         DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
52                 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
53         void add_file(std::string filename, dir_time dt) override;
54         void flush_block() override;
55         std::string train(size_t buf_size);
56
57 private:
58         const size_t blocks_to_keep, block_size;
59         std::string current_block;
60         uint64_t block_num = 0;
61         size_t num_files_in_block = 0;
62
63         std::mt19937 reservoir_rand{ 1234 };  // Fixed seed for reproducibility.
64         bool keep_current_block = true;
65         int64_t slot_for_current_block = -1;
66
67         std::vector<std::string> sampled_blocks;
68         std::vector<size_t> lengths;
69 };
70
71 class EncodingCorpus;
72
73 class DatabaseBuilder {
74 public:
75         DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility);
76         DatabaseReceiver *start_corpus(bool store_dir_times);
77         void set_next_dictionary(std::string next_dictionary);
78         void set_conf_block(std::string conf_block);
79         void finish_corpus();
80
81 private:
82         FILE *outfp;
83         std::string outfile;
84         std::string temp_filename;
85         Header hdr;
86         const int block_size;
87         std::chrono::steady_clock::time_point corpus_start;
88         EncodingCorpus *corpus = nullptr;
89         ZSTD_CDict *cdict = nullptr;
90         std::string next_dictionary, conf_block;
91 };
92
93 #endif  // !defined(_DATABASE_BUILDER_H)