1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
16 class PostingListBuilder;
18 // {0,0} means unknown or so current that it should never match.
19 // {-1,0} means it's not a directory.
24 bool operator<(const dir_time &other) const
27 return sec < other.sec;
28 return nsec < other.nsec;
30 bool operator>=(const dir_time &other) const
32 return !(other < *this);
35 constexpr dir_time unknown_dir_time{ 0, 0 };
36 constexpr dir_time not_a_dir{ -1, 0 };
38 class DatabaseReceiver {
40 virtual ~DatabaseReceiver() = default;
41 virtual void add_file(std::string filename, dir_time dt) = 0;
42 virtual void flush_block() = 0;
43 virtual void finish() { flush_block(); }
45 // EncodingCorpus only.
46 virtual size_t num_files_seen() const { return -1; }
49 class DictionaryBuilder : public DatabaseReceiver {
51 DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
52 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
53 void add_file(std::string filename, dir_time dt) override;
54 void flush_block() override;
55 std::string train(size_t buf_size);
58 const size_t blocks_to_keep, block_size;
59 std::string current_block;
60 uint64_t block_num = 0;
61 size_t num_files_in_block = 0;
63 std::mt19937 reservoir_rand{ 1234 }; // Fixed seed for reproducibility.
64 bool keep_current_block = true;
65 int64_t slot_for_current_block = -1;
67 std::vector<std::string> sampled_blocks;
68 std::vector<size_t> lengths;
73 class DatabaseBuilder {
75 DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility);
76 DatabaseReceiver *start_corpus(bool store_dir_times);
77 void set_next_dictionary(std::string next_dictionary);
78 void set_conf_block(std::string conf_block);
84 std::string temp_filename;
87 std::chrono::steady_clock::time_point corpus_start;
88 EncodingCorpus *corpus = nullptr;
89 ZSTD_CDict *cdict = nullptr;
90 std::string next_dictionary, conf_block;
93 #endif // !defined(_DATABASE_BUILDER_H)