1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
17 class PostingListBuilder;
19 // {0,0} means unknown or so current that it should never match.
20 // {-1,0} means it's not a directory.
25 bool operator<(const dir_time &other) const
28 return sec < other.sec;
29 return nsec < other.nsec;
31 bool operator>=(const dir_time &other) const
33 return !(other < *this);
36 constexpr dir_time unknown_dir_time{ 0, 0 };
37 constexpr dir_time not_a_dir{ -1, 0 };
39 class DatabaseReceiver {
41 virtual ~DatabaseReceiver() = default;
42 virtual void add_file(std::string filename, dir_time dt) = 0;
43 virtual void flush_block() = 0;
44 virtual void finish() { flush_block(); }
46 // EncodingCorpus only.
47 virtual size_t num_files_seen() const { return -1; }
50 class DictionaryBuilder : public DatabaseReceiver {
52 DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
53 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
54 void add_file(std::string filename, dir_time dt) override;
55 void flush_block() override;
56 std::string train(size_t buf_size);
59 const size_t blocks_to_keep, block_size;
60 std::string current_block;
61 uint64_t block_num = 0;
62 size_t num_files_in_block = 0;
64 std::mt19937 reservoir_rand{ 1234 }; // Fixed seed for reproducibility.
65 bool keep_current_block = true;
66 int64_t slot_for_current_block = -1;
68 std::vector<std::string> sampled_blocks;
69 std::vector<size_t> lengths;
74 class DatabaseBuilder {
76 DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility);
77 DatabaseReceiver *start_corpus(bool store_dir_times);
78 void set_next_dictionary(std::string next_dictionary);
79 void set_conf_block(std::string conf_block);
85 std::string temp_filename;
88 std::chrono::steady_clock::time_point corpus_start;
89 EncodingCorpus *corpus = nullptr;
90 ZSTD_CDict *cdict = nullptr;
91 std::string next_dictionary, conf_block;
94 #endif // !defined(_DATABASE_BUILDER_H)