1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
16 class PostingListBuilder;
18 // {0,0} means unknown or so current that it should never match.
19 // {-1,0} means it's not a directory.
24 bool operator<(const dir_time &other) const
27 return sec < other.sec;
28 return nsec < other.nsec;
30 bool operator>=(const dir_time &other) const
32 return !(other < *this);
35 constexpr dir_time unknown_dir_time{ 0, 0 };
36 constexpr dir_time not_a_dir{ -1, 0 };
38 class DatabaseReceiver {
40 virtual ~DatabaseReceiver() = default;
41 virtual void add_file(std::string filename, dir_time dt) = 0;
42 virtual void flush_block() = 0;
43 virtual void finish() { flush_block(); }
46 class DictionaryBuilder : public DatabaseReceiver {
48 DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
49 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
50 void add_file(std::string filename, dir_time dt) override;
51 void flush_block() override;
52 std::string train(size_t buf_size);
55 const size_t blocks_to_keep, block_size;
56 std::string current_block;
57 uint64_t block_num = 0;
58 size_t num_files_in_block = 0;
60 std::mt19937 reservoir_rand{ 1234 }; // Fixed seed for reproducibility.
61 bool keep_current_block = true;
62 int64_t slot_for_current_block = -1;
64 std::vector<std::string> sampled_blocks;
65 std::vector<size_t> lengths;
68 class Corpus : public DatabaseReceiver {
70 Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times);
73 void add_file(std::string filename, dir_time dt) override;
74 void flush_block() override;
75 void finish() override;
77 std::vector<uint64_t> filename_blocks;
78 size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
79 bool seen_trigram(uint32_t trgm)
81 return invindex[trgm] != nullptr;
83 PostingListBuilder &get_pl_builder(uint32_t trgm);
84 size_t num_trigrams() const;
85 std::string get_compressed_dir_times();
88 void compress_dir_times(size_t allowed_slop);
90 std::unique_ptr<PostingListBuilder *[]> invindex;
92 std::string current_block;
94 const size_t block_size;
95 const bool store_dir_times;
98 ZSTD_CStream *dir_time_ctx = nullptr;
99 std::string dir_times; // Buffer of still-uncompressed data.
100 std::string dir_times_compressed;
103 class DatabaseBuilder {
105 DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility);
106 Corpus *start_corpus(bool store_dir_times);
107 void set_next_dictionary(std::string next_dictionary);
108 void set_conf_block(std::string conf_block);
109 void finish_corpus();
115 std::string temp_filename;
118 const int block_size;
119 std::chrono::steady_clock::time_point corpus_start;
120 Corpus *corpus = nullptr;
121 ZSTD_CDict *cdict = nullptr;
122 std::string next_dictionary, conf_block;
125 #endif // !defined(_DATABASE_BUILDER_H)