1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
15 class PostingListBuilder;
17 // {0,0} means unknown or so current that it should never match.
18 // {-1,0} means it's not a directory.
23 bool operator<(const dir_time &other) const
26 return sec < other.sec;
27 return nsec < other.nsec;
29 bool operator>=(const dir_time &other) const
31 return !(other < *this);
34 constexpr dir_time unknown_dir_time{ 0, 0 };
35 constexpr dir_time not_a_dir{ -1, 0 };
37 class DatabaseReceiver {
39 virtual ~DatabaseReceiver() = default;
40 virtual void add_file(std::string filename, dir_time dt) = 0;
41 virtual void flush_block() = 0;
42 virtual void finish() { flush_block(); }
45 class DictionaryBuilder : public DatabaseReceiver {
47 DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
48 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
49 void add_file(std::string filename, dir_time dt) override;
50 void flush_block() override;
51 std::string train(size_t buf_size);
54 const size_t blocks_to_keep, block_size;
55 std::string current_block;
56 uint64_t block_num = 0;
57 size_t num_files_in_block = 0;
59 std::mt19937 reservoir_rand{ 1234 }; // Fixed seed for reproducibility.
60 bool keep_current_block = true;
61 int64_t slot_for_current_block = -1;
63 std::vector<std::string> sampled_blocks;
64 std::vector<size_t> lengths;
67 class Corpus : public DatabaseReceiver {
69 Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times);
72 void add_file(std::string filename, dir_time dt) override;
73 void flush_block() override;
74 void finish() override;
76 std::vector<uint64_t> filename_blocks;
77 size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
78 bool seen_trigram(uint32_t trgm)
80 return invindex[trgm] != nullptr;
82 PostingListBuilder &get_pl_builder(uint32_t trgm);
83 size_t num_trigrams() const;
84 std::string get_compressed_dir_times();
87 void compress_dir_times(size_t allowed_slop);
89 std::unique_ptr<PostingListBuilder *[]> invindex;
91 std::string current_block;
93 const size_t block_size;
94 const bool store_dir_times;
97 ZSTD_CStream *dir_time_ctx = nullptr;
98 std::string dir_times; // Buffer of still-uncompressed data.
99 std::string dir_times_compressed;
102 class DatabaseBuilder {
104 DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility);
105 Corpus *start_corpus(bool store_dir_times);
106 void set_next_dictionary(std::string next_dictionary);
107 void set_conf_block(std::string conf_block);
108 void finish_corpus();
114 const int block_size;
115 std::chrono::steady_clock::time_point corpus_start;
116 Corpus *corpus = nullptr;
117 ZSTD_CDict *cdict = nullptr;
118 std::string next_dictionary, conf_block;
121 #endif // !defined(_DATABASE_BUILDER_H)