1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
14 class PostingListBuilder;
16 class DatabaseReceiver {
18 virtual ~DatabaseReceiver() = default;
19 virtual void add_file(std::string filename) = 0;
20 virtual void flush_block() = 0;
21 virtual void finish() { flush_block(); }
24 class DictionaryBuilder : public DatabaseReceiver {
26 DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
27 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
28 void add_file(std::string filename) override;
29 void flush_block() override;
30 std::string train(size_t buf_size);
33 const size_t blocks_to_keep, block_size;
34 std::string current_block;
35 uint64_t block_num = 0;
36 size_t num_files_in_block = 0;
38 std::mt19937 reservoir_rand{ 1234 }; // Fixed seed for reproducibility.
39 bool keep_current_block = true;
40 int64_t slot_for_current_block = -1;
42 std::vector<std::string> sampled_blocks;
43 std::vector<size_t> lengths;
46 class Corpus : public DatabaseReceiver {
48 Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict);
51 void add_file(std::string filename) override;
52 void flush_block() override;
53 void finish() override;
55 std::vector<uint64_t> filename_blocks;
56 size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
57 bool seen_trigram(uint32_t trgm)
59 return invindex[trgm] != nullptr;
61 PostingListBuilder &get_pl_builder(uint32_t trgm);
62 size_t num_trigrams() const;
65 std::unique_ptr<PostingListBuilder *[]> invindex;
67 std::string current_block;
69 const size_t block_size;
73 class DatabaseBuilder {
75 DatabaseBuilder(const char *outfile, int block_size, std::string dictionary);
76 Corpus *start_corpus();
84 std::chrono::steady_clock::time_point corpus_start;
85 Corpus *corpus = nullptr;
86 ZSTD_CDict *cdict = nullptr;
89 #endif // !defined(_DATABASE_BUILDER_H)