]> git.sesse.net Git - plocate/blob - database-builder.h
Make DatabaseBuilder write the file atomically.
[plocate] / database-builder.h
1 #ifndef _DATABASE_BUILDER_H
2 #define _DATABASE_BUILDER_H 1
3
4 #include "db.h"
5
6 #include <chrono>
7 #include <memory>
8 #include <random>
9 #include <stddef.h>
10 #include <string>
11 #include <vector>
12 #include <zstd.h>
13
14 class PostingListBuilder;
15
16 class DatabaseReceiver {
17 public:
18         virtual ~DatabaseReceiver() = default;
19         virtual void add_file(std::string filename) = 0;
20         virtual void flush_block() = 0;
21         virtual void finish() { flush_block(); }
22 };
23
24 class DictionaryBuilder : public DatabaseReceiver {
25 public:
26         DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
27                 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
28         void add_file(std::string filename) override;
29         void flush_block() override;
30         std::string train(size_t buf_size);
31
32 private:
33         const size_t blocks_to_keep, block_size;
34         std::string current_block;
35         uint64_t block_num = 0;
36         size_t num_files_in_block = 0;
37
38         std::mt19937 reservoir_rand{ 1234 };  // Fixed seed for reproducibility.
39         bool keep_current_block = true;
40         int64_t slot_for_current_block = -1;
41
42         std::vector<std::string> sampled_blocks;
43         std::vector<size_t> lengths;
44 };
45
46 class Corpus : public DatabaseReceiver {
47 public:
48         Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict);
49         ~Corpus();
50
51         void add_file(std::string filename) override;
52         void flush_block() override;
53         void finish() override;
54
55         std::vector<uint64_t> filename_blocks;
56         size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
57         bool seen_trigram(uint32_t trgm)
58         {
59                 return invindex[trgm] != nullptr;
60         }
61         PostingListBuilder &get_pl_builder(uint32_t trgm);
62         size_t num_trigrams() const;
63
64 private:
65         std::unique_ptr<PostingListBuilder *[]> invindex;
66         FILE *outfp;
67         std::string current_block;
68         std::string tempbuf;
69         const size_t block_size;
70         ZSTD_CDict *cdict;
71 };
72
73 class DatabaseBuilder {
74 public:
75         DatabaseBuilder(const char *outfile, int block_size, std::string dictionary);
76         Corpus *start_corpus();
77         void finish_corpus();
78
79 private:
80         FILE *outfp;
81         std::string outfile;
82         Header hdr;
83         const int block_size;
84         std::chrono::steady_clock::time_point corpus_start;
85         Corpus *corpus = nullptr;
86         ZSTD_CDict *cdict = nullptr;
87 };
88
89 #endif  // !defined(_DATABASE_BUILDER_H)