]> git.sesse.net Git - plocate/blobdiff - database-builder.h
Split DatabaseBuilder into its own compilation unit.
[plocate] / database-builder.h
diff --git a/database-builder.h b/database-builder.h
new file mode 100644 (file)
index 0000000..77a4555
--- /dev/null
@@ -0,0 +1,89 @@
+#ifndef _DATABASE_BUILDER_H
+#define _DATABASE_BUILDER_H 1
+
+#include "db.h"
+
+#include <chrono>
+#include <memory>
+#include <random>
+#include <stddef.h>
+#include <string>
+#include <vector>
+#include <zstd.h>
+
+class PostingListBuilder;
+
+class DatabaseReceiver {
+public:
+       virtual ~DatabaseReceiver() = default;
+       virtual void add_file(std::string filename) = 0;
+       virtual void flush_block() = 0;
+       virtual void finish() { flush_block(); }
+};
+
+class DictionaryBuilder : public DatabaseReceiver {
+public:
+       DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
+               : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
+       void add_file(std::string filename) override;
+       void flush_block() override;
+       std::string train(size_t buf_size);
+
+private:
+       const size_t blocks_to_keep, block_size;
+       std::string current_block;
+       uint64_t block_num = 0;
+       size_t num_files_in_block = 0;
+
+       std::mt19937 reservoir_rand{ 1234 };  // Fixed seed for reproducibility.
+       bool keep_current_block = true;
+       int64_t slot_for_current_block = -1;
+
+       std::vector<std::string> sampled_blocks;
+       std::vector<size_t> lengths;
+};
+
+class Corpus : public DatabaseReceiver {
+public:
+       Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict);
+       ~Corpus();
+
+       void add_file(std::string filename) override;
+       void flush_block() override;
+       void finish() override;
+
+       std::vector<uint64_t> filename_blocks;
+       size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
+       bool seen_trigram(uint32_t trgm)
+       {
+               return invindex[trgm] != nullptr;
+       }
+       PostingListBuilder &get_pl_builder(uint32_t trgm);
+       size_t num_trigrams() const;
+
+private:
+       std::unique_ptr<PostingListBuilder *[]> invindex;
+       FILE *outfp;
+       std::string current_block;
+       std::string tempbuf;
+       const size_t block_size;
+       ZSTD_CDict *cdict;
+       std::string directory_data;
+};
+
+class DatabaseBuilder {
+public:
+       DatabaseBuilder(const char *outfile, int block_size, std::string dictionary);
+       Corpus *start_corpus();
+       void finish_corpus();
+
+private:
+       FILE *outfp;
+       Header hdr;
+       const int block_size;
+       std::chrono::steady_clock::time_point corpus_start;
+       Corpus *corpus = nullptr;
+       ZSTD_CDict *cdict = nullptr;
+};
+
+#endif  // !defined(_DATABASE_BUILDER_H)