+class DatabaseBuilder {
+public:
+ DatabaseBuilder(const char *outfile, int block_size, string dictionary);
+ Corpus *start_corpus();
+ void finish_corpus();
+
+private:
+ FILE *outfp;
+ Header hdr;
+ const int block_size;
+ steady_clock::time_point corpus_start;
+ Corpus *corpus = nullptr;
+ ZSTD_CDict *cdict = nullptr;
+};
+
+DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dictionary)
+ : block_size(block_size)
+{
+ umask(0027);
+ outfp = fopen(outfile, "wb");
+ if (outfp == nullptr) {
+ perror(outfile);
+ exit(1);
+ }
+
+ // Write the header.
+ memcpy(hdr.magic, "\0plocate", 8);
+ hdr.version = -1; // Mark as broken.
+ hdr.hashtable_size = 0; // Not known yet.
+ hdr.extra_ht_slots = num_overflow_slots;
+ hdr.num_docids = 0;
+ hdr.hash_table_offset_bytes = -1; // We don't know these offsets yet.
+ hdr.max_version = 1;
+ hdr.filename_index_offset_bytes = -1;
+ hdr.zstd_dictionary_length_bytes = -1;
+ fwrite(&hdr, sizeof(hdr), 1, outfp);
+
+ if (dictionary.empty()) {
+ hdr.zstd_dictionary_offset_bytes = 0;
+ hdr.zstd_dictionary_length_bytes = 0;
+ } else {
+ hdr.zstd_dictionary_offset_bytes = ftell(outfp);
+ fwrite(dictionary.data(), dictionary.size(), 1, outfp);
+ hdr.zstd_dictionary_length_bytes = dictionary.size();
+ cdict = ZSTD_createCDict(dictionary.data(), dictionary.size(), /*level=*/6);
+ }
+}
+
+Corpus *DatabaseBuilder::start_corpus()
+{
+ corpus_start = steady_clock::now();
+ corpus = new Corpus(outfp, block_size, cdict);
+ return corpus;
+}
+
+void DatabaseBuilder::finish_corpus()
+{
+ corpus->flush_block();
+ hdr.num_docids = corpus->filename_blocks.size();
+
+ // Stick an empty block at the end as sentinel.
+ corpus->filename_blocks.push_back(ftell(outfp));
+ const size_t bytes_for_filenames = corpus->filename_blocks.back() - corpus->filename_blocks.front();
+
+ // Write the offsets to the filenames.
+ hdr.filename_index_offset_bytes = ftell(outfp);
+ const size_t bytes_for_filename_index = corpus->filename_blocks.size() * sizeof(uint64_t);
+ fwrite(corpus->filename_blocks.data(), corpus->filename_blocks.size(), sizeof(uint64_t), outfp);
+ corpus->filename_blocks.clear();
+ corpus->filename_blocks.shrink_to_fit();
+
+ // Finish up encoding the posting lists.