X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=database-builder.cpp;h=d883451dd669e1b2bae6a444bb2c02987d02e13b;hb=30305b2e29b28107a311850d40db54c6726f726c;hp=6e792be450202767b55eec08c869cae3b242b4a9;hpb=d0f2469aedf852ba2d6949e59bfc1fff565960c9;p=plocate diff --git a/database-builder.cpp b/database-builder.cpp index 6e792be..d883451 100644 --- a/database-builder.cpp +++ b/database-builder.cpp @@ -56,7 +56,7 @@ private: vector pending_deltas; - uint32_t last_block_end, last_docid = -1; + uint32_t last_docid = -1; }; void PostingListBuilder::add_docid(uint32_t docid) @@ -70,7 +70,7 @@ void PostingListBuilder::add_docid(uint32_t docid) // Very first docid. write_header(docid); ++num_docids; - last_block_end = last_docid = docid; + last_docid = docid; return; } @@ -79,7 +79,6 @@ void PostingListBuilder::add_docid(uint32_t docid) if (pending_deltas.size() == 128) { append_block(); pending_deltas.clear(); - last_block_end = docid; } ++num_docids; } @@ -163,7 +162,7 @@ string DictionaryBuilder::train(size_t buf_size) string buf; buf.resize(buf_size); size_t ret = ZDICT_trainFromBuffer(&buf[0], buf_size, dictionary_buf.data(), lengths.data(), lengths.size()); - if (ret == size_t(-1)) { + if (ZDICT_isError(ret)) { return ""; } dprintf("Sampled %zu bytes in %zu blocks, built a dictionary of size %zu\n", dictionary_buf.size(), lengths.size(), ret); @@ -175,7 +174,51 @@ string DictionaryBuilder::train(size_t buf_size) return buf; } -Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times) +class EncodingCorpus : public DatabaseReceiver { +public: + EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times); + ~EncodingCorpus(); + + void add_file(std::string filename, dir_time dt) override; + void flush_block() override; + void finish() override; + + std::vector filename_blocks; + size_t num_files = 0, num_files_in_block = 0, num_blocks = 0; + bool seen_trigram(uint32_t trgm) + { + return invindex[trgm] != nullptr; + } + size_t num_files_seen() const override { return num_files; } + PostingListBuilder &get_pl_builder(uint32_t trgm) + { + if (invindex[trgm] == nullptr) { + invindex[trgm] = new PostingListBuilder; + } + return *invindex[trgm]; + } + + size_t num_trigrams() const; + std::string get_compressed_dir_times(); + +private: + void compress_dir_times(size_t allowed_slop); + + std::unique_ptr invindex; + FILE *outfp; + std::string current_block; + std::string tempbuf; + const size_t block_size; + const bool store_dir_times; + ZSTD_CDict *cdict; + + ZSTD_CStream *dir_time_ctx = nullptr; + std::string dir_times; // Buffer of still-uncompressed data. + std::string dir_times_compressed; +}; + + +EncodingCorpus::EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times) : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict) { fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr); @@ -185,22 +228,14 @@ Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir } } -Corpus::~Corpus() +EncodingCorpus::~EncodingCorpus() { for (unsigned i = 0; i < NUM_TRIGRAMS; ++i) { delete invindex[i]; } } -PostingListBuilder &Corpus::get_pl_builder(uint32_t trgm) -{ - if (invindex[trgm] == nullptr) { - invindex[trgm] = new PostingListBuilder; - } - return *invindex[trgm]; -} - -void Corpus::add_file(string filename, dir_time dt) +void EncodingCorpus::add_file(string filename, dir_time dt) { ++num_files; if (!current_block.empty()) { @@ -224,7 +259,7 @@ void Corpus::add_file(string filename, dir_time dt) } } -void Corpus::compress_dir_times(size_t allowed_slop) +void EncodingCorpus::compress_dir_times(size_t allowed_slop) { while (dir_times.size() >= allowed_slop) { size_t old_size = dir_times_compressed.size(); @@ -256,7 +291,7 @@ void Corpus::compress_dir_times(size_t allowed_slop) } } -void Corpus::flush_block() +void EncodingCorpus::flush_block() { if (current_block.empty()) { return; @@ -290,12 +325,12 @@ void Corpus::flush_block() ++num_blocks; } -void Corpus::finish() +void EncodingCorpus::finish() { flush_block(); } -size_t Corpus::num_trigrams() const +size_t EncodingCorpus::num_trigrams() const { size_t num = 0; for (unsigned trgm = 0; trgm < NUM_TRIGRAMS; ++trgm) { @@ -306,7 +341,7 @@ size_t Corpus::num_trigrams() const return num; } -string Corpus::get_compressed_dir_times() +string EncodingCorpus::get_compressed_dir_times() { if (!store_dir_times) { return ""; @@ -385,7 +420,7 @@ uint32_t next_prime(uint32_t x) return x; } -unique_ptr create_hashtable(Corpus &corpus, const vector &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots) +unique_ptr create_hashtable(EncodingCorpus &corpus, const vector &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots) { unique_ptr ht(new Trigram[ht_size + num_overflow_slots + 1]); // 1 for the sentinel element at the end. for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) { @@ -427,11 +462,24 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_siz if (path.empty()) { path = "."; } +#ifdef O_TMPFILE int fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640); if (fd == -1) { perror(path.c_str()); exit(1); } +#else + temp_filename = string(outfile) + ".XXXXXX"; + int fd = mkstemp(&temp_filename[0]); + if (fd == -1) { + perror(temp_filename.c_str()); + exit(1); + } + if (fchmod(fd, 0640) == -1) { + perror("fchmod"); + exit(1); + } +#endif if (owner != (gid_t)-1) { if (fchown(fd, (uid_t)-1, owner) == -1) { @@ -477,10 +525,10 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_siz hdr.conf_block_offset_bytes = 0; } -Corpus *DatabaseBuilder::start_corpus(bool store_dir_times) +DatabaseReceiver *DatabaseBuilder::start_corpus(bool store_dir_times) { corpus_start = steady_clock::now(); - corpus = new Corpus(outfp, block_size, cdict, store_dir_times); + corpus = new EncodingCorpus(outfp, block_size, cdict, store_dir_times); return corpus; } @@ -608,6 +656,7 @@ void DatabaseBuilder::finish_corpus() fseek(outfp, 0, SEEK_SET); fwrite(&hdr, sizeof(hdr), 1, outfp); +#ifdef O_TMPFILE // Give the file a proper name, making it visible in the file system. // TODO: It would be nice to be able to do this atomically, like with rename. unlink(outfile.c_str()); @@ -617,6 +666,12 @@ void DatabaseBuilder::finish_corpus() perror("linkat"); exit(1); } +#else + if (rename(temp_filename.c_str(), outfile.c_str()) == -1) { + perror("rename"); + exit(1); + } +#endif fclose(outfp);