From: Steinar H. Gunderson Date: Thu, 11 Feb 2021 08:32:16 +0000 (+0100) Subject: Rename Corpus to EncodingCorpus, and make it private. X-Git-Tag: 1.1.4~11 X-Git-Url: https://git.sesse.net/?p=plocate;a=commitdiff_plain;h=72432f2b4b14a144f7d30c2326273f816e0219ae Rename Corpus to EncodingCorpus, and make it private. We don't really want two different classes named Corpus; that's pretty confusing, even though they don't live in the same binary. --- diff --git a/database-builder.cpp b/database-builder.cpp index 89a06d6..42fc186 100644 --- a/database-builder.cpp +++ b/database-builder.cpp @@ -175,7 +175,44 @@ string DictionaryBuilder::train(size_t buf_size) return buf; } -Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times) +class EncodingCorpus : public DatabaseReceiver { +public: + EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times); + ~EncodingCorpus(); + + void add_file(std::string filename, dir_time dt) override; + void flush_block() override; + void finish() override; + + std::vector filename_blocks; + size_t num_files = 0, num_files_in_block = 0, num_blocks = 0; + bool seen_trigram(uint32_t trgm) + { + return invindex[trgm] != nullptr; + } + size_t num_files_seen() const override { return num_files; } + PostingListBuilder &get_pl_builder(uint32_t trgm); + size_t num_trigrams() const; + std::string get_compressed_dir_times(); + +private: + void compress_dir_times(size_t allowed_slop); + + std::unique_ptr invindex; + FILE *outfp; + std::string current_block; + std::string tempbuf; + const size_t block_size; + const bool store_dir_times; + ZSTD_CDict *cdict; + + ZSTD_CStream *dir_time_ctx = nullptr; + std::string dir_times; // Buffer of still-uncompressed data. + std::string dir_times_compressed; +}; + + +EncodingCorpus::EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times) : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict) { fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr); @@ -185,14 +222,14 @@ Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir } } -Corpus::~Corpus() +EncodingCorpus::~EncodingCorpus() { for (unsigned i = 0; i < NUM_TRIGRAMS; ++i) { delete invindex[i]; } } -PostingListBuilder &Corpus::get_pl_builder(uint32_t trgm) +PostingListBuilder &EncodingCorpus::get_pl_builder(uint32_t trgm) { if (invindex[trgm] == nullptr) { invindex[trgm] = new PostingListBuilder; @@ -200,7 +237,7 @@ PostingListBuilder &Corpus::get_pl_builder(uint32_t trgm) return *invindex[trgm]; } -void Corpus::add_file(string filename, dir_time dt) +void EncodingCorpus::add_file(string filename, dir_time dt) { ++num_files; if (!current_block.empty()) { @@ -224,7 +261,7 @@ void Corpus::add_file(string filename, dir_time dt) } } -void Corpus::compress_dir_times(size_t allowed_slop) +void EncodingCorpus::compress_dir_times(size_t allowed_slop) { while (dir_times.size() >= allowed_slop) { size_t old_size = dir_times_compressed.size(); @@ -256,7 +293,7 @@ void Corpus::compress_dir_times(size_t allowed_slop) } } -void Corpus::flush_block() +void EncodingCorpus::flush_block() { if (current_block.empty()) { return; @@ -290,12 +327,12 @@ void Corpus::flush_block() ++num_blocks; } -void Corpus::finish() +void EncodingCorpus::finish() { flush_block(); } -size_t Corpus::num_trigrams() const +size_t EncodingCorpus::num_trigrams() const { size_t num = 0; for (unsigned trgm = 0; trgm < NUM_TRIGRAMS; ++trgm) { @@ -306,7 +343,7 @@ size_t Corpus::num_trigrams() const return num; } -string Corpus::get_compressed_dir_times() +string EncodingCorpus::get_compressed_dir_times() { if (!store_dir_times) { return ""; @@ -385,7 +422,7 @@ uint32_t next_prime(uint32_t x) return x; } -unique_ptr create_hashtable(Corpus &corpus, const vector &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots) +unique_ptr create_hashtable(EncodingCorpus &corpus, const vector &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots) { unique_ptr ht(new Trigram[ht_size + num_overflow_slots + 1]); // 1 for the sentinel element at the end. for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) { @@ -490,10 +527,10 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_siz hdr.conf_block_offset_bytes = 0; } -Corpus *DatabaseBuilder::start_corpus(bool store_dir_times) +DatabaseReceiver *DatabaseBuilder::start_corpus(bool store_dir_times) { corpus_start = steady_clock::now(); - corpus = new Corpus(outfp, block_size, cdict, store_dir_times); + corpus = new EncodingCorpus(outfp, block_size, cdict, store_dir_times); return corpus; } diff --git a/database-builder.h b/database-builder.h index 2a56e11..0c25476 100644 --- a/database-builder.h +++ b/database-builder.h @@ -41,6 +41,9 @@ public: virtual void add_file(std::string filename, dir_time dt) = 0; virtual void flush_block() = 0; virtual void finish() { flush_block(); } + + // EncodingCorpus only. + virtual size_t num_files_seen() const { return -1; } }; class DictionaryBuilder : public DatabaseReceiver { @@ -65,45 +68,12 @@ private: std::vector lengths; }; -class Corpus : public DatabaseReceiver { -public: - Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times); - ~Corpus(); - - void add_file(std::string filename, dir_time dt) override; - void flush_block() override; - void finish() override; - - std::vector filename_blocks; - size_t num_files = 0, num_files_in_block = 0, num_blocks = 0; - bool seen_trigram(uint32_t trgm) - { - return invindex[trgm] != nullptr; - } - PostingListBuilder &get_pl_builder(uint32_t trgm); - size_t num_trigrams() const; - std::string get_compressed_dir_times(); - -private: - void compress_dir_times(size_t allowed_slop); - - std::unique_ptr invindex; - FILE *outfp; - std::string current_block; - std::string tempbuf; - const size_t block_size; - const bool store_dir_times; - ZSTD_CDict *cdict; - - ZSTD_CStream *dir_time_ctx = nullptr; - std::string dir_times; // Buffer of still-uncompressed data. - std::string dir_times_compressed; -}; +class EncodingCorpus; class DatabaseBuilder { public: DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary, bool check_visibility); - Corpus *start_corpus(bool store_dir_times); + DatabaseReceiver *start_corpus(bool store_dir_times); void set_next_dictionary(std::string next_dictionary); void set_conf_block(std::string conf_block); void finish_corpus(); @@ -117,7 +87,7 @@ private: Header hdr; const int block_size; std::chrono::steady_clock::time_point corpus_start; - Corpus *corpus = nullptr; + EncodingCorpus *corpus = nullptr; ZSTD_CDict *cdict = nullptr; std::string next_dictionary, conf_block; }; diff --git a/plocate-build.cpp b/plocate-build.cpp index cb765ed..028a1ee 100644 --- a/plocate-build.cpp +++ b/plocate-build.cpp @@ -167,7 +167,7 @@ void do_build(const char *infile, const char *outfile, int block_size, bool plai string dictionary = builder.train(1024); DatabaseBuilder db(outfile, /*owner=*/-1, block_size, dictionary, /*check_visibility=*/true); - Corpus *corpus = db.start_corpus(/*store_dir_times=*/false); + DatabaseReceiver *corpus = db.start_corpus(/*store_dir_times=*/false); if (plaintext) { read_plaintext(infp, corpus); } else { @@ -175,7 +175,7 @@ void do_build(const char *infile, const char *outfile, int block_size, bool plai } fclose(infp); - dprintf("Read %zu files from %s\n", corpus->num_files, infile); + dprintf("Read %zu files from %s\n", corpus->num_files_seen(), infile); db.finish_corpus(); } diff --git a/updatedb.cpp b/updatedb.cpp index 1465b5b..895b071 100644 --- a/updatedb.cpp +++ b/updatedb.cpp @@ -530,7 +530,7 @@ string ExistingDB::read_next_dictionary() const // “parent_dev” must be the device of the parent directory of “path”. // // Takes ownership of fd. -int scan(const string &path, int fd, dev_t parent_dev, dir_time modified, dir_time db_modified, ExistingDB *existing_db, Corpus *corpus, DictionaryBuilder *dict_builder) +int scan(const string &path, int fd, dev_t parent_dev, dir_time modified, dir_time db_modified, ExistingDB *existing_db, DatabaseReceiver *corpus, DictionaryBuilder *dict_builder) { if (string_list_contains_dir_path(&conf_prunepaths, &conf_prunepaths_index, path)) { if (conf_debug_pruning) { @@ -782,7 +782,7 @@ int main(int argc, char **argv) DatabaseBuilder db(conf_output.c_str(), owner, conf_block_size, existing_db.read_next_dictionary(), conf_check_visibility); db.set_conf_block(conf_block); - Corpus *corpus = db.start_corpus(/*store_dir_times=*/true); + DatabaseReceiver *corpus = db.start_corpus(/*store_dir_times=*/true); int root_fd = opendir_noatime(AT_FDCWD, conf_scan_root); if (root_fd == -1) {