X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=database-builder.cpp;h=419e012ca94c2625471680de17b67f07716ecec8;hb=fd6198891d6fd9642effc0843fef6f23b991af3e;hp=b95d3bbcdb654c9248f118fa886da6a1a85ef2cc;hpb=3d0c863edc6eb65c0dc3a13d2745cab5ef0a6773;p=plocate diff --git a/database-builder.cpp b/database-builder.cpp index b95d3bb..419e012 100644 --- a/database-builder.cpp +++ b/database-builder.cpp @@ -5,12 +5,16 @@ #include #include +#ifdef HAS_ENDIAN_H +#include +#endif #include #include #include #include #include #include +#include #include #include @@ -25,29 +29,19 @@ constexpr unsigned num_overflow_slots = 16; string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf); -static inline uint32_t read_unigram(const string_view s, size_t idx) -{ - if (idx < s.size()) { - return (unsigned char)s[idx]; - } else { - return 0; - } -} - -static inline uint32_t read_trigram(const string_view s, size_t start) -{ - return read_unigram(s, start) | - (read_unigram(s, start + 1) << 8) | - (read_unigram(s, start + 2) << 16); -} - class PostingListBuilder { public: inline void add_docid(uint32_t docid); + inline void add_first_docid(uint32_t docid); void finish(); - string encoded; - size_t num_docids = 0; + vector encoded; + size_t get_num_docids() const + { + // Updated only when we flush, so check that we're finished. + assert(pending_deltas.empty()); + return num_docids; + } private: void write_header(uint32_t docid); @@ -55,7 +49,8 @@ private: vector pending_deltas; - uint32_t last_block_end, last_docid = -1; + uint32_t num_docids = 0; // Should be size_t, except the format only supports 2^32 docids per posting list anyway. + uint32_t last_docid = -1; }; void PostingListBuilder::add_docid(uint32_t docid) @@ -65,22 +60,20 @@ void PostingListBuilder::add_docid(uint32_t docid) return; } - if (num_docids == 0) { - // Very first docid. - write_header(docid); - ++num_docids; - last_block_end = last_docid = docid; - return; - } - pending_deltas.push_back(docid - last_docid - 1); last_docid = docid; if (pending_deltas.size() == 128) { append_block(); pending_deltas.clear(); - last_block_end = docid; + num_docids += 128; } +} + +void PostingListBuilder::add_first_docid(uint32_t docid) +{ + write_header(docid); ++num_docids; + last_docid = docid; } void PostingListBuilder::finish() @@ -94,7 +87,10 @@ void PostingListBuilder::finish() // No interleaving for partial blocks. unsigned char buf[P4NENC_BOUND(128)]; unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf); - encoded.append(reinterpret_cast(buf), reinterpret_cast(end)); + encoded.insert(encoded.end(), buf, end); + + num_docids += pending_deltas.size(); + pending_deltas.clear(); } void PostingListBuilder::append_block() @@ -102,17 +98,17 @@ void PostingListBuilder::append_block() unsigned char buf[P4NENC_BOUND(128)]; assert(pending_deltas.size() == 128); unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), 128, /*interleaved=*/true, buf); - encoded.append(reinterpret_cast(buf), reinterpret_cast(end)); + encoded.insert(encoded.end(), buf, end); } void PostingListBuilder::write_header(uint32_t docid) { unsigned char buf[P4NENC_BOUND(1)]; unsigned char *end = write_baseval(docid, buf); - encoded.append(reinterpret_cast(buf), end - buf); + encoded.insert(encoded.end(), buf, end); } -void DictionaryBuilder::add_file(string filename) +void DictionaryBuilder::add_file(string filename, dir_time) { if (keep_current_block) { // Only bother saving the filenames if we're actually keeping the block. if (!current_block.empty()) { @@ -162,7 +158,7 @@ string DictionaryBuilder::train(size_t buf_size) string buf; buf.resize(buf_size); size_t ret = ZDICT_trainFromBuffer(&buf[0], buf_size, dictionary_buf.data(), lengths.data(), lengths.size()); - if (ret == size_t(-1)) { + if (ZDICT_isError(ret)) { return ""; } dprintf("Sampled %zu bytes in %zu blocks, built a dictionary of size %zu\n", dictionary_buf.size(), lengths.size(), ret); @@ -174,28 +170,75 @@ string DictionaryBuilder::train(size_t buf_size) return buf; } -Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict) - : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), cdict(cdict) +class EncodingCorpus : public DatabaseReceiver { +public: + EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times); + ~EncodingCorpus(); + + void add_file(std::string filename, dir_time dt) override; + void flush_block() override; + void finish() override; + + std::vector filename_blocks; + size_t num_files = 0, num_files_in_block = 0, num_blocks = 0; + bool seen_trigram(uint32_t trgm) + { + return invindex[trgm] != nullptr; + } + size_t num_files_seen() const override { return num_files; } + PostingListBuilder &get_pl_builder(uint32_t trgm) + { + return *invindex[trgm]; + } + + void add_docid(uint32_t trgm, uint32_t docid) + { + if (invindex[trgm] == nullptr) { + invindex[trgm] = new PostingListBuilder; + invindex[trgm]->add_first_docid(docid); + } else { + invindex[trgm]->add_docid(docid); + } + } + + size_t num_trigrams() const; + std::string get_compressed_dir_times(); + +private: + void compress_dir_times(size_t allowed_slop); + + std::unique_ptr invindex; + FILE *outfp; + off_t outfp_pos; // Cheaper than calling ftell(outfp) all the time. + std::string current_block; + std::string tempbuf; + const size_t block_size; + const bool store_dir_times; + ZSTD_CDict *cdict; + + ZSTD_CStream *dir_time_ctx = nullptr; + std::string dir_times; // Buffer of still-uncompressed data. + std::string dir_times_compressed; +}; + +EncodingCorpus::EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times) + : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), outfp_pos(ftell(outfp)), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict) { fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr); + if (store_dir_times) { + dir_time_ctx = ZSTD_createCStream(); + ZSTD_initCStream(dir_time_ctx, /*level=*/6); + } } -Corpus::~Corpus() +EncodingCorpus::~EncodingCorpus() { for (unsigned i = 0; i < NUM_TRIGRAMS; ++i) { delete invindex[i]; } } -PostingListBuilder &Corpus::get_pl_builder(uint32_t trgm) -{ - if (invindex[trgm] == nullptr) { - invindex[trgm] = new PostingListBuilder; - } - return *invindex[trgm]; -} - -void Corpus::add_file(string filename) +void EncodingCorpus::add_file(string filename, dir_time dt) { ++num_files; if (!current_block.empty()) { @@ -205,9 +248,53 @@ void Corpus::add_file(string filename) if (++num_files_in_block == block_size) { flush_block(); } + + if (store_dir_times) { + if (dt.sec == -1) { + // Not a directory. + dir_times.push_back('\0'); + } else { + dir_times.push_back('\1'); + dir_times.append(reinterpret_cast(&dt.sec), sizeof(dt.sec)); + dir_times.append(reinterpret_cast(&dt.nsec), sizeof(dt.nsec)); + } + compress_dir_times(/*allowed_slop=*/4096); + } } -void Corpus::flush_block() +void EncodingCorpus::compress_dir_times(size_t allowed_slop) +{ + while (dir_times.size() >= allowed_slop) { + size_t old_size = dir_times_compressed.size(); + dir_times_compressed.resize(old_size + 4096); + + ZSTD_outBuffer outbuf; + outbuf.dst = dir_times_compressed.data() + old_size; + outbuf.size = 4096; + outbuf.pos = 0; + + ZSTD_inBuffer inbuf; + inbuf.src = dir_times.data(); + inbuf.size = dir_times.size(); + inbuf.pos = 0; + + int ret = ZSTD_compressStream(dir_time_ctx, &outbuf, &inbuf); + if (ret < 0) { + fprintf(stderr, "ZSTD_compressStream() failed\n"); + exit(1); + } + + dir_times_compressed.resize(old_size + outbuf.pos); + dir_times.erase(dir_times.begin(), dir_times.begin() + inbuf.pos); + + if (outbuf.pos == 0 && inbuf.pos == 0) { + // Nothing happened (not enough data?), try again later. + return; + } + } +} + +void EncodingCorpus::flush_block() { if (current_block.empty()) { return; @@ -217,36 +304,58 @@ void Corpus::flush_block() // Create trigrams. const char *ptr = current_block.c_str(); - while (ptr < current_block.c_str() + current_block.size()) { - string_view s(ptr); - if (s.size() >= 3) { - for (size_t j = 0; j < s.size() - 2; ++j) { - uint32_t trgm = read_trigram(s, j); - get_pl_builder(trgm).add_docid(docid); + const char *end = ptr + current_block.size(); + while (ptr < end - 3) { // Must be at least one filename left, that's at least three bytes. + if (ptr[0] == '\0') { + // This filename is zero bytes, so skip it (and the zero terminator). + ++ptr; + continue; + } else if (ptr[1] == '\0') { + // This filename is one byte, so skip it (and the zero terminator). + ptr += 2; + continue; + } else if (ptr[2] == '\0') { + // This filename is two bytes, so skip it (and the zero terminator). + ptr += 3; + continue; + } + for (;;) { + // NOTE: Will read one byte past the end of the trigram, but it's OK, + // since we always call it from contexts where there's a terminating zero byte. + uint32_t trgm; + memcpy(&trgm, ptr, sizeof(trgm)); + ++ptr; + trgm = le32toh(trgm); + add_docid(trgm & 0xffffff, docid); + if (trgm <= 0xffffff) { + // Terminating zero byte, so we're done with this filename. + // Skip the remaining two bytes, and the zero terminator. + ptr += 3; + break; } } - ptr += s.size() + 1; } // Compress and add the filename block. - filename_blocks.push_back(ftell(outfp)); + filename_blocks.push_back(outfp_pos); string compressed = zstd_compress(current_block, cdict, &tempbuf); if (fwrite(compressed.data(), compressed.size(), 1, outfp) != 1) { perror("fwrite()"); exit(1); } + outfp_pos += compressed.size(); current_block.clear(); num_files_in_block = 0; ++num_blocks; } -void Corpus::finish() +void EncodingCorpus::finish() { flush_block(); } -size_t Corpus::num_trigrams() const +size_t EncodingCorpus::num_trigrams() const { size_t num = 0; for (unsigned trgm = 0; trgm < NUM_TRIGRAMS; ++trgm) { @@ -257,6 +366,40 @@ size_t Corpus::num_trigrams() const return num; } +string EncodingCorpus::get_compressed_dir_times() +{ + if (!store_dir_times) { + return ""; + } + compress_dir_times(/*allowed_slop=*/0); + assert(dir_times.empty()); + + for (;;) { + size_t old_size = dir_times_compressed.size(); + dir_times_compressed.resize(old_size + 4096); + + ZSTD_outBuffer outbuf; + outbuf.dst = dir_times_compressed.data() + old_size; + outbuf.size = 4096; + outbuf.pos = 0; + + int ret = ZSTD_endStream(dir_time_ctx, &outbuf); + if (ret < 0) { + fprintf(stderr, "ZSTD_compressStream() failed\n"); + exit(1); + } + + dir_times_compressed.resize(old_size + outbuf.pos); + + if (ret == 0) { + // All done. + break; + } + } + + return dir_times_compressed; +} + string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf) { static ZSTD_CCtx *ctx = nullptr; @@ -302,7 +445,7 @@ uint32_t next_prime(uint32_t x) return x; } -unique_ptr create_hashtable(Corpus &corpus, const vector &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots) +unique_ptr create_hashtable(EncodingCorpus &corpus, const vector &all_trigrams, uint32_t ht_size, uint32_t num_overflow_slots) { unique_ptr ht(new Trigram[ht_size + num_overflow_slots + 1]); // 1 for the sentinel element at the end. for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) { @@ -312,7 +455,7 @@ unique_ptr create_hashtable(Corpus &corpus, const vector &a } for (uint32_t trgm : all_trigrams) { // We don't know offset yet, so set it to zero. - Trigram to_insert{ trgm, uint32_t(corpus.get_pl_builder(trgm).num_docids), 0 }; + Trigram to_insert{ trgm, uint32_t(corpus.get_pl_builder(trgm).get_num_docids()), 0 }; uint32_t bucket = hash_trigram(trgm, ht_size); unsigned distance = 0; @@ -334,11 +477,45 @@ unique_ptr create_hashtable(Corpus &corpus, const vector &a return ht; } -DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dictionary) - : block_size(block_size) +DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_size, string dictionary, bool check_visibility) + : outfile(outfile), block_size(block_size) { umask(0027); - outfp = fopen(outfile, "wb"); + + string path = outfile; + path.resize(path.find_last_of('/') + 1); + if (path.empty()) { + path = "."; + } + int fd = -1; +#ifdef O_TMPFILE + fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640); + if (fd == -1 && errno != EOPNOTSUPP && errno != EISDIR) { + perror(path.c_str()); + exit(1); + } +#endif + if (fd == -1) { + temp_filename = string(outfile) + ".XXXXXX"; + fd = mkstemp(&temp_filename[0]); + if (fd == -1) { + perror(temp_filename.c_str()); + exit(1); + } + if (fchmod(fd, 0640) == -1) { + perror("fchmod"); + exit(1); + } + } + + if (owner != (gid_t)-1) { + if (fchown(fd, (uid_t)-1, owner) == -1) { + perror("fchown"); + exit(1); + } + } + + outfp = fdopen(fd, "wb"); if (outfp == nullptr) { perror(outfile); exit(1); @@ -351,9 +528,10 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dic hdr.extra_ht_slots = num_overflow_slots; hdr.num_docids = 0; hdr.hash_table_offset_bytes = -1; // We don't know these offsets yet. - hdr.max_version = 1; + hdr.max_version = 2; hdr.filename_index_offset_bytes = -1; hdr.zstd_dictionary_length_bytes = -1; + hdr.check_visibility = check_visibility; fwrite(&hdr, sizeof(hdr), 1, outfp); if (dictionary.empty()) { @@ -365,15 +543,32 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dic hdr.zstd_dictionary_length_bytes = dictionary.size(); cdict = ZSTD_createCDict(dictionary.data(), dictionary.size(), /*level=*/6); } + + hdr.directory_data_length_bytes = 0; + hdr.directory_data_offset_bytes = 0; + hdr.next_zstd_dictionary_length_bytes = 0; + hdr.next_zstd_dictionary_offset_bytes = 0; + hdr.conf_block_length_bytes = 0; + hdr.conf_block_offset_bytes = 0; } -Corpus *DatabaseBuilder::start_corpus() +DatabaseReceiver *DatabaseBuilder::start_corpus(bool store_dir_times) { corpus_start = steady_clock::now(); - corpus = new Corpus(outfp, block_size, cdict); + corpus = new EncodingCorpus(outfp, block_size, cdict, store_dir_times); return corpus; } +void DatabaseBuilder::set_next_dictionary(std::string next_dictionary) +{ + this->next_dictionary = move(next_dictionary); +} + +void DatabaseBuilder::set_conf_block(std::string conf_block) +{ + this->conf_block = move(conf_block); +} + void DatabaseBuilder::finish_corpus() { corpus->finish(); @@ -398,8 +593,8 @@ void DatabaseBuilder::finish_corpus() continue; PostingListBuilder &pl_builder = corpus->get_pl_builder(trgm); pl_builder.finish(); - longest_posting_list = max(longest_posting_list, pl_builder.num_docids); - trigrams += pl_builder.num_docids; + longest_posting_list = max(longest_posting_list, pl_builder.get_num_docids()); + trigrams += pl_builder.get_num_docids(); bytes_for_posting_lists += pl_builder.encoded.size(); } size_t num_trigrams = corpus->num_trigrams(); @@ -440,7 +635,7 @@ void DatabaseBuilder::finish_corpus() continue; } - const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; + const vector &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; offset += encoded.size(); } @@ -454,17 +649,62 @@ void DatabaseBuilder::finish_corpus() if (hashtable[i].num_docids == 0) { continue; } - const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; + const vector &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; fwrite(encoded.data(), encoded.size(), 1, outfp); } + // Finally, write the directory times (for updatedb). + string compressed_dir_times = corpus->get_compressed_dir_times(); + size_t bytes_for_compressed_dir_times = 0; + if (!compressed_dir_times.empty()) { + hdr.directory_data_offset_bytes = ftell(outfp); + hdr.directory_data_length_bytes = compressed_dir_times.size(); + fwrite(compressed_dir_times.data(), compressed_dir_times.size(), 1, outfp); + bytes_for_compressed_dir_times = compressed_dir_times.size(); + compressed_dir_times.clear(); + } + + // Write the recommended dictionary for next update. + if (!next_dictionary.empty()) { + hdr.next_zstd_dictionary_offset_bytes = ftell(outfp); + hdr.next_zstd_dictionary_length_bytes = next_dictionary.size(); + fwrite(next_dictionary.data(), next_dictionary.size(), 1, outfp); + } + + // And the configuration block. + if (!conf_block.empty()) { + hdr.conf_block_offset_bytes = ftell(outfp); + hdr.conf_block_length_bytes = conf_block.size(); + fwrite(conf_block.data(), conf_block.size(), 1, outfp); + } + // Rewind, and write the updated header. hdr.version = 1; fseek(outfp, 0, SEEK_SET); fwrite(&hdr, sizeof(hdr), 1, outfp); + + if (!temp_filename.empty()) { + if (rename(temp_filename.c_str(), outfile.c_str()) == -1) { + perror("rename"); + exit(1); + } + } else { +#ifdef O_TMPFILE + // Give the file a proper name, making it visible in the file system. + // TODO: It would be nice to be able to do this atomically, like with rename. + unlink(outfile.c_str()); + char procpath[256]; + snprintf(procpath, sizeof(procpath), "/proc/self/fd/%d", fileno(outfp)); + if (linkat(AT_FDCWD, procpath, AT_FDCWD, outfile.c_str(), AT_SYMLINK_FOLLOW) == -1) { + perror("linkat"); + exit(1); + } +#endif + } + fclose(outfp); - size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames); + size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames + bytes_for_compressed_dir_times); dprintf("Block size: %7d files\n", block_size); dprintf("Dictionary: %'7.1f MB\n", hdr.zstd_dictionary_length_bytes / 1048576.0); @@ -472,6 +712,9 @@ void DatabaseBuilder::finish_corpus() dprintf("Posting lists: %'7.1f MB\n", bytes_for_posting_lists / 1048576.0); dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0); dprintf("Filenames: %'7.1f MB\n", bytes_for_filenames / 1048576.0); + if (bytes_for_compressed_dir_times != 0) { + dprintf("Modify times: %'7.1f MB\n", bytes_for_compressed_dir_times / 1048576.0); + } dprintf("Total: %'7.1f MB\n", total_bytes / 1048576.0); dprintf("\n"); }