X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=database-builder.cpp;h=419e012ca94c2625471680de17b67f07716ecec8;hb=fd6198891d6fd9642effc0843fef6f23b991af3e;hp=3801e5df582730e25585cb0dd0872ef206476ac0;hpb=fc12afad4fb35b1f45bcd755f6db3713fc7c0572;p=plocate diff --git a/database-builder.cpp b/database-builder.cpp index 3801e5d..419e012 100644 --- a/database-builder.cpp +++ b/database-builder.cpp @@ -5,6 +5,9 @@ #include #include +#ifdef HAS_ENDIAN_H +#include +#endif #include #include #include @@ -26,29 +29,19 @@ constexpr unsigned num_overflow_slots = 16; string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf); -static inline uint32_t read_unigram(const string_view s, size_t idx) -{ - if (idx < s.size()) { - return (unsigned char)s[idx]; - } else { - return 0; - } -} - -static inline uint32_t read_trigram(const string_view s, size_t start) -{ - return read_unigram(s, start) | - (read_unigram(s, start + 1) << 8) | - (read_unigram(s, start + 2) << 16); -} - class PostingListBuilder { public: inline void add_docid(uint32_t docid); + inline void add_first_docid(uint32_t docid); void finish(); - string encoded; - size_t num_docids = 0; + vector encoded; + size_t get_num_docids() const + { + // Updated only when we flush, so check that we're finished. + assert(pending_deltas.empty()); + return num_docids; + } private: void write_header(uint32_t docid); @@ -56,7 +49,8 @@ private: vector pending_deltas; - uint32_t last_block_end, last_docid = -1; + uint32_t num_docids = 0; // Should be size_t, except the format only supports 2^32 docids per posting list anyway. + uint32_t last_docid = -1; }; void PostingListBuilder::add_docid(uint32_t docid) @@ -66,22 +60,20 @@ void PostingListBuilder::add_docid(uint32_t docid) return; } - if (num_docids == 0) { - // Very first docid. - write_header(docid); - ++num_docids; - last_block_end = last_docid = docid; - return; - } - pending_deltas.push_back(docid - last_docid - 1); last_docid = docid; if (pending_deltas.size() == 128) { append_block(); pending_deltas.clear(); - last_block_end = docid; + num_docids += 128; } +} + +void PostingListBuilder::add_first_docid(uint32_t docid) +{ + write_header(docid); ++num_docids; + last_docid = docid; } void PostingListBuilder::finish() @@ -95,7 +87,10 @@ void PostingListBuilder::finish() // No interleaving for partial blocks. unsigned char buf[P4NENC_BOUND(128)]; unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf); - encoded.append(reinterpret_cast(buf), reinterpret_cast(end)); + encoded.insert(encoded.end(), buf, end); + + num_docids += pending_deltas.size(); + pending_deltas.clear(); } void PostingListBuilder::append_block() @@ -103,14 +98,14 @@ void PostingListBuilder::append_block() unsigned char buf[P4NENC_BOUND(128)]; assert(pending_deltas.size() == 128); unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), 128, /*interleaved=*/true, buf); - encoded.append(reinterpret_cast(buf), reinterpret_cast(end)); + encoded.insert(encoded.end(), buf, end); } void PostingListBuilder::write_header(uint32_t docid) { unsigned char buf[P4NENC_BOUND(1)]; unsigned char *end = write_baseval(docid, buf); - encoded.append(reinterpret_cast(buf), end - buf); + encoded.insert(encoded.end(), buf, end); } void DictionaryBuilder::add_file(string filename, dir_time) @@ -192,11 +187,18 @@ public: } size_t num_files_seen() const override { return num_files; } PostingListBuilder &get_pl_builder(uint32_t trgm) + { + return *invindex[trgm]; + } + + void add_docid(uint32_t trgm, uint32_t docid) { if (invindex[trgm] == nullptr) { invindex[trgm] = new PostingListBuilder; + invindex[trgm]->add_first_docid(docid); + } else { + invindex[trgm]->add_docid(docid); } - return *invindex[trgm]; } size_t num_trigrams() const; @@ -207,6 +209,7 @@ private: std::unique_ptr invindex; FILE *outfp; + off_t outfp_pos; // Cheaper than calling ftell(outfp) all the time. std::string current_block; std::string tempbuf; const size_t block_size; @@ -218,9 +221,8 @@ private: std::string dir_times_compressed; }; - EncodingCorpus::EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times) - : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict) + : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), outfp_pos(ftell(outfp)), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict) { fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr); if (store_dir_times) { @@ -302,24 +304,46 @@ void EncodingCorpus::flush_block() // Create trigrams. const char *ptr = current_block.c_str(); - while (ptr < current_block.c_str() + current_block.size()) { - string_view s(ptr); - if (s.size() >= 3) { - for (size_t j = 0; j < s.size() - 2; ++j) { - uint32_t trgm = read_trigram(s, j); - get_pl_builder(trgm).add_docid(docid); + const char *end = ptr + current_block.size(); + while (ptr < end - 3) { // Must be at least one filename left, that's at least three bytes. + if (ptr[0] == '\0') { + // This filename is zero bytes, so skip it (and the zero terminator). + ++ptr; + continue; + } else if (ptr[1] == '\0') { + // This filename is one byte, so skip it (and the zero terminator). + ptr += 2; + continue; + } else if (ptr[2] == '\0') { + // This filename is two bytes, so skip it (and the zero terminator). + ptr += 3; + continue; + } + for (;;) { + // NOTE: Will read one byte past the end of the trigram, but it's OK, + // since we always call it from contexts where there's a terminating zero byte. + uint32_t trgm; + memcpy(&trgm, ptr, sizeof(trgm)); + ++ptr; + trgm = le32toh(trgm); + add_docid(trgm & 0xffffff, docid); + if (trgm <= 0xffffff) { + // Terminating zero byte, so we're done with this filename. + // Skip the remaining two bytes, and the zero terminator. + ptr += 3; + break; } } - ptr += s.size() + 1; } // Compress and add the filename block. - filename_blocks.push_back(ftell(outfp)); + filename_blocks.push_back(outfp_pos); string compressed = zstd_compress(current_block, cdict, &tempbuf); if (fwrite(compressed.data(), compressed.size(), 1, outfp) != 1) { perror("fwrite()"); exit(1); } + outfp_pos += compressed.size(); current_block.clear(); num_files_in_block = 0; @@ -431,7 +455,7 @@ unique_ptr create_hashtable(EncodingCorpus &corpus, const vectorget_pl_builder(trgm); pl_builder.finish(); - longest_posting_list = max(longest_posting_list, pl_builder.num_docids); - trigrams += pl_builder.num_docids; + longest_posting_list = max(longest_posting_list, pl_builder.get_num_docids()); + trigrams += pl_builder.get_num_docids(); bytes_for_posting_lists += pl_builder.encoded.size(); } size_t num_trigrams = corpus->num_trigrams(); @@ -609,7 +635,7 @@ void DatabaseBuilder::finish_corpus() continue; } - const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; + const vector &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; offset += encoded.size(); } @@ -623,7 +649,7 @@ void DatabaseBuilder::finish_corpus() if (hashtable[i].num_docids == 0) { continue; } - const string &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; + const vector &encoded = corpus->get_pl_builder(hashtable[i].trgm).encoded; fwrite(encoded.data(), encoded.size(), 1, outfp); } @@ -657,22 +683,24 @@ void DatabaseBuilder::finish_corpus() fseek(outfp, 0, SEEK_SET); fwrite(&hdr, sizeof(hdr), 1, outfp); + if (!temp_filename.empty()) { + if (rename(temp_filename.c_str(), outfile.c_str()) == -1) { + perror("rename"); + exit(1); + } + } else { #ifdef O_TMPFILE - // Give the file a proper name, making it visible in the file system. - // TODO: It would be nice to be able to do this atomically, like with rename. - unlink(outfile.c_str()); - char procpath[256]; - snprintf(procpath, sizeof(procpath), "/proc/self/fd/%d", fileno(outfp)); - if (linkat(AT_FDCWD, procpath, AT_FDCWD, outfile.c_str(), AT_SYMLINK_FOLLOW) == -1) { - perror("linkat"); - exit(1); - } -#else - if (rename(temp_filename.c_str(), outfile.c_str()) == -1) { - perror("rename"); - exit(1); - } + // Give the file a proper name, making it visible in the file system. + // TODO: It would be nice to be able to do this atomically, like with rename. + unlink(outfile.c_str()); + char procpath[256]; + snprintf(procpath, sizeof(procpath), "/proc/self/fd/%d", fileno(outfp)); + if (linkat(AT_FDCWD, procpath, AT_FDCWD, outfile.c_str(), AT_SYMLINK_FOLLOW) == -1) { + perror("linkat"); + exit(1); + } #endif + } fclose(outfp);