From: Steinar H. Gunderson Date: Mon, 28 Sep 2020 07:34:31 +0000 (+0200) Subject: Deduplicate docids as we go. X-Git-Tag: 1.0.0~100 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=1bb5df65bf76c5a2dff0c6e69f38cdcb9d884d69;p=plocate Deduplicate docids as we go. This saves ~50% RAM in the build step, now that we have blocking (there's a lot of deduplication going on), and seemingly also ~15% execution time, possibly because of less memory allocation (I haven't checked thoroughly). --- diff --git a/plocate-build.cpp b/plocate-build.cpp index e6789d0..42ff96d 100644 --- a/plocate-build.cpp +++ b/plocate-build.cpp @@ -158,19 +158,21 @@ void do_build(const char *infile, const char *outfile, int block_size) size_t trigrams = 0, longest_posting_list = 0; unordered_map> invindex; for (size_t i = 0; i < files.size(); ++i) { + uint32_t docid = i / block_size; const string &s = files[i]; if (s.size() >= 3) { for (size_t j = 0; j < s.size() - 2; ++j) { uint32_t trgm = read_trigram(s, j); - invindex[trgm].push_back(i / block_size); + vector &docids = invindex[trgm]; + if (docids.empty() || docids.back() != docid) { + docids.push_back(docid); + } } } } string buf; size_t bytes_used = 0; for (auto &[trigram, docids] : invindex) { - auto last = unique(docids.begin(), docids.end()); - docids.erase(last, docids.end()); longest_posting_list = max(longest_posting_list, docids.size()); trigrams += docids.size();