From 82e93f155db843b49c6fe1789ae56816ec6dce07 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Thu, 1 Oct 2020 18:09:47 +0200 Subject: [PATCH] Do early reject of trigrams we can say up-front will be too large; saves loading their posting lists from disk in extreme cases. --- plocate.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/plocate.cpp b/plocate.cpp index 09cc834..d935c88 100644 --- a/plocate.cpp +++ b/plocate.cpp @@ -321,17 +321,26 @@ void do_search_file(const vector &needles, const char *filename) dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); vector> trigrams; + uint64_t shortest_so_far = numeric_limits::max(); for (const string &needle : needles) { if (needle.size() < 3) continue; for (size_t i = 0; i < needle.size() - 2; ++i) { uint32_t trgm = read_trigram(needle, i); - corpus.find_trigram(trgm, [trgm, &trigrams](const Trigram *trgmptr, size_t len) { + corpus.find_trigram(trgm, [trgm, &trigrams, &shortest_so_far](const Trigram *trgmptr, size_t len) { if (trgmptr == nullptr) { - dprintf("trigram %06x isn't found, we abort the search\n", trgm); + dprintf("trigram '%c%c%c' isn't found, we abort the search\n", + trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff); exit(0); } - trigrams.emplace_back(*trgmptr, len); + if (trgmptr->num_docids > shortest_so_far * 100) { + dprintf("not loading trigram '%c%c%c' with %u docids, it would be ignored later anyway\n", + trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff, + trgmptr->num_docids); + } else { + trigrams.emplace_back(*trgmptr, len); + shortest_so_far = std::min(shortest_so_far, trgmptr->num_docids); + } }); } } -- 2.39.2