From 1263477bf305a675ea45dfe0440cb1968256cae7 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Tue, 29 Sep 2020 00:27:06 +0200 Subject: [PATCH] Support patterns shorter than 3 bytes. This isn't very efficient, but it appears to still be slightly better than mlocate. --- plocate.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/plocate.cpp b/plocate.cpp index 138eb36..3d30adb 100644 --- a/plocate.cpp +++ b/plocate.cpp @@ -74,6 +74,7 @@ public: const unsigned char * get_compressed_posting_list(const Trigram *trigram) const; string_view get_compressed_filename_block(uint32_t docid) const; + size_t get_num_filename_blocks() const; private: const int fd; @@ -137,6 +138,15 @@ string_view Corpus::get_compressed_filename_block(uint32_t docid) const return { compressed, compressed_size }; } +size_t Corpus::get_num_filename_blocks() const +{ + // The beginning of the filename blocks is the end of the filename index blocks. + const uint64_t *filename_offsets_end = (const uint64_t *)(data + filename_offsets[0]); + + // Subtract the sentinel block. + return filename_offsets_end - filename_offsets - 1; +} + size_t scan_docid(const string &needle, uint32_t docid, const Corpus &corpus, unordered_map *access_rx_cache) { @@ -187,6 +197,19 @@ void do_search_file(const string &needle, const char *filename) Corpus corpus(fd); + if (needle.size() < 3) { + // Too short for trigram matching. Apply brute force. + // (We could have searched through all trigrams that matched + // the pattern and done a union of them, but that's a lot of + // work for fairly unclear gain.) + unordered_map access_rx_cache; + uint32_t num_blocks = corpus.get_num_filename_blocks(); + for (uint32_t docid = 0; docid < num_blocks; ++docid) { + scan_docid(needle, docid, corpus, &access_rx_cache); + } + return; + } + vector trigrams; for (size_t i = 0; i < needle.size() - 2; ++i) { uint32_t trgm = read_trigram(needle, i); -- 2.39.2