From: Steinar H. Gunderson Date: Mon, 28 Sep 2020 22:27:06 +0000 (+0200) Subject: Support patterns shorter than 3 bytes. X-Git-Tag: 1.0.0~90 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=1263477bf305a675ea45dfe0440cb1968256cae7;p=plocate Support patterns shorter than 3 bytes. This isn't very efficient, but it appears to still be slightly better than mlocate. --- diff --git a/plocate.cpp b/plocate.cpp index 138eb36..3d30adb 100644 --- a/plocate.cpp +++ b/plocate.cpp @@ -74,6 +74,7 @@ public: const unsigned char * get_compressed_posting_list(const Trigram *trigram) const; string_view get_compressed_filename_block(uint32_t docid) const; + size_t get_num_filename_blocks() const; private: const int fd; @@ -137,6 +138,15 @@ string_view Corpus::get_compressed_filename_block(uint32_t docid) const return { compressed, compressed_size }; } +size_t Corpus::get_num_filename_blocks() const +{ + // The beginning of the filename blocks is the end of the filename index blocks. + const uint64_t *filename_offsets_end = (const uint64_t *)(data + filename_offsets[0]); + + // Subtract the sentinel block. + return filename_offsets_end - filename_offsets - 1; +} + size_t scan_docid(const string &needle, uint32_t docid, const Corpus &corpus, unordered_map *access_rx_cache) { @@ -187,6 +197,19 @@ void do_search_file(const string &needle, const char *filename) Corpus corpus(fd); + if (needle.size() < 3) { + // Too short for trigram matching. Apply brute force. + // (We could have searched through all trigrams that matched + // the pattern and done a union of them, but that's a lot of + // work for fairly unclear gain.) + unordered_map access_rx_cache; + uint32_t num_blocks = corpus.get_num_filename_blocks(); + for (uint32_t docid = 0; docid < num_blocks; ++docid) { + scan_docid(needle, docid, corpus, &access_rx_cache); + } + return; + } + vector trigrams; for (size_t i = 0; i < needle.size() - 2; ++i) { uint32_t trgm = read_trigram(needle, i);