From: Steinar H. Gunderson Date: Sat, 10 Oct 2020 17:18:52 +0000 (+0200) Subject: Use globs if there are wildcards in the pattern. X-Git-Tag: 1.0.0~16 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=567546a0c6a930f240f0b76d21c9573d948f4b73;p=plocate Use globs if there are wildcards in the pattern. This matches mlocate behavior; even the sort-of strange behavior of having them non-anchored. Case-insensitive matching has also been changed away from regex, since fnmatch() is seemingly slightly faster. --- diff --git a/parse_trigrams.cpp b/parse_trigrams.cpp index 03fbc6b..678f520 100644 --- a/parse_trigrams.cpp +++ b/parse_trigrams.cpp @@ -2,6 +2,7 @@ #include "unique_sort.h" +#include #include #include @@ -79,19 +80,68 @@ string print_trigram(uint32_t trgm) return str; } -uint32_t read_unigram(const string &s, size_t idx) +pair read_unigram(const string &s, size_t start) { - if (idx < s.size()) { - return (unsigned char)s[idx]; - } else { - return 0; + if (start >= s.size()) { + return { PREMATURE_END_UNIGRAM, 0 }; + } + if (s[start] == '\\') { + // Escaped character. + if (start + 1 >= s.size()) { + return { PREMATURE_END_UNIGRAM, 1 }; + } else { + return { (unsigned char)s[start + 1], 2 }; + } + } + if (s[start] == '*' || s[start] == '?') { + // Wildcard. + return { WILDCARD_UNIGRAM, 1 }; + } + if (s[start] == '[') { + // Character class; search to find the end. + size_t len = 1; + if (start + len >= s.size()) { + return { PREMATURE_END_UNIGRAM, len }; + } + if (s[start + len] == '!') { + ++len; + } + if (start + len >= s.size()) { + return { PREMATURE_END_UNIGRAM, len }; + } + if (s[start + len] == ']') { + ++len; + } + for (;;) { + if (start + len >= s.size()) { + return { PREMATURE_END_UNIGRAM, len }; + } + if (s[start + len] == ']') { + return { WILDCARD_UNIGRAM, len + 1 }; + } + ++len; + } } + + // Regular letter. + return { (unsigned char)s[start], 1 }; } uint32_t read_trigram(const string &s, size_t start) { - return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) | - (read_unigram(s, start + 2) << 16); + pair u1 = read_unigram(s, start); + if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) { + return u1.first; + } + pair u2 = read_unigram(s, start + u1.second); + if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) { + return u2.first; + } + pair u3 = read_unigram(s, start + u1.second + u2.second); + if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) { + return u3.first; + } + return u1.first | (u2.first << 8) | (u3.first << 16); } struct TrigramState { @@ -171,7 +221,7 @@ void parse_trigrams_ignore_case(const string &needle, vector need_another_pass = false; vector new_states; for (const TrigramState &state : states) { - if (state.buffered.size() >= 3) { + if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) { // No need to extend this further. new_states.push_back(state); continue; @@ -183,7 +233,7 @@ void parse_trigrams_ignore_case(const string &needle, vector } for (const string &rune : alternatives_for_cp[state.next_codepoint]) { TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 }; - if (new_state.buffered.size() < 3) { + if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) { need_another_pass = true; } new_states.push_back(move(new_state)); @@ -197,19 +247,29 @@ void parse_trigrams_ignore_case(const string &needle, vector // and the destination filename must contain at least one of them. // Output those trigrams, cut out the first byte and then deduplicate // the states before we continue. + bool any_wildcard = false; vector trigram_alternatives; for (TrigramState &state : states) { trigram_alternatives.push_back(read_trigram(state.buffered, 0)); - state.buffered.erase(0, 1); + state.buffered.erase(0, read_unigram(state.buffered, 0).second); + assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM); + if (trigram_alternatives.back() == WILDCARD_UNIGRAM) { + // If any of the candidates are wildcards, we need to drop the entire OR group. + // (Most likely, all of them would be anyway.) We need to keep stripping out + // the first unigram from each state. + any_wildcard = true; + } } unique_sort(&trigram_alternatives); // Could have duplicates, although it's rare. unique_sort(&states); - TrigramDisjunction new_pt; - new_pt.remaining_trigrams_to_read = trigram_alternatives.size(); - new_pt.trigram_alternatives = move(trigram_alternatives); - new_pt.max_num_docids = 0; - trigram_groups->push_back(move(new_pt)); + if (!any_wildcard) { + TrigramDisjunction new_pt; + new_pt.remaining_trigrams_to_read = trigram_alternatives.size(); + new_pt.trigram_alternatives = move(trigram_alternatives); + new_pt.max_num_docids = 0; + trigram_groups->push_back(move(new_pt)); + } if (states.size() > 100) { // A completely crazy pattern with lots of those special characters. @@ -229,14 +289,17 @@ void parse_trigrams(const string &needle, bool ignore_case, vector= 3) { - for (size_t i = 0; i < needle.size() - 2; ++i) { - uint32_t trgm = read_trigram(needle, i); - TrigramDisjunction new_pt; - new_pt.remaining_trigrams_to_read = 1; - new_pt.trigram_alternatives.push_back(trgm); - new_pt.max_num_docids = 0; - trigram_groups->push_back(move(new_pt)); + for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) { + uint32_t trgm = read_trigram(needle, i); + if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) { + // Invalid trigram, so skip. + continue; } + + TrigramDisjunction new_pt; + new_pt.remaining_trigrams_to_read = 1; + new_pt.trigram_alternatives.push_back(trgm); + new_pt.max_num_docids = 0; + trigram_groups->push_back(move(new_pt)); } } diff --git a/parse_trigrams.h b/parse_trigrams.h index 810d005..9c8cdf2 100644 --- a/parse_trigrams.h +++ b/parse_trigrams.h @@ -53,6 +53,19 @@ struct TrigramDisjunction { // getting their own trigram). void parse_trigrams(const std::string &needle, bool ignore_case, std::vector *trigram_groups); +static constexpr uint32_t WILDCARD_UNIGRAM = 0xFF000000; +static constexpr uint32_t PREMATURE_END_UNIGRAM = 0xFF000001; + +// Reads a unigram, taking into account escaping (\ becomes ). +// Returns WILDCARD_UNIGRAM if there's an invalid unigram, ie., we found +// a glob character (?, * or a [] group). Returns EOS_UNIGRAM if we went +// past the end of the string, e.g., a string that ends in a backslash. +// The second element is always the length. +std::pair read_unigram(const std::string &s, size_t start); + +// Reads a trigram, ie., three calls to read_unigram(). Needs to start on a valid unigram. +// Returns WILDCARD_UNIGRAM or PREMATURE_END_UNIGRAM of either of those occurred +// during reading of the string. uint32_t read_trigram(const std::string &s, size_t start); // For debugging. diff --git a/plocate.cpp b/plocate.cpp index 7a0067b..a7dd78a 100644 --- a/plocate.cpp +++ b/plocate.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -89,7 +90,8 @@ void Serializer::release_current() struct Needle { enum { STRSTR, - REGEX } type; + REGEX, // Not currently used. + GLOB } type; string str; // Filled in no matter what. regex_t re; // For REGEX. }; @@ -98,6 +100,9 @@ bool matches(const Needle &needle, const char *haystack) { if (needle.type == Needle::STRSTR) { return strstr(haystack, needle.str.c_str()) != nullptr; + } else if (needle.type == Needle::GLOB) { + int flags = ignore_case ? FNM_CASEFOLD : 0; + return fnmatch(needle.str.c_str(), haystack, flags) == 0; } else { assert(needle.type == Needle::REGEX); return regexec(&needle.re, haystack, /*nmatch=*/0, /*pmatch=*/nullptr, /*flags=*/0) == 0; @@ -547,44 +552,19 @@ void do_search_file(const vector &needles, const char *filename) } } -regex_t needle_to_regex(const string &needle) +string unescape_glob_to_plain_string(const string &needle) { - string escaped_needle; - for (char ch : needle) { - switch (ch) { - // Directly from what regex(7) considers an “atom”. - case '^': - case '.': - case '[': - case '$': - case '(': - case ')': - case '|': - case '*': - case '+': - case '?': - case '{': - case '\\': - escaped_needle.push_back('\\'); - // Fall through. - default: - escaped_needle.push_back(ch); + string unescaped; + for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) { + uint32_t ch = read_unigram(needle, i).first; + assert(ch != WILDCARD_UNIGRAM); + if (ch == PREMATURE_END_UNIGRAM) { + fprintf(stderr, "Pattern '%s' ended prematurely\n", needle.c_str()); + exit(1); } + unescaped.push_back(ch); } - regex_t re; - int err; - if (ignore_case) { - err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB | REG_ICASE); - } else { - err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB); - } - if (err != 0) { - char errbuf[256]; - regerror(err, &re, errbuf, sizeof(errbuf)); - fprintf(stderr, "Error when compiling regex for '%s': %s\n", needle.c_str(), errbuf); - exit(1); - } - return re; + return unescaped; } void usage() @@ -650,16 +630,28 @@ int main(int argc, char **argv) for (int i = optind; i < argc; ++i) { Needle needle; needle.str = argv[i]; - if (ignore_case) { + + // See if there are any wildcard characters, which indicates we should treat it + // as an (anchored) glob. + bool any_wildcard = false; + for (size_t i = 0; i < needle.str.size(); i += read_unigram(needle.str, i).second) { + if (read_unigram(needle.str, i).first == WILDCARD_UNIGRAM) { + any_wildcard = true; + break; + } + } + + if (any_wildcard) { + needle.type = Needle::GLOB; + } else if (ignore_case) { // strcasestr() doesn't handle locales correctly (even though LSB - // claims it should), but somehow, the glibc regex engine does. - // It's much slower than strstr() for non-case-sensitive searches, though - // (even though it really ought to be faster, since it can precompile), - // so only use it for that. - needle.type = Needle::REGEX; - needle.re = needle_to_regex(argv[i]); + // claims it should), but somehow, fnmatch() does, and it's about + // the same speed as using a regex. + needle.type = Needle::GLOB; + needle.str = "*" + needle.str + "*"; } else { needle.type = Needle::STRSTR; + needle.str = unescape_glob_to_plain_string(needle.str); } needles.push_back(move(needle)); }