Use globs if there are wildcards in the pattern.

author Steinar H. Gunderson <steinar+git@gunderson.no>

Sat, 10 Oct 2020 17:18:52 +0000 (19:18 +0200)

committer Steinar H. Gunderson <steinar+git@gunderson.no>

Sat, 10 Oct 2020 17:20:51 +0000 (19:20 +0200)
author Steinar H. Gunderson <steinar+git@gunderson.no>
Sat, 10 Oct 2020 17:18:52 +0000 (19:18 +0200)
committer Steinar H. Gunderson <steinar+git@gunderson.no>
Sat, 10 Oct 2020 17:20:51 +0000 (19:20 +0200)
diff --git a/parse_trigrams.cpp b/parse_trigrams.cpp

index 03fbc6b7bd3120873afa427f038848a64da143ae..678f520fd03cc7c11c63b3cae7f77e712196c56f 100644 (file)
--- a/parse_trigrams.cpp
+++ b/parse_trigrams.cpp
@@ -2,6 +2,7 @@
  
  #include "unique_sort.h"
  
+#include <assert.h>
  #include <string.h>
  #include <wctype.h>
  
@@ -79,19 +80,68 @@ string print_trigram(uint32_t trgm)
         return str;
  }
  
-uint32_t read_unigram(const string &s, size_t idx)
+pair<uint32_t, size_t> read_unigram(const string &s, size_t start)
  {
-       if (idx < s.size()) {
-               return (unsigned char)s[idx];
-       } else {
-               return 0;
+       if (start >= s.size()) {
+               return { PREMATURE_END_UNIGRAM, 0 };
+       }
+       if (s[start] == '\\') {
+               // Escaped character.
+               if (start + 1 >= s.size()) {
+                       return { PREMATURE_END_UNIGRAM, 1 };
+               } else {
+                       return { (unsigned char)s[start + 1], 2 };
+               }
+       }
+       if (s[start] == '*' || s[start] == '?') {
+               // Wildcard.
+               return { WILDCARD_UNIGRAM, 1 };
+       }
+       if (s[start] == '[') {
+               // Character class; search to find the end.
+               size_t len = 1;
+               if (start + len >= s.size()) {
+                       return { PREMATURE_END_UNIGRAM, len };
+               }
+               if (s[start + len] == '!') {
+                       ++len;
+               }
+               if (start + len >= s.size()) {
+                       return { PREMATURE_END_UNIGRAM, len };
+               }
+               if (s[start + len] == ']') {
+                       ++len;
+               }
+               for (;;) {
+                       if (start + len >= s.size()) {
+                               return { PREMATURE_END_UNIGRAM, len };
+                       }
+                       if (s[start + len] == ']') {
+                               return { WILDCARD_UNIGRAM, len + 1 };
+                       }
+                       ++len;
+               }
         }
+
+       // Regular letter.
+       return { (unsigned char)s[start], 1 };
  }
  
  uint32_t read_trigram(const string &s, size_t start)
  {
-       return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
-               (read_unigram(s, start + 2) << 16);
+       pair<uint32_t, size_t> u1 = read_unigram(s, start);
+       if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) {
+               return u1.first;
+       }
+       pair<uint32_t, size_t> u2 = read_unigram(s, start + u1.second);
+       if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) {
+               return u2.first;
+       }
+       pair<uint32_t, size_t> u3 = read_unigram(s, start + u1.second + u2.second);
+       if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) {
+               return u3.first;
+       }
+       return u1.first | (u2.first << 8) | (u3.first << 16);
  }
  
  struct TrigramState {
@@ -171,7 +221,7 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
                         need_another_pass = false;
                         vector<TrigramState> new_states;
                         for (const TrigramState &state : states) {
-                               if (state.buffered.size() >= 3) {
+                               if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) {
                                         // No need to extend this further.
                                         new_states.push_back(state);
                                         continue;
@@ -183,7 +233,7 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
                                 }
                                 for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
                                         TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
-                                       if (new_state.buffered.size() < 3) {
+                                       if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) {
                                                 need_another_pass = true;
                                         }
                                         new_states.push_back(move(new_state));
@@ -197,19 +247,29 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
                 // and the destination filename must contain at least one of them.
                 // Output those trigrams, cut out the first byte and then deduplicate
                 // the states before we continue.
+               bool any_wildcard = false;
                 vector<uint32_t> trigram_alternatives;
                 for (TrigramState &state : states) {
                         trigram_alternatives.push_back(read_trigram(state.buffered, 0));
-                       state.buffered.erase(0, 1);
+                       state.buffered.erase(0, read_unigram(state.buffered, 0).second);
+                       assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM);
+                       if (trigram_alternatives.back() == WILDCARD_UNIGRAM) {
+                               // If any of the candidates are wildcards, we need to drop the entire OR group.
+                               // (Most likely, all of them would be anyway.) We need to keep stripping out
+                               // the first unigram from each state.
+                               any_wildcard = true;
+                       }
                 }
                 unique_sort(&trigram_alternatives);  // Could have duplicates, although it's rare.
                 unique_sort(&states);
  
-               TrigramDisjunction new_pt;
-               new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
-               new_pt.trigram_alternatives = move(trigram_alternatives);
-               new_pt.max_num_docids = 0;
-               trigram_groups->push_back(move(new_pt));
+               if (!any_wildcard) {
+                       TrigramDisjunction new_pt;
+                       new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
+                       new_pt.trigram_alternatives = move(trigram_alternatives);
+                       new_pt.max_num_docids = 0;
+                       trigram_groups->push_back(move(new_pt));
+               }
  
                 if (states.size() > 100) {
                         // A completely crazy pattern with lots of those special characters.
@@ -229,14 +289,17 @@ void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjun
         }
  
         // The case-sensitive case is straightforward.
-       if (needle.size() >= 3) {
-               for (size_t i = 0; i < needle.size() - 2; ++i) {
-                       uint32_t trgm = read_trigram(needle, i);
-                       TrigramDisjunction new_pt;
-                       new_pt.remaining_trigrams_to_read = 1;
-                       new_pt.trigram_alternatives.push_back(trgm);
-                       new_pt.max_num_docids = 0;
-                       trigram_groups->push_back(move(new_pt));
+       for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
+               uint32_t trgm = read_trigram(needle, i);
+               if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) {
+                       // Invalid trigram, so skip.
+                       continue;
                 }
+
+               TrigramDisjunction new_pt;
+               new_pt.remaining_trigrams_to_read = 1;
+               new_pt.trigram_alternatives.push_back(trgm);
+               new_pt.max_num_docids = 0;
+               trigram_groups->push_back(move(new_pt));
         }
  }
diff --git a/parse_trigrams.h b/parse_trigrams.h

index 810d005653fcf7ab3906f4ddc006670f3e99a4ec..9c8cdf2b62a7b80b565a5e3447fb7531260de3b2 100644 (file)
--- a/parse_trigrams.h
+++ b/parse_trigrams.h
@@ -53,6 +53,19 @@ struct TrigramDisjunction {
  // getting their own trigram).
  void parse_trigrams(const std::string &needle, bool ignore_case, std::vector<TrigramDisjunction> *trigram_groups);
  
+static constexpr uint32_t WILDCARD_UNIGRAM = 0xFF000000;
+static constexpr uint32_t PREMATURE_END_UNIGRAM = 0xFF000001;
+
+// Reads a unigram, taking into account escaping (\<foo> becomes <foo>).
+// Returns WILDCARD_UNIGRAM if there's an invalid unigram, ie., we found
+// a glob character (?, * or a [] group). Returns EOS_UNIGRAM if we went
+// past the end of the string, e.g., a string that ends in a backslash.
+// The second element is always the length.
+std::pair<uint32_t, size_t> read_unigram(const std::string &s, size_t start);
+
+// Reads a trigram, ie., three calls to read_unigram(). Needs to start on a valid unigram.
+// Returns WILDCARD_UNIGRAM or PREMATURE_END_UNIGRAM of either of those occurred
+// during reading of the string.
  uint32_t read_trigram(const std::string &s, size_t start);
  
  // For debugging.
diff --git a/plocate.cpp b/plocate.cpp

index 7a0067be5dfb0d36b936ca01456a6d7a88fb309f..a7dd78ab87614752264a81ece6e3b719401b696f 100644 (file)
--- a/plocate.cpp
+++ b/plocate.cpp
@@ -7,6 +7,7 @@
  #include <assert.h>
  #include <chrono>
  #include <fcntl.h>
+#include <fnmatch.h>
  #include <functional>
  #include <getopt.h>
  #include <iosfwd>
@@ -89,7 +90,8 @@ void Serializer::release_current()
  
  struct Needle {
         enum { STRSTR,
-              REGEX } type;
+              REGEX,  // Not currently used.
+              GLOB } type;
         string str;  // Filled in no matter what.
         regex_t re;  // For REGEX.
  };
@@ -98,6 +100,9 @@ bool matches(const Needle &needle, const char *haystack)
  {
         if (needle.type == Needle::STRSTR) {
                 return strstr(haystack, needle.str.c_str()) != nullptr;
+       } else if (needle.type == Needle::GLOB) {
+               int flags = ignore_case ? FNM_CASEFOLD : 0;
+               return fnmatch(needle.str.c_str(), haystack, flags) == 0;
         } else {
                 assert(needle.type == Needle::REGEX);
                 return regexec(&needle.re, haystack, /*nmatch=*/0, /*pmatch=*/nullptr, /*flags=*/0) == 0;
@@ -547,44 +552,19 @@ void do_search_file(const vector<Needle> &needles, const char *filename)
         }
  }
  
-regex_t needle_to_regex(const string &needle)
+string unescape_glob_to_plain_string(const string &needle)
  {
-       string escaped_needle;
-       for (char ch : needle) {
-               switch (ch) {
-               // Directly from what regex(7) considers an “atom”.
-               case '^':
-               case '.':
-               case '[':
-               case '$':
-               case '(':
-               case ')':
-               case '|':
-               case '*':
-               case '+':
-               case '?':
-               case '{':
-               case '\\':
-                       escaped_needle.push_back('\\');
-                       // Fall through.
-               default:
-                       escaped_needle.push_back(ch);
+       string unescaped;
+       for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
+               uint32_t ch = read_unigram(needle, i).first;
+               assert(ch != WILDCARD_UNIGRAM);
+               if (ch == PREMATURE_END_UNIGRAM) {
+                       fprintf(stderr, "Pattern '%s' ended prematurely\n", needle.c_str());
+                       exit(1);
                 }
+               unescaped.push_back(ch);
         }
-       regex_t re;
-       int err;
-       if (ignore_case) {
-               err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB | REG_ICASE);
-       } else {
-               err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB);
-       }
-       if (err != 0) {
-               char errbuf[256];
-               regerror(err, &re, errbuf, sizeof(errbuf));
-               fprintf(stderr, "Error when compiling regex for '%s': %s\n", needle.c_str(), errbuf);
-               exit(1);
-       }
-       return re;
+       return unescaped;
  }
  
  void usage()
@@ -650,16 +630,28 @@ int main(int argc, char **argv)
         for (int i = optind; i < argc; ++i) {
                 Needle needle;
                 needle.str = argv[i];
-               if (ignore_case) {
+
+               // See if there are any wildcard characters, which indicates we should treat it
+               // as an (anchored) glob.
+               bool any_wildcard = false;
+               for (size_t i = 0; i < needle.str.size(); i += read_unigram(needle.str, i).second) {
+                       if (read_unigram(needle.str, i).first == WILDCARD_UNIGRAM) {
+                               any_wildcard = true;
+                               break;
+                       }
+               }
+
+               if (any_wildcard) {
+                       needle.type = Needle::GLOB;
+               } else if (ignore_case) {
                         // strcasestr() doesn't handle locales correctly (even though LSB
-                       // claims it should), but somehow, the glibc regex engine does.
-                       // It's much slower than strstr() for non-case-sensitive searches, though
-                       // (even though it really ought to be faster, since it can precompile),
-                       // so only use it for that.
-                       needle.type = Needle::REGEX;
-                       needle.re = needle_to_regex(argv[i]);
+                       // claims it should), but somehow, fnmatch() does, and it's about
+                       // the same speed as using a regex.
+                       needle.type = Needle::GLOB;
+                       needle.str = "*" + needle.str + "*";
                 } else {
                         needle.type = Needle::STRSTR;
+                       needle.str = unescape_glob_to_plain_string(needle.str);
                 }
                 needles.push_back(move(needle));
         }
author	Steinar H. Gunderson <steinar+git@gunderson.no>
	Sat, 10 Oct 2020 17:18:52 +0000 (19:18 +0200)
committer	Steinar H. Gunderson <steinar+git@gunderson.no>
	Sat, 10 Oct 2020 17:20:51 +0000 (19:20 +0200)
parse_trigrams.cpp		patch \| blob \| history
parse_trigrams.h		patch \| blob \| history
plocate.cpp		patch \| blob \| history