From: Steinar H. Gunderson <steinar+git@gunderson.no>
Date: Sat, 10 Oct 2020 17:18:52 +0000 (+0200)
Subject: Use globs if there are wildcards in the pattern.
X-Git-Tag: 1.0.0~16
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=567546a0c6a930f240f0b76d21c9573d948f4b73;p=plocate

Use globs if there are wildcards in the pattern.

This matches mlocate behavior; even the sort-of strange behavior
of having them non-anchored. Case-insensitive matching has also
been changed away from regex, since fnmatch() is seemingly slightly
faster.
---

diff --git a/parse_trigrams.cpp b/parse_trigrams.cpp
index 03fbc6b..678f520 100644
--- a/parse_trigrams.cpp
+++ b/parse_trigrams.cpp
@@ -2,6 +2,7 @@
 
 #include "unique_sort.h"
 
+#include <assert.h>
 #include <string.h>
 #include <wctype.h>
 
@@ -79,19 +80,68 @@ string print_trigram(uint32_t trgm)
 	return str;
 }
 
-uint32_t read_unigram(const string &s, size_t idx)
+pair<uint32_t, size_t> read_unigram(const string &s, size_t start)
 {
-	if (idx < s.size()) {
-		return (unsigned char)s[idx];
-	} else {
-		return 0;
+	if (start >= s.size()) {
+		return { PREMATURE_END_UNIGRAM, 0 };
+	}
+	if (s[start] == '\\') {
+		// Escaped character.
+		if (start + 1 >= s.size()) {
+			return { PREMATURE_END_UNIGRAM, 1 };
+		} else {
+			return { (unsigned char)s[start + 1], 2 };
+		}
+	}
+	if (s[start] == '*' || s[start] == '?') {
+		// Wildcard.
+		return { WILDCARD_UNIGRAM, 1 };
+	}
+	if (s[start] == '[') {
+		// Character class; search to find the end.
+		size_t len = 1;
+		if (start + len >= s.size()) {
+			return { PREMATURE_END_UNIGRAM, len };
+		}
+		if (s[start + len] == '!') {
+			++len;
+		}
+		if (start + len >= s.size()) {
+			return { PREMATURE_END_UNIGRAM, len };
+		}
+		if (s[start + len] == ']') {
+			++len;
+		}
+		for (;;) {
+			if (start + len >= s.size()) {
+				return { PREMATURE_END_UNIGRAM, len };
+			}
+			if (s[start + len] == ']') {
+				return { WILDCARD_UNIGRAM, len + 1 };
+			}
+			++len;
+		}
 	}
+
+	// Regular letter.
+	return { (unsigned char)s[start], 1 };
 }
 
 uint32_t read_trigram(const string &s, size_t start)
 {
-	return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
-		(read_unigram(s, start + 2) << 16);
+	pair<uint32_t, size_t> u1 = read_unigram(s, start);
+	if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) {
+		return u1.first;
+	}
+	pair<uint32_t, size_t> u2 = read_unigram(s, start + u1.second);
+	if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) {
+		return u2.first;
+	}
+	pair<uint32_t, size_t> u3 = read_unigram(s, start + u1.second + u2.second);
+	if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) {
+		return u3.first;
+	}
+	return u1.first | (u2.first << 8) | (u3.first << 16);
 }
 
 struct TrigramState {
@@ -171,7 +221,7 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
 			need_another_pass = false;
 			vector<TrigramState> new_states;
 			for (const TrigramState &state : states) {
-				if (state.buffered.size() >= 3) {
+				if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) {
 					// No need to extend this further.
 					new_states.push_back(state);
 					continue;
@@ -183,7 +233,7 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
 				}
 				for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
 					TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
-					if (new_state.buffered.size() < 3) {
+					if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) {
 						need_another_pass = true;
 					}
 					new_states.push_back(move(new_state));
@@ -197,19 +247,29 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
 		// and the destination filename must contain at least one of them.
 		// Output those trigrams, cut out the first byte and then deduplicate
 		// the states before we continue.
+		bool any_wildcard = false;
 		vector<uint32_t> trigram_alternatives;
 		for (TrigramState &state : states) {
 			trigram_alternatives.push_back(read_trigram(state.buffered, 0));
-			state.buffered.erase(0, 1);
+			state.buffered.erase(0, read_unigram(state.buffered, 0).second);
+			assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM);
+			if (trigram_alternatives.back() == WILDCARD_UNIGRAM) {
+				// If any of the candidates are wildcards, we need to drop the entire OR group.
+				// (Most likely, all of them would be anyway.) We need to keep stripping out
+				// the first unigram from each state.
+				any_wildcard = true;
+			}
 		}
 		unique_sort(&trigram_alternatives);  // Could have duplicates, although it's rare.
 		unique_sort(&states);
 
-		TrigramDisjunction new_pt;
-		new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
-		new_pt.trigram_alternatives = move(trigram_alternatives);
-		new_pt.max_num_docids = 0;
-		trigram_groups->push_back(move(new_pt));
+		if (!any_wildcard) {
+			TrigramDisjunction new_pt;
+			new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
+			new_pt.trigram_alternatives = move(trigram_alternatives);
+			new_pt.max_num_docids = 0;
+			trigram_groups->push_back(move(new_pt));
+		}
 
 		if (states.size() > 100) {
 			// A completely crazy pattern with lots of those special characters.
@@ -229,14 +289,17 @@ void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjun
 	}
 
 	// The case-sensitive case is straightforward.
-	if (needle.size() >= 3) {
-		for (size_t i = 0; i < needle.size() - 2; ++i) {
-			uint32_t trgm = read_trigram(needle, i);
-			TrigramDisjunction new_pt;
-			new_pt.remaining_trigrams_to_read = 1;
-			new_pt.trigram_alternatives.push_back(trgm);
-			new_pt.max_num_docids = 0;
-			trigram_groups->push_back(move(new_pt));
+	for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
+		uint32_t trgm = read_trigram(needle, i);
+		if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) {
+			// Invalid trigram, so skip.
+			continue;
 		}
+
+		TrigramDisjunction new_pt;
+		new_pt.remaining_trigrams_to_read = 1;
+		new_pt.trigram_alternatives.push_back(trgm);
+		new_pt.max_num_docids = 0;
+		trigram_groups->push_back(move(new_pt));
 	}
 }
diff --git a/parse_trigrams.h b/parse_trigrams.h
index 810d005..9c8cdf2 100644
--- a/parse_trigrams.h
+++ b/parse_trigrams.h
@@ -53,6 +53,19 @@ struct TrigramDisjunction {
 // getting their own trigram).
 void parse_trigrams(const std::string &needle, bool ignore_case, std::vector<TrigramDisjunction> *trigram_groups);
 
+static constexpr uint32_t WILDCARD_UNIGRAM = 0xFF000000;
+static constexpr uint32_t PREMATURE_END_UNIGRAM = 0xFF000001;
+
+// Reads a unigram, taking into account escaping (\<foo> becomes <foo>).
+// Returns WILDCARD_UNIGRAM if there's an invalid unigram, ie., we found
+// a glob character (?, * or a [] group). Returns EOS_UNIGRAM if we went
+// past the end of the string, e.g., a string that ends in a backslash.
+// The second element is always the length.
+std::pair<uint32_t, size_t> read_unigram(const std::string &s, size_t start);
+
+// Reads a trigram, ie., three calls to read_unigram(). Needs to start on a valid unigram.
+// Returns WILDCARD_UNIGRAM or PREMATURE_END_UNIGRAM of either of those occurred
+// during reading of the string.
 uint32_t read_trigram(const std::string &s, size_t start);
 
 // For debugging.
diff --git a/plocate.cpp b/plocate.cpp
index 7a0067b..a7dd78a 100644
--- a/plocate.cpp
+++ b/plocate.cpp
@@ -7,6 +7,7 @@
 #include <assert.h>
 #include <chrono>
 #include <fcntl.h>
+#include <fnmatch.h>
 #include <functional>
 #include <getopt.h>
 #include <iosfwd>
@@ -89,7 +90,8 @@ void Serializer::release_current()
 
 struct Needle {
 	enum { STRSTR,
-	       REGEX } type;
+	       REGEX,  // Not currently used.
+	       GLOB } type;
 	string str;  // Filled in no matter what.
 	regex_t re;  // For REGEX.
 };
@@ -98,6 +100,9 @@ bool matches(const Needle &needle, const char *haystack)
 {
 	if (needle.type == Needle::STRSTR) {
 		return strstr(haystack, needle.str.c_str()) != nullptr;
+	} else if (needle.type == Needle::GLOB) {
+		int flags = ignore_case ? FNM_CASEFOLD : 0;
+		return fnmatch(needle.str.c_str(), haystack, flags) == 0;
 	} else {
 		assert(needle.type == Needle::REGEX);
 		return regexec(&needle.re, haystack, /*nmatch=*/0, /*pmatch=*/nullptr, /*flags=*/0) == 0;
@@ -547,44 +552,19 @@ void do_search_file(const vector<Needle> &needles, const char *filename)
 	}
 }
 
-regex_t needle_to_regex(const string &needle)
+string unescape_glob_to_plain_string(const string &needle)
 {
-	string escaped_needle;
-	for (char ch : needle) {
-		switch (ch) {
-		// Directly from what regex(7) considers an âatomâ.
-		case '^':
-		case '.':
-		case '[':
-		case '$':
-		case '(':
-		case ')':
-		case '|':
-		case '*':
-		case '+':
-		case '?':
-		case '{':
-		case '\\':
-			escaped_needle.push_back('\\');
-			// Fall through.
-		default:
-			escaped_needle.push_back(ch);
+	string unescaped;
+	for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
+		uint32_t ch = read_unigram(needle, i).first;
+		assert(ch != WILDCARD_UNIGRAM);
+		if (ch == PREMATURE_END_UNIGRAM) {
+			fprintf(stderr, "Pattern '%s' ended prematurely\n", needle.c_str());
+			exit(1);
 		}
+		unescaped.push_back(ch);
 	}
-	regex_t re;
-	int err;
-	if (ignore_case) {
-		err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB | REG_ICASE);
-	} else {
-		err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB);
-	}
-	if (err != 0) {
-		char errbuf[256];
-		regerror(err, &re, errbuf, sizeof(errbuf));
-		fprintf(stderr, "Error when compiling regex for '%s': %s\n", needle.c_str(), errbuf);
-		exit(1);
-	}
-	return re;
+	return unescaped;
 }
 
 void usage()
@@ -650,16 +630,28 @@ int main(int argc, char **argv)
 	for (int i = optind; i < argc; ++i) {
 		Needle needle;
 		needle.str = argv[i];
-		if (ignore_case) {
+
+		// See if there are any wildcard characters, which indicates we should treat it
+		// as an (anchored) glob.
+		bool any_wildcard = false;
+		for (size_t i = 0; i < needle.str.size(); i += read_unigram(needle.str, i).second) {
+			if (read_unigram(needle.str, i).first == WILDCARD_UNIGRAM) {
+				any_wildcard = true;
+				break;
+			}
+		}
+
+		if (any_wildcard) {
+			needle.type = Needle::GLOB;
+		} else if (ignore_case) {
 			// strcasestr() doesn't handle locales correctly (even though LSB
-			// claims it should), but somehow, the glibc regex engine does.
-			// It's much slower than strstr() for non-case-sensitive searches, though
-			// (even though it really ought to be faster, since it can precompile),
-			// so only use it for that.
-			needle.type = Needle::REGEX;
-			needle.re = needle_to_regex(argv[i]);
+			// claims it should), but somehow, fnmatch() does, and it's about
+			// the same speed as using a regex.
+			needle.type = Needle::GLOB;
+			needle.str = "*" + needle.str + "*";
 		} else {
 			needle.type = Needle::STRSTR;
+			needle.str = unescape_glob_to_plain_string(needle.str);
 		}
 		needles.push_back(move(needle));
 	}