]> git.sesse.net Git - plocate/blobdiff - parse_trigrams.cpp
Release plocate 1.1.22.
[plocate] / parse_trigrams.cpp
index 03fbc6b7bd3120873afa427f038848a64da143ae..9e72aac9bcfbb71f085b3553d81ef80278c058d3 100644 (file)
@@ -2,6 +2,8 @@
 
 #include "unique_sort.h"
 
+#include <assert.h>
+#include <memory>
 #include <string.h>
 #include <wctype.h>
 
@@ -79,19 +81,68 @@ string print_trigram(uint32_t trgm)
        return str;
 }
 
-uint32_t read_unigram(const string &s, size_t idx)
+pair<uint32_t, size_t> read_unigram(const string &s, size_t start)
 {
-       if (idx < s.size()) {
-               return (unsigned char)s[idx];
-       } else {
-               return 0;
+       if (start >= s.size()) {
+               return { PREMATURE_END_UNIGRAM, 0 };
+       }
+       if (s[start] == '\\') {
+               // Escaped character.
+               if (start + 1 >= s.size()) {
+                       return { PREMATURE_END_UNIGRAM, 1 };
+               } else {
+                       return { (unsigned char)s[start + 1], 2 };
+               }
+       }
+       if (s[start] == '*' || s[start] == '?') {
+               // Wildcard.
+               return { WILDCARD_UNIGRAM, 1 };
+       }
+       if (s[start] == '[') {
+               // Character class; search to find the end.
+               size_t len = 1;
+               if (start + len >= s.size()) {
+                       return { PREMATURE_END_UNIGRAM, len };
+               }
+               if (s[start + len] == '!') {
+                       ++len;
+               }
+               if (start + len >= s.size()) {
+                       return { PREMATURE_END_UNIGRAM, len };
+               }
+               if (s[start + len] == ']') {
+                       ++len;
+               }
+               for (;;) {
+                       if (start + len >= s.size()) {
+                               return { PREMATURE_END_UNIGRAM, len };
+                       }
+                       if (s[start + len] == ']') {
+                               return { WILDCARD_UNIGRAM, len + 1 };
+                       }
+                       ++len;
+               }
        }
+
+       // Regular letter.
+       return { (unsigned char)s[start], 1 };
 }
 
 uint32_t read_trigram(const string &s, size_t start)
 {
-       return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
-               (read_unigram(s, start + 2) << 16);
+       pair<uint32_t, size_t> u1 = read_unigram(s, start);
+       if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) {
+               return u1.first;
+       }
+       pair<uint32_t, size_t> u2 = read_unigram(s, start + u1.second);
+       if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) {
+               return u2.first;
+       }
+       pair<uint32_t, size_t> u3 = read_unigram(s, start + u1.second + u2.second);
+       if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) {
+               return u3.first;
+       }
+       return u1.first | (u2.first << 8) | (u3.first << 16);
 }
 
 struct TrigramState {
@@ -121,6 +172,7 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
        // involving ICU or the likes.
        mbtowc(nullptr, 0, 0);
        const char *ptr = needle.c_str();
+       unique_ptr<char[]> buf(new char[MB_CUR_MAX]);
        while (*ptr != '\0') {
                wchar_t ch;
                int ret = mbtowc(&ch, ptr, strlen(ptr));
@@ -129,17 +181,16 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
                        exit(1);
                }
 
-               char buf[MB_CUR_MAX];
                vector<string> alt;
                alt.push_back(string(ptr, ret));
                ptr += ret;
                if (towlower(ch) != wint_t(ch)) {
-                       ret = wctomb(buf, towlower(ch));
-                       alt.push_back(string(buf, ret));
+                       ret = wctomb(buf.get(), towlower(ch));
+                       alt.push_back(string(buf.get(), ret));
                }
                if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
-                       ret = wctomb(buf, towupper(ch));
-                       alt.push_back(string(buf, ret));
+                       ret = wctomb(buf.get(), towupper(ch));
+                       alt.push_back(string(buf.get(), ret));
                }
                alternatives_for_cp.push_back(move(alt));
        }
@@ -171,7 +222,7 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
                        need_another_pass = false;
                        vector<TrigramState> new_states;
                        for (const TrigramState &state : states) {
-                               if (state.buffered.size() >= 3) {
+                               if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) {
                                        // No need to extend this further.
                                        new_states.push_back(state);
                                        continue;
@@ -183,7 +234,7 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
                                }
                                for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
                                        TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
-                                       if (new_state.buffered.size() < 3) {
+                                       if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) {
                                                need_another_pass = true;
                                        }
                                        new_states.push_back(move(new_state));
@@ -197,19 +248,29 @@ void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction>
                // and the destination filename must contain at least one of them.
                // Output those trigrams, cut out the first byte and then deduplicate
                // the states before we continue.
+               bool any_wildcard = false;
                vector<uint32_t> trigram_alternatives;
                for (TrigramState &state : states) {
                        trigram_alternatives.push_back(read_trigram(state.buffered, 0));
-                       state.buffered.erase(0, 1);
+                       state.buffered.erase(0, read_unigram(state.buffered, 0).second);
+                       assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM);
+                       if (trigram_alternatives.back() == WILDCARD_UNIGRAM) {
+                               // If any of the candidates are wildcards, we need to drop the entire OR group.
+                               // (Most likely, all of them would be anyway.) We need to keep stripping out
+                               // the first unigram from each state.
+                               any_wildcard = true;
+                       }
                }
                unique_sort(&trigram_alternatives);  // Could have duplicates, although it's rare.
                unique_sort(&states);
 
-               TrigramDisjunction new_pt;
-               new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
-               new_pt.trigram_alternatives = move(trigram_alternatives);
-               new_pt.max_num_docids = 0;
-               trigram_groups->push_back(move(new_pt));
+               if (!any_wildcard) {
+                       TrigramDisjunction new_pt;
+                       new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
+                       new_pt.trigram_alternatives = move(trigram_alternatives);
+                       new_pt.max_num_docids = 0;
+                       trigram_groups->push_back(move(new_pt));
+               }
 
                if (states.size() > 100) {
                        // A completely crazy pattern with lots of those special characters.
@@ -229,14 +290,17 @@ void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjun
        }
 
        // The case-sensitive case is straightforward.
-       if (needle.size() >= 3) {
-               for (size_t i = 0; i < needle.size() - 2; ++i) {
-                       uint32_t trgm = read_trigram(needle, i);
-                       TrigramDisjunction new_pt;
-                       new_pt.remaining_trigrams_to_read = 1;
-                       new_pt.trigram_alternatives.push_back(trgm);
-                       new_pt.max_num_docids = 0;
-                       trigram_groups->push_back(move(new_pt));
+       for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
+               uint32_t trgm = read_trigram(needle, i);
+               if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) {
+                       // Invalid trigram, so skip.
+                       continue;
                }
+
+               TrigramDisjunction new_pt;
+               new_pt.remaining_trigrams_to_read = 1;
+               new_pt.trigram_alternatives.push_back(trgm);
+               new_pt.max_num_docids = 0;
+               trigram_groups->push_back(move(new_pt));
        }
 }