X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=parse_trigrams.cpp;h=9e72aac9bcfbb71f085b3553d81ef80278c058d3;hb=fd6198891d6fd9642effc0843fef6f23b991af3e;hp=03fbc6b7bd3120873afa427f038848a64da143ae;hpb=efd7545c8ee2177aa13cf3ec8423d0e725c6a16d;p=plocate diff --git a/parse_trigrams.cpp b/parse_trigrams.cpp index 03fbc6b..9e72aac 100644 --- a/parse_trigrams.cpp +++ b/parse_trigrams.cpp @@ -2,6 +2,8 @@ #include "unique_sort.h" +#include +#include #include #include @@ -79,19 +81,68 @@ string print_trigram(uint32_t trgm) return str; } -uint32_t read_unigram(const string &s, size_t idx) +pair read_unigram(const string &s, size_t start) { - if (idx < s.size()) { - return (unsigned char)s[idx]; - } else { - return 0; + if (start >= s.size()) { + return { PREMATURE_END_UNIGRAM, 0 }; + } + if (s[start] == '\\') { + // Escaped character. + if (start + 1 >= s.size()) { + return { PREMATURE_END_UNIGRAM, 1 }; + } else { + return { (unsigned char)s[start + 1], 2 }; + } + } + if (s[start] == '*' || s[start] == '?') { + // Wildcard. + return { WILDCARD_UNIGRAM, 1 }; + } + if (s[start] == '[') { + // Character class; search to find the end. + size_t len = 1; + if (start + len >= s.size()) { + return { PREMATURE_END_UNIGRAM, len }; + } + if (s[start + len] == '!') { + ++len; + } + if (start + len >= s.size()) { + return { PREMATURE_END_UNIGRAM, len }; + } + if (s[start + len] == ']') { + ++len; + } + for (;;) { + if (start + len >= s.size()) { + return { PREMATURE_END_UNIGRAM, len }; + } + if (s[start + len] == ']') { + return { WILDCARD_UNIGRAM, len + 1 }; + } + ++len; + } } + + // Regular letter. + return { (unsigned char)s[start], 1 }; } uint32_t read_trigram(const string &s, size_t start) { - return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) | - (read_unigram(s, start + 2) << 16); + pair u1 = read_unigram(s, start); + if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) { + return u1.first; + } + pair u2 = read_unigram(s, start + u1.second); + if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) { + return u2.first; + } + pair u3 = read_unigram(s, start + u1.second + u2.second); + if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) { + return u3.first; + } + return u1.first | (u2.first << 8) | (u3.first << 16); } struct TrigramState { @@ -121,6 +172,7 @@ void parse_trigrams_ignore_case(const string &needle, vector // involving ICU or the likes. mbtowc(nullptr, 0, 0); const char *ptr = needle.c_str(); + unique_ptr buf(new char[MB_CUR_MAX]); while (*ptr != '\0') { wchar_t ch; int ret = mbtowc(&ch, ptr, strlen(ptr)); @@ -129,17 +181,16 @@ void parse_trigrams_ignore_case(const string &needle, vector exit(1); } - char buf[MB_CUR_MAX]; vector alt; alt.push_back(string(ptr, ret)); ptr += ret; if (towlower(ch) != wint_t(ch)) { - ret = wctomb(buf, towlower(ch)); - alt.push_back(string(buf, ret)); + ret = wctomb(buf.get(), towlower(ch)); + alt.push_back(string(buf.get(), ret)); } if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) { - ret = wctomb(buf, towupper(ch)); - alt.push_back(string(buf, ret)); + ret = wctomb(buf.get(), towupper(ch)); + alt.push_back(string(buf.get(), ret)); } alternatives_for_cp.push_back(move(alt)); } @@ -171,7 +222,7 @@ void parse_trigrams_ignore_case(const string &needle, vector need_another_pass = false; vector new_states; for (const TrigramState &state : states) { - if (state.buffered.size() >= 3) { + if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) { // No need to extend this further. new_states.push_back(state); continue; @@ -183,7 +234,7 @@ void parse_trigrams_ignore_case(const string &needle, vector } for (const string &rune : alternatives_for_cp[state.next_codepoint]) { TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 }; - if (new_state.buffered.size() < 3) { + if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) { need_another_pass = true; } new_states.push_back(move(new_state)); @@ -197,19 +248,29 @@ void parse_trigrams_ignore_case(const string &needle, vector // and the destination filename must contain at least one of them. // Output those trigrams, cut out the first byte and then deduplicate // the states before we continue. + bool any_wildcard = false; vector trigram_alternatives; for (TrigramState &state : states) { trigram_alternatives.push_back(read_trigram(state.buffered, 0)); - state.buffered.erase(0, 1); + state.buffered.erase(0, read_unigram(state.buffered, 0).second); + assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM); + if (trigram_alternatives.back() == WILDCARD_UNIGRAM) { + // If any of the candidates are wildcards, we need to drop the entire OR group. + // (Most likely, all of them would be anyway.) We need to keep stripping out + // the first unigram from each state. + any_wildcard = true; + } } unique_sort(&trigram_alternatives); // Could have duplicates, although it's rare. unique_sort(&states); - TrigramDisjunction new_pt; - new_pt.remaining_trigrams_to_read = trigram_alternatives.size(); - new_pt.trigram_alternatives = move(trigram_alternatives); - new_pt.max_num_docids = 0; - trigram_groups->push_back(move(new_pt)); + if (!any_wildcard) { + TrigramDisjunction new_pt; + new_pt.remaining_trigrams_to_read = trigram_alternatives.size(); + new_pt.trigram_alternatives = move(trigram_alternatives); + new_pt.max_num_docids = 0; + trigram_groups->push_back(move(new_pt)); + } if (states.size() > 100) { // A completely crazy pattern with lots of those special characters. @@ -229,14 +290,17 @@ void parse_trigrams(const string &needle, bool ignore_case, vector= 3) { - for (size_t i = 0; i < needle.size() - 2; ++i) { - uint32_t trgm = read_trigram(needle, i); - TrigramDisjunction new_pt; - new_pt.remaining_trigrams_to_read = 1; - new_pt.trigram_alternatives.push_back(trgm); - new_pt.max_num_docids = 0; - trigram_groups->push_back(move(new_pt)); + for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) { + uint32_t trgm = read_trigram(needle, i); + if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) { + // Invalid trigram, so skip. + continue; } + + TrigramDisjunction new_pt; + new_pt.remaining_trigrams_to_read = 1; + new_pt.trigram_alternatives.push_back(trgm); + new_pt.max_num_docids = 0; + trigram_groups->push_back(move(new_pt)); } }