#include "unique_sort.h"
+#include <assert.h>
#include <string.h>
#include <wctype.h>
return str;
}
-uint32_t read_unigram(const string &s, size_t idx)
+pair<uint32_t, size_t> read_unigram(const string &s, size_t start)
{
- if (idx < s.size()) {
- return (unsigned char)s[idx];
- } else {
- return 0;
+ if (start >= s.size()) {
+ return { PREMATURE_END_UNIGRAM, 0 };
+ }
+ if (s[start] == '\\') {
+ // Escaped character.
+ if (start + 1 >= s.size()) {
+ return { PREMATURE_END_UNIGRAM, 1 };
+ } else {
+ return { (unsigned char)s[start + 1], 2 };
+ }
+ }
+ if (s[start] == '*' || s[start] == '?') {
+ // Wildcard.
+ return { WILDCARD_UNIGRAM, 1 };
+ }
+ if (s[start] == '[') {
+ // Character class; search to find the end.
+ size_t len = 1;
+ if (start + len >= s.size()) {
+ return { PREMATURE_END_UNIGRAM, len };
+ }
+ if (s[start + len] == '!') {
+ ++len;
+ }
+ if (start + len >= s.size()) {
+ return { PREMATURE_END_UNIGRAM, len };
+ }
+ if (s[start + len] == ']') {
+ ++len;
+ }
+ for (;;) {
+ if (start + len >= s.size()) {
+ return { PREMATURE_END_UNIGRAM, len };
+ }
+ if (s[start + len] == ']') {
+ return { WILDCARD_UNIGRAM, len + 1 };
+ }
+ ++len;
+ }
}
+
+ // Regular letter.
+ return { (unsigned char)s[start], 1 };
}
uint32_t read_trigram(const string &s, size_t start)
{
- return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
- (read_unigram(s, start + 2) << 16);
+ pair<uint32_t, size_t> u1 = read_unigram(s, start);
+ if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) {
+ return u1.first;
+ }
+ pair<uint32_t, size_t> u2 = read_unigram(s, start + u1.second);
+ if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) {
+ return u2.first;
+ }
+ pair<uint32_t, size_t> u3 = read_unigram(s, start + u1.second + u2.second);
+ if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) {
+ return u3.first;
+ }
+ return u1.first | (u2.first << 8) | (u3.first << 16);
}
struct TrigramState {
need_another_pass = false;
vector<TrigramState> new_states;
for (const TrigramState &state : states) {
- if (state.buffered.size() >= 3) {
+ if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) {
// No need to extend this further.
new_states.push_back(state);
continue;
}
for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
- if (new_state.buffered.size() < 3) {
+ if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) {
need_another_pass = true;
}
new_states.push_back(move(new_state));
// and the destination filename must contain at least one of them.
// Output those trigrams, cut out the first byte and then deduplicate
// the states before we continue.
+ bool any_wildcard = false;
vector<uint32_t> trigram_alternatives;
for (TrigramState &state : states) {
trigram_alternatives.push_back(read_trigram(state.buffered, 0));
- state.buffered.erase(0, 1);
+ state.buffered.erase(0, read_unigram(state.buffered, 0).second);
+ assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM);
+ if (trigram_alternatives.back() == WILDCARD_UNIGRAM) {
+ // If any of the candidates are wildcards, we need to drop the entire OR group.
+ // (Most likely, all of them would be anyway.) We need to keep stripping out
+ // the first unigram from each state.
+ any_wildcard = true;
+ }
}
unique_sort(&trigram_alternatives); // Could have duplicates, although it's rare.
unique_sort(&states);
- TrigramDisjunction new_pt;
- new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
- new_pt.trigram_alternatives = move(trigram_alternatives);
- new_pt.max_num_docids = 0;
- trigram_groups->push_back(move(new_pt));
+ if (!any_wildcard) {
+ TrigramDisjunction new_pt;
+ new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
+ new_pt.trigram_alternatives = move(trigram_alternatives);
+ new_pt.max_num_docids = 0;
+ trigram_groups->push_back(move(new_pt));
+ }
if (states.size() > 100) {
// A completely crazy pattern with lots of those special characters.
}
// The case-sensitive case is straightforward.
- if (needle.size() >= 3) {
- for (size_t i = 0; i < needle.size() - 2; ++i) {
- uint32_t trgm = read_trigram(needle, i);
- TrigramDisjunction new_pt;
- new_pt.remaining_trigrams_to_read = 1;
- new_pt.trigram_alternatives.push_back(trgm);
- new_pt.max_num_docids = 0;
- trigram_groups->push_back(move(new_pt));
+ for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
+ uint32_t trgm = read_trigram(needle, i);
+ if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) {
+ // Invalid trigram, so skip.
+ continue;
}
+
+ TrigramDisjunction new_pt;
+ new_pt.remaining_trigrams_to_read = 1;
+ new_pt.trigram_alternatives.push_back(trgm);
+ new_pt.max_num_docids = 0;
+ trigram_groups->push_back(move(new_pt));
}
}