1 #include "parse_trigrams.h"
3 #include "unique_sort.h"
10 string print_td(const TrigramDisjunction &td)
12 if (td.read_trigrams.size() == 0) {
13 // Before we've done hash lookups (or none matched), so print all alternatives.
14 if (td.trigram_alternatives.size() == 1) {
15 return print_trigram(td.trigram_alternatives[0]);
20 for (uint32_t trgm : td.trigram_alternatives) {
23 ret += print_trigram(trgm);
29 // Print only those that we actually have in the index.
30 if (td.read_trigrams.size() == 1) {
31 return print_trigram(td.read_trigrams[0].first.trgm);
36 for (auto &[trgmptr, len] : td.read_trigrams) {
39 ret += print_trigram(trgmptr.trgm);
47 string print_trigram(uint32_t trgm)
50 char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
54 for (unsigned i = 0; i < 3;) {
59 } else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) { // Holds no matter whether char is signed or unsigned.
63 // See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
64 mbtowc(nullptr, 0, 0);
66 int ret = mbtowc(&pwc, ch + i, 3 - i);
67 if (ret >= 1 && pwc >= 32) {
68 str.append(ch + i, ret);
72 snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
82 uint32_t read_unigram(const string &s, size_t idx)
85 return (unsigned char)s[idx];
91 uint32_t read_trigram(const string &s, size_t start)
93 return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
94 (read_unigram(s, start + 2) << 16);
99 unsigned next_codepoint;
101 bool operator<(const TrigramState &other) const
103 if (next_codepoint != other.next_codepoint)
104 return next_codepoint < other.next_codepoint;
105 return buffered < other.buffered;
107 bool operator==(const TrigramState &other) const
109 return next_codepoint == other.next_codepoint &&
110 buffered == other.buffered;
114 void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction> *trigram_groups)
116 vector<vector<string>> alternatives_for_cp;
118 // Parse the needle into Unicode code points, and do inverse case folding
119 // on each to find legal alternatives. This is far from perfect (e.g. ß
120 // will not become ss), but it's generally the best we can do without
121 // involving ICU or the likes.
122 mbtowc(nullptr, 0, 0);
123 const char *ptr = needle.c_str();
124 while (*ptr != '\0') {
126 int ret = mbtowc(&ch, ptr, strlen(ptr));
132 char buf[MB_CUR_MAX];
134 alt.push_back(string(ptr, ret));
136 if (towlower(ch) != wint_t(ch)) {
137 ret = wctomb(buf, towlower(ch));
138 alt.push_back(string(buf, ret));
140 if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
141 ret = wctomb(buf, towupper(ch));
142 alt.push_back(string(buf, ret));
144 alternatives_for_cp.push_back(move(alt));
147 // Now generate all possible byte strings from those code points in order;
148 // e.g., from abc, we'd create a and A, then extend those to ab aB Ab AB,
149 // then abc abC aBc aBC and so on. Since we don't want to have 2^n
150 // (or even 3^n) strings, we only extend them far enough to cover at
151 // least three bytes; this will give us a set of candidate trigrams
152 // (the filename must have at least one of those), and then we can
153 // chop off the first byte, deduplicate states and continue extending
154 // and generating trigram sets.
156 // There are a few special cases, notably the dotted i (İ), where the
157 // UTF-8 versions of upper and lower case have different number of bytes.
158 // If this happens, we can have combinatorial explosion and get many more
159 // than the normal 8 states. We detect this and simply bomb out; it will
160 // never really happen in real strings, and stopping trigram generation
161 // really only means our pruning of candidates will be less effective.
162 vector<TrigramState> states;
163 states.push_back(TrigramState{ "", 0 });
166 // Extend every state so that it has buffered at least three bytes.
167 // If this isn't possible, we are done with the string (can generate
168 // no more trigrams).
169 bool need_another_pass;
171 need_another_pass = false;
172 vector<TrigramState> new_states;
173 for (const TrigramState &state : states) {
174 if (state.buffered.size() >= 3) {
175 // No need to extend this further.
176 new_states.push_back(state);
179 if (state.next_codepoint == alternatives_for_cp.size()) {
180 // We can't form a complete trigram from this alternative,
184 for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
185 TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
186 if (new_state.buffered.size() < 3) {
187 need_another_pass = true;
189 new_states.push_back(move(new_state));
192 states = move(new_states);
193 } while (need_another_pass);
195 // OK, so now we have a bunch of states, and all of them are at least
196 // three bytes long. This means we have a complete set of trigrams,
197 // and the destination filename must contain at least one of them.
198 // Output those trigrams, cut out the first byte and then deduplicate
199 // the states before we continue.
200 vector<uint32_t> trigram_alternatives;
201 for (TrigramState &state : states) {
202 trigram_alternatives.push_back(read_trigram(state.buffered, 0));
203 state.buffered.erase(0, 1);
205 unique_sort(&trigram_alternatives); // Could have duplicates, although it's rare.
206 unique_sort(&states);
208 TrigramDisjunction new_pt;
209 new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
210 new_pt.trigram_alternatives = move(trigram_alternatives);
211 new_pt.max_num_docids = 0;
212 trigram_groups->push_back(move(new_pt));
214 if (states.size() > 100) {
215 // A completely crazy pattern with lots of those special characters.
216 // We just give up; this isn't a realistic scenario anyway.
217 // We already have lots of trigrams that should reduce the amount of
224 void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjunction> *trigram_groups)
227 parse_trigrams_ignore_case(needle, trigram_groups);
231 // The case-sensitive case is straightforward.
232 if (needle.size() >= 3) {
233 for (size_t i = 0; i < needle.size() - 2; ++i) {
234 uint32_t trgm = read_trigram(needle, i);
235 TrigramDisjunction new_pt;
236 new_pt.remaining_trigrams_to_read = 1;
237 new_pt.trigram_alternatives.push_back(trgm);
238 new_pt.max_num_docids = 0;
239 trigram_groups->push_back(move(new_pt));