]> git.sesse.net Git - plocate/blob - parse_trigrams.cpp
Release plocate 1.1.22.
[plocate] / parse_trigrams.cpp
1 #include "parse_trigrams.h"
2
3 #include "unique_sort.h"
4
5 #include <assert.h>
6 #include <string.h>
7 #include <wctype.h>
8
9 using namespace std;
10
11 string print_td(const TrigramDisjunction &td)
12 {
13         if (td.read_trigrams.size() == 0) {
14                 // Before we've done hash lookups (or none matched), so print all alternatives.
15                 if (td.trigram_alternatives.size() == 1) {
16                         return print_trigram(td.trigram_alternatives[0]);
17                 } else {
18                         string ret;
19                         ret = "(";
20                         bool first = true;
21                         for (uint32_t trgm : td.trigram_alternatives) {
22                                 if (!first)
23                                         ret += " OR ";
24                                 ret += print_trigram(trgm);
25                                 first = false;
26                         }
27                         return ret + ")";
28                 }
29         } else {
30                 // Print only those that we actually have in the index.
31                 if (td.read_trigrams.size() == 1) {
32                         return print_trigram(td.read_trigrams[0].first.trgm);
33                 } else {
34                         string ret;
35                         ret = "(";
36                         bool first = true;
37                         for (auto &[trgmptr, len] : td.read_trigrams) {
38                                 if (!first)
39                                         ret += " OR ";
40                                 ret += print_trigram(trgmptr.trgm);
41                                 first = false;
42                         }
43                         return ret + ")";
44                 }
45         }
46 }
47
48 string print_trigram(uint32_t trgm)
49 {
50         char ch[3] = {
51                 char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
52         };
53
54         string str = "'";
55         for (unsigned i = 0; i < 3;) {
56                 if (ch[i] == '\\') {
57                         str.push_back('\\');
58                         str.push_back(ch[i]);
59                         ++i;
60                 } else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) {  // Holds no matter whether char is signed or unsigned.
61                         str.push_back(ch[i]);
62                         ++i;
63                 } else {
64                         // See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
65                         mbtowc(nullptr, 0, 0);
66                         wchar_t pwc;
67                         int ret = mbtowc(&pwc, ch + i, 3 - i);
68                         if (ret >= 1 && pwc >= 32) {
69                                 str.append(ch + i, ret);
70                                 i += ret;
71                         } else {
72                                 char buf[16];
73                                 snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
74                                 str += buf;
75                                 ++i;
76                         }
77                 }
78         }
79         str += "'";
80         return str;
81 }
82
83 pair<uint32_t, size_t> read_unigram(const string &s, size_t start)
84 {
85         if (start >= s.size()) {
86                 return { PREMATURE_END_UNIGRAM, 0 };
87         }
88         if (s[start] == '\\') {
89                 // Escaped character.
90                 if (start + 1 >= s.size()) {
91                         return { PREMATURE_END_UNIGRAM, 1 };
92                 } else {
93                         return { (unsigned char)s[start + 1], 2 };
94                 }
95         }
96         if (s[start] == '*' || s[start] == '?') {
97                 // Wildcard.
98                 return { WILDCARD_UNIGRAM, 1 };
99         }
100         if (s[start] == '[') {
101                 // Character class; search to find the end.
102                 size_t len = 1;
103                 if (start + len >= s.size()) {
104                         return { PREMATURE_END_UNIGRAM, len };
105                 }
106                 if (s[start + len] == '!') {
107                         ++len;
108                 }
109                 if (start + len >= s.size()) {
110                         return { PREMATURE_END_UNIGRAM, len };
111                 }
112                 if (s[start + len] == ']') {
113                         ++len;
114                 }
115                 for (;;) {
116                         if (start + len >= s.size()) {
117                                 return { PREMATURE_END_UNIGRAM, len };
118                         }
119                         if (s[start + len] == ']') {
120                                 return { WILDCARD_UNIGRAM, len + 1 };
121                         }
122                         ++len;
123                 }
124         }
125
126         // Regular letter.
127         return { (unsigned char)s[start], 1 };
128 }
129
130 uint32_t read_trigram(const string &s, size_t start)
131 {
132         pair<uint32_t, size_t> u1 = read_unigram(s, start);
133         if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) {
134                 return u1.first;
135         }
136         pair<uint32_t, size_t> u2 = read_unigram(s, start + u1.second);
137         if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) {
138                 return u2.first;
139         }
140         pair<uint32_t, size_t> u3 = read_unigram(s, start + u1.second + u2.second);
141         if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) {
142                 return u3.first;
143         }
144         return u1.first | (u2.first << 8) | (u3.first << 16);
145 }
146
147 struct TrigramState {
148         string buffered;
149         unsigned next_codepoint;
150
151         bool operator<(const TrigramState &other) const
152         {
153                 if (next_codepoint != other.next_codepoint)
154                         return next_codepoint < other.next_codepoint;
155                 return buffered < other.buffered;
156         }
157         bool operator==(const TrigramState &other) const
158         {
159                 return next_codepoint == other.next_codepoint &&
160                         buffered == other.buffered;
161         }
162 };
163
164 void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction> *trigram_groups)
165 {
166         vector<vector<string>> alternatives_for_cp;
167
168         // Parse the needle into Unicode code points, and do inverse case folding
169         // on each to find legal alternatives. This is far from perfect (e.g. ß
170         // will not become ss), but it's generally the best we can do without
171         // involving ICU or the likes.
172         mbtowc(nullptr, 0, 0);
173         const char *ptr = needle.c_str();
174         while (*ptr != '\0') {
175                 wchar_t ch;
176                 int ret = mbtowc(&ch, ptr, strlen(ptr));
177                 if (ret == -1) {
178                         perror(ptr);
179                         exit(1);
180                 }
181
182                 char buf[MB_CUR_MAX];
183                 vector<string> alt;
184                 alt.push_back(string(ptr, ret));
185                 ptr += ret;
186                 if (towlower(ch) != wint_t(ch)) {
187                         ret = wctomb(buf, towlower(ch));
188                         alt.push_back(string(buf, ret));
189                 }
190                 if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
191                         ret = wctomb(buf, towupper(ch));
192                         alt.push_back(string(buf, ret));
193                 }
194                 alternatives_for_cp.push_back(move(alt));
195         }
196
197         // Now generate all possible byte strings from those code points in order;
198         // e.g., from abc, we'd create a and A, then extend those to ab aB Ab AB,
199         // then abc abC aBc aBC and so on. Since we don't want to have 2^n
200         // (or even 3^n) strings, we only extend them far enough to cover at
201         // least three bytes; this will give us a set of candidate trigrams
202         // (the filename must have at least one of those), and then we can
203         // chop off the first byte, deduplicate states and continue extending
204         // and generating trigram sets.
205         //
206         // There are a few special cases, notably the dotted i (İ), where the
207         // UTF-8 versions of upper and lower case have different number of bytes.
208         // If this happens, we can have combinatorial explosion and get many more
209         // than the normal 8 states. We detect this and simply bomb out; it will
210         // never really happen in real strings, and stopping trigram generation
211         // really only means our pruning of candidates will be less effective.
212         vector<TrigramState> states;
213         states.push_back(TrigramState{ "", 0 });
214
215         for (;;) {
216                 // Extend every state so that it has buffered at least three bytes.
217                 // If this isn't possible, we are done with the string (can generate
218                 // no more trigrams).
219                 bool need_another_pass;
220                 do {
221                         need_another_pass = false;
222                         vector<TrigramState> new_states;
223                         for (const TrigramState &state : states) {
224                                 if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) {
225                                         // No need to extend this further.
226                                         new_states.push_back(state);
227                                         continue;
228                                 }
229                                 if (state.next_codepoint == alternatives_for_cp.size()) {
230                                         // We can't form a complete trigram from this alternative,
231                                         // so we're done.
232                                         return;
233                                 }
234                                 for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
235                                         TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
236                                         if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) {
237                                                 need_another_pass = true;
238                                         }
239                                         new_states.push_back(move(new_state));
240                                 }
241                         }
242                         states = move(new_states);
243                 } while (need_another_pass);
244
245                 // OK, so now we have a bunch of states, and all of them are at least
246                 // three bytes long. This means we have a complete set of trigrams,
247                 // and the destination filename must contain at least one of them.
248                 // Output those trigrams, cut out the first byte and then deduplicate
249                 // the states before we continue.
250                 bool any_wildcard = false;
251                 vector<uint32_t> trigram_alternatives;
252                 for (TrigramState &state : states) {
253                         trigram_alternatives.push_back(read_trigram(state.buffered, 0));
254                         state.buffered.erase(0, read_unigram(state.buffered, 0).second);
255                         assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM);
256                         if (trigram_alternatives.back() == WILDCARD_UNIGRAM) {
257                                 // If any of the candidates are wildcards, we need to drop the entire OR group.
258                                 // (Most likely, all of them would be anyway.) We need to keep stripping out
259                                 // the first unigram from each state.
260                                 any_wildcard = true;
261                         }
262                 }
263                 unique_sort(&trigram_alternatives);  // Could have duplicates, although it's rare.
264                 unique_sort(&states);
265
266                 if (!any_wildcard) {
267                         TrigramDisjunction new_pt;
268                         new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
269                         new_pt.trigram_alternatives = move(trigram_alternatives);
270                         new_pt.max_num_docids = 0;
271                         trigram_groups->push_back(move(new_pt));
272                 }
273
274                 if (states.size() > 100) {
275                         // A completely crazy pattern with lots of those special characters.
276                         // We just give up; this isn't a realistic scenario anyway.
277                         // We already have lots of trigrams that should reduce the amount of
278                         // candidates.
279                         return;
280                 }
281         }
282 }
283
284 void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjunction> *trigram_groups)
285 {
286         if (ignore_case) {
287                 parse_trigrams_ignore_case(needle, trigram_groups);
288                 return;
289         }
290
291         // The case-sensitive case is straightforward.
292         for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
293                 uint32_t trgm = read_trigram(needle, i);
294                 if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) {
295                         // Invalid trigram, so skip.
296                         continue;
297                 }
298
299                 TrigramDisjunction new_pt;
300                 new_pt.remaining_trigrams_to_read = 1;
301                 new_pt.trigram_alternatives.push_back(trgm);
302                 new_pt.max_num_docids = 0;
303                 trigram_groups->push_back(move(new_pt));
304         }
305 }