]> git.sesse.net Git - plocate/blob - parse_trigrams.cpp
Remove dependency on non-POSIX header error.h.
[plocate] / parse_trigrams.cpp
1 #include "parse_trigrams.h"
2
3 #include "unique_sort.h"
4
5 #include <assert.h>
6 #include <memory>
7 #include <string.h>
8 #include <wctype.h>
9
10 using namespace std;
11
12 string print_td(const TrigramDisjunction &td)
13 {
14         if (td.read_trigrams.size() == 0) {
15                 // Before we've done hash lookups (or none matched), so print all alternatives.
16                 if (td.trigram_alternatives.size() == 1) {
17                         return print_trigram(td.trigram_alternatives[0]);
18                 } else {
19                         string ret;
20                         ret = "(";
21                         bool first = true;
22                         for (uint32_t trgm : td.trigram_alternatives) {
23                                 if (!first)
24                                         ret += " OR ";
25                                 ret += print_trigram(trgm);
26                                 first = false;
27                         }
28                         return ret + ")";
29                 }
30         } else {
31                 // Print only those that we actually have in the index.
32                 if (td.read_trigrams.size() == 1) {
33                         return print_trigram(td.read_trigrams[0].first.trgm);
34                 } else {
35                         string ret;
36                         ret = "(";
37                         bool first = true;
38                         for (auto &[trgmptr, len] : td.read_trigrams) {
39                                 if (!first)
40                                         ret += " OR ";
41                                 ret += print_trigram(trgmptr.trgm);
42                                 first = false;
43                         }
44                         return ret + ")";
45                 }
46         }
47 }
48
49 string print_trigram(uint32_t trgm)
50 {
51         char ch[3] = {
52                 char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
53         };
54
55         string str = "'";
56         for (unsigned i = 0; i < 3;) {
57                 if (ch[i] == '\\') {
58                         str.push_back('\\');
59                         str.push_back(ch[i]);
60                         ++i;
61                 } else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) {  // Holds no matter whether char is signed or unsigned.
62                         str.push_back(ch[i]);
63                         ++i;
64                 } else {
65                         // See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
66                         mbtowc(nullptr, 0, 0);
67                         wchar_t pwc;
68                         int ret = mbtowc(&pwc, ch + i, 3 - i);
69                         if (ret >= 1 && pwc >= 32) {
70                                 str.append(ch + i, ret);
71                                 i += ret;
72                         } else {
73                                 char buf[16];
74                                 snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
75                                 str += buf;
76                                 ++i;
77                         }
78                 }
79         }
80         str += "'";
81         return str;
82 }
83
84 pair<uint32_t, size_t> read_unigram(const string &s, size_t start)
85 {
86         if (start >= s.size()) {
87                 return { PREMATURE_END_UNIGRAM, 0 };
88         }
89         if (s[start] == '\\') {
90                 // Escaped character.
91                 if (start + 1 >= s.size()) {
92                         return { PREMATURE_END_UNIGRAM, 1 };
93                 } else {
94                         return { (unsigned char)s[start + 1], 2 };
95                 }
96         }
97         if (s[start] == '*' || s[start] == '?') {
98                 // Wildcard.
99                 return { WILDCARD_UNIGRAM, 1 };
100         }
101         if (s[start] == '[') {
102                 // Character class; search to find the end.
103                 size_t len = 1;
104                 if (start + len >= s.size()) {
105                         return { PREMATURE_END_UNIGRAM, len };
106                 }
107                 if (s[start + len] == '!') {
108                         ++len;
109                 }
110                 if (start + len >= s.size()) {
111                         return { PREMATURE_END_UNIGRAM, len };
112                 }
113                 if (s[start + len] == ']') {
114                         ++len;
115                 }
116                 for (;;) {
117                         if (start + len >= s.size()) {
118                                 return { PREMATURE_END_UNIGRAM, len };
119                         }
120                         if (s[start + len] == ']') {
121                                 return { WILDCARD_UNIGRAM, len + 1 };
122                         }
123                         ++len;
124                 }
125         }
126
127         // Regular letter.
128         return { (unsigned char)s[start], 1 };
129 }
130
131 uint32_t read_trigram(const string &s, size_t start)
132 {
133         pair<uint32_t, size_t> u1 = read_unigram(s, start);
134         if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) {
135                 return u1.first;
136         }
137         pair<uint32_t, size_t> u2 = read_unigram(s, start + u1.second);
138         if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) {
139                 return u2.first;
140         }
141         pair<uint32_t, size_t> u3 = read_unigram(s, start + u1.second + u2.second);
142         if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) {
143                 return u3.first;
144         }
145         return u1.first | (u2.first << 8) | (u3.first << 16);
146 }
147
148 struct TrigramState {
149         string buffered;
150         unsigned next_codepoint;
151
152         bool operator<(const TrigramState &other) const
153         {
154                 if (next_codepoint != other.next_codepoint)
155                         return next_codepoint < other.next_codepoint;
156                 return buffered < other.buffered;
157         }
158         bool operator==(const TrigramState &other) const
159         {
160                 return next_codepoint == other.next_codepoint &&
161                         buffered == other.buffered;
162         }
163 };
164
165 void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction> *trigram_groups)
166 {
167         vector<vector<string>> alternatives_for_cp;
168
169         // Parse the needle into Unicode code points, and do inverse case folding
170         // on each to find legal alternatives. This is far from perfect (e.g. ß
171         // will not become ss), but it's generally the best we can do without
172         // involving ICU or the likes.
173         mbtowc(nullptr, 0, 0);
174         const char *ptr = needle.c_str();
175         unique_ptr<char[]> buf(new char[MB_CUR_MAX]);
176         while (*ptr != '\0') {
177                 wchar_t ch;
178                 int ret = mbtowc(&ch, ptr, strlen(ptr));
179                 if (ret == -1) {
180                         perror(ptr);
181                         exit(1);
182                 }
183
184                 vector<string> alt;
185                 alt.push_back(string(ptr, ret));
186                 ptr += ret;
187                 if (towlower(ch) != wint_t(ch)) {
188                         ret = wctomb(buf.get(), towlower(ch));
189                         alt.push_back(string(buf.get(), ret));
190                 }
191                 if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
192                         ret = wctomb(buf.get(), towupper(ch));
193                         alt.push_back(string(buf.get(), ret));
194                 }
195                 alternatives_for_cp.push_back(move(alt));
196         }
197
198         // Now generate all possible byte strings from those code points in order;
199         // e.g., from abc, we'd create a and A, then extend those to ab aB Ab AB,
200         // then abc abC aBc aBC and so on. Since we don't want to have 2^n
201         // (or even 3^n) strings, we only extend them far enough to cover at
202         // least three bytes; this will give us a set of candidate trigrams
203         // (the filename must have at least one of those), and then we can
204         // chop off the first byte, deduplicate states and continue extending
205         // and generating trigram sets.
206         //
207         // There are a few special cases, notably the dotted i (İ), where the
208         // UTF-8 versions of upper and lower case have different number of bytes.
209         // If this happens, we can have combinatorial explosion and get many more
210         // than the normal 8 states. We detect this and simply bomb out; it will
211         // never really happen in real strings, and stopping trigram generation
212         // really only means our pruning of candidates will be less effective.
213         vector<TrigramState> states;
214         states.push_back(TrigramState{ "", 0 });
215
216         for (;;) {
217                 // Extend every state so that it has buffered at least three bytes.
218                 // If this isn't possible, we are done with the string (can generate
219                 // no more trigrams).
220                 bool need_another_pass;
221                 do {
222                         need_another_pass = false;
223                         vector<TrigramState> new_states;
224                         for (const TrigramState &state : states) {
225                                 if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) {
226                                         // No need to extend this further.
227                                         new_states.push_back(state);
228                                         continue;
229                                 }
230                                 if (state.next_codepoint == alternatives_for_cp.size()) {
231                                         // We can't form a complete trigram from this alternative,
232                                         // so we're done.
233                                         return;
234                                 }
235                                 for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
236                                         TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
237                                         if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) {
238                                                 need_another_pass = true;
239                                         }
240                                         new_states.push_back(move(new_state));
241                                 }
242                         }
243                         states = move(new_states);
244                 } while (need_another_pass);
245
246                 // OK, so now we have a bunch of states, and all of them are at least
247                 // three bytes long. This means we have a complete set of trigrams,
248                 // and the destination filename must contain at least one of them.
249                 // Output those trigrams, cut out the first byte and then deduplicate
250                 // the states before we continue.
251                 bool any_wildcard = false;
252                 vector<uint32_t> trigram_alternatives;
253                 for (TrigramState &state : states) {
254                         trigram_alternatives.push_back(read_trigram(state.buffered, 0));
255                         state.buffered.erase(0, read_unigram(state.buffered, 0).second);
256                         assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM);
257                         if (trigram_alternatives.back() == WILDCARD_UNIGRAM) {
258                                 // If any of the candidates are wildcards, we need to drop the entire OR group.
259                                 // (Most likely, all of them would be anyway.) We need to keep stripping out
260                                 // the first unigram from each state.
261                                 any_wildcard = true;
262                         }
263                 }
264                 unique_sort(&trigram_alternatives);  // Could have duplicates, although it's rare.
265                 unique_sort(&states);
266
267                 if (!any_wildcard) {
268                         TrigramDisjunction new_pt;
269                         new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
270                         new_pt.trigram_alternatives = move(trigram_alternatives);
271                         new_pt.max_num_docids = 0;
272                         trigram_groups->push_back(move(new_pt));
273                 }
274
275                 if (states.size() > 100) {
276                         // A completely crazy pattern with lots of those special characters.
277                         // We just give up; this isn't a realistic scenario anyway.
278                         // We already have lots of trigrams that should reduce the amount of
279                         // candidates.
280                         return;
281                 }
282         }
283 }
284
285 void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjunction> *trigram_groups)
286 {
287         if (ignore_case) {
288                 parse_trigrams_ignore_case(needle, trigram_groups);
289                 return;
290         }
291
292         // The case-sensitive case is straightforward.
293         for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
294                 uint32_t trgm = read_trigram(needle, i);
295                 if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) {
296                         // Invalid trigram, so skip.
297                         continue;
298                 }
299
300                 TrigramDisjunction new_pt;
301                 new_pt.remaining_trigrams_to_read = 1;
302                 new_pt.trigram_alternatives.push_back(trgm);
303                 new_pt.max_num_docids = 0;
304                 trigram_groups->push_back(move(new_pt));
305         }
306 }