git.sesse.net Git - plocate/blob - parse_trigrams.cpp

   1 #include "parse_trigrams.h"
   2
   3 #include "unique_sort.h"
   4
   5 #include <assert.h>
   6 #include <string.h>
   7 #include <wctype.h>
   8
   9 using namespace std;
  10
  11 string print_td(const TrigramDisjunction &td)
  12 {
  13         if (td.read_trigrams.size() == 0) {
  14                 // Before we've done hash lookups (or none matched), so print all alternatives.
  15                 if (td.trigram_alternatives.size() == 1) {
  16                         return print_trigram(td.trigram_alternatives[0]);
  17                 } else {
  18                         string ret;
  19                         ret = "(";
  20                         bool first = true;
  21                         for (uint32_t trgm : td.trigram_alternatives) {
  22                                 if (!first)
  23                                         ret += " OR ";
  24                                 ret += print_trigram(trgm);
  25                                 first = false;
  26                         }
  27                         return ret + ")";
  28                 }
  29         } else {
  30                 // Print only those that we actually have in the index.
  31                 if (td.read_trigrams.size() == 1) {
  32                         return print_trigram(td.read_trigrams[0].first.trgm);
  33                 } else {
  34                         string ret;
  35                         ret = "(";
  36                         bool first = true;
  37                         for (auto &[trgmptr, len] : td.read_trigrams) {
  38                                 if (!first)
  39                                         ret += " OR ";
  40                                 ret += print_trigram(trgmptr.trgm);
  41                                 first = false;
  42                         }
  43                         return ret + ")";
  44                 }
  45         }
  46 }
  47
  48 string print_trigram(uint32_t trgm)
  49 {
  50         char ch[3] = {
  51                 char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
  52         };
  53
  54         string str = "'";
  55         for (unsigned i = 0; i < 3;) {
  56                 if (ch[i] == '\\') {
  57                         str.push_back('\\');
  58                         str.push_back(ch[i]);
  59                         ++i;
  60                 } else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) {  // Holds no matter whether char is signed or unsigned.
  61                         str.push_back(ch[i]);
  62                         ++i;
  63                 } else {
  64                         // See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
  65                         mbtowc(nullptr, 0, 0);
  66                         wchar_t pwc;
  67                         int ret = mbtowc(&pwc, ch + i, 3 - i);
  68                         if (ret >= 1 && pwc >= 32) {
  69                                 str.append(ch + i, ret);
  70                                 i += ret;
  71                         } else {
  72                                 char buf[16];
  73                                 snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
  74                                 str += buf;
  75                                 ++i;
  76                         }
  77                 }
  78         }
  79         str += "'";
  80         return str;
  81 }
  82
  83 pair<uint32_t, size_t> read_unigram(const string &s, size_t start)
  84 {
  85         if (start >= s.size()) {
  86                 return { PREMATURE_END_UNIGRAM, 0 };
  87         }
  88         if (s[start] == '\\') {
  89                 // Escaped character.
  90                 if (start + 1 >= s.size()) {
  91                         return { PREMATURE_END_UNIGRAM, 1 };
  92                 } else {
  93                         return { (unsigned char)s[start + 1], 2 };
  94                 }
  95         }
  96         if (s[start] == '*' || s[start] == '?') {
  97                 // Wildcard.
  98                 return { WILDCARD_UNIGRAM, 1 };
  99         }
 100         if (s[start] == '[') {
 101                 // Character class; search to find the end.
 102                 size_t len = 1;
 103                 if (start + len >= s.size()) {
 104                         return { PREMATURE_END_UNIGRAM, len };
 105                 }
 106                 if (s[start + len] == '!') {
 107                         ++len;
 108                 }
 109                 if (start + len >= s.size()) {
 110                         return { PREMATURE_END_UNIGRAM, len };
 111                 }
 112                 if (s[start + len] == ']') {
 113                         ++len;
 114                 }
 115                 for (;;) {
 116                         if (start + len >= s.size()) {
 117                                 return { PREMATURE_END_UNIGRAM, len };
 118                         }
 119                         if (s[start + len] == ']') {
 120                                 return { WILDCARD_UNIGRAM, len + 1 };
 121                         }
 122                         ++len;
 123                 }
 124         }
 125
 126         // Regular letter.
 127         return { (unsigned char)s[start], 1 };
 128 }
 129
 130 uint32_t read_trigram(const string &s, size_t start)
 131 {
 132         pair<uint32_t, size_t> u1 = read_unigram(s, start);
 133         if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) {
 134                 return u1.first;
 135         }
 136         pair<uint32_t, size_t> u2 = read_unigram(s, start + u1.second);
 137         if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) {
 138                 return u2.first;
 139         }
 140         pair<uint32_t, size_t> u3 = read_unigram(s, start + u1.second + u2.second);
 141         if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) {
 142                 return u3.first;
 143         }
 144         return u1.first | (u2.first << 8) | (u3.first << 16);
 145 }
 146
 147 struct TrigramState {
 148         string buffered;
 149         unsigned next_codepoint;
 150
 151         bool operator<(const TrigramState &other) const
 152         {
 153                 if (next_codepoint != other.next_codepoint)
 154                         return next_codepoint < other.next_codepoint;
 155                 return buffered < other.buffered;
 156         }
 157         bool operator==(const TrigramState &other) const
 158         {
 159                 return next_codepoint == other.next_codepoint &&
 160                         buffered == other.buffered;
 161         }
 162 };
 163
 164 void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction> *trigram_groups)
 165 {
 166         vector<vector<string>> alternatives_for_cp;
 167
 168         // Parse the needle into Unicode code points, and do inverse case folding
 169         // on each to find legal alternatives. This is far from perfect (e.g. ß
 170         // will not become ss), but it's generally the best we can do without
 171         // involving ICU or the likes.
 172         mbtowc(nullptr, 0, 0);
 173         const char *ptr = needle.c_str();
 174         while (*ptr != '\0') {
 175                 wchar_t ch;
 176                 int ret = mbtowc(&ch, ptr, strlen(ptr));
 177                 if (ret == -1) {
 178                         perror(ptr);
 179                         exit(1);
 180                 }
 181
 182                 char buf[MB_CUR_MAX];
 183                 vector<string> alt;
 184                 alt.push_back(string(ptr, ret));
 185                 ptr += ret;
 186                 if (towlower(ch) != wint_t(ch)) {
 187                         ret = wctomb(buf, towlower(ch));
 188                         alt.push_back(string(buf, ret));
 189                 }
 190                 if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
 191                         ret = wctomb(buf, towupper(ch));
 192                         alt.push_back(string(buf, ret));
 193                 }
 194                 alternatives_for_cp.push_back(move(alt));
 195         }
 196
 197         // Now generate all possible byte strings from those code points in order;
 198         // e.g., from abc, we'd create a and A, then extend those to ab aB Ab AB,
 199         // then abc abC aBc aBC and so on. Since we don't want to have 2^n
 200         // (or even 3^n) strings, we only extend them far enough to cover at
 201         // least three bytes; this will give us a set of candidate trigrams
 202         // (the filename must have at least one of those), and then we can
 203         // chop off the first byte, deduplicate states and continue extending
 204         // and generating trigram sets.
 205         //
 206         // There are a few special cases, notably the dotted i (İ), where the
 207         // UTF-8 versions of upper and lower case have different number of bytes.
 208         // If this happens, we can have combinatorial explosion and get many more
 209         // than the normal 8 states. We detect this and simply bomb out; it will
 210         // never really happen in real strings, and stopping trigram generation
 211         // really only means our pruning of candidates will be less effective.
 212         vector<TrigramState> states;
 213         states.push_back(TrigramState{ "", 0 });
 214
 215         for (;;) {
 216                 // Extend every state so that it has buffered at least three bytes.
 217                 // If this isn't possible, we are done with the string (can generate
 218                 // no more trigrams).
 219                 bool need_another_pass;
 220                 do {
 221                         need_another_pass = false;
 222                         vector<TrigramState> new_states;
 223                         for (const TrigramState &state : states) {
 224                                 if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) {
 225                                         // No need to extend this further.
 226                                         new_states.push_back(state);
 227                                         continue;
 228                                 }
 229                                 if (state.next_codepoint == alternatives_for_cp.size()) {
 230                                         // We can't form a complete trigram from this alternative,
 231                                         // so we're done.
 232                                         return;
 233                                 }
 234                                 for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
 235                                         TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
 236                                         if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) {
 237                                                 need_another_pass = true;
 238                                         }
 239                                         new_states.push_back(move(new_state));
 240                                 }
 241                         }
 242                         states = move(new_states);
 243                 } while (need_another_pass);
 244
 245                 // OK, so now we have a bunch of states, and all of them are at least
 246                 // three bytes long. This means we have a complete set of trigrams,
 247                 // and the destination filename must contain at least one of them.
 248                 // Output those trigrams, cut out the first byte and then deduplicate
 249                 // the states before we continue.
 250                 bool any_wildcard = false;
 251                 vector<uint32_t> trigram_alternatives;
 252                 for (TrigramState &state : states) {
 253                         trigram_alternatives.push_back(read_trigram(state.buffered, 0));
 254                         state.buffered.erase(0, read_unigram(state.buffered, 0).second);
 255                         assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM);
 256                         if (trigram_alternatives.back() == WILDCARD_UNIGRAM) {
 257                                 // If any of the candidates are wildcards, we need to drop the entire OR group.
 258                                 // (Most likely, all of them would be anyway.) We need to keep stripping out
 259                                 // the first unigram from each state.
 260                                 any_wildcard = true;
 261                         }
 262                 }
 263                 unique_sort(&trigram_alternatives);  // Could have duplicates, although it's rare.
 264                 unique_sort(&states);
 265
 266                 if (!any_wildcard) {
 267                         TrigramDisjunction new_pt;
 268                         new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
 269                         new_pt.trigram_alternatives = move(trigram_alternatives);
 270                         new_pt.max_num_docids = 0;
 271                         trigram_groups->push_back(move(new_pt));
 272                 }
 273
 274                 if (states.size() > 100) {
 275                         // A completely crazy pattern with lots of those special characters.
 276                         // We just give up; this isn't a realistic scenario anyway.
 277                         // We already have lots of trigrams that should reduce the amount of
 278                         // candidates.
 279                         return;
 280                 }
 281         }
 282 }
 283
 284 void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjunction> *trigram_groups)
 285 {
 286         if (ignore_case) {
 287                 parse_trigrams_ignore_case(needle, trigram_groups);
 288                 return;
 289         }
 290
 291         // The case-sensitive case is straightforward.
 292         for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
 293                 uint32_t trgm = read_trigram(needle, i);
 294                 if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) {
 295                         // Invalid trigram, so skip.
 296                         continue;
 297                 }
 298
 299                 TrigramDisjunction new_pt;
 300                 new_pt.remaining_trigrams_to_read = 1;
 301                 new_pt.trigram_alternatives.push_back(trgm);
 302                 new_pt.max_num_docids = 0;
 303                 trigram_groups->push_back(move(new_pt));
 304         }
 305 }