git.sesse.net Git - plocate/blob - parse_trigrams.cpp

   1 #include "parse_trigrams.h"
   2
   3 #include "unique_sort.h"
   4
   5 #include <assert.h>
   6 #include <memory>
   7 #include <string.h>
   8 #include <wctype.h>
   9
  10 using namespace std;
  11
  12 string print_td(const TrigramDisjunction &td)
  13 {
  14         if (td.read_trigrams.size() == 0) {
  15                 // Before we've done hash lookups (or none matched), so print all alternatives.
  16                 if (td.trigram_alternatives.size() == 1) {
  17                         return print_trigram(td.trigram_alternatives[0]);
  18                 } else {
  19                         string ret;
  20                         ret = "(";
  21                         bool first = true;
  22                         for (uint32_t trgm : td.trigram_alternatives) {
  23                                 if (!first)
  24                                         ret += " OR ";
  25                                 ret += print_trigram(trgm);
  26                                 first = false;
  27                         }
  28                         return ret + ")";
  29                 }
  30         } else {
  31                 // Print only those that we actually have in the index.
  32                 if (td.read_trigrams.size() == 1) {
  33                         return print_trigram(td.read_trigrams[0].first.trgm);
  34                 } else {
  35                         string ret;
  36                         ret = "(";
  37                         bool first = true;
  38                         for (auto &[trgmptr, len] : td.read_trigrams) {
  39                                 if (!first)
  40                                         ret += " OR ";
  41                                 ret += print_trigram(trgmptr.trgm);
  42                                 first = false;
  43                         }
  44                         return ret + ")";
  45                 }
  46         }
  47 }
  48
  49 string print_trigram(uint32_t trgm)
  50 {
  51         char ch[3] = {
  52                 char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
  53         };
  54
  55         string str = "'";
  56         for (unsigned i = 0; i < 3;) {
  57                 if (ch[i] == '\\') {
  58                         str.push_back('\\');
  59                         str.push_back(ch[i]);
  60                         ++i;
  61                 } else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) {  // Holds no matter whether char is signed or unsigned.
  62                         str.push_back(ch[i]);
  63                         ++i;
  64                 } else {
  65                         // See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
  66                         mbtowc(nullptr, 0, 0);
  67                         wchar_t pwc;
  68                         int ret = mbtowc(&pwc, ch + i, 3 - i);
  69                         if (ret >= 1 && pwc >= 32) {
  70                                 str.append(ch + i, ret);
  71                                 i += ret;
  72                         } else {
  73                                 char buf[16];
  74                                 snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
  75                                 str += buf;
  76                                 ++i;
  77                         }
  78                 }
  79         }
  80         str += "'";
  81         return str;
  82 }
  83
  84 pair<uint32_t, size_t> read_unigram(const string &s, size_t start)
  85 {
  86         if (start >= s.size()) {
  87                 return { PREMATURE_END_UNIGRAM, 0 };
  88         }
  89         if (s[start] == '\\') {
  90                 // Escaped character.
  91                 if (start + 1 >= s.size()) {
  92                         return { PREMATURE_END_UNIGRAM, 1 };
  93                 } else {
  94                         return { (unsigned char)s[start + 1], 2 };
  95                 }
  96         }
  97         if (s[start] == '*' || s[start] == '?') {
  98                 // Wildcard.
  99                 return { WILDCARD_UNIGRAM, 1 };
 100         }
 101         if (s[start] == '[') {
 102                 // Character class; search to find the end.
 103                 size_t len = 1;
 104                 if (start + len >= s.size()) {
 105                         return { PREMATURE_END_UNIGRAM, len };
 106                 }
 107                 if (s[start + len] == '!') {
 108                         ++len;
 109                 }
 110                 if (start + len >= s.size()) {
 111                         return { PREMATURE_END_UNIGRAM, len };
 112                 }
 113                 if (s[start + len] == ']') {
 114                         ++len;
 115                 }
 116                 for (;;) {
 117                         if (start + len >= s.size()) {
 118                                 return { PREMATURE_END_UNIGRAM, len };
 119                         }
 120                         if (s[start + len] == ']') {
 121                                 return { WILDCARD_UNIGRAM, len + 1 };
 122                         }
 123                         ++len;
 124                 }
 125         }
 126
 127         // Regular letter.
 128         return { (unsigned char)s[start], 1 };
 129 }
 130
 131 uint32_t read_trigram(const string &s, size_t start)
 132 {
 133         pair<uint32_t, size_t> u1 = read_unigram(s, start);
 134         if (u1.first == WILDCARD_UNIGRAM || u1.first == PREMATURE_END_UNIGRAM) {
 135                 return u1.first;
 136         }
 137         pair<uint32_t, size_t> u2 = read_unigram(s, start + u1.second);
 138         if (u2.first == WILDCARD_UNIGRAM || u2.first == PREMATURE_END_UNIGRAM) {
 139                 return u2.first;
 140         }
 141         pair<uint32_t, size_t> u3 = read_unigram(s, start + u1.second + u2.second);
 142         if (u3.first == WILDCARD_UNIGRAM || u3.first == PREMATURE_END_UNIGRAM) {
 143                 return u3.first;
 144         }
 145         return u1.first | (u2.first << 8) | (u3.first << 16);
 146 }
 147
 148 struct TrigramState {
 149         string buffered;
 150         unsigned next_codepoint;
 151
 152         bool operator<(const TrigramState &other) const
 153         {
 154                 if (next_codepoint != other.next_codepoint)
 155                         return next_codepoint < other.next_codepoint;
 156                 return buffered < other.buffered;
 157         }
 158         bool operator==(const TrigramState &other) const
 159         {
 160                 return next_codepoint == other.next_codepoint &&
 161                         buffered == other.buffered;
 162         }
 163 };
 164
 165 void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction> *trigram_groups)
 166 {
 167         vector<vector<string>> alternatives_for_cp;
 168
 169         // Parse the needle into Unicode code points, and do inverse case folding
 170         // on each to find legal alternatives. This is far from perfect (e.g. ß
 171         // will not become ss), but it's generally the best we can do without
 172         // involving ICU or the likes.
 173         mbtowc(nullptr, 0, 0);
 174         const char *ptr = needle.c_str();
 175         unique_ptr<char[]> buf(new char[MB_CUR_MAX]);
 176         while (*ptr != '\0') {
 177                 wchar_t ch;
 178                 int ret = mbtowc(&ch, ptr, strlen(ptr));
 179                 if (ret == -1) {
 180                         perror(ptr);
 181                         exit(1);
 182                 }
 183
 184                 vector<string> alt;
 185                 alt.push_back(string(ptr, ret));
 186                 ptr += ret;
 187                 if (towlower(ch) != wint_t(ch)) {
 188                         ret = wctomb(buf.get(), towlower(ch));
 189                         alt.push_back(string(buf.get(), ret));
 190                 }
 191                 if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
 192                         ret = wctomb(buf.get(), towupper(ch));
 193                         alt.push_back(string(buf.get(), ret));
 194                 }
 195                 alternatives_for_cp.push_back(move(alt));
 196         }
 197
 198         // Now generate all possible byte strings from those code points in order;
 199         // e.g., from abc, we'd create a and A, then extend those to ab aB Ab AB,
 200         // then abc abC aBc aBC and so on. Since we don't want to have 2^n
 201         // (or even 3^n) strings, we only extend them far enough to cover at
 202         // least three bytes; this will give us a set of candidate trigrams
 203         // (the filename must have at least one of those), and then we can
 204         // chop off the first byte, deduplicate states and continue extending
 205         // and generating trigram sets.
 206         //
 207         // There are a few special cases, notably the dotted i (İ), where the
 208         // UTF-8 versions of upper and lower case have different number of bytes.
 209         // If this happens, we can have combinatorial explosion and get many more
 210         // than the normal 8 states. We detect this and simply bomb out; it will
 211         // never really happen in real strings, and stopping trigram generation
 212         // really only means our pruning of candidates will be less effective.
 213         vector<TrigramState> states;
 214         states.push_back(TrigramState{ "", 0 });
 215
 216         for (;;) {
 217                 // Extend every state so that it has buffered at least three bytes.
 218                 // If this isn't possible, we are done with the string (can generate
 219                 // no more trigrams).
 220                 bool need_another_pass;
 221                 do {
 222                         need_another_pass = false;
 223                         vector<TrigramState> new_states;
 224                         for (const TrigramState &state : states) {
 225                                 if (read_trigram(state.buffered, 0) != PREMATURE_END_UNIGRAM) {
 226                                         // No need to extend this further.
 227                                         new_states.push_back(state);
 228                                         continue;
 229                                 }
 230                                 if (state.next_codepoint == alternatives_for_cp.size()) {
 231                                         // We can't form a complete trigram from this alternative,
 232                                         // so we're done.
 233                                         return;
 234                                 }
 235                                 for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
 236                                         TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
 237                                         if (read_trigram(state.buffered, 0) == PREMATURE_END_UNIGRAM) {
 238                                                 need_another_pass = true;
 239                                         }
 240                                         new_states.push_back(move(new_state));
 241                                 }
 242                         }
 243                         states = move(new_states);
 244                 } while (need_another_pass);
 245
 246                 // OK, so now we have a bunch of states, and all of them are at least
 247                 // three bytes long. This means we have a complete set of trigrams,
 248                 // and the destination filename must contain at least one of them.
 249                 // Output those trigrams, cut out the first byte and then deduplicate
 250                 // the states before we continue.
 251                 bool any_wildcard = false;
 252                 vector<uint32_t> trigram_alternatives;
 253                 for (TrigramState &state : states) {
 254                         trigram_alternatives.push_back(read_trigram(state.buffered, 0));
 255                         state.buffered.erase(0, read_unigram(state.buffered, 0).second);
 256                         assert(trigram_alternatives.back() != PREMATURE_END_UNIGRAM);
 257                         if (trigram_alternatives.back() == WILDCARD_UNIGRAM) {
 258                                 // If any of the candidates are wildcards, we need to drop the entire OR group.
 259                                 // (Most likely, all of them would be anyway.) We need to keep stripping out
 260                                 // the first unigram from each state.
 261                                 any_wildcard = true;
 262                         }
 263                 }
 264                 unique_sort(&trigram_alternatives);  // Could have duplicates, although it's rare.
 265                 unique_sort(&states);
 266
 267                 if (!any_wildcard) {
 268                         TrigramDisjunction new_pt;
 269                         new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
 270                         new_pt.trigram_alternatives = move(trigram_alternatives);
 271                         new_pt.max_num_docids = 0;
 272                         trigram_groups->push_back(move(new_pt));
 273                 }
 274
 275                 if (states.size() > 100) {
 276                         // A completely crazy pattern with lots of those special characters.
 277                         // We just give up; this isn't a realistic scenario anyway.
 278                         // We already have lots of trigrams that should reduce the amount of
 279                         // candidates.
 280                         return;
 281                 }
 282         }
 283 }
 284
 285 void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjunction> *trigram_groups)
 286 {
 287         if (ignore_case) {
 288                 parse_trigrams_ignore_case(needle, trigram_groups);
 289                 return;
 290         }
 291
 292         // The case-sensitive case is straightforward.
 293         for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) {
 294                 uint32_t trgm = read_trigram(needle, i);
 295                 if (trgm == WILDCARD_UNIGRAM || trgm == PREMATURE_END_UNIGRAM) {
 296                         // Invalid trigram, so skip.
 297                         continue;
 298                 }
 299
 300                 TrigramDisjunction new_pt;
 301                 new_pt.remaining_trigrams_to_read = 1;
 302                 new_pt.trigram_alternatives.push_back(trgm);
 303                 new_pt.max_num_docids = 0;
 304                 trigram_groups->push_back(move(new_pt));
 305         }
 306 }