Support case-insensitive searches.

author Steinar H. Gunderson <steinar+git@gunderson.no>

Sat, 10 Oct 2020 09:36:17 +0000 (11:36 +0200)

committer Steinar H. Gunderson <steinar+git@gunderson.no>

Sat, 10 Oct 2020 17:20:48 +0000 (19:20 +0200)
author Steinar H. Gunderson <steinar+git@gunderson.no>
Sat, 10 Oct 2020 09:36:17 +0000 (11:36 +0200)
committer Steinar H. Gunderson <steinar+git@gunderson.no>
Sat, 10 Oct 2020 17:20:48 +0000 (19:20 +0200)
diff --git a/meson.build b/meson.build

index eca029769cc3a5de48cce87d5d3b54307234454d..070a804fb89cea93e02a793e53893b928edbc422 100644 (file)
--- a/meson.build
+++ b/meson.build
@@ -8,7 +8,7 @@ if not uringdep.found()
         add_project_arguments('-DWITHOUT_URING', language: 'cpp')
  endif
  
-executable('plocate', ['plocate.cpp', 'io_uring_engine.cpp', 'turbopfor.cpp'],
+executable('plocate', ['plocate.cpp', 'io_uring_engine.cpp', 'turbopfor.cpp', 'parse_trigrams.cpp'],
         dependencies: [uringdep, zstddep],
         install: true,
         install_mode: ['rwxr-sr-x', 'root', 'mlocate'])
diff --git a/parse_trigrams.cpp b/parse_trigrams.cpp

new file mode 100644 (file)

index 0000000..03fbc6b
--- /dev/null
+++ b/parse_trigrams.cpp
@@ -0,0 +1,242 @@
+#include "parse_trigrams.h"
+
+#include "unique_sort.h"
+
+#include <string.h>
+#include <wctype.h>
+
+using namespace std;
+
+string print_td(const TrigramDisjunction &td)
+{
+       if (td.read_trigrams.size() == 0) {
+               // Before we've done hash lookups (or none matched), so print all alternatives.
+               if (td.trigram_alternatives.size() == 1) {
+                       return print_trigram(td.trigram_alternatives[0]);
+               } else {
+                       string ret;
+                       ret = "(";
+                       bool first = true;
+                       for (uint32_t trgm : td.trigram_alternatives) {
+                               if (!first)
+                                       ret += " OR ";
+                               ret += print_trigram(trgm);
+                               first = false;
+                       }
+                       return ret + ")";
+               }
+       } else {
+               // Print only those that we actually have in the index.
+               if (td.read_trigrams.size() == 1) {
+                       return print_trigram(td.read_trigrams[0].first.trgm);
+               } else {
+                       string ret;
+                       ret = "(";
+                       bool first = true;
+                       for (auto &[trgmptr, len] : td.read_trigrams) {
+                               if (!first)
+                                       ret += " OR ";
+                               ret += print_trigram(trgmptr.trgm);
+                               first = false;
+                       }
+                       return ret + ")";
+               }
+       }
+}
+
+string print_trigram(uint32_t trgm)
+{
+       char ch[3] = {
+               char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
+       };
+
+       string str = "'";
+       for (unsigned i = 0; i < 3;) {
+               if (ch[i] == '\\') {
+                       str.push_back('\\');
+                       str.push_back(ch[i]);
+                       ++i;
+               } else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) {  // Holds no matter whether char is signed or unsigned.
+                       str.push_back(ch[i]);
+                       ++i;
+               } else {
+                       // See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
+                       mbtowc(nullptr, 0, 0);
+                       wchar_t pwc;
+                       int ret = mbtowc(&pwc, ch + i, 3 - i);
+                       if (ret >= 1 && pwc >= 32) {
+                               str.append(ch + i, ret);
+                               i += ret;
+                       } else {
+                               char buf[16];
+                               snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
+                               str += buf;
+                               ++i;
+                       }
+               }
+       }
+       str += "'";
+       return str;
+}
+
+uint32_t read_unigram(const string &s, size_t idx)
+{
+       if (idx < s.size()) {
+               return (unsigned char)s[idx];
+       } else {
+               return 0;
+       }
+}
+
+uint32_t read_trigram(const string &s, size_t start)
+{
+       return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
+               (read_unigram(s, start + 2) << 16);
+}
+
+struct TrigramState {
+       string buffered;
+       unsigned next_codepoint;
+
+       bool operator<(const TrigramState &other) const
+       {
+               if (next_codepoint != other.next_codepoint)
+                       return next_codepoint < other.next_codepoint;
+               return buffered < other.buffered;
+       }
+       bool operator==(const TrigramState &other) const
+       {
+               return next_codepoint == other.next_codepoint &&
+                       buffered == other.buffered;
+       }
+};
+
+void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction> *trigram_groups)
+{
+       vector<vector<string>> alternatives_for_cp;
+
+       // Parse the needle into Unicode code points, and do inverse case folding
+       // on each to find legal alternatives. This is far from perfect (e.g. ß
+       // will not become ss), but it's generally the best we can do without
+       // involving ICU or the likes.
+       mbtowc(nullptr, 0, 0);
+       const char *ptr = needle.c_str();
+       while (*ptr != '\0') {
+               wchar_t ch;
+               int ret = mbtowc(&ch, ptr, strlen(ptr));
+               if (ret == -1) {
+                       perror(ptr);
+                       exit(1);
+               }
+
+               char buf[MB_CUR_MAX];
+               vector<string> alt;
+               alt.push_back(string(ptr, ret));
+               ptr += ret;
+               if (towlower(ch) != wint_t(ch)) {
+                       ret = wctomb(buf, towlower(ch));
+                       alt.push_back(string(buf, ret));
+               }
+               if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
+                       ret = wctomb(buf, towupper(ch));
+                       alt.push_back(string(buf, ret));
+               }
+               alternatives_for_cp.push_back(move(alt));
+       }
+
+       // Now generate all possible byte strings from those code points in order;
+       // e.g., from abc, we'd create a and A, then extend those to ab aB Ab AB,
+       // then abc abC aBc aBC and so on. Since we don't want to have 2^n
+       // (or even 3^n) strings, we only extend them far enough to cover at
+       // least three bytes; this will give us a set of candidate trigrams
+       // (the filename must have at least one of those), and then we can
+       // chop off the first byte, deduplicate states and continue extending
+       // and generating trigram sets.
+       //
+       // There are a few special cases, notably the dotted i (İ), where the
+       // UTF-8 versions of upper and lower case have different number of bytes.
+       // If this happens, we can have combinatorial explosion and get many more
+       // than the normal 8 states. We detect this and simply bomb out; it will
+       // never really happen in real strings, and stopping trigram generation
+       // really only means our pruning of candidates will be less effective.
+       vector<TrigramState> states;
+       states.push_back(TrigramState{ "", 0 });
+
+       for (;;) {
+               // Extend every state so that it has buffered at least three bytes.
+               // If this isn't possible, we are done with the string (can generate
+               // no more trigrams).
+               bool need_another_pass;
+               do {
+                       need_another_pass = false;
+                       vector<TrigramState> new_states;
+                       for (const TrigramState &state : states) {
+                               if (state.buffered.size() >= 3) {
+                                       // No need to extend this further.
+                                       new_states.push_back(state);
+                                       continue;
+                               }
+                               if (state.next_codepoint == alternatives_for_cp.size()) {
+                                       // We can't form a complete trigram from this alternative,
+                                       // so we're done.
+                                       return;
+                               }
+                               for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
+                                       TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
+                                       if (new_state.buffered.size() < 3) {
+                                               need_another_pass = true;
+                                       }
+                                       new_states.push_back(move(new_state));
+                               }
+                       }
+                       states = move(new_states);
+               } while (need_another_pass);
+
+               // OK, so now we have a bunch of states, and all of them are at least
+               // three bytes long. This means we have a complete set of trigrams,
+               // and the destination filename must contain at least one of them.
+               // Output those trigrams, cut out the first byte and then deduplicate
+               // the states before we continue.
+               vector<uint32_t> trigram_alternatives;
+               for (TrigramState &state : states) {
+                       trigram_alternatives.push_back(read_trigram(state.buffered, 0));
+                       state.buffered.erase(0, 1);
+               }
+               unique_sort(&trigram_alternatives);  // Could have duplicates, although it's rare.
+               unique_sort(&states);
+
+               TrigramDisjunction new_pt;
+               new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
+               new_pt.trigram_alternatives = move(trigram_alternatives);
+               new_pt.max_num_docids = 0;
+               trigram_groups->push_back(move(new_pt));
+
+               if (states.size() > 100) {
+                       // A completely crazy pattern with lots of those special characters.
+                       // We just give up; this isn't a realistic scenario anyway.
+                       // We already have lots of trigrams that should reduce the amount of
+                       // candidates.
+                       return;
+               }
+       }
+}
+
+void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjunction> *trigram_groups)
+{
+       if (ignore_case) {
+               parse_trigrams_ignore_case(needle, trigram_groups);
+               return;
+       }
+
+       // The case-sensitive case is straightforward.
+       if (needle.size() >= 3) {
+               for (size_t i = 0; i < needle.size() - 2; ++i) {
+                       uint32_t trgm = read_trigram(needle, i);
+                       TrigramDisjunction new_pt;
+                       new_pt.remaining_trigrams_to_read = 1;
+                       new_pt.trigram_alternatives.push_back(trgm);
+                       new_pt.max_num_docids = 0;
+                       trigram_groups->push_back(move(new_pt));
+               }
+       }
+}
diff --git a/parse_trigrams.h b/parse_trigrams.h

new file mode 100644 (file)

index 0000000..810d005
--- /dev/null
+++ b/parse_trigrams.h
@@ -0,0 +1,62 @@
+#ifndef _PARSE_TRIGRAMS_H
+#define _PARSE_TRIGRAMS_H 1
+
+#include "db.h"
+
+#include <stdint.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+// One or more trigrams, with an implicit OR between them. For case-sensitive searches,
+// this is just e.g. “abc”, but for case-insensitive, it would be “abc OR abC or aBc ...” etc.
+struct TrigramDisjunction {
+       unsigned index;  // For debugging only.
+
+       // The alternatives as determined by parse_trigrams().
+       std::vector<uint32_t> trigram_alternatives;
+
+       // Like trigram_alternatives, but only the ones we've actually read from the
+       // hash table (the non-existent ones are filtered out). The second member is
+       // the length in bytes. Incomplete if remaining_trigrams_to_read > 0.
+       std::vector<std::pair<Trigram, size_t>> read_trigrams;
+
+       // Sum of num_docids in all trigrams. This is usually a fairly good indicator
+       // of the real number of docids, since there are few files that would have e.g.
+       // both abc and abC in them (but of course, with multiple files in the same
+       // docid block, it is far from unheard of).
+       uint32_t max_num_docids;
+
+       // While reading posting lists: Holds the union of the posting lists read
+       // so far. Once remaining_trigrams_to_read == 0 (all are read), will be taken
+       // out and used for intersections against the other disjunctions.
+       std::vector<uint32_t> docids;
+
+       // While looking up in the hash table (filling out read_trigrams): Number of
+       // lookups in the hash table remaining. While reading actual posting lists
+       // (filling out docids): Number of posting lists left to read.
+       unsigned remaining_trigrams_to_read;
+};
+
+// Take the given needle (search string) and break it down into a set of trigrams
+// (or trigram alternatives; see TrigramDisjunction) that must be present for the
+// string to match. (Note: They are not _sufficient_ for the string to match;
+// false positives might very well occur and must be weeded out later.)
+//
+// For the case-sensitive case, this is straightforward; just take every trigram
+// present in the needle and add them (e.g. abcd -> abc AND bcd).
+// For case-insensitivity, it's trickier; see the comments in the function.
+//
+// Note that our trigrams are on the basis of bytes, not Unicode code points.
+// This both simplifies table structure (everything is the same length), and
+// guards us against trigram explosion (imagine every combination of CJK characters
+// getting their own trigram).
+void parse_trigrams(const std::string &needle, bool ignore_case, std::vector<TrigramDisjunction> *trigram_groups);
+
+uint32_t read_trigram(const std::string &s, size_t start);
+
+// For debugging.
+std::string print_td(const TrigramDisjunction &td);
+std::string print_trigram(uint32_t trgm);
+
+#endif  // !defined(_PARSE_TRIGRAMS_H)
diff --git a/plocate.cpp b/plocate.cpp

index d53c41c5ede0fb9f30f01dda384e9fbbdca8b585..f721b91264f9f2f6d642d0b628d2905a2c7aee21 100644 (file)
--- a/plocate.cpp
+++ b/plocate.cpp
@@ -1,8 +1,10 @@
  #include "db.h"
  #include "io_uring_engine.h"
+#include "parse_trigrams.h"
  #include "unique_sort.h"
  
  #include <algorithm>
+#include <assert.h>
  #include <chrono>
  #include <fcntl.h>
  #include <functional>
@@ -12,6 +14,7 @@
  #include <limits>
  #include <memory>
  #include <queue>
+#include <regex.h>
  #include <stdint.h>
  #include <stdio.h>
  #include <stdlib.h>
@@ -20,6 +23,7 @@
  #include <string_view>
  #include <unistd.h>
  #include <unordered_map>
+#include <unordered_set>
  #include <utility>
  #include <vector>
  #include <zstd.h>
@@ -33,6 +37,7 @@ using namespace std::chrono;
  #include "turbopfor.h"
  
  const char *dbpath = "/var/lib/mlocate/plocate.db";
+bool ignore_case = false;
  bool only_count = false;
  bool print_nul = false;
  int64_t limit_matches = numeric_limits<int64_t>::max();
@@ -82,21 +87,23 @@ void Serializer::release_current()
         }
  }
  
-static inline uint32_t read_unigram(const string &s, size_t idx)
+struct Needle {
+       enum { STRSTR,
+              REGEX } type;
+       string str;  // Filled in no matter what.
+       regex_t re;  // For REGEX.
+};
+
+bool matches(const Needle &needle, const char *haystack)
  {
-       if (idx < s.size()) {
-               return (unsigned char)s[idx];
+       if (needle.type == Needle::STRSTR) {
+               return strstr(haystack, needle.str.c_str()) != nullptr;
         } else {
-               return 0;
+               assert(needle.type == Needle::REGEX);
+               return regexec(&needle.re, haystack, /*nmatch=*/0, /*pmatch=*/nullptr, /*flags=*/0) == 0;
         }
  }
  
-static inline uint32_t read_trigram(const string &s, size_t start)
-{
-       return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
-               (read_unigram(s, start + 2) << 16);
-}
-
  bool has_access(const char *filename,
                  unordered_map<string, bool> *access_rx_cache)
  {
@@ -202,7 +209,7 @@ size_t Corpus::get_num_filename_blocks() const
         return hdr.num_docids;
  }
  
-uint64_t scan_file_block(const vector<string> &needles, string_view compressed,
+uint64_t scan_file_block(const vector<Needle> &needles, string_view compressed,
                           unordered_map<string, bool> *access_rx_cache, int seq,
                           Serializer *serializer)
  {
@@ -232,8 +239,8 @@ uint64_t scan_file_block(const vector<string> &needles, string_view compressed,
              filename != block.data() + block.size();
              filename += strlen(filename) + 1) {
                 bool found = true;
-               for (const string &needle : needles) {
-                       if (strstr(filename, needle.c_str()) == nullptr) {
+               for (const Needle &needle : needles) {
+                       if (!matches(needle, filename)) {
                                 found = false;
                                 break;
                         }
@@ -265,7 +272,7 @@ uint64_t scan_file_block(const vector<string> &needles, string_view compressed,
         return matched;
  }
  
-size_t scan_docids(const vector<string> &needles, const vector<uint32_t> &docids, const Corpus &corpus, IOUringEngine *engine)
+size_t scan_docids(const vector<Needle> &needles, const vector<uint32_t> &docids, const Corpus &corpus, IOUringEngine *engine)
  {
         Serializer docids_in_order;
         unordered_map<string, bool> access_rx_cache;
@@ -283,7 +290,7 @@ size_t scan_docids(const vector<string> &needles, const vector<uint32_t> &docids
  // We do this sequentially, as it's faster than scattering
  // a lot of I/O through io_uring and hoping the kernel will
  // coalesce it plus readahead for us.
-uint64_t scan_all_docids(const vector<string> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
+uint64_t scan_all_docids(const vector<Needle> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
  {
         unordered_map<string, bool> access_rx_cache;
         uint32_t num_blocks = corpus.get_num_filename_blocks();
@@ -310,43 +317,54 @@ uint64_t scan_all_docids(const vector<string> &needles, int fd, const Corpus &co
         return matched;
  }
  
-// For debugging.
-string print_trigram(uint32_t trgm)
+// Takes the given posting list, unions it into the parts of the trigram disjunction
+// already read; if the list is complete, intersects with “cur_candidates”.
+//
+// Returns true if the search should be aborted (we are done).
+bool new_posting_list_read(TrigramDisjunction *td, vector<uint32_t> decoded, vector<uint32_t> *cur_candidates, vector<uint32_t> *tmp)
  {
-       char ch[3] = {
-               char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
-       };
-
-       string str = "'";
-       for (unsigned i = 0; i < 3;) {
-               if (ch[i] == '\\') {
-                       str.push_back('\\');
-                       str.push_back(ch[i]);
-                       ++i;
-               } else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) {  // Holds no matter whether char is signed or unsigned.
-                       str.push_back(ch[i]);
-                       ++i;
-               } else {
-                       // See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
-                       mbtowc(nullptr, 0, 0);
-                       wchar_t pwc;
-                       int ret = mbtowc(&pwc, ch + i, 3 - i);
-                       if (ret >= 1 && pwc >= 32) {
-                               str.append(ch + i, ret);
-                               i += ret;
+       if (td->docids.empty()) {
+               td->docids = move(decoded);
+       } else {
+               tmp->clear();
+               set_union(decoded.begin(), decoded.end(), td->docids.begin(), td->docids.end(), back_inserter(*tmp));
+               swap(*tmp, td->docids);
+       }
+       if (--td->remaining_trigrams_to_read > 0) {
+               // Need to wait for more.
+               if (ignore_case) {
+                       dprintf("  ... %u reads left in OR group %u (%zu docids in list)\n",
+                               td->remaining_trigrams_to_read, td->index, td->docids.size());
+               }
+               return false;
+       }
+       if (cur_candidates->empty()) {
+               if (ignore_case) {
+                       dprintf("  ... all reads done for OR group %u (%zu docids)\n",
+                               td->index, td->docids.size());
+               }
+               *cur_candidates = move(td->docids);
+       } else {
+               tmp->clear();
+               set_intersection(cur_candidates->begin(), cur_candidates->end(),
+                                td->docids.begin(), td->docids.end(),
+                                back_inserter(*tmp));
+               swap(*cur_candidates, *tmp);
+               if (ignore_case) {
+                       if (cur_candidates->empty()) {
+                               dprintf("  ... all reads done for OR group %u (%zu docids), intersected (none left, search is done)\n",
+                                       td->index, td->docids.size());
+                               return true;
                         } else {
-                               char buf[16];
-                               snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
-                               str += buf;
-                               ++i;
+                               dprintf("  ... all reads done for OR group %u (%zu docids), intersected (%zu left)\n",
+                                       td->index, td->docids.size(), cur_candidates->size());
                         }
                 }
         }
-       str += "'";
-       return str;
+       return false;
  }
  
-void do_search_file(const vector<string> &needles, const char *filename)
+void do_search_file(const vector<Needle> &needles, const char *filename)
  {
         int fd = open(filename, O_RDONLY);
         if (fd == -1) {
@@ -370,28 +388,31 @@ void do_search_file(const vector<string> &needles, const char *filename)
         Corpus corpus(fd, &engine);
         dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
  
-       vector<pair<Trigram, size_t>> trigrams;
-       for (const string &needle : needles) {
-               if (needle.size() < 3)
+       vector<TrigramDisjunction> trigram_groups;
+       for (const Needle &needle : needles) {
+               if (needle.str.size() < 3)
                         continue;
-               for (size_t i = 0; i < needle.size() - 2; ++i) {
-                       uint32_t trgm = read_trigram(needle, i);
-                       corpus.find_trigram(trgm, [trgm, &trigrams](const Trigram *trgmptr, size_t len) {
-                               if (trgmptr == nullptr) {
-                                       dprintf("trigram %s isn't found, we abort the search\n", print_trigram(trgm).c_str());
-                                       if (only_count) {
-                                               printf("0\n");
-                                       }
-                                       exit(0);
-                               }
-                               trigrams.emplace_back(*trgmptr, len);
-                       });
-               }
+               parse_trigrams(needle.str, ignore_case, &trigram_groups);
+       }
+       unique_sort(
+               &trigram_groups,
+               [](const TrigramDisjunction &a, const TrigramDisjunction &b) { return a.trigram_alternatives < b.trigram_alternatives; },
+               [](const TrigramDisjunction &a, const TrigramDisjunction &b) { return a.trigram_alternatives == b.trigram_alternatives; });
+
+       // Give them names for debugging.
+       unsigned td_index = 0;
+       for (TrigramDisjunction &td : trigram_groups) {
+               td.index = td_index++;
         }
-       engine.finish();
-       dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
  
-       if (trigrams.empty()) {
+       // Collect which trigrams we need to look up in the hash table.
+       unordered_map<uint32_t, vector<TrigramDisjunction *>> trigrams_to_lookup;
+       for (TrigramDisjunction &td : trigram_groups) {
+               for (uint32_t trgm : td.trigram_alternatives) {
+                       trigrams_to_lookup[trgm].push_back(&td);
+               }
+       }
+       if (trigrams_to_lookup.empty()) {
                 // Too short for trigram matching. Apply brute force.
                 // (We could have searched through all trigrams that matched
                 // the pattern and done a union of them, but that's a lot of
@@ -402,61 +423,113 @@ void do_search_file(const vector<string> &needles, const char *filename)
                 }
                 return;
         }
-       unique_sort(&trigrams);
-       sort(trigrams.begin(), trigrams.end(),
-            [&](const pair<Trigram, size_t> &a, const pair<Trigram, size_t> &b) {
-                    return a.first.num_docids < b.first.num_docids;
+
+       // Look them all up on disk.
+       for (auto &[trgm, trigram_groups] : trigrams_to_lookup) {
+               corpus.find_trigram(trgm, [trgm{ trgm }, trigram_groups{ &trigram_groups }](const Trigram *trgmptr, size_t len) {
+                       if (trgmptr == nullptr) {
+                               dprintf("trigram %s isn't found\n", print_trigram(trgm).c_str());
+                               for (TrigramDisjunction *td : *trigram_groups) {
+                                       --td->remaining_trigrams_to_read;
+                                       if (td->remaining_trigrams_to_read == 0 && td->read_trigrams.empty()) {
+                                               dprintf("zero matches in %s, so we are done\n", print_td(*td).c_str());
+                                               if (only_count) {
+                                                       printf("0\n");
+                                               }
+                                               exit(0);
+                                       }
+                               }
+                               return;
+                       }
+                       for (TrigramDisjunction *td : *trigram_groups) {
+                               --td->remaining_trigrams_to_read;
+                               td->max_num_docids += trgmptr->num_docids;
+                               td->read_trigrams.emplace_back(*trgmptr, len);
+                       }
+               });
+       }
+       engine.finish();
+       dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+
+       for (TrigramDisjunction &td : trigram_groups) {
+               // Reset for reads.
+               td.remaining_trigrams_to_read = td.read_trigrams.size();
+
+               if (ignore_case) {  // If case-sensitive, they'll all be pretty obvious single-entry groups.
+                       dprintf("OR group %u (max_num_docids=%u): %s\n", td.index, td.max_num_docids, print_td(td).c_str());
+               }
+       }
+
+       // TODO: For case-insensitive (ie. more than one alternative in each),
+       // prioritize the ones with fewer seeks?
+       sort(trigram_groups.begin(), trigram_groups.end(),
+            [&](const TrigramDisjunction &a, const TrigramDisjunction &b) {
+                    return a.max_num_docids < b.max_num_docids;
              });
  
-       vector<uint32_t> in1, in2, out;
+       unordered_map<uint32_t, vector<TrigramDisjunction *>> uses_trigram;
+       for (TrigramDisjunction &td : trigram_groups) {
+               for (uint32_t trgm : td.trigram_alternatives) {
+                       uses_trigram[trgm].push_back(&td);
+               }
+       }
+
+       unordered_set<uint32_t> trigrams_submitted_read;
+       vector<uint32_t> cur_candidates, tmp, decoded;
         bool done = false;
-       for (auto [trgmptr, len] : trigrams) {
-               if (!in1.empty() && trgmptr.num_docids > in1.size() * 100) {
-                       uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
-                       dprintf("trigram %s (%zu bytes) has %u entries, ignoring the rest (will "
+       for (TrigramDisjunction &td : trigram_groups) {
+               if (!cur_candidates.empty() && td.max_num_docids > cur_candidates.size() * 100) {
+                       dprintf("%s has up to %u entries, ignoring the rest (will "
                                 "weed out false positives later)\n",
-                               print_trigram(trgm).c_str(), len, trgmptr.num_docids);
+                               print_td(td).c_str(), td.max_num_docids);
                         break;
                 }
  
-               // Only stay a certain amount ahead, so that we don't spend I/O
-               // on reading the latter, large posting lists. We are unlikely
-               // to need them anyway, even if they should come in first.
-               if (engine.get_waiting_reads() >= 5) {
-                       engine.finish();
-                       if (done)
-                               break;
-               }
-               engine.submit_read(fd, len, trgmptr.offset, [trgmptr{ trgmptr }, len{ len }, &done, &in1, &in2, &out](string_view s) {
-                       if (done)
-                               return;
-                       uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
-                       size_t num = trgmptr.num_docids;
-                       const unsigned char *pldata = reinterpret_cast<const unsigned char *>(s.data());
-                       if (in1.empty()) {
-                               in1.resize(num + 128);
-                               decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &in1[0]);
-                               in1.resize(num);
-                               dprintf("trigram %s (%zu bytes) decoded to %zu entries\n",
-                                       print_trigram(trgm).c_str(), len, num);
-                       } else {
-                               if (in2.size() < num + 128) {
-                                       in2.resize(num + 128);
+               for (auto &[trgmptr, len] : td.read_trigrams) {
+                       if (trigrams_submitted_read.count(trgmptr.trgm) != 0) {
+                               continue;
+                       }
+                       trigrams_submitted_read.insert(trgmptr.trgm);
+                       // Only stay a certain amount ahead, so that we don't spend I/O
+                       // on reading the latter, large posting lists. We are unlikely
+                       // to need them anyway, even if they should come in first.
+                       if (engine.get_waiting_reads() >= 5) {
+                               engine.finish();
+                               if (done)
+                                       break;
+                       }
+                       engine.submit_read(fd, len, trgmptr.offset, [trgmptr{ trgmptr }, len{ len }, &done, &cur_candidates, &tmp, &decoded, &uses_trigram](string_view s) {
+                               if (done)
+                                       return;
+
+                               uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
+                               const unsigned char *pldata = reinterpret_cast<const unsigned char *>(s.data());
+                               size_t num = trgmptr.num_docids;
+                               decoded.resize(num);
+                               decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &decoded[0]);
+
+                               assert(uses_trigram.count(trgm) != 0);
+                               bool was_empty = cur_candidates.empty();
+                               if (ignore_case) {
+                                       dprintf("trigram %s (%zu bytes) decoded to %zu entries\n", print_trigram(trgm).c_str(), len, num);
                                 }
-                               decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &in2[0]);
-
-                               out.clear();
-                               set_intersection(in1.begin(), in1.end(), in2.begin(), in2.begin() + num,
-                                                back_inserter(out));
-                               swap(in1, out);
-                               dprintf("trigram %s (%zu bytes) decoded to %zu entries, %zu left\n",
-                                       print_trigram(trgm).c_str(), len, num, in1.size());
-                               if (in1.empty()) {
-                                       dprintf("no matches (intersection list is empty)\n");
-                                       done = true;
+
+                               for (TrigramDisjunction *td : uses_trigram[trgm]) {
+                                       done |= new_posting_list_read(td, decoded, &cur_candidates, &tmp);
+                                       if (done)
+                                               break;
                                 }
-                       }
-               });
+                               if (!ignore_case) {
+                                       if (was_empty) {
+                                               dprintf("trigram %s (%zu bytes) decoded to %zu entries\n", print_trigram(trgm).c_str(), len, num);
+                                       } else if (cur_candidates.empty()) {
+                                               dprintf("trigram %s (%zu bytes) decoded to %zu entries (none left, search is done)\n", print_trigram(trgm).c_str(), len, num);
+                                       } else {
+                                               dprintf("trigram %s (%zu bytes) decoded to %zu entries (%zu left)\n", print_trigram(trgm).c_str(), len, num, cur_candidates.size());
+                                       }
+                               }
+                       });
+               }
         }
         engine.finish();
         if (done) {
@@ -465,7 +538,7 @@ void do_search_file(const vector<string> &needles, const char *filename)
         dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n",
                 1e3 * duration<float>(steady_clock::now() - start).count());
  
-       uint64_t matched = scan_docids(needles, in1, corpus, &engine);
+       uint64_t matched = scan_docids(needles, cur_candidates, corpus, &engine);
         dprintf("Done in %.1f ms, found %zu matches.\n",
                 1e3 * duration<float>(steady_clock::now() - start).count(), matched);
  
@@ -474,6 +547,46 @@ void do_search_file(const vector<string> &needles, const char *filename)
         }
  }
  
+regex_t needle_to_regex(const string &needle)
+{
+       string escaped_needle;
+       for (char ch : needle) {
+               switch (ch) {
+               // Directly from what regex(7) considers an “atom”.
+               case '^':
+               case '.':
+               case '[':
+               case '$':
+               case '(':
+               case ')':
+               case '|':
+               case '*':
+               case '+':
+               case '?':
+               case '{':
+               case '\\':
+                       escaped_needle.push_back('\\');
+                       // Fall through.
+               default:
+                       escaped_needle.push_back(ch);
+               }
+       }
+       regex_t re;
+       int err;
+       if (ignore_case) {
+               err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB | REG_ICASE);
+       } else {
+               err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB);
+       }
+       if (err != 0) {
+               char errbuf[256];
+               regerror(err, &re, errbuf, sizeof(errbuf));
+               fprintf(stderr, "Error when compiling regex for '%s': %s\n", needle.c_str(), errbuf);
+               exit(1);
+       }
+       return re;
+}
+
  void usage()
  {
         // The help text comes from mlocate.
@@ -483,6 +596,7 @@ void usage()
         printf("  -d, --database DBPATH  use DBPATH instead of default database (which is\n");
         printf("                         %s)\n", dbpath);
         printf("  -h, --help             print this help\n");
+       printf("  -i, --ignore-case      ignore case distinctions when matching patterns\n");
         printf("  -l, --limit, -n LIMIT  limit output (or counting) to LIMIT entries\n");
         printf("  -0, --null             separate entries with NUL on output\n");
  }
@@ -493,6 +607,7 @@ int main(int argc, char **argv)
                 { "help", no_argument, 0, 'h' },
                 { "count", no_argument, 0, 'c' },
                 { "database", required_argument, 0, 'd' },
+               { "ignore-case", no_argument, 0, 'i' },
                 { "limit", required_argument, 0, 'l' },
                 { nullptr, required_argument, 0, 'n' },
                 { "null", no_argument, 0, '0' },
@@ -502,7 +617,7 @@ int main(int argc, char **argv)
         setlocale(LC_ALL, "");
         for (;;) {
                 int option_index = 0;
-               int c = getopt_long(argc, argv, "cd:hl:n:0", long_options, &option_index);
+               int c = getopt_long(argc, argv, "cd:hil:n:0", long_options, &option_index);
                 if (c == -1) {
                         break;
                 }
@@ -516,6 +631,9 @@ int main(int argc, char **argv)
                 case 'h':
                         usage();
                         exit(0);
+               case 'i':
+                       ignore_case = true;
+                       break;
                 case 'l':
                 case 'n':
                         limit_matches = atoll(optarg);
@@ -528,9 +646,22 @@ int main(int argc, char **argv)
                 }
         }
  
-       vector<string> needles;
+       vector<Needle> needles;
         for (int i = optind; i < argc; ++i) {
-               needles.push_back(argv[i]);
+               Needle needle;
+               needle.str = argv[i];
+               if (ignore_case) {
+                       // strcasestr() doesn't handle locales correctly (even though LSB
+                       // claims it should), but somehow, the glibc regex engine does.
+                       // It's much slower than strstr() for non-case-sensitive searches, though
+                       // (even though it really ought to be faster, since it can precompile),
+                       // so only use it for that.
+                       needle.type = Needle::REGEX;
+                       needle.re = needle_to_regex(argv[i]);
+               } else {
+                       needle.type = Needle::STRSTR;
+               }
+               needles.push_back(move(needle));
         }
         if (needles.empty()) {
                 fprintf(stderr, "plocate: no pattern to search for specified\n");
author	Steinar H. Gunderson <steinar+git@gunderson.no>
	Sat, 10 Oct 2020 09:36:17 +0000 (11:36 +0200)
committer	Steinar H. Gunderson <steinar+git@gunderson.no>
	Sat, 10 Oct 2020 17:20:48 +0000 (19:20 +0200)
meson.build		patch \| blob \| history
parse_trigrams.cpp	[new file with mode: 0644]	patch \| blob
parse_trigrams.h	[new file with mode: 0644]	patch \| blob
plocate.cpp		patch \| blob \| history