From efd7545c8ee2177aa13cf3ec8423d0e725c6a16d Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <steinar+git@gunderson.no>
Date: Sat, 10 Oct 2020 11:36:17 +0200
Subject: [PATCH] Support case-insensitive searches.

Without changing the database format, this causes a bunch of extra
lookups. But somehow, it appears to go fairly well in practice.
Of course, case-sensitive will always be faster.
---
 meson.build        |   2 +-
 parse_trigrams.cpp | 242 ++++++++++++++++++++++++++++++
 parse_trigrams.h   |  62 ++++++++
 plocate.cpp        | 359 +++++++++++++++++++++++++++++++--------------
 4 files changed, 550 insertions(+), 115 deletions(-)
 create mode 100644 parse_trigrams.cpp
 create mode 100644 parse_trigrams.h

diff --git a/meson.build b/meson.build
index eca0297..070a804 100644
--- a/meson.build
+++ b/meson.build
@@ -8,7 +8,7 @@ if not uringdep.found()
 	add_project_arguments('-DWITHOUT_URING', language: 'cpp')
 endif
 
-executable('plocate', ['plocate.cpp', 'io_uring_engine.cpp', 'turbopfor.cpp'],
+executable('plocate', ['plocate.cpp', 'io_uring_engine.cpp', 'turbopfor.cpp', 'parse_trigrams.cpp'],
 	dependencies: [uringdep, zstddep],
 	install: true,
 	install_mode: ['rwxr-sr-x', 'root', 'mlocate'])
diff --git a/parse_trigrams.cpp b/parse_trigrams.cpp
new file mode 100644
index 0000000..03fbc6b
--- /dev/null
+++ b/parse_trigrams.cpp
@@ -0,0 +1,242 @@
+#include "parse_trigrams.h"
+
+#include "unique_sort.h"
+
+#include <string.h>
+#include <wctype.h>
+
+using namespace std;
+
+string print_td(const TrigramDisjunction &td)
+{
+	if (td.read_trigrams.size() == 0) {
+		// Before we've done hash lookups (or none matched), so print all alternatives.
+		if (td.trigram_alternatives.size() == 1) {
+			return print_trigram(td.trigram_alternatives[0]);
+		} else {
+			string ret;
+			ret = "(";
+			bool first = true;
+			for (uint32_t trgm : td.trigram_alternatives) {
+				if (!first)
+					ret += " OR ";
+				ret += print_trigram(trgm);
+				first = false;
+			}
+			return ret + ")";
+		}
+	} else {
+		// Print only those that we actually have in the index.
+		if (td.read_trigrams.size() == 1) {
+			return print_trigram(td.read_trigrams[0].first.trgm);
+		} else {
+			string ret;
+			ret = "(";
+			bool first = true;
+			for (auto &[trgmptr, len] : td.read_trigrams) {
+				if (!first)
+					ret += " OR ";
+				ret += print_trigram(trgmptr.trgm);
+				first = false;
+			}
+			return ret + ")";
+		}
+	}
+}
+
+string print_trigram(uint32_t trgm)
+{
+	char ch[3] = {
+		char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
+	};
+
+	string str = "'";
+	for (unsigned i = 0; i < 3;) {
+		if (ch[i] == '\\') {
+			str.push_back('\\');
+			str.push_back(ch[i]);
+			++i;
+		} else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) {  // Holds no matter whether char is signed or unsigned.
+			str.push_back(ch[i]);
+			++i;
+		} else {
+			// See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
+			mbtowc(nullptr, 0, 0);
+			wchar_t pwc;
+			int ret = mbtowc(&pwc, ch + i, 3 - i);
+			if (ret >= 1 && pwc >= 32) {
+				str.append(ch + i, ret);
+				i += ret;
+			} else {
+				char buf[16];
+				snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
+				str += buf;
+				++i;
+			}
+		}
+	}
+	str += "'";
+	return str;
+}
+
+uint32_t read_unigram(const string &s, size_t idx)
+{
+	if (idx < s.size()) {
+		return (unsigned char)s[idx];
+	} else {
+		return 0;
+	}
+}
+
+uint32_t read_trigram(const string &s, size_t start)
+{
+	return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
+		(read_unigram(s, start + 2) << 16);
+}
+
+struct TrigramState {
+	string buffered;
+	unsigned next_codepoint;
+
+	bool operator<(const TrigramState &other) const
+	{
+		if (next_codepoint != other.next_codepoint)
+			return next_codepoint < other.next_codepoint;
+		return buffered < other.buffered;
+	}
+	bool operator==(const TrigramState &other) const
+	{
+		return next_codepoint == other.next_codepoint &&
+			buffered == other.buffered;
+	}
+};
+
+void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction> *trigram_groups)
+{
+	vector<vector<string>> alternatives_for_cp;
+
+	// Parse the needle into Unicode code points, and do inverse case folding
+	// on each to find legal alternatives. This is far from perfect (e.g. Ã
+	// will not become ss), but it's generally the best we can do without
+	// involving ICU or the likes.
+	mbtowc(nullptr, 0, 0);
+	const char *ptr = needle.c_str();
+	while (*ptr != '\0') {
+		wchar_t ch;
+		int ret = mbtowc(&ch, ptr, strlen(ptr));
+		if (ret == -1) {
+			perror(ptr);
+			exit(1);
+		}
+
+		char buf[MB_CUR_MAX];
+		vector<string> alt;
+		alt.push_back(string(ptr, ret));
+		ptr += ret;
+		if (towlower(ch) != wint_t(ch)) {
+			ret = wctomb(buf, towlower(ch));
+			alt.push_back(string(buf, ret));
+		}
+		if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
+			ret = wctomb(buf, towupper(ch));
+			alt.push_back(string(buf, ret));
+		}
+		alternatives_for_cp.push_back(move(alt));
+	}
+
+	// Now generate all possible byte strings from those code points in order;
+	// e.g., from abc, we'd create a and A, then extend those to ab aB Ab AB,
+	// then abc abC aBc aBC and so on. Since we don't want to have 2^n
+	// (or even 3^n) strings, we only extend them far enough to cover at
+	// least three bytes; this will give us a set of candidate trigrams
+	// (the filename must have at least one of those), and then we can
+	// chop off the first byte, deduplicate states and continue extending
+	// and generating trigram sets.
+	//
+	// There are a few special cases, notably the dotted i (Ä°), where the
+	// UTF-8 versions of upper and lower case have different number of bytes.
+	// If this happens, we can have combinatorial explosion and get many more
+	// than the normal 8 states. We detect this and simply bomb out; it will
+	// never really happen in real strings, and stopping trigram generation
+	// really only means our pruning of candidates will be less effective.
+	vector<TrigramState> states;
+	states.push_back(TrigramState{ "", 0 });
+
+	for (;;) {
+		// Extend every state so that it has buffered at least three bytes.
+		// If this isn't possible, we are done with the string (can generate
+		// no more trigrams).
+		bool need_another_pass;
+		do {
+			need_another_pass = false;
+			vector<TrigramState> new_states;
+			for (const TrigramState &state : states) {
+				if (state.buffered.size() >= 3) {
+					// No need to extend this further.
+					new_states.push_back(state);
+					continue;
+				}
+				if (state.next_codepoint == alternatives_for_cp.size()) {
+					// We can't form a complete trigram from this alternative,
+					// so we're done.
+					return;
+				}
+				for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
+					TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
+					if (new_state.buffered.size() < 3) {
+						need_another_pass = true;
+					}
+					new_states.push_back(move(new_state));
+				}
+			}
+			states = move(new_states);
+		} while (need_another_pass);
+
+		// OK, so now we have a bunch of states, and all of them are at least
+		// three bytes long. This means we have a complete set of trigrams,
+		// and the destination filename must contain at least one of them.
+		// Output those trigrams, cut out the first byte and then deduplicate
+		// the states before we continue.
+		vector<uint32_t> trigram_alternatives;
+		for (TrigramState &state : states) {
+			trigram_alternatives.push_back(read_trigram(state.buffered, 0));
+			state.buffered.erase(0, 1);
+		}
+		unique_sort(&trigram_alternatives);  // Could have duplicates, although it's rare.
+		unique_sort(&states);
+
+		TrigramDisjunction new_pt;
+		new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
+		new_pt.trigram_alternatives = move(trigram_alternatives);
+		new_pt.max_num_docids = 0;
+		trigram_groups->push_back(move(new_pt));
+
+		if (states.size() > 100) {
+			// A completely crazy pattern with lots of those special characters.
+			// We just give up; this isn't a realistic scenario anyway.
+			// We already have lots of trigrams that should reduce the amount of
+			// candidates.
+			return;
+		}
+	}
+}
+
+void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjunction> *trigram_groups)
+{
+	if (ignore_case) {
+		parse_trigrams_ignore_case(needle, trigram_groups);
+		return;
+	}
+
+	// The case-sensitive case is straightforward.
+	if (needle.size() >= 3) {
+		for (size_t i = 0; i < needle.size() - 2; ++i) {
+			uint32_t trgm = read_trigram(needle, i);
+			TrigramDisjunction new_pt;
+			new_pt.remaining_trigrams_to_read = 1;
+			new_pt.trigram_alternatives.push_back(trgm);
+			new_pt.max_num_docids = 0;
+			trigram_groups->push_back(move(new_pt));
+		}
+	}
+}
diff --git a/parse_trigrams.h b/parse_trigrams.h
new file mode 100644
index 0000000..810d005
--- /dev/null
+++ b/parse_trigrams.h
@@ -0,0 +1,62 @@
+#ifndef _PARSE_TRIGRAMS_H
+#define _PARSE_TRIGRAMS_H 1
+
+#include "db.h"
+
+#include <stdint.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+// One or more trigrams, with an implicit OR between them. For case-sensitive searches,
+// this is just e.g. âabcâ, but for case-insensitive, it would be âabc OR abC or aBc ...â etc.
+struct TrigramDisjunction {
+	unsigned index;  // For debugging only.
+
+	// The alternatives as determined by parse_trigrams().
+	std::vector<uint32_t> trigram_alternatives;
+
+	// Like trigram_alternatives, but only the ones we've actually read from the
+	// hash table (the non-existent ones are filtered out). The second member is
+	// the length in bytes. Incomplete if remaining_trigrams_to_read > 0.
+	std::vector<std::pair<Trigram, size_t>> read_trigrams;
+
+	// Sum of num_docids in all trigrams. This is usually a fairly good indicator
+	// of the real number of docids, since there are few files that would have e.g.
+	// both abc and abC in them (but of course, with multiple files in the same
+	// docid block, it is far from unheard of).
+	uint32_t max_num_docids;
+
+	// While reading posting lists: Holds the union of the posting lists read
+	// so far. Once remaining_trigrams_to_read == 0 (all are read), will be taken
+	// out and used for intersections against the other disjunctions.
+	std::vector<uint32_t> docids;
+
+	// While looking up in the hash table (filling out read_trigrams): Number of
+	// lookups in the hash table remaining. While reading actual posting lists
+	// (filling out docids): Number of posting lists left to read.
+	unsigned remaining_trigrams_to_read;
+};
+
+// Take the given needle (search string) and break it down into a set of trigrams
+// (or trigram alternatives; see TrigramDisjunction) that must be present for the
+// string to match. (Note: They are not _sufficient_ for the string to match;
+// false positives might very well occur and must be weeded out later.)
+//
+// For the case-sensitive case, this is straightforward; just take every trigram
+// present in the needle and add them (e.g. abcd -> abc AND bcd).
+// For case-insensitivity, it's trickier; see the comments in the function.
+//
+// Note that our trigrams are on the basis of bytes, not Unicode code points.
+// This both simplifies table structure (everything is the same length), and
+// guards us against trigram explosion (imagine every combination of CJK characters
+// getting their own trigram).
+void parse_trigrams(const std::string &needle, bool ignore_case, std::vector<TrigramDisjunction> *trigram_groups);
+
+uint32_t read_trigram(const std::string &s, size_t start);
+
+// For debugging.
+std::string print_td(const TrigramDisjunction &td);
+std::string print_trigram(uint32_t trgm);
+
+#endif  // !defined(_PARSE_TRIGRAMS_H)
diff --git a/plocate.cpp b/plocate.cpp
index d53c41c..f721b91 100644
--- a/plocate.cpp
+++ b/plocate.cpp
@@ -1,8 +1,10 @@
 #include "db.h"
 #include "io_uring_engine.h"
+#include "parse_trigrams.h"
 #include "unique_sort.h"
 
 #include <algorithm>
+#include <assert.h>
 #include <chrono>
 #include <fcntl.h>
 #include <functional>
@@ -12,6 +14,7 @@
 #include <limits>
 #include <memory>
 #include <queue>
+#include <regex.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -20,6 +23,7 @@
 #include <string_view>
 #include <unistd.h>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include <zstd.h>
@@ -33,6 +37,7 @@ using namespace std::chrono;
 #include "turbopfor.h"
 
 const char *dbpath = "/var/lib/mlocate/plocate.db";
+bool ignore_case = false;
 bool only_count = false;
 bool print_nul = false;
 int64_t limit_matches = numeric_limits<int64_t>::max();
@@ -82,21 +87,23 @@ void Serializer::release_current()
 	}
 }
 
-static inline uint32_t read_unigram(const string &s, size_t idx)
+struct Needle {
+	enum { STRSTR,
+	       REGEX } type;
+	string str;  // Filled in no matter what.
+	regex_t re;  // For REGEX.
+};
+
+bool matches(const Needle &needle, const char *haystack)
 {
-	if (idx < s.size()) {
-		return (unsigned char)s[idx];
+	if (needle.type == Needle::STRSTR) {
+		return strstr(haystack, needle.str.c_str()) != nullptr;
 	} else {
-		return 0;
+		assert(needle.type == Needle::REGEX);
+		return regexec(&needle.re, haystack, /*nmatch=*/0, /*pmatch=*/nullptr, /*flags=*/0) == 0;
 	}
 }
 
-static inline uint32_t read_trigram(const string &s, size_t start)
-{
-	return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
-		(read_unigram(s, start + 2) << 16);
-}
-
 bool has_access(const char *filename,
                 unordered_map<string, bool> *access_rx_cache)
 {
@@ -202,7 +209,7 @@ size_t Corpus::get_num_filename_blocks() const
 	return hdr.num_docids;
 }
 
-uint64_t scan_file_block(const vector<string> &needles, string_view compressed,
+uint64_t scan_file_block(const vector<Needle> &needles, string_view compressed,
                          unordered_map<string, bool> *access_rx_cache, int seq,
                          Serializer *serializer)
 {
@@ -232,8 +239,8 @@ uint64_t scan_file_block(const vector<string> &needles, string_view compressed,
 	     filename != block.data() + block.size();
 	     filename += strlen(filename) + 1) {
 		bool found = true;
-		for (const string &needle : needles) {
-			if (strstr(filename, needle.c_str()) == nullptr) {
+		for (const Needle &needle : needles) {
+			if (!matches(needle, filename)) {
 				found = false;
 				break;
 			}
@@ -265,7 +272,7 @@ uint64_t scan_file_block(const vector<string> &needles, string_view compressed,
 	return matched;
 }
 
-size_t scan_docids(const vector<string> &needles, const vector<uint32_t> &docids, const Corpus &corpus, IOUringEngine *engine)
+size_t scan_docids(const vector<Needle> &needles, const vector<uint32_t> &docids, const Corpus &corpus, IOUringEngine *engine)
 {
 	Serializer docids_in_order;
 	unordered_map<string, bool> access_rx_cache;
@@ -283,7 +290,7 @@ size_t scan_docids(const vector<string> &needles, const vector<uint32_t> &docids
 // We do this sequentially, as it's faster than scattering
 // a lot of I/O through io_uring and hoping the kernel will
 // coalesce it plus readahead for us.
-uint64_t scan_all_docids(const vector<string> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
+uint64_t scan_all_docids(const vector<Needle> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
 {
 	unordered_map<string, bool> access_rx_cache;
 	uint32_t num_blocks = corpus.get_num_filename_blocks();
@@ -310,43 +317,54 @@ uint64_t scan_all_docids(const vector<string> &needles, int fd, const Corpus &co
 	return matched;
 }
 
-// For debugging.
-string print_trigram(uint32_t trgm)
+// Takes the given posting list, unions it into the parts of the trigram disjunction
+// already read; if the list is complete, intersects with âcur_candidatesâ.
+//
+// Returns true if the search should be aborted (we are done).
+bool new_posting_list_read(TrigramDisjunction *td, vector<uint32_t> decoded, vector<uint32_t> *cur_candidates, vector<uint32_t> *tmp)
 {
-	char ch[3] = {
-		char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
-	};
-
-	string str = "'";
-	for (unsigned i = 0; i < 3;) {
-		if (ch[i] == '\\') {
-			str.push_back('\\');
-			str.push_back(ch[i]);
-			++i;
-		} else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) {  // Holds no matter whether char is signed or unsigned.
-			str.push_back(ch[i]);
-			++i;
-		} else {
-			// See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
-			mbtowc(nullptr, 0, 0);
-			wchar_t pwc;
-			int ret = mbtowc(&pwc, ch + i, 3 - i);
-			if (ret >= 1 && pwc >= 32) {
-				str.append(ch + i, ret);
-				i += ret;
+	if (td->docids.empty()) {
+		td->docids = move(decoded);
+	} else {
+		tmp->clear();
+		set_union(decoded.begin(), decoded.end(), td->docids.begin(), td->docids.end(), back_inserter(*tmp));
+		swap(*tmp, td->docids);
+	}
+	if (--td->remaining_trigrams_to_read > 0) {
+		// Need to wait for more.
+		if (ignore_case) {
+			dprintf("  ... %u reads left in OR group %u (%zu docids in list)\n",
+				td->remaining_trigrams_to_read, td->index, td->docids.size());
+		}
+		return false;
+	}
+	if (cur_candidates->empty()) {
+		if (ignore_case) {
+			dprintf("  ... all reads done for OR group %u (%zu docids)\n",
+				td->index, td->docids.size());
+		}
+		*cur_candidates = move(td->docids);
+	} else {
+		tmp->clear();
+		set_intersection(cur_candidates->begin(), cur_candidates->end(),
+		                 td->docids.begin(), td->docids.end(),
+		                 back_inserter(*tmp));
+		swap(*cur_candidates, *tmp);
+		if (ignore_case) {
+			if (cur_candidates->empty()) {
+				dprintf("  ... all reads done for OR group %u (%zu docids), intersected (none left, search is done)\n",
+					td->index, td->docids.size());
+				return true;
 			} else {
-				char buf[16];
-				snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
-				str += buf;
-				++i;
+				dprintf("  ... all reads done for OR group %u (%zu docids), intersected (%zu left)\n",
+					td->index, td->docids.size(), cur_candidates->size());
 			}
 		}
 	}
-	str += "'";
-	return str;
+	return false;
 }
 
-void do_search_file(const vector<string> &needles, const char *filename)
+void do_search_file(const vector<Needle> &needles, const char *filename)
 {
 	int fd = open(filename, O_RDONLY);
 	if (fd == -1) {
@@ -370,28 +388,31 @@ void do_search_file(const vector<string> &needles, const char *filename)
 	Corpus corpus(fd, &engine);
 	dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
 
-	vector<pair<Trigram, size_t>> trigrams;
-	for (const string &needle : needles) {
-		if (needle.size() < 3)
+	vector<TrigramDisjunction> trigram_groups;
+	for (const Needle &needle : needles) {
+		if (needle.str.size() < 3)
 			continue;
-		for (size_t i = 0; i < needle.size() - 2; ++i) {
-			uint32_t trgm = read_trigram(needle, i);
-			corpus.find_trigram(trgm, [trgm, &trigrams](const Trigram *trgmptr, size_t len) {
-				if (trgmptr == nullptr) {
-					dprintf("trigram %s isn't found, we abort the search\n", print_trigram(trgm).c_str());
-					if (only_count) {
-						printf("0\n");
-					}
-					exit(0);
-				}
-				trigrams.emplace_back(*trgmptr, len);
-			});
-		}
+		parse_trigrams(needle.str, ignore_case, &trigram_groups);
+	}
+	unique_sort(
+		&trigram_groups,
+		[](const TrigramDisjunction &a, const TrigramDisjunction &b) { return a.trigram_alternatives < b.trigram_alternatives; },
+		[](const TrigramDisjunction &a, const TrigramDisjunction &b) { return a.trigram_alternatives == b.trigram_alternatives; });
+
+	// Give them names for debugging.
+	unsigned td_index = 0;
+	for (TrigramDisjunction &td : trigram_groups) {
+		td.index = td_index++;
 	}
-	engine.finish();
-	dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
 
-	if (trigrams.empty()) {
+	// Collect which trigrams we need to look up in the hash table.
+	unordered_map<uint32_t, vector<TrigramDisjunction *>> trigrams_to_lookup;
+	for (TrigramDisjunction &td : trigram_groups) {
+		for (uint32_t trgm : td.trigram_alternatives) {
+			trigrams_to_lookup[trgm].push_back(&td);
+		}
+	}
+	if (trigrams_to_lookup.empty()) {
 		// Too short for trigram matching. Apply brute force.
 		// (We could have searched through all trigrams that matched
 		// the pattern and done a union of them, but that's a lot of
@@ -402,61 +423,113 @@ void do_search_file(const vector<string> &needles, const char *filename)
 		}
 		return;
 	}
-	unique_sort(&trigrams);
-	sort(trigrams.begin(), trigrams.end(),
-	     [&](const pair<Trigram, size_t> &a, const pair<Trigram, size_t> &b) {
-		     return a.first.num_docids < b.first.num_docids;
+
+	// Look them all up on disk.
+	for (auto &[trgm, trigram_groups] : trigrams_to_lookup) {
+		corpus.find_trigram(trgm, [trgm{ trgm }, trigram_groups{ &trigram_groups }](const Trigram *trgmptr, size_t len) {
+			if (trgmptr == nullptr) {
+				dprintf("trigram %s isn't found\n", print_trigram(trgm).c_str());
+				for (TrigramDisjunction *td : *trigram_groups) {
+					--td->remaining_trigrams_to_read;
+					if (td->remaining_trigrams_to_read == 0 && td->read_trigrams.empty()) {
+						dprintf("zero matches in %s, so we are done\n", print_td(*td).c_str());
+						if (only_count) {
+							printf("0\n");
+						}
+						exit(0);
+					}
+				}
+				return;
+			}
+			for (TrigramDisjunction *td : *trigram_groups) {
+				--td->remaining_trigrams_to_read;
+				td->max_num_docids += trgmptr->num_docids;
+				td->read_trigrams.emplace_back(*trgmptr, len);
+			}
+		});
+	}
+	engine.finish();
+	dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+
+	for (TrigramDisjunction &td : trigram_groups) {
+		// Reset for reads.
+		td.remaining_trigrams_to_read = td.read_trigrams.size();
+
+		if (ignore_case) {  // If case-sensitive, they'll all be pretty obvious single-entry groups.
+			dprintf("OR group %u (max_num_docids=%u): %s\n", td.index, td.max_num_docids, print_td(td).c_str());
+		}
+	}
+
+	// TODO: For case-insensitive (ie. more than one alternative in each),
+	// prioritize the ones with fewer seeks?
+	sort(trigram_groups.begin(), trigram_groups.end(),
+	     [&](const TrigramDisjunction &a, const TrigramDisjunction &b) {
+		     return a.max_num_docids < b.max_num_docids;
 	     });
 
-	vector<uint32_t> in1, in2, out;
+	unordered_map<uint32_t, vector<TrigramDisjunction *>> uses_trigram;
+	for (TrigramDisjunction &td : trigram_groups) {
+		for (uint32_t trgm : td.trigram_alternatives) {
+			uses_trigram[trgm].push_back(&td);
+		}
+	}
+
+	unordered_set<uint32_t> trigrams_submitted_read;
+	vector<uint32_t> cur_candidates, tmp, decoded;
 	bool done = false;
-	for (auto [trgmptr, len] : trigrams) {
-		if (!in1.empty() && trgmptr.num_docids > in1.size() * 100) {
-			uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
-			dprintf("trigram %s (%zu bytes) has %u entries, ignoring the rest (will "
+	for (TrigramDisjunction &td : trigram_groups) {
+		if (!cur_candidates.empty() && td.max_num_docids > cur_candidates.size() * 100) {
+			dprintf("%s has up to %u entries, ignoring the rest (will "
 			        "weed out false positives later)\n",
-			        print_trigram(trgm).c_str(), len, trgmptr.num_docids);
+			        print_td(td).c_str(), td.max_num_docids);
 			break;
 		}
 
-		// Only stay a certain amount ahead, so that we don't spend I/O
-		// on reading the latter, large posting lists. We are unlikely
-		// to need them anyway, even if they should come in first.
-		if (engine.get_waiting_reads() >= 5) {
-			engine.finish();
-			if (done)
-				break;
-		}
-		engine.submit_read(fd, len, trgmptr.offset, [trgmptr{ trgmptr }, len{ len }, &done, &in1, &in2, &out](string_view s) {
-			if (done)
-				return;
-			uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
-			size_t num = trgmptr.num_docids;
-			const unsigned char *pldata = reinterpret_cast<const unsigned char *>(s.data());
-			if (in1.empty()) {
-				in1.resize(num + 128);
-				decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &in1[0]);
-				in1.resize(num);
-				dprintf("trigram %s (%zu bytes) decoded to %zu entries\n",
-				        print_trigram(trgm).c_str(), len, num);
-			} else {
-				if (in2.size() < num + 128) {
-					in2.resize(num + 128);
+		for (auto &[trgmptr, len] : td.read_trigrams) {
+			if (trigrams_submitted_read.count(trgmptr.trgm) != 0) {
+				continue;
+			}
+			trigrams_submitted_read.insert(trgmptr.trgm);
+			// Only stay a certain amount ahead, so that we don't spend I/O
+			// on reading the latter, large posting lists. We are unlikely
+			// to need them anyway, even if they should come in first.
+			if (engine.get_waiting_reads() >= 5) {
+				engine.finish();
+				if (done)
+					break;
+			}
+			engine.submit_read(fd, len, trgmptr.offset, [trgmptr{ trgmptr }, len{ len }, &done, &cur_candidates, &tmp, &decoded, &uses_trigram](string_view s) {
+				if (done)
+					return;
+
+				uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
+				const unsigned char *pldata = reinterpret_cast<const unsigned char *>(s.data());
+				size_t num = trgmptr.num_docids;
+				decoded.resize(num);
+				decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &decoded[0]);
+
+				assert(uses_trigram.count(trgm) != 0);
+				bool was_empty = cur_candidates.empty();
+				if (ignore_case) {
+					dprintf("trigram %s (%zu bytes) decoded to %zu entries\n", print_trigram(trgm).c_str(), len, num);
 				}
-				decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &in2[0]);
-
-				out.clear();
-				set_intersection(in1.begin(), in1.end(), in2.begin(), in2.begin() + num,
-				                 back_inserter(out));
-				swap(in1, out);
-				dprintf("trigram %s (%zu bytes) decoded to %zu entries, %zu left\n",
-				        print_trigram(trgm).c_str(), len, num, in1.size());
-				if (in1.empty()) {
-					dprintf("no matches (intersection list is empty)\n");
-					done = true;
+
+				for (TrigramDisjunction *td : uses_trigram[trgm]) {
+					done |= new_posting_list_read(td, decoded, &cur_candidates, &tmp);
+					if (done)
+						break;
 				}
-			}
-		});
+				if (!ignore_case) {
+					if (was_empty) {
+						dprintf("trigram %s (%zu bytes) decoded to %zu entries\n", print_trigram(trgm).c_str(), len, num);
+					} else if (cur_candidates.empty()) {
+						dprintf("trigram %s (%zu bytes) decoded to %zu entries (none left, search is done)\n", print_trigram(trgm).c_str(), len, num);
+					} else {
+						dprintf("trigram %s (%zu bytes) decoded to %zu entries (%zu left)\n", print_trigram(trgm).c_str(), len, num, cur_candidates.size());
+					}
+				}
+			});
+		}
 	}
 	engine.finish();
 	if (done) {
@@ -465,7 +538,7 @@ void do_search_file(const vector<string> &needles, const char *filename)
 	dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n",
 	        1e3 * duration<float>(steady_clock::now() - start).count());
 
-	uint64_t matched = scan_docids(needles, in1, corpus, &engine);
+	uint64_t matched = scan_docids(needles, cur_candidates, corpus, &engine);
 	dprintf("Done in %.1f ms, found %zu matches.\n",
 	        1e3 * duration<float>(steady_clock::now() - start).count(), matched);
 
@@ -474,6 +547,46 @@ void do_search_file(const vector<string> &needles, const char *filename)
 	}
 }
 
+regex_t needle_to_regex(const string &needle)
+{
+	string escaped_needle;
+	for (char ch : needle) {
+		switch (ch) {
+		// Directly from what regex(7) considers an âatomâ.
+		case '^':
+		case '.':
+		case '[':
+		case '$':
+		case '(':
+		case ')':
+		case '|':
+		case '*':
+		case '+':
+		case '?':
+		case '{':
+		case '\\':
+			escaped_needle.push_back('\\');
+			// Fall through.
+		default:
+			escaped_needle.push_back(ch);
+		}
+	}
+	regex_t re;
+	int err;
+	if (ignore_case) {
+		err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB | REG_ICASE);
+	} else {
+		err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB);
+	}
+	if (err != 0) {
+		char errbuf[256];
+		regerror(err, &re, errbuf, sizeof(errbuf));
+		fprintf(stderr, "Error when compiling regex for '%s': %s\n", needle.c_str(), errbuf);
+		exit(1);
+	}
+	return re;
+}
+
 void usage()
 {
 	// The help text comes from mlocate.
@@ -483,6 +596,7 @@ void usage()
 	printf("  -d, --database DBPATH  use DBPATH instead of default database (which is\n");
 	printf("                         %s)\n", dbpath);
 	printf("  -h, --help             print this help\n");
+	printf("  -i, --ignore-case      ignore case distinctions when matching patterns\n");
 	printf("  -l, --limit, -n LIMIT  limit output (or counting) to LIMIT entries\n");
 	printf("  -0, --null             separate entries with NUL on output\n");
 }
@@ -493,6 +607,7 @@ int main(int argc, char **argv)
 		{ "help", no_argument, 0, 'h' },
 		{ "count", no_argument, 0, 'c' },
 		{ "database", required_argument, 0, 'd' },
+		{ "ignore-case", no_argument, 0, 'i' },
 		{ "limit", required_argument, 0, 'l' },
 		{ nullptr, required_argument, 0, 'n' },
 		{ "null", no_argument, 0, '0' },
@@ -502,7 +617,7 @@ int main(int argc, char **argv)
 	setlocale(LC_ALL, "");
 	for (;;) {
 		int option_index = 0;
-		int c = getopt_long(argc, argv, "cd:hl:n:0", long_options, &option_index);
+		int c = getopt_long(argc, argv, "cd:hil:n:0", long_options, &option_index);
 		if (c == -1) {
 			break;
 		}
@@ -516,6 +631,9 @@ int main(int argc, char **argv)
 		case 'h':
 			usage();
 			exit(0);
+		case 'i':
+			ignore_case = true;
+			break;
 		case 'l':
 		case 'n':
 			limit_matches = atoll(optarg);
@@ -528,9 +646,22 @@ int main(int argc, char **argv)
 		}
 	}
 
-	vector<string> needles;
+	vector<Needle> needles;
 	for (int i = optind; i < argc; ++i) {
-		needles.push_back(argv[i]);
+		Needle needle;
+		needle.str = argv[i];
+		if (ignore_case) {
+			// strcasestr() doesn't handle locales correctly (even though LSB
+			// claims it should), but somehow, the glibc regex engine does.
+			// It's much slower than strstr() for non-case-sensitive searches, though
+			// (even though it really ought to be faster, since it can precompile),
+			// so only use it for that.
+			needle.type = Needle::REGEX;
+			needle.re = needle_to_regex(argv[i]);
+		} else {
+			needle.type = Needle::STRSTR;
+		}
+		needles.push_back(move(needle));
 	}
 	if (needles.empty()) {
 		fprintf(stderr, "plocate: no pattern to search for specified\n");
-- 
2.39.2