--- /dev/null
+#include "parse_trigrams.h"
+
+#include "unique_sort.h"
+
+#include <string.h>
+#include <wctype.h>
+
+using namespace std;
+
+string print_td(const TrigramDisjunction &td)
+{
+ if (td.read_trigrams.size() == 0) {
+ // Before we've done hash lookups (or none matched), so print all alternatives.
+ if (td.trigram_alternatives.size() == 1) {
+ return print_trigram(td.trigram_alternatives[0]);
+ } else {
+ string ret;
+ ret = "(";
+ bool first = true;
+ for (uint32_t trgm : td.trigram_alternatives) {
+ if (!first)
+ ret += " OR ";
+ ret += print_trigram(trgm);
+ first = false;
+ }
+ return ret + ")";
+ }
+ } else {
+ // Print only those that we actually have in the index.
+ if (td.read_trigrams.size() == 1) {
+ return print_trigram(td.read_trigrams[0].first.trgm);
+ } else {
+ string ret;
+ ret = "(";
+ bool first = true;
+ for (auto &[trgmptr, len] : td.read_trigrams) {
+ if (!first)
+ ret += " OR ";
+ ret += print_trigram(trgmptr.trgm);
+ first = false;
+ }
+ return ret + ")";
+ }
+ }
+}
+
+string print_trigram(uint32_t trgm)
+{
+ char ch[3] = {
+ char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
+ };
+
+ string str = "'";
+ for (unsigned i = 0; i < 3;) {
+ if (ch[i] == '\\') {
+ str.push_back('\\');
+ str.push_back(ch[i]);
+ ++i;
+ } else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) { // Holds no matter whether char is signed or unsigned.
+ str.push_back(ch[i]);
+ ++i;
+ } else {
+ // See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
+ mbtowc(nullptr, 0, 0);
+ wchar_t pwc;
+ int ret = mbtowc(&pwc, ch + i, 3 - i);
+ if (ret >= 1 && pwc >= 32) {
+ str.append(ch + i, ret);
+ i += ret;
+ } else {
+ char buf[16];
+ snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
+ str += buf;
+ ++i;
+ }
+ }
+ }
+ str += "'";
+ return str;
+}
+
+uint32_t read_unigram(const string &s, size_t idx)
+{
+ if (idx < s.size()) {
+ return (unsigned char)s[idx];
+ } else {
+ return 0;
+ }
+}
+
+uint32_t read_trigram(const string &s, size_t start)
+{
+ return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
+ (read_unigram(s, start + 2) << 16);
+}
+
+struct TrigramState {
+ string buffered;
+ unsigned next_codepoint;
+
+ bool operator<(const TrigramState &other) const
+ {
+ if (next_codepoint != other.next_codepoint)
+ return next_codepoint < other.next_codepoint;
+ return buffered < other.buffered;
+ }
+ bool operator==(const TrigramState &other) const
+ {
+ return next_codepoint == other.next_codepoint &&
+ buffered == other.buffered;
+ }
+};
+
+void parse_trigrams_ignore_case(const string &needle, vector<TrigramDisjunction> *trigram_groups)
+{
+ vector<vector<string>> alternatives_for_cp;
+
+ // Parse the needle into Unicode code points, and do inverse case folding
+ // on each to find legal alternatives. This is far from perfect (e.g. ß
+ // will not become ss), but it's generally the best we can do without
+ // involving ICU or the likes.
+ mbtowc(nullptr, 0, 0);
+ const char *ptr = needle.c_str();
+ while (*ptr != '\0') {
+ wchar_t ch;
+ int ret = mbtowc(&ch, ptr, strlen(ptr));
+ if (ret == -1) {
+ perror(ptr);
+ exit(1);
+ }
+
+ char buf[MB_CUR_MAX];
+ vector<string> alt;
+ alt.push_back(string(ptr, ret));
+ ptr += ret;
+ if (towlower(ch) != wint_t(ch)) {
+ ret = wctomb(buf, towlower(ch));
+ alt.push_back(string(buf, ret));
+ }
+ if (towupper(ch) != wint_t(ch) && towupper(ch) != towlower(ch)) {
+ ret = wctomb(buf, towupper(ch));
+ alt.push_back(string(buf, ret));
+ }
+ alternatives_for_cp.push_back(move(alt));
+ }
+
+ // Now generate all possible byte strings from those code points in order;
+ // e.g., from abc, we'd create a and A, then extend those to ab aB Ab AB,
+ // then abc abC aBc aBC and so on. Since we don't want to have 2^n
+ // (or even 3^n) strings, we only extend them far enough to cover at
+ // least three bytes; this will give us a set of candidate trigrams
+ // (the filename must have at least one of those), and then we can
+ // chop off the first byte, deduplicate states and continue extending
+ // and generating trigram sets.
+ //
+ // There are a few special cases, notably the dotted i (İ), where the
+ // UTF-8 versions of upper and lower case have different number of bytes.
+ // If this happens, we can have combinatorial explosion and get many more
+ // than the normal 8 states. We detect this and simply bomb out; it will
+ // never really happen in real strings, and stopping trigram generation
+ // really only means our pruning of candidates will be less effective.
+ vector<TrigramState> states;
+ states.push_back(TrigramState{ "", 0 });
+
+ for (;;) {
+ // Extend every state so that it has buffered at least three bytes.
+ // If this isn't possible, we are done with the string (can generate
+ // no more trigrams).
+ bool need_another_pass;
+ do {
+ need_another_pass = false;
+ vector<TrigramState> new_states;
+ for (const TrigramState &state : states) {
+ if (state.buffered.size() >= 3) {
+ // No need to extend this further.
+ new_states.push_back(state);
+ continue;
+ }
+ if (state.next_codepoint == alternatives_for_cp.size()) {
+ // We can't form a complete trigram from this alternative,
+ // so we're done.
+ return;
+ }
+ for (const string &rune : alternatives_for_cp[state.next_codepoint]) {
+ TrigramState new_state{ state.buffered + rune, state.next_codepoint + 1 };
+ if (new_state.buffered.size() < 3) {
+ need_another_pass = true;
+ }
+ new_states.push_back(move(new_state));
+ }
+ }
+ states = move(new_states);
+ } while (need_another_pass);
+
+ // OK, so now we have a bunch of states, and all of them are at least
+ // three bytes long. This means we have a complete set of trigrams,
+ // and the destination filename must contain at least one of them.
+ // Output those trigrams, cut out the first byte and then deduplicate
+ // the states before we continue.
+ vector<uint32_t> trigram_alternatives;
+ for (TrigramState &state : states) {
+ trigram_alternatives.push_back(read_trigram(state.buffered, 0));
+ state.buffered.erase(0, 1);
+ }
+ unique_sort(&trigram_alternatives); // Could have duplicates, although it's rare.
+ unique_sort(&states);
+
+ TrigramDisjunction new_pt;
+ new_pt.remaining_trigrams_to_read = trigram_alternatives.size();
+ new_pt.trigram_alternatives = move(trigram_alternatives);
+ new_pt.max_num_docids = 0;
+ trigram_groups->push_back(move(new_pt));
+
+ if (states.size() > 100) {
+ // A completely crazy pattern with lots of those special characters.
+ // We just give up; this isn't a realistic scenario anyway.
+ // We already have lots of trigrams that should reduce the amount of
+ // candidates.
+ return;
+ }
+ }
+}
+
+void parse_trigrams(const string &needle, bool ignore_case, vector<TrigramDisjunction> *trigram_groups)
+{
+ if (ignore_case) {
+ parse_trigrams_ignore_case(needle, trigram_groups);
+ return;
+ }
+
+ // The case-sensitive case is straightforward.
+ if (needle.size() >= 3) {
+ for (size_t i = 0; i < needle.size() - 2; ++i) {
+ uint32_t trgm = read_trigram(needle, i);
+ TrigramDisjunction new_pt;
+ new_pt.remaining_trigrams_to_read = 1;
+ new_pt.trigram_alternatives.push_back(trgm);
+ new_pt.max_num_docids = 0;
+ trigram_groups->push_back(move(new_pt));
+ }
+ }
+}
#include "db.h"
#include "io_uring_engine.h"
+#include "parse_trigrams.h"
#include "unique_sort.h"
#include <algorithm>
+#include <assert.h>
#include <chrono>
#include <fcntl.h>
#include <functional>
#include <limits>
#include <memory>
#include <queue>
+#include <regex.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string_view>
#include <unistd.h>
#include <unordered_map>
+#include <unordered_set>
#include <utility>
#include <vector>
#include <zstd.h>
#include "turbopfor.h"
const char *dbpath = "/var/lib/mlocate/plocate.db";
+bool ignore_case = false;
bool only_count = false;
bool print_nul = false;
int64_t limit_matches = numeric_limits<int64_t>::max();
}
}
-static inline uint32_t read_unigram(const string &s, size_t idx)
+struct Needle {
+ enum { STRSTR,
+ REGEX } type;
+ string str; // Filled in no matter what.
+ regex_t re; // For REGEX.
+};
+
+bool matches(const Needle &needle, const char *haystack)
{
- if (idx < s.size()) {
- return (unsigned char)s[idx];
+ if (needle.type == Needle::STRSTR) {
+ return strstr(haystack, needle.str.c_str()) != nullptr;
} else {
- return 0;
+ assert(needle.type == Needle::REGEX);
+ return regexec(&needle.re, haystack, /*nmatch=*/0, /*pmatch=*/nullptr, /*flags=*/0) == 0;
}
}
-static inline uint32_t read_trigram(const string &s, size_t start)
-{
- return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) |
- (read_unigram(s, start + 2) << 16);
-}
-
bool has_access(const char *filename,
unordered_map<string, bool> *access_rx_cache)
{
return hdr.num_docids;
}
-uint64_t scan_file_block(const vector<string> &needles, string_view compressed,
+uint64_t scan_file_block(const vector<Needle> &needles, string_view compressed,
unordered_map<string, bool> *access_rx_cache, int seq,
Serializer *serializer)
{
filename != block.data() + block.size();
filename += strlen(filename) + 1) {
bool found = true;
- for (const string &needle : needles) {
- if (strstr(filename, needle.c_str()) == nullptr) {
+ for (const Needle &needle : needles) {
+ if (!matches(needle, filename)) {
found = false;
break;
}
return matched;
}
-size_t scan_docids(const vector<string> &needles, const vector<uint32_t> &docids, const Corpus &corpus, IOUringEngine *engine)
+size_t scan_docids(const vector<Needle> &needles, const vector<uint32_t> &docids, const Corpus &corpus, IOUringEngine *engine)
{
Serializer docids_in_order;
unordered_map<string, bool> access_rx_cache;
// We do this sequentially, as it's faster than scattering
// a lot of I/O through io_uring and hoping the kernel will
// coalesce it plus readahead for us.
-uint64_t scan_all_docids(const vector<string> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
+uint64_t scan_all_docids(const vector<Needle> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
{
unordered_map<string, bool> access_rx_cache;
uint32_t num_blocks = corpus.get_num_filename_blocks();
return matched;
}
-// For debugging.
-string print_trigram(uint32_t trgm)
+// Takes the given posting list, unions it into the parts of the trigram disjunction
+// already read; if the list is complete, intersects with “cur_candidates”.
+//
+// Returns true if the search should be aborted (we are done).
+bool new_posting_list_read(TrigramDisjunction *td, vector<uint32_t> decoded, vector<uint32_t> *cur_candidates, vector<uint32_t> *tmp)
{
- char ch[3] = {
- char(trgm & 0xff), char((trgm >> 8) & 0xff), char((trgm >> 16) & 0xff)
- };
-
- string str = "'";
- for (unsigned i = 0; i < 3;) {
- if (ch[i] == '\\') {
- str.push_back('\\');
- str.push_back(ch[i]);
- ++i;
- } else if (int(ch[i]) >= 32 && int(ch[i]) <= 127) { // Holds no matter whether char is signed or unsigned.
- str.push_back(ch[i]);
- ++i;
- } else {
- // See if we have an entire UTF-8 codepoint, and that it's reasonably printable.
- mbtowc(nullptr, 0, 0);
- wchar_t pwc;
- int ret = mbtowc(&pwc, ch + i, 3 - i);
- if (ret >= 1 && pwc >= 32) {
- str.append(ch + i, ret);
- i += ret;
+ if (td->docids.empty()) {
+ td->docids = move(decoded);
+ } else {
+ tmp->clear();
+ set_union(decoded.begin(), decoded.end(), td->docids.begin(), td->docids.end(), back_inserter(*tmp));
+ swap(*tmp, td->docids);
+ }
+ if (--td->remaining_trigrams_to_read > 0) {
+ // Need to wait for more.
+ if (ignore_case) {
+ dprintf(" ... %u reads left in OR group %u (%zu docids in list)\n",
+ td->remaining_trigrams_to_read, td->index, td->docids.size());
+ }
+ return false;
+ }
+ if (cur_candidates->empty()) {
+ if (ignore_case) {
+ dprintf(" ... all reads done for OR group %u (%zu docids)\n",
+ td->index, td->docids.size());
+ }
+ *cur_candidates = move(td->docids);
+ } else {
+ tmp->clear();
+ set_intersection(cur_candidates->begin(), cur_candidates->end(),
+ td->docids.begin(), td->docids.end(),
+ back_inserter(*tmp));
+ swap(*cur_candidates, *tmp);
+ if (ignore_case) {
+ if (cur_candidates->empty()) {
+ dprintf(" ... all reads done for OR group %u (%zu docids), intersected (none left, search is done)\n",
+ td->index, td->docids.size());
+ return true;
} else {
- char buf[16];
- snprintf(buf, sizeof(buf), "\\x{%02x}", (unsigned char)ch[i]);
- str += buf;
- ++i;
+ dprintf(" ... all reads done for OR group %u (%zu docids), intersected (%zu left)\n",
+ td->index, td->docids.size(), cur_candidates->size());
}
}
}
- str += "'";
- return str;
+ return false;
}
-void do_search_file(const vector<string> &needles, const char *filename)
+void do_search_file(const vector<Needle> &needles, const char *filename)
{
int fd = open(filename, O_RDONLY);
if (fd == -1) {
Corpus corpus(fd, &engine);
dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
- vector<pair<Trigram, size_t>> trigrams;
- for (const string &needle : needles) {
- if (needle.size() < 3)
+ vector<TrigramDisjunction> trigram_groups;
+ for (const Needle &needle : needles) {
+ if (needle.str.size() < 3)
continue;
- for (size_t i = 0; i < needle.size() - 2; ++i) {
- uint32_t trgm = read_trigram(needle, i);
- corpus.find_trigram(trgm, [trgm, &trigrams](const Trigram *trgmptr, size_t len) {
- if (trgmptr == nullptr) {
- dprintf("trigram %s isn't found, we abort the search\n", print_trigram(trgm).c_str());
- if (only_count) {
- printf("0\n");
- }
- exit(0);
- }
- trigrams.emplace_back(*trgmptr, len);
- });
- }
+ parse_trigrams(needle.str, ignore_case, &trigram_groups);
+ }
+ unique_sort(
+ &trigram_groups,
+ [](const TrigramDisjunction &a, const TrigramDisjunction &b) { return a.trigram_alternatives < b.trigram_alternatives; },
+ [](const TrigramDisjunction &a, const TrigramDisjunction &b) { return a.trigram_alternatives == b.trigram_alternatives; });
+
+ // Give them names for debugging.
+ unsigned td_index = 0;
+ for (TrigramDisjunction &td : trigram_groups) {
+ td.index = td_index++;
}
- engine.finish();
- dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
- if (trigrams.empty()) {
+ // Collect which trigrams we need to look up in the hash table.
+ unordered_map<uint32_t, vector<TrigramDisjunction *>> trigrams_to_lookup;
+ for (TrigramDisjunction &td : trigram_groups) {
+ for (uint32_t trgm : td.trigram_alternatives) {
+ trigrams_to_lookup[trgm].push_back(&td);
+ }
+ }
+ if (trigrams_to_lookup.empty()) {
// Too short for trigram matching. Apply brute force.
// (We could have searched through all trigrams that matched
// the pattern and done a union of them, but that's a lot of
}
return;
}
- unique_sort(&trigrams);
- sort(trigrams.begin(), trigrams.end(),
- [&](const pair<Trigram, size_t> &a, const pair<Trigram, size_t> &b) {
- return a.first.num_docids < b.first.num_docids;
+
+ // Look them all up on disk.
+ for (auto &[trgm, trigram_groups] : trigrams_to_lookup) {
+ corpus.find_trigram(trgm, [trgm{ trgm }, trigram_groups{ &trigram_groups }](const Trigram *trgmptr, size_t len) {
+ if (trgmptr == nullptr) {
+ dprintf("trigram %s isn't found\n", print_trigram(trgm).c_str());
+ for (TrigramDisjunction *td : *trigram_groups) {
+ --td->remaining_trigrams_to_read;
+ if (td->remaining_trigrams_to_read == 0 && td->read_trigrams.empty()) {
+ dprintf("zero matches in %s, so we are done\n", print_td(*td).c_str());
+ if (only_count) {
+ printf("0\n");
+ }
+ exit(0);
+ }
+ }
+ return;
+ }
+ for (TrigramDisjunction *td : *trigram_groups) {
+ --td->remaining_trigrams_to_read;
+ td->max_num_docids += trgmptr->num_docids;
+ td->read_trigrams.emplace_back(*trgmptr, len);
+ }
+ });
+ }
+ engine.finish();
+ dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+
+ for (TrigramDisjunction &td : trigram_groups) {
+ // Reset for reads.
+ td.remaining_trigrams_to_read = td.read_trigrams.size();
+
+ if (ignore_case) { // If case-sensitive, they'll all be pretty obvious single-entry groups.
+ dprintf("OR group %u (max_num_docids=%u): %s\n", td.index, td.max_num_docids, print_td(td).c_str());
+ }
+ }
+
+ // TODO: For case-insensitive (ie. more than one alternative in each),
+ // prioritize the ones with fewer seeks?
+ sort(trigram_groups.begin(), trigram_groups.end(),
+ [&](const TrigramDisjunction &a, const TrigramDisjunction &b) {
+ return a.max_num_docids < b.max_num_docids;
});
- vector<uint32_t> in1, in2, out;
+ unordered_map<uint32_t, vector<TrigramDisjunction *>> uses_trigram;
+ for (TrigramDisjunction &td : trigram_groups) {
+ for (uint32_t trgm : td.trigram_alternatives) {
+ uses_trigram[trgm].push_back(&td);
+ }
+ }
+
+ unordered_set<uint32_t> trigrams_submitted_read;
+ vector<uint32_t> cur_candidates, tmp, decoded;
bool done = false;
- for (auto [trgmptr, len] : trigrams) {
- if (!in1.empty() && trgmptr.num_docids > in1.size() * 100) {
- uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
- dprintf("trigram %s (%zu bytes) has %u entries, ignoring the rest (will "
+ for (TrigramDisjunction &td : trigram_groups) {
+ if (!cur_candidates.empty() && td.max_num_docids > cur_candidates.size() * 100) {
+ dprintf("%s has up to %u entries, ignoring the rest (will "
"weed out false positives later)\n",
- print_trigram(trgm).c_str(), len, trgmptr.num_docids);
+ print_td(td).c_str(), td.max_num_docids);
break;
}
- // Only stay a certain amount ahead, so that we don't spend I/O
- // on reading the latter, large posting lists. We are unlikely
- // to need them anyway, even if they should come in first.
- if (engine.get_waiting_reads() >= 5) {
- engine.finish();
- if (done)
- break;
- }
- engine.submit_read(fd, len, trgmptr.offset, [trgmptr{ trgmptr }, len{ len }, &done, &in1, &in2, &out](string_view s) {
- if (done)
- return;
- uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
- size_t num = trgmptr.num_docids;
- const unsigned char *pldata = reinterpret_cast<const unsigned char *>(s.data());
- if (in1.empty()) {
- in1.resize(num + 128);
- decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &in1[0]);
- in1.resize(num);
- dprintf("trigram %s (%zu bytes) decoded to %zu entries\n",
- print_trigram(trgm).c_str(), len, num);
- } else {
- if (in2.size() < num + 128) {
- in2.resize(num + 128);
+ for (auto &[trgmptr, len] : td.read_trigrams) {
+ if (trigrams_submitted_read.count(trgmptr.trgm) != 0) {
+ continue;
+ }
+ trigrams_submitted_read.insert(trgmptr.trgm);
+ // Only stay a certain amount ahead, so that we don't spend I/O
+ // on reading the latter, large posting lists. We are unlikely
+ // to need them anyway, even if they should come in first.
+ if (engine.get_waiting_reads() >= 5) {
+ engine.finish();
+ if (done)
+ break;
+ }
+ engine.submit_read(fd, len, trgmptr.offset, [trgmptr{ trgmptr }, len{ len }, &done, &cur_candidates, &tmp, &decoded, &uses_trigram](string_view s) {
+ if (done)
+ return;
+
+ uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
+ const unsigned char *pldata = reinterpret_cast<const unsigned char *>(s.data());
+ size_t num = trgmptr.num_docids;
+ decoded.resize(num);
+ decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &decoded[0]);
+
+ assert(uses_trigram.count(trgm) != 0);
+ bool was_empty = cur_candidates.empty();
+ if (ignore_case) {
+ dprintf("trigram %s (%zu bytes) decoded to %zu entries\n", print_trigram(trgm).c_str(), len, num);
}
- decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &in2[0]);
-
- out.clear();
- set_intersection(in1.begin(), in1.end(), in2.begin(), in2.begin() + num,
- back_inserter(out));
- swap(in1, out);
- dprintf("trigram %s (%zu bytes) decoded to %zu entries, %zu left\n",
- print_trigram(trgm).c_str(), len, num, in1.size());
- if (in1.empty()) {
- dprintf("no matches (intersection list is empty)\n");
- done = true;
+
+ for (TrigramDisjunction *td : uses_trigram[trgm]) {
+ done |= new_posting_list_read(td, decoded, &cur_candidates, &tmp);
+ if (done)
+ break;
}
- }
- });
+ if (!ignore_case) {
+ if (was_empty) {
+ dprintf("trigram %s (%zu bytes) decoded to %zu entries\n", print_trigram(trgm).c_str(), len, num);
+ } else if (cur_candidates.empty()) {
+ dprintf("trigram %s (%zu bytes) decoded to %zu entries (none left, search is done)\n", print_trigram(trgm).c_str(), len, num);
+ } else {
+ dprintf("trigram %s (%zu bytes) decoded to %zu entries (%zu left)\n", print_trigram(trgm).c_str(), len, num, cur_candidates.size());
+ }
+ }
+ });
+ }
}
engine.finish();
if (done) {
dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n",
1e3 * duration<float>(steady_clock::now() - start).count());
- uint64_t matched = scan_docids(needles, in1, corpus, &engine);
+ uint64_t matched = scan_docids(needles, cur_candidates, corpus, &engine);
dprintf("Done in %.1f ms, found %zu matches.\n",
1e3 * duration<float>(steady_clock::now() - start).count(), matched);
}
}
+regex_t needle_to_regex(const string &needle)
+{
+ string escaped_needle;
+ for (char ch : needle) {
+ switch (ch) {
+ // Directly from what regex(7) considers an “atom”.
+ case '^':
+ case '.':
+ case '[':
+ case '$':
+ case '(':
+ case ')':
+ case '|':
+ case '*':
+ case '+':
+ case '?':
+ case '{':
+ case '\\':
+ escaped_needle.push_back('\\');
+ // Fall through.
+ default:
+ escaped_needle.push_back(ch);
+ }
+ }
+ regex_t re;
+ int err;
+ if (ignore_case) {
+ err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB | REG_ICASE);
+ } else {
+ err = regcomp(&re, escaped_needle.c_str(), REG_NOSUB);
+ }
+ if (err != 0) {
+ char errbuf[256];
+ regerror(err, &re, errbuf, sizeof(errbuf));
+ fprintf(stderr, "Error when compiling regex for '%s': %s\n", needle.c_str(), errbuf);
+ exit(1);
+ }
+ return re;
+}
+
void usage()
{
// The help text comes from mlocate.
printf(" -d, --database DBPATH use DBPATH instead of default database (which is\n");
printf(" %s)\n", dbpath);
printf(" -h, --help print this help\n");
+ printf(" -i, --ignore-case ignore case distinctions when matching patterns\n");
printf(" -l, --limit, -n LIMIT limit output (or counting) to LIMIT entries\n");
printf(" -0, --null separate entries with NUL on output\n");
}
{ "help", no_argument, 0, 'h' },
{ "count", no_argument, 0, 'c' },
{ "database", required_argument, 0, 'd' },
+ { "ignore-case", no_argument, 0, 'i' },
{ "limit", required_argument, 0, 'l' },
{ nullptr, required_argument, 0, 'n' },
{ "null", no_argument, 0, '0' },
setlocale(LC_ALL, "");
for (;;) {
int option_index = 0;
- int c = getopt_long(argc, argv, "cd:hl:n:0", long_options, &option_index);
+ int c = getopt_long(argc, argv, "cd:hil:n:0", long_options, &option_index);
if (c == -1) {
break;
}
case 'h':
usage();
exit(0);
+ case 'i':
+ ignore_case = true;
+ break;
case 'l':
case 'n':
limit_matches = atoll(optarg);
}
}
- vector<string> needles;
+ vector<Needle> needles;
for (int i = optind; i < argc; ++i) {
- needles.push_back(argv[i]);
+ Needle needle;
+ needle.str = argv[i];
+ if (ignore_case) {
+ // strcasestr() doesn't handle locales correctly (even though LSB
+ // claims it should), but somehow, the glibc regex engine does.
+ // It's much slower than strstr() for non-case-sensitive searches, though
+ // (even though it really ought to be faster, since it can precompile),
+ // so only use it for that.
+ needle.type = Needle::REGEX;
+ needle.re = needle_to_regex(argv[i]);
+ } else {
+ needle.type = Needle::STRSTR;
+ }
+ needles.push_back(move(needle));
}
if (needles.empty()) {
fprintf(stderr, "plocate: no pattern to search for specified\n");