X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=plocate.cpp;h=a76c288a2bca86f0a11ebfec822d34721fa38335;hb=c427ecd63267946d66cf15808ed507d4f94c3566;hp=8ba5aa8aeea1fb660798d06f10f3d6d160c3facf;hpb=6f7838012a445183eeaba4dde44d4b2b1617aa6b;p=plocate diff --git a/plocate.cpp b/plocate.cpp index 8ba5aa8..a76c288 100644 --- a/plocate.cpp +++ b/plocate.cpp @@ -1,32 +1,50 @@ #include "db.h" +#include "dprintf.h" #include "io_uring_engine.h" -#include "vp4.h" +#include "parse_trigrams.h" +#include "turbopfor.h" +#include "unique_sort.h" #include -#include #include #include -#include #include +#include #include #include +#include +#include +#include +#include #include +#include +#include +#include #include +#include #include #include +#include #include #include +#include +#include #include #include using namespace std; using namespace std::chrono; -#define dprintf(...) -//#define dprintf(...) fprintf(stderr, __VA_ARGS__); +#define DEFAULT_DBPATH "/var/lib/mlocate/plocate.db" -const char *dbpath = "/var/lib/mlocate/plocate.db"; +const char *dbpath = DEFAULT_DBPATH; +bool ignore_case = false; +bool only_count = false; bool print_nul = false; +bool use_debug = false; +bool patterns_are_regex = false; +bool use_extended_regex = false; +int64_t limit_matches = numeric_limits::max(); class Serializer { public: @@ -50,7 +68,7 @@ private: void Serializer::print_delayed(int seq, const vector msg) { - pending.push(Element{seq, move(msg)}); + pending.push(Element{ seq, move(msg) }); } void Serializer::release_current() @@ -59,6 +77,8 @@ void Serializer::release_current() // See if any delayed prints can now be dealt with. while (!pending.empty() && pending.top().seq == next_seq) { + if (limit_matches-- <= 0) + return; for (const string &msg : pending.top().msg) { if (print_nul) { printf("%s%c", msg.c_str(), 0); @@ -71,21 +91,27 @@ void Serializer::release_current() } } -static inline uint32_t read_unigram(const string &s, size_t idx) +struct Needle { + enum { STRSTR, + REGEX, + GLOB } type; + string str; // Filled in no matter what. + regex_t re; // For REGEX. +}; + +bool matches(const Needle &needle, const char *haystack) { - if (idx < s.size()) { - return (unsigned char)s[idx]; + if (needle.type == Needle::STRSTR) { + return strstr(haystack, needle.str.c_str()) != nullptr; + } else if (needle.type == Needle::GLOB) { + int flags = ignore_case ? FNM_CASEFOLD : 0; + return fnmatch(needle.str.c_str(), haystack, flags) == 0; } else { - return 0; + assert(needle.type == Needle::REGEX); + return regexec(&needle.re, haystack, /*nmatch=*/0, /*pmatch=*/nullptr, /*flags=*/0) == 0; } } -static inline uint32_t read_trigram(const string &s, size_t start) -{ - return read_unigram(s, start) | (read_unigram(s, start + 1) << 8) | - (read_unigram(s, start + 2) << 16); -} - bool has_access(const char *filename, unordered_map *access_rx_cache) { @@ -114,7 +140,7 @@ public: Corpus(int fd, IOUringEngine *engine); ~Corpus(); void find_trigram(uint32_t trgm, function cb); - void get_compressed_filename_block(uint32_t docid, function cb) const; + void get_compressed_filename_block(uint32_t docid, function cb) const; size_t get_num_filename_blocks() const; off_t offset_for_block(uint32_t docid) const { @@ -160,7 +186,7 @@ Corpus::~Corpus() void Corpus::find_trigram(uint32_t trgm, function cb) { uint32_t bucket = hash_trigram(trgm, hdr.hashtable_size); - engine->submit_read(fd, sizeof(Trigram) * (hdr.extra_ht_slots + 2), hdr.hash_table_offset_bytes + sizeof(Trigram) * bucket, [this, trgm, bucket, cb{ move(cb) }](string s) { + engine->submit_read(fd, sizeof(Trigram) * (hdr.extra_ht_slots + 2), hdr.hash_table_offset_bytes + sizeof(Trigram) * bucket, [this, trgm, cb{ move(cb) }](string_view s) { const Trigram *trgmptr = reinterpret_cast(s.data()); for (unsigned i = 0; i < hdr.extra_ht_slots + 1; ++i) { if (trgmptr[i].trgm == trgm) { @@ -174,11 +200,11 @@ void Corpus::find_trigram(uint32_t trgm, function cb) const +void Corpus::get_compressed_filename_block(uint32_t docid, function cb) const { // Read the file offset from this docid and the next one. // This is always allowed, since we have a sentinel block at the end. - engine->submit_read(fd, sizeof(uint64_t) * 2, offset_for_block(docid), [this, cb{ move(cb) }](string s) { + engine->submit_read(fd, sizeof(uint64_t) * 2, offset_for_block(docid), [this, cb{ move(cb) }](string_view s) { const uint64_t *ptr = reinterpret_cast(s.data()); off_t offset = ptr[0]; size_t len = ptr[1] - ptr[0]; @@ -188,19 +214,14 @@ void Corpus::get_compressed_filename_block(uint32_t docid, function &needles, string_view compressed, - unordered_map *access_rx_cache, int seq, - Serializer *serializer) +uint64_t scan_file_block(const vector &needles, string_view compressed, + unordered_map *access_rx_cache, int seq, + Serializer *serializer) { - size_t matched = 0; + uint64_t matched = 0; unsigned long long uncompressed_len = ZSTD_getFrameContentSize(compressed.data(), compressed.size()); if (uncompressed_len == ZSTD_CONTENTSIZE_UNKNOWN || uncompressed_len == ZSTD_CONTENTSIZE_ERROR) { @@ -226,14 +247,18 @@ size_t scan_file_block(const vector &needles, string_view compressed, filename != block.data() + block.size(); filename += strlen(filename) + 1) { bool found = true; - for (const string &needle : needles) { - if (strstr(filename, needle.c_str()) == nullptr) { + for (const Needle &needle : needles) { + if (!matches(needle, filename)) { found = false; break; } } if (found && has_access(filename, access_rx_cache)) { + if (limit_matches-- <= 0) + break; ++matched; + if (only_count) + continue; if (immediate_print) { if (print_nul) { printf("%s%c", filename, 0); @@ -245,7 +270,7 @@ size_t scan_file_block(const vector &needles, string_view compressed, } } } - if (serializer != nullptr) { + if (serializer != nullptr && !only_count) { if (immediate_print) { serializer->release_current(); } else { @@ -255,14 +280,14 @@ size_t scan_file_block(const vector &needles, string_view compressed, return matched; } -size_t scan_docids(const vector &needles, const vector &docids, const Corpus &corpus, IOUringEngine *engine) +size_t scan_docids(const vector &needles, const vector &docids, const Corpus &corpus, IOUringEngine *engine) { Serializer docids_in_order; unordered_map access_rx_cache; - size_t matched = 0; + uint64_t matched = 0; for (size_t i = 0; i < docids.size(); ++i) { uint32_t docid = docids[i]; - corpus.get_compressed_filename_block(docid, [i, &matched, &needles, &access_rx_cache, &docids_in_order](string compressed) { + corpus.get_compressed_filename_block(docid, [i, &matched, &needles, &access_rx_cache, &docids_in_order](string_view compressed) { matched += scan_file_block(needles, compressed, &access_rx_cache, i, &docids_in_order); }); } @@ -273,13 +298,14 @@ size_t scan_docids(const vector &needles, const vector &docids // We do this sequentially, as it's faster than scattering // a lot of I/O through io_uring and hoping the kernel will // coalesce it plus readahead for us. -void scan_all_docids(const vector &needles, int fd, const Corpus &corpus, IOUringEngine *engine) +uint64_t scan_all_docids(const vector &needles, int fd, const Corpus &corpus, IOUringEngine *engine) { unordered_map access_rx_cache; uint32_t num_blocks = corpus.get_num_filename_blocks(); unique_ptr offsets(new uint64_t[num_blocks + 1]); complete_pread(fd, offsets.get(), (num_blocks + 1) * sizeof(uint64_t), corpus.offset_for_block(0)); string compressed; + uint64_t matched = 0; for (uint32_t io_docid = 0; io_docid < num_blocks; io_docid += 32) { uint32_t last_docid = std::min(io_docid + 32, num_blocks); size_t io_len = offsets[last_docid] - offsets[io_docid]; @@ -291,12 +317,62 @@ void scan_all_docids(const vector &needles, int fd, const Corpus &corpus for (uint32_t docid = io_docid; docid < last_docid; ++docid) { size_t relative_offset = offsets[docid] - offsets[io_docid]; size_t len = offsets[docid + 1] - offsets[docid]; - scan_file_block(needles, { &compressed[relative_offset], len }, &access_rx_cache, 0, nullptr); + matched += scan_file_block(needles, { &compressed[relative_offset], len }, &access_rx_cache, 0, nullptr); + if (limit_matches <= 0) + return matched; } } + return matched; } -void do_search_file(const vector &needles, const char *filename) +// Takes the given posting list, unions it into the parts of the trigram disjunction +// already read; if the list is complete, intersects with “cur_candidates”. +// +// Returns true if the search should be aborted (we are done). +bool new_posting_list_read(TrigramDisjunction *td, vector decoded, vector *cur_candidates, vector *tmp) +{ + if (td->docids.empty()) { + td->docids = move(decoded); + } else { + tmp->clear(); + set_union(decoded.begin(), decoded.end(), td->docids.begin(), td->docids.end(), back_inserter(*tmp)); + swap(*tmp, td->docids); + } + if (--td->remaining_trigrams_to_read > 0) { + // Need to wait for more. + if (ignore_case) { + dprintf(" ... %u reads left in OR group %u (%zu docids in list)\n", + td->remaining_trigrams_to_read, td->index, td->docids.size()); + } + return false; + } + if (cur_candidates->empty()) { + if (ignore_case) { + dprintf(" ... all reads done for OR group %u (%zu docids)\n", + td->index, td->docids.size()); + } + *cur_candidates = move(td->docids); + } else { + tmp->clear(); + set_intersection(cur_candidates->begin(), cur_candidates->end(), + td->docids.begin(), td->docids.end(), + back_inserter(*tmp)); + swap(*cur_candidates, *tmp); + if (ignore_case) { + if (cur_candidates->empty()) { + dprintf(" ... all reads done for OR group %u (%zu docids), intersected (none left, search is done)\n", + td->index, td->docids.size()); + return true; + } else { + dprintf(" ... all reads done for OR group %u (%zu docids), intersected (%zu left)\n", + td->index, td->docids.size(), cur_candidates->size()); + } + } + } + return false; +} + +void do_search_file(const vector &needles, const char *filename) { int fd = open(filename, O_RDONLY); if (fd == -1) { @@ -316,96 +392,158 @@ void do_search_file(const vector &needles, const char *filename) return; } - IOUringEngine engine; + IOUringEngine engine(/*slop_bytes=*/16); // 16 slop bytes as described in turbopfor.h. Corpus corpus(fd, &engine); dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); - vector> trigrams; - for (const string &needle : needles) { - if (needle.size() < 3) continue; - for (size_t i = 0; i < needle.size() - 2; ++i) { - uint32_t trgm = read_trigram(needle, i); - corpus.find_trigram(trgm, [trgm, &trigrams](const Trigram *trgmptr, size_t len) { - if (trgmptr == nullptr) { - dprintf("trigram %06x isn't found, we abort the search\n", trgm); - return; - } - trigrams.emplace_back(*trgmptr, len); - }); + vector trigram_groups; + if (patterns_are_regex) { + // We could parse the regex to find trigrams that have to be there + // (there are actually known algorithms to deal with disjunctions + // and such, too), but for now, we just go brute force. + // Using locate with regexes is pretty niche. + } else { + for (const Needle &needle : needles) { + parse_trigrams(needle.str, ignore_case, &trigram_groups); } } - engine.finish(); - dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); - if (trigrams.empty()) { + unique_sort( + &trigram_groups, + [](const TrigramDisjunction &a, const TrigramDisjunction &b) { return a.trigram_alternatives < b.trigram_alternatives; }, + [](const TrigramDisjunction &a, const TrigramDisjunction &b) { return a.trigram_alternatives == b.trigram_alternatives; }); + + // Give them names for debugging. + unsigned td_index = 0; + for (TrigramDisjunction &td : trigram_groups) { + td.index = td_index++; + } + + // Collect which trigrams we need to look up in the hash table. + unordered_map> trigrams_to_lookup; + for (TrigramDisjunction &td : trigram_groups) { + for (uint32_t trgm : td.trigram_alternatives) { + trigrams_to_lookup[trgm].push_back(&td); + } + } + if (trigrams_to_lookup.empty()) { // Too short for trigram matching. Apply brute force. // (We could have searched through all trigrams that matched // the pattern and done a union of them, but that's a lot of // work for fairly unclear gain.) - scan_all_docids(needles, fd, corpus, &engine); + uint64_t matched = scan_all_docids(needles, fd, corpus, &engine); + if (only_count) { + printf("%" PRId64 "\n", matched); + } return; } - sort(trigrams.begin(), trigrams.end()); - { - auto last = unique(trigrams.begin(), trigrams.end()); - trigrams.erase(last, trigrams.end()); + + // Look them all up on disk. + for (auto &[trgm, trigram_groups] : trigrams_to_lookup) { + corpus.find_trigram(trgm, [trgm{ trgm }, trigram_groups{ &trigram_groups }](const Trigram *trgmptr, size_t len) { + if (trgmptr == nullptr) { + dprintf("trigram %s isn't found\n", print_trigram(trgm).c_str()); + for (TrigramDisjunction *td : *trigram_groups) { + --td->remaining_trigrams_to_read; + if (td->remaining_trigrams_to_read == 0 && td->read_trigrams.empty()) { + dprintf("zero matches in %s, so we are done\n", print_td(*td).c_str()); + if (only_count) { + printf("0\n"); + } + exit(0); + } + } + return; + } + for (TrigramDisjunction *td : *trigram_groups) { + --td->remaining_trigrams_to_read; + td->max_num_docids += trgmptr->num_docids; + td->read_trigrams.emplace_back(*trgmptr, len); + } + }); + } + engine.finish(); + dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); + + for (TrigramDisjunction &td : trigram_groups) { + // Reset for reads. + td.remaining_trigrams_to_read = td.read_trigrams.size(); + + if (ignore_case) { // If case-sensitive, they'll all be pretty obvious single-entry groups. + dprintf("OR group %u (max_num_docids=%u): %s\n", td.index, td.max_num_docids, print_td(td).c_str()); + } } - sort(trigrams.begin(), trigrams.end(), - [&](const pair &a, const pair &b) { - return a.first.num_docids < b.first.num_docids; + + // TODO: For case-insensitive (ie. more than one alternative in each), + // prioritize the ones with fewer seeks? + sort(trigram_groups.begin(), trigram_groups.end(), + [&](const TrigramDisjunction &a, const TrigramDisjunction &b) { + return a.max_num_docids < b.max_num_docids; }); - vector in1, in2, out; + unordered_map> uses_trigram; + for (TrigramDisjunction &td : trigram_groups) { + for (uint32_t trgm : td.trigram_alternatives) { + uses_trigram[trgm].push_back(&td); + } + } + + unordered_set trigrams_submitted_read; + vector cur_candidates, tmp, decoded; bool done = false; - for (auto [trgmptr, len] : trigrams) { - if (!in1.empty() && trgmptr.num_docids > in1.size() * 100) { - uint32_t trgm __attribute__((unused)) = trgmptr.trgm; - dprintf("trigram '%c%c%c' (%zu bytes) has %u entries, ignoring the rest (will " + for (TrigramDisjunction &td : trigram_groups) { + if (!cur_candidates.empty() && td.max_num_docids > cur_candidates.size() * 100) { + dprintf("%s has up to %u entries, ignoring the rest (will " "weed out false positives later)\n", - trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff, - len, trgmptr.num_docids); + print_td(td).c_str(), td.max_num_docids); break; } - // Only stay a certain amount ahead, so that we don't spend I/O - // on reading the latter, large posting lists. We are unlikely - // to need them anyway, even if they should come in first. - if (engine.get_waiting_reads() >= 5) { - engine.finish(); - if (done) - break; - } - engine.submit_read(fd, len, trgmptr.offset, [trgmptr, len, &done, &in1, &in2, &out](string s) { - if (done) - return; - uint32_t trgm __attribute__((unused)) = trgmptr.trgm; - size_t num = trgmptr.num_docids; - unsigned char *pldata = reinterpret_cast(s.data()); - if (in1.empty()) { - in1.resize(num + 128); - p4nd1dec128v32(pldata, num, &in1[0]); - in1.resize(num); - dprintf("trigram '%c%c%c' (%zu bytes) decoded to %zu entries\n", trgm & 0xff, - (trgm >> 8) & 0xff, (trgm >> 16) & 0xff, len, num); - } else { - if (in2.size() < num + 128) { - in2.resize(num + 128); + for (auto &[trgmptr, len] : td.read_trigrams) { + if (trigrams_submitted_read.count(trgmptr.trgm) != 0) { + continue; + } + trigrams_submitted_read.insert(trgmptr.trgm); + // Only stay a certain amount ahead, so that we don't spend I/O + // on reading the latter, large posting lists. We are unlikely + // to need them anyway, even if they should come in first. + if (engine.get_waiting_reads() >= 5) { + engine.finish(); + if (done) + break; + } + engine.submit_read(fd, len, trgmptr.offset, [trgmptr{ trgmptr }, len{ len }, &done, &cur_candidates, &tmp, &decoded, &uses_trigram](string_view s) { + if (done) + return; + + uint32_t trgm __attribute__((unused)) = trgmptr.trgm; + const unsigned char *pldata = reinterpret_cast(s.data()); + size_t num = trgmptr.num_docids; + decoded.resize(num); + decode_pfor_delta1_128(pldata, num, /*interleaved=*/true, &decoded[0]); + + assert(uses_trigram.count(trgm) != 0); + bool was_empty = cur_candidates.empty(); + if (ignore_case) { + dprintf("trigram %s (%zu bytes) decoded to %zu entries\n", print_trigram(trgm).c_str(), len, num); + } + + for (TrigramDisjunction *td : uses_trigram[trgm]) { + done |= new_posting_list_read(td, decoded, &cur_candidates, &tmp); + if (done) + break; } - p4nd1dec128v32(pldata, num, &in2[0]); - - out.clear(); - set_intersection(in1.begin(), in1.end(), in2.begin(), in2.begin() + num, - back_inserter(out)); - swap(in1, out); - dprintf("trigram '%c%c%c' (%zu bytes) decoded to %zu entries, %zu left\n", - trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff, - len, num, in1.size()); - if (in1.empty()) { - dprintf("no matches (intersection list is empty)\n"); - done = true; + if (!ignore_case) { + if (was_empty) { + dprintf("trigram %s (%zu bytes) decoded to %zu entries\n", print_trigram(trgm).c_str(), len, num); + } else if (cur_candidates.empty()) { + dprintf("trigram %s (%zu bytes) decoded to %zu entries (none left, search is done)\n", print_trigram(trgm).c_str(), len, num); + } else { + dprintf("trigram %s (%zu bytes) decoded to %zu entries (%zu left)\n", print_trigram(trgm).c_str(), len, num, cur_candidates.size()); + } } - } - }); + }); + } } engine.finish(); if (done) { @@ -414,54 +552,184 @@ void do_search_file(const vector &needles, const char *filename) dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n", 1e3 * duration(steady_clock::now() - start).count()); - size_t matched __attribute__((unused)) = scan_docids(needles, in1, corpus, &engine); - dprintf("Done in %.1f ms, found %zu matches.\n", + uint64_t matched = scan_docids(needles, cur_candidates, corpus, &engine); + dprintf("Done in %.1f ms, found %" PRId64 " matches.\n", 1e3 * duration(steady_clock::now() - start).count(), matched); + + if (only_count) { + printf("%" PRId64 "\n", matched); + } +} + +string unescape_glob_to_plain_string(const string &needle) +{ + string unescaped; + for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) { + uint32_t ch = read_unigram(needle, i).first; + assert(ch != WILDCARD_UNIGRAM); + if (ch == PREMATURE_END_UNIGRAM) { + fprintf(stderr, "Pattern '%s' ended prematurely\n", needle.c_str()); + exit(1); + } + unescaped.push_back(ch); + } + return unescaped; +} + +regex_t compile_regex(const string &needle) +{ + regex_t re; + int flags = REG_NOSUB; + if (ignore_case) { + flags |= REG_ICASE; + } + if (use_extended_regex) { + flags |= REG_EXTENDED; + } + int err = regcomp(&re, needle.c_str(), flags); + if (err != 0) { + char errbuf[256]; + regerror(err, &re, errbuf, sizeof(errbuf)); + fprintf(stderr, "Error when compiling regex '%s': %s\n", needle.c_str(), errbuf); + exit(1); + } + return re; } void usage() { - // The help text comes from mlocate. - printf("Usage: plocate [OPTION]... PATTERN...\n"); - printf(" -d, --database DBPATH use DBPATH instead of default database (which is\n"); - printf(" %s)\n", dbpath); - printf(" -h, --help print this help\n"); - printf(" -0, --null separate entries with NUL on output\n"); + printf( + "Usage: plocate [OPTION]... PATTERN...\n" + "\n" + " -c, --count print number of matches instead of the matches\n" + " -d, --database DBPATH search for files in DBPATH\n" + " (default is " DEFAULT_DBPATH ")\n" + " -i, --ignore-case search case-insensitively\n" + " -l, --limit LIMIT stop after LIMIT matches\n" + " -0, --null delimit matches by NUL instead of newline\n" + " -r, --regexp interpret patterns as basic regexps (slow)\n" + " --regex interpret patterns as extended regexps (slow)\n" + " --help print this help\n" + " --version print version information\n"); +} + +void version() +{ + printf("plocate %s\n", PLOCATE_VERSION); + printf("Copyright 2020 Steinar H. Gunderson\n"); + printf("License GPLv2+: GNU GPL version 2 or later .\n"); + printf("This is free software: you are free to change and redistribute it.\n"); + printf("There is NO WARRANTY, to the extent permitted by law.\n"); + exit(0); } int main(int argc, char **argv) { + constexpr int EXTENDED_REGEX = 1000; static const struct option long_options[] = { { "help", no_argument, 0, 'h' }, + { "count", no_argument, 0, 'c' }, { "database", required_argument, 0, 'd' }, + { "ignore-case", no_argument, 0, 'i' }, + { "limit", required_argument, 0, 'l' }, { "null", no_argument, 0, '0' }, + { "version", no_argument, 0, 'V' }, + { "regexp", no_argument, 0, 'r' }, + { "regex", no_argument, 0, EXTENDED_REGEX }, + { "debug", no_argument, 0, 'D' }, // Not documented. { 0, 0, 0, 0 } }; + setlocale(LC_ALL, ""); for (;;) { int option_index = 0; - int c = getopt_long(argc, argv, "d:h0", long_options, &option_index); + int c = getopt_long(argc, argv, "cd:hil:n:0VD", long_options, &option_index); if (c == -1) { break; } switch (c) { + case 'c': + only_count = true; + break; case 'd': dbpath = strdup(optarg); break; case 'h': usage(); exit(0); + case 'i': + ignore_case = true; + break; + case 'l': + case 'n': + limit_matches = atoll(optarg); + if (limit_matches <= 0) { + fprintf(stderr, "Error: limit must be a strictly positive number.\n"); + exit(1); + } + break; case '0': print_nul = true; break; + case 'r': + patterns_are_regex = true; + break; + case EXTENDED_REGEX: + patterns_are_regex = true; + use_extended_regex = true; + break; + case 'D': + use_debug = true; + break; + case 'V': + version(); + break; default: exit(1); } } - vector needles; + if (use_debug) { + // Debug information would leak information about which files exist, + // so drop setgid before we open the file; one would either need to run + // as root, or use a locally-built file. + if (setgid(getgid()) != 0) { + perror("setgid"); + exit(EXIT_FAILURE); + } + } + + vector needles; for (int i = optind; i < argc; ++i) { - needles.push_back(argv[i]); + Needle needle; + needle.str = argv[i]; + + // See if there are any wildcard characters, which indicates we should treat it + // as an (anchored) glob. + bool any_wildcard = false; + for (size_t i = 0; i < needle.str.size(); i += read_unigram(needle.str, i).second) { + if (read_unigram(needle.str, i).first == WILDCARD_UNIGRAM) { + any_wildcard = true; + break; + } + } + + if (patterns_are_regex) { + needle.type = Needle::REGEX; + needle.re = compile_regex(needle.str); + } else if (any_wildcard) { + needle.type = Needle::GLOB; + } else if (ignore_case) { + // strcasestr() doesn't handle locales correctly (even though LSB + // claims it should), but somehow, fnmatch() does, and it's about + // the same speed as using a regex. + needle.type = Needle::GLOB; + needle.str = "*" + needle.str + "*"; + } else { + needle.type = Needle::STRSTR; + needle.str = unescape_glob_to_plain_string(needle.str); + } + needles.push_back(move(needle)); } if (needles.empty()) { fprintf(stderr, "plocate: no pattern to search for specified\n");