X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=plocate.cpp;h=a1cd97a4a4645c08880b014d4799f9d4c33182f6;hb=fd6198891d6fd9642effc0843fef6f23b991af3e;hp=0d59076e8fc99efb4829384da1894e99c4399867;hpb=d5ba26d705460a7e37213eeb4954b2efed8bebf0;p=plocate diff --git a/plocate.cpp b/plocate.cpp index 0d59076..a1cd97a 100644 --- a/plocate.cpp +++ b/plocate.cpp @@ -1,7 +1,9 @@ #include "access_rx_cache.h" +#include "complete_pread.h" #include "db.h" #include "dprintf.h" #include "io_uring_engine.h" +#include "needle.h" #include "parse_trigrams.h" #include "serializer.h" #include "turbopfor.h" @@ -14,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -30,6 +31,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -42,47 +46,27 @@ using namespace std; using namespace std::chrono; -#define DEFAULT_DBPATH "/var/lib/mlocate/plocate.db" - -const char *dbpath = DEFAULT_DBPATH; bool ignore_case = false; bool only_count = false; bool print_nul = false; bool use_debug = false; +bool flush_cache = false; bool patterns_are_regex = false; bool use_extended_regex = false; +bool match_basename = false; +bool check_existence = false; int64_t limit_matches = numeric_limits::max(); int64_t limit_left = numeric_limits::max(); +bool stdout_is_tty = false; +bool literal_printing = false; +static bool in_forked_child = false; steady_clock::time_point start; ZSTD_DDict *ddict = nullptr; -regex_t compile_regex(const string &needle); - -struct Needle { - enum { STRSTR, - REGEX, - GLOB } type; - string str; // Filled in no matter what. - regex_t re; // For REGEX. -}; - -bool matches(const Needle &needle, const char *haystack) -{ - if (needle.type == Needle::STRSTR) { - return strstr(haystack, needle.str.c_str()) != nullptr; - } else if (needle.type == Needle::GLOB) { - int flags = ignore_case ? FNM_CASEFOLD : 0; - return fnmatch(needle.str.c_str(), haystack, flags) == 0; - } else { - assert(needle.type == Needle::REGEX); - return regexec(&needle.re, haystack, /*nmatch=*/0, /*pmatch=*/nullptr, /*flags=*/0) == 0; - } -} - class Corpus { public: - Corpus(int fd, IOUringEngine *engine); + Corpus(int fd, const char *filename_for_errors, IOUringEngine *engine); ~Corpus(); void find_trigram(uint32_t trgm, function cb); void get_compressed_filename_block(uint32_t docid, function cb) const; @@ -100,11 +84,10 @@ public: Header hdr; }; -Corpus::Corpus(int fd, IOUringEngine *engine) +Corpus::Corpus(int fd, const char *filename_for_errors, IOUringEngine *engine) : fd(fd), engine(engine) { - // Enable to test cold-cache behavior (except for access()). - if (false) { + if (flush_cache) { off_t len = lseek(fd, 0, SEEK_END); if (len == -1) { perror("lseek"); @@ -115,11 +98,11 @@ Corpus::Corpus(int fd, IOUringEngine *engine) complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0); if (memcmp(hdr.magic, "\0plocate", 8) != 0) { - fprintf(stderr, "plocate.db is corrupt or an old version; please rebuild it.\n"); + fprintf(stderr, "%s: database is corrupt or not a plocate database; please rebuild it.\n", filename_for_errors); exit(1); } if (hdr.version != 0 && hdr.version != 1) { - fprintf(stderr, "plocate.db has version %u, expected 0 or 1; please rebuild it.\n", hdr.version); + fprintf(stderr, "%s: has version %u, expected 0 or 1; please rebuild it.\n", filename_for_errors, hdr.version); exit(1); } if (hdr.version == 0) { @@ -127,6 +110,10 @@ Corpus::Corpus(int fd, IOUringEngine *engine) hdr.zstd_dictionary_offset_bytes = 0; hdr.zstd_dictionary_length_bytes = 0; } + if (hdr.max_version < 2) { + // This too. (We ignore the other max_version 2 fields.) + hdr.check_visibility = true; + } } Corpus::~Corpus() @@ -168,8 +155,24 @@ size_t Corpus::get_num_filename_blocks() const return hdr.num_docids; } +template +void stat_if_needed(const char *filename, bool access_ok, IOUringEngine *engine, T cb) +{ + if (!access_ok || !check_existence) { + // Doesn't have access or doesn't care about existence, so no need to stat. + cb(access_ok); + } else if (engine == nullptr || !engine->get_supports_stat()) { + // Do a synchronous stat. + struct stat buf; + bool ok = lstat(filename, &buf) == 0; + cb(ok); + } else { + engine->submit_stat(filename, cb); + } +} + void scan_file_block(const vector &needles, string_view compressed, - AccessRXCache *access_rx_cache, uint64_t seq, ResultReceiver *serializer, + IOUringEngine *engine, AccessRXCache *access_rx_cache, uint64_t seq, ResultReceiver *serializer, atomic *matched) { unsigned long long uncompressed_len = ZSTD_getFrameContentSize(compressed.data(), compressed.size()); @@ -198,14 +201,16 @@ void scan_file_block(const vector &needles, string_view compressed, block[block.size() - 1] = '\0'; auto test_candidate = [&](const char *filename, uint64_t local_seq, uint64_t next_seq) { - access_rx_cache->check_access(filename, /*allow_async=*/true, [matched, serializer, local_seq, next_seq, filename{ strdup(filename) }](bool ok) { - if (ok) { - ++*matched; - serializer->print(local_seq, next_seq - local_seq, filename); - } else { - serializer->print(local_seq, next_seq - local_seq, ""); - } - free(filename); + access_rx_cache->check_access(filename, /*allow_async=*/true, [matched, engine, serializer, local_seq, next_seq, filename{ strdup(filename) }](bool ok) { + stat_if_needed(filename, ok, engine, [matched, serializer, local_seq, next_seq, filename](bool ok) { + if (ok) { + ++*matched; + serializer->print(local_seq, next_seq - local_seq, filename); + } else { + serializer->print(local_seq, next_seq - local_seq, ""); + } + free(filename); + }); }); }; @@ -217,9 +222,19 @@ void scan_file_block(const vector &needles, string_view compressed, for (const char *filename = block.data(); filename != block.data() + block.size(); filename += strlen(filename) + 1) { + const char *haystack = filename; + if (match_basename) { + haystack = strrchr(filename, '/'); + if (haystack == nullptr) { + haystack = filename; + } else { + ++haystack; + } + } + bool found = true; for (const Needle &needle : needles) { - if (!matches(needle, filename)) { + if (!matches(needle, haystack)) { found = false; break; } @@ -242,12 +257,12 @@ void scan_file_block(const vector &needles, string_view compressed, size_t scan_docids(const vector &needles, const vector &docids, const Corpus &corpus, IOUringEngine *engine) { Serializer docids_in_order; - AccessRXCache access_rx_cache(engine); + AccessRXCache access_rx_cache(engine, corpus.get_hdr().check_visibility); atomic matched{ 0 }; for (size_t i = 0; i < docids.size(); ++i) { uint32_t docid = docids[i]; - corpus.get_compressed_filename_block(docid, [i, &matched, &needles, &access_rx_cache, &docids_in_order](string_view compressed) { - scan_file_block(needles, compressed, &access_rx_cache, i, &docids_in_order, &matched); + corpus.get_compressed_filename_block(docid, [i, &matched, &needles, &access_rx_cache, engine, &docids_in_order](string_view compressed) { + scan_file_block(needles, compressed, engine, &access_rx_cache, i, &docids_in_order, &matched); }); } engine->finish(); @@ -318,7 +333,7 @@ uint64_t scan_all_docids(const vector &needles, int fd, const Corpus &co } } - AccessRXCache access_rx_cache(nullptr); + AccessRXCache access_rx_cache(nullptr, corpus.get_hdr().check_visibility); Serializer serializer; uint32_t num_blocks = corpus.get_num_filename_blocks(); unique_ptr offsets(new uint64_t[num_blocks + 1]); @@ -365,7 +380,8 @@ uint64_t scan_all_docids(const vector &needles, int fd, const Corpus &co for (uint32_t docid = io_docid; docid < last_docid; ++docid) { size_t relative_offset = offsets[docid] - offsets[io_docid]; size_t len = offsets[docid + 1] - offsets[docid]; - scan_file_block(*use_needles, { &compressed[relative_offset], len }, &access_rx_cache, docid, &receiver, &matched); + // IOUringEngine isn't thread-safe, so we do any needed stat()s synchronously (nullptr engine). + scan_file_block(*use_needles, { &compressed[relative_offset], len }, /*engine=*/nullptr, &access_rx_cache, docid, &receiver, &matched); } } }); @@ -452,11 +468,11 @@ bool new_posting_list_read(TrigramDisjunction *td, vector decoded, vec return false; } -void do_search_file(const vector &needles, const char *filename) +uint64_t do_search_file(const vector &needles, const std::string &filename) { - int fd = open(filename, O_RDONLY); + int fd = open(filename.c_str(), O_RDONLY); if (fd == -1) { - perror(filename); + perror(filename.c_str()); exit(1); } @@ -469,11 +485,11 @@ void do_search_file(const vector &needles, const char *filename) start = steady_clock::now(); if (access("/", R_OK | X_OK)) { // We can't find anything, no need to bother... - return; + return 0; } IOUringEngine engine(/*slop_bytes=*/16); // 16 slop bytes as described in turbopfor.h. - Corpus corpus(fd, &engine); + Corpus corpus(fd, filename.c_str(), &engine); dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); vector trigram_groups; @@ -512,10 +528,9 @@ void do_search_file(const vector &needles, const char *filename) // the pattern and done a union of them, but that's a lot of // work for fairly unclear gain.) uint64_t matched = scan_all_docids(needles, fd, corpus); - if (only_count) { - printf("%" PRId64 "\n", matched); - } - return; + dprintf("Done in %.1f ms, found %" PRId64 " matches.\n", + 1e3 * duration(steady_clock::now() - start).count(), matched); + return matched; } // Sneak in fetching the dictionary, if present. It's not necessarily clear @@ -534,18 +549,30 @@ void do_search_file(const vector &needles, const char *filename) } // Look them all up on disk. + bool should_early_exit = false; for (auto &[trgm, trigram_groups] : trigrams_to_lookup) { - corpus.find_trigram(trgm, [trgm{ trgm }, trigram_groups{ &trigram_groups }](const Trigram *trgmptr, size_t len) { + corpus.find_trigram(trgm, [trgm{ trgm }, trigram_groups{ &trigram_groups }, &should_early_exit](const Trigram *trgmptr, size_t len) { if (trgmptr == nullptr) { dprintf("trigram %s isn't found\n", print_trigram(trgm).c_str()); for (TrigramDisjunction *td : *trigram_groups) { --td->remaining_trigrams_to_read; + + // If we now know this trigram group doesn't match anything at all, + // we can do early exit; however, if we're in a forked child, + // that would confuse the parent process (since we don't write + // our count to the pipe), so we wait until we're back in to the + // regular (non-async) context. This is a fairly rare case anyway, + // and the gains from dropping the remaining trigram reads are limited. if (td->remaining_trigrams_to_read == 0 && td->read_trigrams.empty()) { - dprintf("zero matches in %s, so we are done\n", print_td(*td).c_str()); - if (only_count) { - printf("0\n"); + if (in_forked_child) { + should_early_exit = true; + } else { + dprintf("zero matches in %s, so we are done\n", print_td(*td).c_str()); + if (only_count) { + printf("0\n"); + } + exit(0); } - exit(0); } } return; @@ -560,6 +587,10 @@ void do_search_file(const vector &needles, const char *filename) engine.finish(); dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration(steady_clock::now() - start).count()); + if (should_early_exit) { + return 0; + } + for (TrigramDisjunction &td : trigram_groups) { // Reset for reads. td.remaining_trigrams_to_read = td.read_trigrams.size(); @@ -611,7 +642,7 @@ void do_search_file(const vector &needles, const char *filename) if (done) return; - uint32_t trgm __attribute__((unused)) = trgmptr.trgm; + uint32_t trgm = trgmptr.trgm; const unsigned char *pldata = reinterpret_cast(s.data()); size_t num = trgmptr.num_docids; decoded.resize(num); @@ -642,7 +673,7 @@ void do_search_file(const vector &needles, const char *filename) } engine.finish(); if (done) { - return; + return 0; } dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n", 1e3 * duration(steady_clock::now() - start).count()); @@ -650,45 +681,121 @@ void do_search_file(const vector &needles, const char *filename) uint64_t matched = scan_docids(needles, cur_candidates, corpus, &engine); dprintf("Done in %.1f ms, found %" PRId64 " matches.\n", 1e3 * duration(steady_clock::now() - start).count(), matched); - - if (only_count) { - printf("%" PRId64 "\n", matched); - } + return matched; } -string unescape_glob_to_plain_string(const string &needle) +// Run do_search_file() in a child process. +// +// The reason for this is that we're not robust against malicious input, so we need +// to drop privileges after opening the file. (Otherwise, we could fall prey to an attack +// where a user does locate -d badfile.db:/var/lib/plocate/plocate.db, badfile.db contains +// a buffer overflow that takes over the process, and then uses the elevated privileges +// to print out otherwise inaccessible paths.) We solve this by forking and treating the +// child process as untrusted after it has dropped its privileges (which it does before +// reading any data from the file); it returns a single 64-bit number over a pipe, +// and that's it. The parent keeps its privileges, and can then fork out new children +// without fear of being taken over. (The child keeps stdout for outputting results.) +// +// The count is returned over the pipe, because it's needed both for --limit and --count. +uint64_t do_search_file_in_child(const vector &needles, const std::string &filename) { - string unescaped; - for (size_t i = 0; i < needle.size(); i += read_unigram(needle, i).second) { - uint32_t ch = read_unigram(needle, i).first; - assert(ch != WILDCARD_UNIGRAM); - if (ch == PREMATURE_END_UNIGRAM) { - fprintf(stderr, "Pattern '%s' ended prematurely\n", needle.c_str()); - exit(1); + int pipefd[2]; + if (pipe(pipefd) == -1) { + perror("pipe"); + exit(EXIT_FAILURE); + } + + pid_t child_pid = fork(); + switch (child_pid) { + case 0: { + // Child. + close(pipefd[0]); + in_forked_child = true; + uint64_t matched = do_search_file(needles, filename); + int ret; + do { + ret = write(pipefd[1], &matched, sizeof(matched)); + } while (ret == -1 && errno == EINTR); + if (ret != sizeof(matched)) { + perror("write"); + _exit(EXIT_FAILURE); } - unescaped.push_back(ch); + fflush(stdout); + _exit(EXIT_SUCCESS); + } + case -1: + // Error. + perror("fork"); + exit(EXIT_FAILURE); + default: + // Parent. + close(pipefd[1]); + break; + } + + // Wait for the child to finish. + int wstatus; + pid_t err; + do { + err = waitpid(child_pid, &wstatus, 0); + } while (err == -1 && errno == EINTR); + if (err == -1) { + perror("waitpid"); + exit(EXIT_FAILURE); + } + if (WIFEXITED(wstatus)) { + if (WEXITSTATUS(wstatus) != 0) { + // The child has probably already printed out its error, so just propagate the exit status. + exit(WEXITSTATUS(wstatus)); + } + // Success! + } else if (!WIFEXITED(wstatus)) { + fprintf(stderr, "FATAL: Child died unexpectedly while processing %s\n", filename.c_str()); + exit(1); + } + + // Now get the number of matches from the child. + uint64_t matched; + int ret; + do { + ret = read(pipefd[0], &matched, sizeof(matched)); + } while (ret == -1 && errno == EINTR); + if (ret == -1) { + perror("read"); + exit(EXIT_FAILURE); + } else if (ret != sizeof(matched)) { + fprintf(stderr, "FATAL: Short read through pipe (got %d bytes)\n", ret); + exit(EXIT_FAILURE); } - return unescaped; + close(pipefd[0]); + return matched; } -regex_t compile_regex(const string &needle) +// Parses a colon-separated list of strings and appends them onto the given vector. +// Backslash escapes whatever comes after it. +void parse_dbpaths(const char *ptr, vector *output) { - regex_t re; - int flags = REG_NOSUB; - if (ignore_case) { - flags |= REG_ICASE; - } - if (use_extended_regex) { - flags |= REG_EXTENDED; - } - int err = regcomp(&re, needle.c_str(), flags); - if (err != 0) { - char errbuf[256]; - regerror(err, &re, errbuf, sizeof(errbuf)); - fprintf(stderr, "Error when compiling regex '%s': %s\n", needle.c_str(), errbuf); - exit(1); + string str; + while (*ptr != '\0') { + if (*ptr == '\\') { + if (ptr[1] == '\0') { + fprintf(stderr, "ERROR: Escape character at the end of string\n"); + exit(EXIT_FAILURE); + } + // Escape. + str.push_back(ptr[1]); + ptr += 2; + continue; + } + if (*ptr == ':') { + // Separator. + output->push_back(move(str)); + ++ptr; + continue; + } + str.push_back(*ptr++); } - return re; + output->push_back(move(str)); } void usage() @@ -696,21 +803,24 @@ void usage() printf( "Usage: plocate [OPTION]... PATTERN...\n" "\n" + " -b, --basename search only the file name portion of path names\n" " -c, --count print number of matches instead of the matches\n" " -d, --database DBPATH search for files in DBPATH\n" - " (default is " DEFAULT_DBPATH ")\n" + " (default is " DBFILE ")\n" " -i, --ignore-case search case-insensitively\n" " -l, --limit LIMIT stop after LIMIT matches\n" " -0, --null delimit matches by NUL instead of newline\n" + " -N, --literal do not quote filenames, even if printing to a tty\n" " -r, --regexp interpret patterns as basic regexps (slow)\n" " --regex interpret patterns as extended regexps (slow)\n" + " -w, --wholename search the entire path name (default; see -b)\n" " --help print this help\n" " --version print version information\n"); } void version() { - printf("plocate %s\n", PLOCATE_VERSION); + printf("%s %s\n", PACKAGE_NAME, PACKAGE_VERSION); printf("Copyright 2020 Steinar H. Gunderson\n"); printf("License GPLv2+: GNU GPL version 2 or later .\n"); printf("This is free software: you are free to change and redistribute it.\n"); @@ -720,34 +830,53 @@ void version() int main(int argc, char **argv) { + vector dbpaths; + constexpr int EXTENDED_REGEX = 1000; + constexpr int FLUSH_CACHE = 1001; static const struct option long_options[] = { { "help", no_argument, 0, 'h' }, { "count", no_argument, 0, 'c' }, + { "all", no_argument, 0, 'A' }, + { "basename", no_argument, 0, 'b' }, { "database", required_argument, 0, 'd' }, + { "existing", no_argument, 0, 'e' }, { "ignore-case", no_argument, 0, 'i' }, { "limit", required_argument, 0, 'l' }, + { "literal", no_argument, 0, 'N' }, { "null", no_argument, 0, '0' }, { "version", no_argument, 0, 'V' }, { "regexp", no_argument, 0, 'r' }, { "regex", no_argument, 0, EXTENDED_REGEX }, + { "wholename", no_argument, 0, 'w' }, { "debug", no_argument, 0, 'D' }, // Not documented. + // Enable to test cold-cache behavior (except for access()). Not documented. + { "flush-cache", no_argument, 0, FLUSH_CACHE }, { 0, 0, 0, 0 } }; setlocale(LC_ALL, ""); for (;;) { int option_index = 0; - int c = getopt_long(argc, argv, "cd:hil:n:0VD", long_options, &option_index); + int c = getopt_long(argc, argv, "Abcd:ehil:n:N0rwVD", long_options, &option_index); if (c == -1) { break; } switch (c) { + case 'A': + // Ignored. + break; + case 'b': + match_basename = true; + break; case 'c': only_count = true; break; case 'd': - dbpath = strdup(optarg); + parse_dbpaths(optarg, &dbpaths); + break; + case 'e': + check_existence = true; break; case 'h': usage(); @@ -763,6 +892,9 @@ int main(int argc, char **argv) exit(1); } break; + case 'N': + literal_printing = true; + break; case '0': print_nul = true; break; @@ -773,9 +905,15 @@ int main(int argc, char **argv) patterns_are_regex = true; use_extended_regex = true; break; + case 'w': + match_basename = false; // No-op unless -b is given first. + break; case 'D': use_debug = true; break; + case FLUSH_CACHE: + flush_cache = true; + break; case 'V': version(); break; @@ -784,16 +922,22 @@ int main(int argc, char **argv) } } - if (use_debug) { + if (use_debug || flush_cache) { // Debug information would leak information about which files exist, // so drop setgid before we open the file; one would either need to run - // as root, or use a locally-built file. + // as root, or use a locally-built file. Doing the same thing for + // flush_cache is mostly paranoia, in an attempt to prevent random users + // from making plocate slow for everyone else. if (setgid(getgid()) != 0) { perror("setgid"); exit(EXIT_FAILURE); } } + if (!print_nul) { + stdout_is_tty = isatty(1); + } + vector needles; for (int i = optind; i < argc; ++i) { Needle needle; @@ -830,5 +974,30 @@ int main(int argc, char **argv) fprintf(stderr, "plocate: no pattern to search for specified\n"); exit(0); } - do_search_file(needles, dbpath); + + if (dbpaths.empty()) { + // No -d given, so use our default. Note that this happens + // even if LOCATE_PATH exists, to match mlocate behavior. + dbpaths.push_back(DBFILE); + } + + const char *locate_path = getenv("LOCATE_PATH"); + if (locate_path != nullptr) { + parse_dbpaths(locate_path, &dbpaths); + } + + uint64_t matched = 0; + for (size_t i = 0; i < dbpaths.size(); ++i) { + uint64_t this_matched; + if (i != dbpaths.size() - 1) { + this_matched = do_search_file_in_child(needles, dbpaths[i]); + } else { + this_matched = do_search_file(needles, dbpaths[i]); + } + matched += this_matched; + limit_left -= this_matched; + } + if (only_count) { + printf("%" PRId64 "\n", matched); + } }