ninja reconfigure
ninja bench
-Copyright 2020 Steinar H. Gunderson <steinar+plocate@gunderson.no>.
+plocate (except updatedb), and the plocate-specific changes to updatedb,
+is Copyright 2020 Steinar H. Gunderson <steinar+plocate@gunderson.no>.
Licensed under the GNU General Public License, either version 2,
or (at your option) any later version. See the included file COPYING.
+
+updatedb is Copyright (C) 2005, 2007 Red Hat, Inc. All rights reserved.
+Licensed under the GNU General Public License, version 2. See the
+included file COPYING.
--- /dev/null
+/* Bind mount detection. Note: if you change this, change tmpwatch as well.
+
+Copyright (C) 2005, 2007, 2008, 2012 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#include "bind-mount.h"
+
+#include "conf.h"
+#include "lib.h"
+
+#include <atomic>
+#include <fcntl.h>
+#include <limits.h>
+#include <map>
+#include <poll.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <thread>
+
+using namespace std;
+
+/* mountinfo handling */
+
+/* A single mountinfo entry */
+struct mount {
+ int id, parent_id;
+ unsigned dev_major, dev_minor;
+ string root;
+ string mount_point;
+ string fs_type;
+ string source;
+};
+
+/* Path to mountinfo */
+static const char *mountinfo_path;
+atomic<bool> mountinfo_updated{ false };
+
+multimap<pair<int, int>, mount> mount_entries; // Keyed by device major/minor.
+
+/* Read a line from F.
+ Return a string, or empty string on error. */
+static string read_mount_line(FILE *f)
+{
+ string line;
+
+ for (;;) {
+ char buf[LINE_MAX];
+
+ if (fgets(buf, sizeof(buf), f) == nullptr) {
+ if (feof(f))
+ break;
+ return "";
+ }
+ size_t chunk_length = strlen(buf);
+ if (chunk_length > 0 && buf[chunk_length - 1] == '\n') {
+ line.append(buf, chunk_length - 1);
+ break;
+ }
+ line.append(buf, chunk_length);
+ }
+ return line;
+}
+
+/* Parse a space-delimited entry in STR, decode octal escapes, write it to
+ DEST (allocated from mount_string_obstack) if it is not nullptr.
+ Return 0 if OK, -1 on error. */
+static int parse_mount_string(string *dest, const char **str)
+{
+ const char *src = *str;
+ while (*src == ' ' || *src == '\t') {
+ src++;
+ }
+ if (*src == 0) {
+ return -1;
+ }
+ string mount_string;
+ for (;;) {
+ char c = *src;
+
+ switch (c) {
+ case 0:
+ case ' ':
+ case '\t':
+ goto done;
+
+ case '\\':
+ if (src[1] >= '0' && src[1] <= '7' && src[2] >= '0' && src[2] <= '7' && src[3] >= '0' && src[3] <= '7') {
+ unsigned v;
+
+ v = ((src[1] - '0') << 6) | ((src[2] - '0') << 3) | (src[3] - '0');
+ if (v <= UCHAR_MAX) {
+ mount_string.push_back(v);
+ src += 4;
+ break;
+ }
+ }
+ /* Else fall through */
+
+ default:
+ mount_string.push_back(c);
+ src++;
+ }
+ }
+
+done:
+ *str = src;
+ if (dest != nullptr) {
+ *dest = move(mount_string);
+ }
+ return 0;
+}
+
+/* Read a single entry from F. Return true if succesful. */
+static bool read_mount_entry(FILE *f, mount *me)
+{
+ string line = read_mount_line(f);
+ if (line.empty()) {
+ return false;
+ }
+ size_t offset;
+ if (sscanf(line.c_str(), "%d %d %u:%u%zn", &me->id, &me->parent_id, &me->dev_major,
+ &me->dev_minor, &offset) != 4) {
+ return false;
+ }
+ const char *ptr = line.c_str() + offset;
+ if (parse_mount_string(&me->root, &ptr) != 0 ||
+ parse_mount_string(&me->mount_point, &ptr) != 0 ||
+ parse_mount_string(nullptr, &ptr) != 0) {
+ return false;
+ }
+ bool separator_found;
+ do {
+ string option;
+ if (parse_mount_string(&option, &ptr) != 0) {
+ return false;
+ }
+ separator_found = strcmp(option.c_str(), "-") == 0;
+ } while (!separator_found);
+
+ if (parse_mount_string(&me->fs_type, &ptr) != 0 ||
+ parse_mount_string(&me->source, &ptr) != 0 ||
+ parse_mount_string(nullptr, &ptr) != 0) {
+ return false;
+ }
+ return true;
+}
+
+/* Read mount information from mountinfo_path, update mount_entries and
+ num_mount_entries.
+ Return 0 if OK, -1 on error. */
+static int read_mount_entries(void)
+{
+ FILE *f = fopen(mountinfo_path, "r");
+ if (f == nullptr) {
+ return -1;
+ }
+
+ mount_entries.clear();
+
+ mount me;
+ while (read_mount_entry(f, &me)) {
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr,
+ " `%s' (%d on %d) is `%s' of `%s' (%u:%u), type `%s'\n",
+ me.mount_point.c_str(), me.id, me.parent_id, me.root.c_str(), me.source.c_str(),
+ me.dev_major, me.dev_minor, me.fs_type.c_str());
+ }
+ mount_entries.emplace(make_pair(me.dev_major, me.dev_minor), me);
+ }
+ fclose(f);
+ return 0;
+}
+
+/* Bind mount path list maintenace and top-level interface. */
+
+/* mountinfo_path file descriptor, or -1 */
+static int mountinfo_fd;
+
+/* Known bind mount paths */
+static struct vector<string> bind_mount_paths; /* = { 0, }; */
+
+/* Next bind_mount_paths entry */
+static size_t bind_mount_paths_index; /* = 0; */
+
+/* Rebuild bind_mount_paths */
+static void rebuild_bind_mount_paths(void)
+{
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, "Rebuilding bind_mount_paths:\n");
+ }
+ if (read_mount_entries() != 0) {
+ return;
+ }
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, "Matching bind_mount_paths:\n");
+ }
+
+ bind_mount_paths.clear();
+
+ for (const auto &[dev_id, me] : mount_entries) {
+ const auto &[first, second] = mount_entries.equal_range(make_pair(me.dev_major, me.dev_minor));
+ for (auto it = first; it != second; ++it) {
+ const mount &other = it->second;
+ if (other.id == me.id) {
+ // Don't compare an element to itself.
+ continue;
+ }
+ // We have two mounts from the same device. Is one a prefix of the other?
+ // If there are two that are equal, prefer the one with lowest ID.
+ if (me.root.size() > other.root.size() && me.root.find(other.root) == 0) {
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, " => adding `%s' (root `%s' is a child of `%s', mounted on `%s')\n",
+ me.mount_point.c_str(), me.root.c_str(), other.root.c_str(), other.mount_point.c_str());
+ }
+ bind_mount_paths.push_back(me.mount_point);
+ break;
+ }
+ if (me.root == other.root && me.id > other.id) {
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, " => adding `%s' (duplicate of mount point `%s')\n",
+ me.mount_point.c_str(), other.mount_point.c_str());
+ }
+ bind_mount_paths.push_back(me.mount_point);
+ break;
+ }
+ }
+ }
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, "...done\n");
+ }
+ string_list_dir_path_sort(&bind_mount_paths);
+}
+
+/* Return true if PATH is a destination of a bind mount.
+ (Bind mounts "to self" are ignored.) */
+bool is_bind_mount(const char *path)
+{
+ if (mountinfo_updated.exchange(false)) { // Atomic test-and-clear.
+ rebuild_bind_mount_paths();
+ bind_mount_paths_index = 0;
+ }
+ return string_list_contains_dir_path(&bind_mount_paths,
+ &bind_mount_paths_index, path);
+}
+
+/* Initialize state for is_bind_mount(), to read data from MOUNTINFO. */
+void bind_mount_init(const char *mountinfo)
+{
+ mountinfo_path = mountinfo;
+ mountinfo_fd = open(mountinfo_path, O_RDONLY);
+ if (mountinfo_fd == -1)
+ return;
+ rebuild_bind_mount_paths();
+
+ // mlocate re-polls this for each and every directory it wants to check,
+ // for unclear reasons; it's possible that it's worried about a new recursive
+ // bind mount being made while updatedb is running, causing an infinite loop?
+ // Since it's probably for some good reason, we do the same, but we don't
+ // want the barrage of syscalls. It's not synchronous, but the poll signal
+ // isn't either; there's a slight race condition, but one that could only
+ // be exploited by root.
+ //
+ // The thread is forcibly terminated on exit(), so we just let it loop forever.
+ thread poll_thread([&] {
+ for (;;) {
+ struct pollfd pfd;
+ /* Unfortunately (mount --bind $path $path/subdir) would leave st_dev
+ unchanged between $path and $path/subdir, so we must keep reparsing
+ mountinfo_path each time it changes. */
+ pfd.fd = mountinfo_fd;
+ pfd.events = POLLPRI;
+ if (poll(&pfd, 1, /*timeout=*/-1) == -1) {
+ perror("poll()");
+ exit(1);
+ }
+ if ((pfd.revents & POLLPRI) != 0) {
+ mountinfo_updated = true;
+ }
+ }
+ });
+ poll_thread.detach();
+}
--- /dev/null
+/* Bind mount detection.
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#ifndef BIND_MOUNT_H__
+#define BIND_MOUNT_H__
+
+/* System mount information file */
+#define MOUNTINFO_PATH "/proc/self/mountinfo"
+
+/* Return true if PATH is a destination of a bind mount.
+ (Bind mounts "to self" are ignored.) */
+extern bool is_bind_mount(const char *path);
+
+/* Initialize state for is_bind_mount(), to read data from MOUNTINFO. */
+extern void bind_mount_init(const char *mountinfo);
+
+#endif
#include <stdlib.h>
#include <unistd.h>
-void complete_pread(int fd, void *ptr, size_t len, off_t offset)
+bool try_complete_pread(int fd, void *ptr, size_t len, off_t offset)
{
while (len > 0) {
ssize_t ret = pread(fd, ptr, len, offset);
continue;
}
if (ret <= 0) {
- perror("pread");
- exit(1);
+ return false;
}
ptr = reinterpret_cast<char *>(ptr) + ret;
len -= ret;
offset -= ret;
}
+ return true;
+}
+
+void complete_pread(int fd, void *ptr, size_t len, off_t offset)
+{
+ if (!try_complete_pread(fd, ptr, len, offset)) {
+ perror("pread");
+ exit(1);
+ }
}
#include <unistd.h>
-// A wrapper around pread() that returns an incomplete read.
-// Always synchronous (no io_uring).
+// A wrapper around pread() that retries on short reads and EINTR,
+// so you never need to call it twice. Always synchronous (no io_uring).
+bool try_complete_pread(int fd, void *ptr, size_t len, off_t offset);
+
+// Same, but exit on failure, so never returns a short read.
void complete_pread(int fd, void *ptr, size_t len, off_t offset);
#endif // !defined(COMPLETE_PREAD_H)
--- /dev/null
+/* updatedb configuration.
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+ */
+
+#include "conf.h"
+
+#include "error.h"
+#include "lib.h"
+
+#include <algorithm>
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+using namespace std;
+
+/* true if locate(1) should check whether files are visible before reporting
+ them */
+bool conf_check_visibility = true;
+
+/* Filesystems to skip, converted to uppercase and sorted by name */
+vector<string> conf_prunefs;
+
+/* Directory names to skip, sorted by name */
+vector<string> conf_prunenames;
+
+/* Paths to skip, sorted by name using dir_path_cmp () */
+vector<string> conf_prunepaths;
+
+/* true if bind mounts should be skipped */
+bool conf_prune_bind_mounts; /* = false; */
+
+/* true if pruning debug output was requested */
+bool conf_debug_pruning; /* = false; */
+
+/* Root of the directory tree to store in the database (canonical) */
+char *conf_scan_root; /* = NULL; */
+
+/* Absolute (not necessarily canonical) path to the database */
+string conf_output;
+
+/* 1 if file names should be written to stdout as they are found */
+bool conf_verbose; /* = false; */
+
+/* Configuration representation for the database configuration block */
+string conf_block;
+
+int conf_block_size = 32;
+bool use_debug = false;
+
+/* Parse a STR, store the parsed boolean value to DEST;
+ return 0 if OK, -1 on error. */
+static int
+parse_bool(bool *dest, const char *str)
+{
+ if (strcmp(str, "0") == 0 || strcmp(str, "no") == 0) {
+ *dest = false;
+ return 0;
+ }
+ if (strcmp(str, "1") == 0 || strcmp(str, "yes") == 0) {
+ *dest = true;
+ return 0;
+ }
+ return -1;
+}
+
+/* String list handling */
+
+/* Add values from space-separated VAL to VAR and LIST */
+static void
+var_add_values(vector<string> *list, const char *val)
+{
+ for (;;) {
+ const char *start;
+
+ while (isspace((unsigned char)*val))
+ val++;
+ if (*val == 0)
+ break;
+ start = val;
+ do
+ val++;
+ while (*val != 0 && !isspace((unsigned char)*val));
+ list->emplace_back(start, val - start);
+ }
+}
+
+/* Finish variable LIST, sort its contents, remove duplicates */
+static void
+var_finish(vector<string> *list)
+{
+ sort(list->begin(), list->end());
+ auto new_end = unique(list->begin(), list->end());
+ list->erase(new_end, list->end());
+}
+
+/* UPDATEDB_CONF parsing */
+
+/* UPDATEDB_CONF (locked) */
+static FILE *uc_file;
+/* Line number at token start; type matches error_at_line () */
+static unsigned uc_line;
+/* Current line number; type matches error_at_line () */
+static unsigned uc_current_line;
+/* Last string returned by uc_lex */
+static string uc_lex_buf;
+
+/* Token types */
+enum {
+ UCT_EOF,
+ UCT_EOL,
+ UCT_IDENTIFIER,
+ UCT_EQUAL,
+ UCT_QUOTED,
+ UCT_OTHER,
+ UCT_PRUNE_BIND_MOUNTS,
+ UCT_PRUNEFS,
+ UCT_PRUNENAMES,
+ UCT_PRUNEPATHS
+};
+
+/* Return next token from uc_file; for UCT_IDENTIFIER, UCT_QUOTED or keywords,
+ store the data to uc_lex_buf (valid until next call). */
+static int
+uc_lex(void)
+{
+ int c;
+
+ uc_lex_buf.clear();
+ uc_line = uc_current_line;
+ do {
+ c = getc_unlocked(uc_file);
+ if (c == EOF)
+ return UCT_EOF;
+ } while (c != '\n' && isspace((unsigned char)c));
+ switch (c) {
+ case '#':
+ do {
+ c = getc_unlocked(uc_file);
+ if (c == EOF)
+ return UCT_EOF;
+ } while (c != '\n');
+ /* Fall through */
+ case '\n':
+ uc_current_line++;
+ if (uc_current_line == 0) {
+ error_at_line(0, 0, UPDATEDB_CONF, uc_current_line - 1,
+ _("warning: Line number overflow"));
+ error_message_count--; /* Don't count as an error */
+ }
+ return UCT_EOL;
+
+ case '=':
+ return UCT_EQUAL;
+
+ case '"': {
+ while ((c = getc_unlocked(uc_file)) != '"') {
+ if (c == EOF || c == '\n') {
+ error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+ _("missing closing `\"'"));
+ ungetc(c, uc_file);
+ break;
+ }
+ uc_lex_buf.push_back(c);
+ }
+ return UCT_QUOTED;
+ }
+
+ default: {
+ if (!isalpha((unsigned char)c) && c != '_')
+ return UCT_OTHER;
+ do {
+ uc_lex_buf.push_back(c);
+ c = getc_unlocked(uc_file);
+ } while (c != EOF && (isalnum((unsigned char)c) || c == '_'));
+ ungetc(c, uc_file);
+ if (uc_lex_buf == "PRUNE_BIND_MOUNTS")
+ return UCT_PRUNE_BIND_MOUNTS;
+ if (uc_lex_buf == "PRUNEFS")
+ return UCT_PRUNEFS;
+ if (uc_lex_buf == "PRUNENAMES")
+ return UCT_PRUNENAMES;
+ if (uc_lex_buf == "PRUNEPATHS")
+ return UCT_PRUNEPATHS;
+ return UCT_IDENTIFIER;
+ }
+ }
+}
+
+/* Parse /etc/updatedb.conf. Exit on I/O or syntax error. */
+static void
+parse_updatedb_conf(void)
+{
+ int old_error_one_per_line;
+ unsigned old_error_message_count;
+ bool had_prune_bind_mounts, had_prunefs, had_prunenames, had_prunepaths;
+
+ uc_file = fopen(UPDATEDB_CONF, "r");
+ if (uc_file == NULL) {
+ if (errno != ENOENT)
+ error(EXIT_FAILURE, errno, _("can not open `%s'"), UPDATEDB_CONF);
+ goto err;
+ }
+ flockfile(uc_file);
+ uc_current_line = 1;
+ old_error_message_count = error_message_count;
+ old_error_one_per_line = error_one_per_line;
+ error_one_per_line = 1;
+ had_prune_bind_mounts = false;
+ had_prunefs = false;
+ had_prunenames = false;
+ had_prunepaths = false;
+ for (;;) {
+ bool *had_var;
+ int var_token, token;
+
+ token = uc_lex();
+ switch (token) {
+ case UCT_EOF:
+ goto eof;
+
+ case UCT_EOL:
+ continue;
+
+ case UCT_PRUNE_BIND_MOUNTS:
+ had_var = &had_prune_bind_mounts;
+ break;
+
+ case UCT_PRUNEFS:
+ had_var = &had_prunefs;
+ break;
+
+ case UCT_PRUNENAMES:
+ had_var = &had_prunenames;
+ break;
+
+ case UCT_PRUNEPATHS:
+ had_var = &had_prunepaths;
+ break;
+
+ case UCT_IDENTIFIER:
+ error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+ _("unknown variable `%s'"), uc_lex_buf.c_str());
+ goto skip_to_eol;
+
+ default:
+ error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+ _("variable name expected"));
+ goto skip_to_eol;
+ }
+ if (*had_var != false) {
+ error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+ _("variable `%s' was already defined"), uc_lex_buf.c_str());
+ goto skip_to_eol;
+ }
+ *had_var = true;
+ var_token = token;
+ token = uc_lex();
+ if (token != UCT_EQUAL) {
+ error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+ _("`=' expected after variable name"));
+ goto skip_to_eol;
+ }
+ token = uc_lex();
+ if (token != UCT_QUOTED) {
+ error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+ _("value in quotes expected after `='"));
+ goto skip_to_eol;
+ }
+ if (var_token == UCT_PRUNE_BIND_MOUNTS) {
+ if (parse_bool(&conf_prune_bind_mounts, uc_lex_buf.c_str()) != 0) {
+ error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+ _("invalid value `%s' of PRUNE_BIND_MOUNTS"),
+ uc_lex_buf.c_str());
+ goto skip_to_eol;
+ }
+ } else if (var_token == UCT_PRUNEFS)
+ var_add_values(&conf_prunefs, uc_lex_buf.c_str());
+ else if (var_token == UCT_PRUNENAMES)
+ var_add_values(&conf_prunenames, uc_lex_buf.c_str());
+ else if (var_token == UCT_PRUNEPATHS)
+ var_add_values(&conf_prunepaths, uc_lex_buf.c_str());
+ else
+ abort();
+ token = uc_lex();
+ if (token != UCT_EOL && token != UCT_EOF) {
+ error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+ _("unexpected data after variable value"));
+ goto skip_to_eol;
+ }
+ /* Fall through */
+ skip_to_eol:
+ while (token != UCT_EOL) {
+ if (token == UCT_EOF)
+ goto eof;
+ token = uc_lex();
+ }
+ }
+eof:
+ if (ferror(uc_file))
+ error(EXIT_FAILURE, 0, _("I/O error reading `%s'"), UPDATEDB_CONF);
+ error_one_per_line = old_error_one_per_line;
+ funlockfile(uc_file);
+ fclose(uc_file);
+ if (error_message_count != old_error_message_count)
+ exit(EXIT_FAILURE);
+err:;
+}
+
+/* Command-line argument parsing */
+
+/* Output --help text */
+static void
+help(void)
+{
+ printf(_("Usage: updatedb [OPTION]...\n"
+ "Update a mlocate database.\n"
+ "\n"
+ " -f, --add-prunefs FS omit also FS\n"
+ " -n, --add-prunenames NAMES omit also NAMES\n"
+ " -e, --add-prunepaths PATHS omit also PATHS\n"
+ " -U, --database-root PATH the subtree to store in "
+ "database (default \"/\")\n"
+ " -h, --help print this help\n"
+ " -o, --output FILE database to update (default\n"
+ " `%s')\n"
+ " -b, --block-size SIZE number of filenames to store\n"
+ " in each block (default 32)\n"
+ " --prune-bind-mounts FLAG omit bind mounts (default "
+ "\"no\")\n"
+ " --prunefs FS filesystems to omit from "
+ "database\n"
+ " --prunenames NAMES directory names to omit from "
+ "database\n"
+ " --prunepaths PATHS paths to omit from database\n"
+ " -l, --require-visibility FLAG check visibility before "
+ "reporting files\n"
+ " (default \"yes\")\n"
+ " -v, --verbose print paths of files as they "
+ "are found\n"
+ " -V, --version print version information\n"
+ "\n"
+ "The configuration defaults to values read from\n"
+ "`%s'.\n"),
+ DBFILE, UPDATEDB_CONF);
+ printf(_("\n"
+ "Report bugs to %s.\n"),
+ PACKAGE_BUGREPORT);
+}
+
+/* Prepend current working directory to PATH;
+ return resulting path */
+static string
+prepend_cwd(const string &path)
+{
+ const char *res;
+ string buf;
+ buf.resize(BUFSIZ); /* Not PATH_MAX because it is not defined on some platforms. */
+ do
+ buf.resize(buf.size() * 1.5);
+ while ((res = getcwd(buf.data(), buf.size())) == NULL && errno == ERANGE);
+ if (res == NULL)
+ error(EXIT_FAILURE, errno, _("can not get current working directory"));
+ buf.resize(strlen(buf.data()));
+ return buf + '/' + path;
+}
+
+/* Parse ARGC, ARGV. Exit on error or --help, --version. */
+static void
+parse_arguments(int argc, char *argv[])
+{
+ enum { OPT_DEBUG_PRUNING = CHAR_MAX + 1 };
+
+ static const struct option options[] = {
+ { "add-prunefs", required_argument, NULL, 'f' },
+ { "add-prunenames", required_argument, NULL, 'n' },
+ { "add-prunepaths", required_argument, NULL, 'e' },
+ { "database-root", required_argument, NULL, 'U' },
+ { "debug-pruning", no_argument, NULL, OPT_DEBUG_PRUNING },
+ { "help", no_argument, NULL, 'h' },
+ { "output", required_argument, NULL, 'o' },
+ { "prune-bind-mounts", required_argument, NULL, 'B' },
+ { "prunefs", required_argument, NULL, 'F' },
+ { "prunenames", required_argument, NULL, 'N' },
+ { "prunepaths", required_argument, NULL, 'P' },
+ { "require-visibility", required_argument, NULL, 'l' },
+ { "verbose", no_argument, NULL, 'v' },
+ { "version", no_argument, NULL, 'V' },
+ { "block-size", required_argument, 0, 'b' },
+ { "debug", no_argument, 0, 'D' }, // Not documented.
+ { NULL, 0, NULL, 0 }
+ };
+
+ bool prunefs_changed, prunenames_changed, prunepaths_changed;
+ bool got_prune_bind_mounts, got_visibility;
+
+ prunefs_changed = false;
+ prunenames_changed = false;
+ prunepaths_changed = false;
+ got_prune_bind_mounts = false;
+ got_visibility = false;
+ for (;;) {
+ int opt, idx;
+
+ opt = getopt_long(argc, argv, "U:Ve:f:hl:n:o:vb:D", options, &idx);
+ switch (opt) {
+ case -1:
+ goto options_done;
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ case 'B':
+ if (got_prune_bind_mounts != false)
+ error(EXIT_FAILURE, 0,
+ _("--%s would override earlier command-line argument"),
+ "prune-bind-mounts");
+ got_prune_bind_mounts = true;
+ if (parse_bool(&conf_prune_bind_mounts, optarg) != 0)
+ error(EXIT_FAILURE, 0, _("invalid value `%s' of --%s"), optarg,
+ "prune-bind-mounts");
+ break;
+
+ case 'F':
+ if (prunefs_changed != false)
+ error(EXIT_FAILURE, 0,
+ _("--%s would override earlier command-line argument"),
+ "prunefs");
+ prunefs_changed = true;
+ conf_prunefs.clear();
+ var_add_values(&conf_prunefs, optarg);
+ break;
+
+ case 'N':
+ if (prunenames_changed != false)
+ error(EXIT_FAILURE, 0,
+ _("--%s would override earlier command-line argument"),
+ "prunenames");
+ prunenames_changed = true;
+ conf_prunenames.clear();
+ var_add_values(&conf_prunenames, optarg);
+ break;
+
+ case 'P':
+ if (prunepaths_changed != false)
+ error(EXIT_FAILURE, 0,
+ _("--%s would override earlier command-line argument"),
+ "prunepaths");
+ prunepaths_changed = true;
+ conf_prunepaths.clear(),
+ var_add_values(&conf_prunepaths, optarg);
+ break;
+
+ case 'U':
+ if (conf_scan_root != NULL)
+ error(EXIT_FAILURE, 0, _("--%s specified twice"),
+ "database-root");
+ conf_scan_root = canonicalize_file_name(optarg);
+ if (conf_scan_root == NULL)
+ error(EXIT_FAILURE, errno, _("invalid value `%s' of --%s"), optarg,
+ "database-root");
+ break;
+
+ case 'V':
+ puts("updatedb (" PACKAGE_NAME ") " PACKAGE_VERSION);
+ puts(_("Copyright (C) 2007 Red Hat, Inc. All rights reserved.\n"
+ "This software is distributed under the GPL v.2.\n"
+ "\n"
+ "This program is provided with NO WARRANTY, to the extent "
+ "permitted by law."));
+ exit(EXIT_SUCCESS);
+
+ case 'e':
+ prunepaths_changed = true;
+ var_add_values(&conf_prunepaths, optarg);
+ break;
+
+ case 'f':
+ prunefs_changed = true;
+ var_add_values(&conf_prunefs, optarg);
+ break;
+
+ case 'h':
+ help();
+ exit(EXIT_SUCCESS);
+
+ case 'l':
+ if (got_visibility != false)
+ error(EXIT_FAILURE, 0, _("--%s specified twice"),
+ "require-visibility");
+ got_visibility = true;
+ if (parse_bool(&conf_check_visibility, optarg) != 0)
+ error(EXIT_FAILURE, 0, _("invalid value `%s' of --%s"), optarg,
+ "require-visibility");
+ break;
+
+ case 'n':
+ prunenames_changed = true;
+ var_add_values(&conf_prunenames, optarg);
+ break;
+
+ case 'o':
+ if (!conf_output.empty())
+ error(EXIT_FAILURE, 0, _("--%s specified twice"), "output");
+ conf_output = optarg;
+ break;
+
+ case 'v':
+ conf_verbose = true;
+ break;
+
+ case 'b':
+ conf_block_size = atoi(optarg);
+ break;
+
+ case 'D':
+ use_debug = true;
+ break;
+
+ case OPT_DEBUG_PRUNING:
+ conf_debug_pruning = true;
+ break;
+
+ default:
+ abort();
+ }
+ }
+options_done:
+ if (optind != argc)
+ error(EXIT_FAILURE, 0, _("unexpected operand on command line"));
+ if (conf_scan_root == NULL) {
+ static char root[] = "/";
+
+ conf_scan_root = root;
+ }
+ if (conf_output.empty())
+ conf_output = DBFILE;
+ if (conf_output[0] != '/')
+ conf_output = prepend_cwd(conf_output);
+}
+
+/* Conversion of configuration for main code */
+
+/* Store a string list to OBSTACK */
+static void
+gen_conf_block_string_list(string *obstack,
+ const vector<string> *strings)
+{
+ for (const string &str : *strings) {
+ *obstack += str;
+ *obstack += '\0';
+ }
+ *obstack += '\0';
+}
+
+/* Generate conf_block */
+static void
+gen_conf_block(void)
+{
+ conf_block.clear();
+
+#define CONST(S) conf_block.append(S, sizeof(S))
+ /* conf_check_visibility value is stored in the header */
+ CONST("prune_bind_mounts");
+ /* Add two NUL bytes after the value */
+ conf_block.append(conf_prune_bind_mounts != false ? "1\0" : "0\0", 3);
+ CONST("prunefs");
+ gen_conf_block_string_list(&conf_block, &conf_prunefs);
+ CONST("prunenames");
+ gen_conf_block_string_list(&conf_block, &conf_prunenames);
+ CONST("prunepaths");
+ gen_conf_block_string_list(&conf_block, &conf_prunepaths);
+ /* scan_root is contained directly in the header */
+ /* conf_output, conf_verbose are not relevant */
+#undef CONST
+}
+
+/* Parse /etc/updatedb.conf and command-line arguments ARGC, ARGV.
+ Exit on error or --help, --version. */
+void conf_prepare(int argc, char *argv[])
+{
+ parse_updatedb_conf();
+ parse_arguments(argc, argv);
+ for (string &str : conf_prunefs) {
+ /* Assuming filesystem names are ASCII-only */
+ for (char &c : str)
+ c = toupper(c);
+ }
+ /* Finish the variable only after converting filesystem names to upper case
+ to avoid keeping duplicates that originally differed in case and to sort
+ them correctly. */
+ var_finish(&conf_prunefs);
+ var_finish(&conf_prunenames);
+ var_finish(&conf_prunepaths);
+ gen_conf_block();
+ string_list_dir_path_sort(&conf_prunepaths);
+
+ if (conf_debug_pruning) {
+ /* This is debuging output, don't mark anything for translation */
+ fprintf(stderr, "conf_block:\n");
+ for (char c : conf_block) {
+ if (isascii((unsigned char)c) && isprint((unsigned char)c) && c != '\\')
+ putc(c, stderr);
+ else {
+ fprintf(stderr, "\\%03o", (unsigned)(unsigned char)c);
+ if (c == 0)
+ putc('\n', stderr);
+ }
+ }
+ fprintf(stderr, "\n-----------------------\n");
+ }
+}
--- /dev/null
+/* updatedb configuration.
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#ifndef CONF_H__
+#define CONF_H__
+
+#include <stddef.h>
+#include <string>
+#include <vector>
+
+/* true if locate(1) should check whether files are visible before reporting
+ them */
+extern bool conf_check_visibility;
+
+/* Filesystems to skip, converted to uppercase and sorted by name */
+extern std::vector<std::string> conf_prunefs;
+
+/* Directory names to skip, sorted by name */
+extern std::vector<std::string> conf_prunenames;
+
+/* Paths to skip, sorted by name using dir_path_cmp () */
+extern std::vector<std::string> conf_prunepaths;
+
+/* true if bind mounts should be skipped */
+extern bool conf_prune_bind_mounts;
+
+/* true if pruning debug output was requested */
+extern bool conf_debug_pruning;
+
+/* Root of the directory tree to store in the database (canonical) */
+extern char *conf_scan_root;
+
+/* Absolute (not necessarily canonical) path to the database */
+extern std::string conf_output;
+
+/* true if file names should be written to stdout as they are found */
+extern bool conf_verbose;
+
+/* Configuration representation for the database configuration block */
+extern std::string conf_block;
+
+/* Parse /etc/updatedb.conf and command-line arguments ARGC, ARGV.
+ Exit on error or --help, --version. */
+extern void conf_prepare(int argc, char *argv[]);
+
+extern int conf_block_size;
+extern bool use_debug;
+
+#endif
encoded.append(reinterpret_cast<char *>(buf), end - buf);
}
-void DictionaryBuilder::add_file(string filename)
+void DictionaryBuilder::add_file(string filename, dir_time)
{
if (keep_current_block) { // Only bother saving the filenames if we're actually keeping the block.
if (!current_block.empty()) {
return buf;
}
-Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict)
- : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), cdict(cdict)
+Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times)
+ : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict)
{
fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr);
+ if (store_dir_times) {
+ dir_time_ctx = ZSTD_createCStream();
+ ZSTD_initCStream(dir_time_ctx, /*level=*/6);
+ }
}
Corpus::~Corpus()
return *invindex[trgm];
}
-void Corpus::add_file(string filename)
+void Corpus::add_file(string filename, dir_time dt)
{
++num_files;
if (!current_block.empty()) {
if (++num_files_in_block == block_size) {
flush_block();
}
+
+ if (store_dir_times) {
+ if (dt.sec == -1) {
+ // Not a directory.
+ dir_times.push_back('\0');
+ } else {
+ dir_times.push_back('\1');
+ dir_times.append(reinterpret_cast<char *>(&dt.sec), sizeof(dt.sec));
+ dir_times.append(reinterpret_cast<char *>(&dt.nsec), sizeof(dt.nsec));
+ }
+ compress_dir_times(/*allowed_slop=*/4096);
+ }
+}
+
+void Corpus::compress_dir_times(size_t allowed_slop) {
+ while (dir_times.size() >= allowed_slop) {
+ size_t old_size = dir_times_compressed.size();
+ dir_times_compressed.resize(old_size + 4096);
+
+ ZSTD_outBuffer outbuf;
+ outbuf.dst = dir_times_compressed.data() + old_size;
+ outbuf.size = 4096;
+ outbuf.pos = 0;
+
+ ZSTD_inBuffer inbuf;
+ inbuf.src = dir_times.data();
+ inbuf.size = dir_times.size();
+ inbuf.pos = 0;
+
+ int ret = ZSTD_compressStream(dir_time_ctx, &outbuf, &inbuf);
+ if (ret < 0) {
+ fprintf(stderr, "ZSTD_compressStream() failed\n");
+ exit(1);
+ }
+
+ dir_times_compressed.resize(old_size + outbuf.pos);
+ dir_times.erase(dir_times.begin(), dir_times.begin() + inbuf.pos);
+
+ if (outbuf.pos == 0 && inbuf.pos == 0) {
+ // Nothing happened (not enough data?), try again later.
+ return;
+ }
+ }
}
void Corpus::flush_block()
return num;
}
+string Corpus::get_compressed_dir_times()
+{
+ if (!store_dir_times) {
+ return "";
+ }
+ compress_dir_times(/*allowed_slop=*/0);
+ assert(dir_times.empty());
+
+ for ( ;; ) {
+ size_t old_size = dir_times_compressed.size();
+ dir_times_compressed.resize(old_size + 4096);
+
+ ZSTD_outBuffer outbuf;
+ outbuf.dst = dir_times_compressed.data() + old_size;
+ outbuf.size = 4096;
+ outbuf.pos = 0;
+
+ int ret = ZSTD_endStream(dir_time_ctx, &outbuf);
+ if (ret < 0) {
+ fprintf(stderr, "ZSTD_compressStream() failed\n");
+ exit(1);
+ }
+
+ dir_times_compressed.resize(old_size + outbuf.pos);
+
+ if (ret == 0) {
+ // All done.
+ break;
+ }
+ }
+
+ return dir_times_compressed;
+}
+
string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf)
{
static ZSTD_CCtx *ctx = nullptr;
return ht;
}
-DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dictionary)
+DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_size, string dictionary)
: outfile(outfile), block_size(block_size)
{
umask(0027);
string path = outfile;
path.resize(path.find_last_of('/') + 1);
+ if (path.empty()) {
+ path = ".";
+ }
int fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640);
if (fd == -1) {
perror(path.c_str());
exit(1);
}
+ if (owner != (gid_t)-1) {
+ if (fchown(fd, (uid_t)-1, owner) == -1) {
+ perror("fchown");
+ exit(1);
+ }
+ }
+
outfp = fdopen(fd, "wb");
if (outfp == nullptr) {
perror(outfile);
hdr.extra_ht_slots = num_overflow_slots;
hdr.num_docids = 0;
hdr.hash_table_offset_bytes = -1; // We don't know these offsets yet.
- hdr.max_version = 1;
+ hdr.max_version = 2;
hdr.filename_index_offset_bytes = -1;
hdr.zstd_dictionary_length_bytes = -1;
fwrite(&hdr, sizeof(hdr), 1, outfp);
hdr.zstd_dictionary_length_bytes = dictionary.size();
cdict = ZSTD_createCDict(dictionary.data(), dictionary.size(), /*level=*/6);
}
+
+ hdr.directory_data_length_bytes = 0;
+ hdr.directory_data_offset_bytes = 0;
+ hdr.next_zstd_dictionary_length_bytes = 0;
+ hdr.next_zstd_dictionary_offset_bytes = 0;
+ hdr.conf_block_length_bytes = 0;
+ hdr.conf_block_offset_bytes = 0;
}
-Corpus *DatabaseBuilder::start_corpus()
+Corpus *DatabaseBuilder::start_corpus(bool store_dir_times)
{
corpus_start = steady_clock::now();
- corpus = new Corpus(outfp, block_size, cdict);
+ corpus = new Corpus(outfp, block_size, cdict, store_dir_times);
return corpus;
}
+void DatabaseBuilder::set_next_dictionary(std::string next_dictionary)
+{
+ this->next_dictionary = move(next_dictionary);
+}
+
+void DatabaseBuilder::set_conf_block(std::string conf_block)
+{
+ this->conf_block = move(conf_block);
+}
+
void DatabaseBuilder::finish_corpus()
{
corpus->finish();
fwrite(encoded.data(), encoded.size(), 1, outfp);
}
+ // Finally, write the directory times (for updatedb).
+ string compressed_dir_times = corpus->get_compressed_dir_times();
+ size_t bytes_for_compressed_dir_times = 0;
+ if (!compressed_dir_times.empty()) {
+ hdr.directory_data_offset_bytes = ftell(outfp);
+ hdr.directory_data_length_bytes = compressed_dir_times.size();
+ fwrite(compressed_dir_times.data(), compressed_dir_times.size(), 1, outfp);
+ bytes_for_compressed_dir_times = compressed_dir_times.size();
+ compressed_dir_times.clear();
+ }
+
+ // Write the recommended dictionary for next update.
+ if (!next_dictionary.empty()) {
+ hdr.next_zstd_dictionary_offset_bytes = ftell(outfp);
+ hdr.next_zstd_dictionary_length_bytes = next_dictionary.size();
+ fwrite(next_dictionary.data(), next_dictionary.size(), 1, outfp);
+ }
+
+ // And the configuration block.
+ if (!conf_block.empty()) {
+ hdr.next_zstd_dictionary_offset_bytes = ftell(outfp);
+ hdr.next_zstd_dictionary_length_bytes = conf_block.size();
+ fwrite(conf_block.data(), conf_block.size(), 1, outfp);
+ }
+
// Rewind, and write the updated header.
hdr.version = 1;
fseek(outfp, 0, SEEK_SET);
fclose(outfp);
- size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
+ size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames + bytes_for_compressed_dir_times);
dprintf("Block size: %7d files\n", block_size);
dprintf("Dictionary: %'7.1f MB\n", hdr.zstd_dictionary_length_bytes / 1048576.0);
dprintf("Posting lists: %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
dprintf("Filenames: %'7.1f MB\n", bytes_for_filenames / 1048576.0);
+ if (bytes_for_compressed_dir_times != 0) {
+ dprintf("Modify times: %'7.1f MB\n", bytes_for_compressed_dir_times / 1048576.0);
+ }
dprintf("Total: %'7.1f MB\n", total_bytes / 1048576.0);
dprintf("\n");
}
#include <random>
#include <stddef.h>
#include <string>
+#include <utility>
#include <vector>
#include <zstd.h>
class PostingListBuilder;
+// {0,0} means unknown or so current that it should never match.
+// {-1,0} means it's not a directory.
+struct dir_time {
+ int64_t sec;
+ int32_t nsec;
+
+ bool operator<(const dir_time &other) const
+ {
+ if (sec != other.sec)
+ return sec < other.sec;
+ return nsec < other.nsec;
+ }
+ bool operator>=(const dir_time &other) const
+ {
+ return !(other < *this);
+ }
+};
+constexpr dir_time unknown_dir_time{ 0, 0 };
+constexpr dir_time not_a_dir{ -1, 0 };
+
class DatabaseReceiver {
public:
virtual ~DatabaseReceiver() = default;
- virtual void add_file(std::string filename) = 0;
+ virtual void add_file(std::string filename, dir_time dt) = 0;
virtual void flush_block() = 0;
virtual void finish() { flush_block(); }
};
public:
DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
: blocks_to_keep(blocks_to_keep), block_size(block_size) {}
- void add_file(std::string filename) override;
+ void add_file(std::string filename, dir_time dt) override;
void flush_block() override;
std::string train(size_t buf_size);
class Corpus : public DatabaseReceiver {
public:
- Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict);
+ Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times);
~Corpus();
- void add_file(std::string filename) override;
+ void add_file(std::string filename, dir_time dt) override;
void flush_block() override;
void finish() override;
}
PostingListBuilder &get_pl_builder(uint32_t trgm);
size_t num_trigrams() const;
+ std::string get_compressed_dir_times();
private:
+ void compress_dir_times(size_t allowed_slop);
+
std::unique_ptr<PostingListBuilder *[]> invindex;
FILE *outfp;
std::string current_block;
std::string tempbuf;
const size_t block_size;
+ const bool store_dir_times;
ZSTD_CDict *cdict;
+
+ ZSTD_CStream *dir_time_ctx = nullptr;
+ std::string dir_times; // Buffer of still-uncompressed data.
+ std::string dir_times_compressed;
};
class DatabaseBuilder {
public:
- DatabaseBuilder(const char *outfile, int block_size, std::string dictionary);
- Corpus *start_corpus();
+ DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary);
+ Corpus *start_corpus(bool store_dir_times);
+ void set_next_dictionary(std::string next_dictionary);
+ void set_conf_block(std::string conf_block);
void finish_corpus();
private:
std::chrono::steady_clock::time_point corpus_start;
Corpus *corpus = nullptr;
ZSTD_CDict *cdict = nullptr;
+ std::string next_dictionary, conf_block;
};
#endif // !defined(_DATABASE_BUILDER_H)
uint64_t filename_index_offset_bytes;
// Version 1 and up only.
- uint32_t max_version; // Nominally 1, but can be increased if more features are added in a backward-compatible way.
+ uint32_t max_version; // Nominally 1 or 2, but can be increased if more features are added in a backward-compatible way.
uint32_t zstd_dictionary_length_bytes;
uint64_t zstd_dictionary_offset_bytes;
+
+ // Only if max_version >= 2, and only relevant for updatedb.
+ uint64_t directory_data_length_bytes;
+ uint64_t directory_data_offset_bytes;
+ uint64_t next_zstd_dictionary_length_bytes;
+ uint64_t next_zstd_dictionary_offset_bytes;
+ uint64_t conf_block_length_bytes;
+ uint64_t conf_block_offset_bytes;
};
struct Trigram {
--- /dev/null
+/* Common functions.
+
+Copyright (C) 2005, 2007 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+using namespace std;
+
+#include "lib.h"
+
+#include "db.h"
+#include "error.h"
+
+#include <algorithm>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Compare two path names using the database directory order. This is not
+ exactly strcmp () order: "a" < "a.b", so "a/z" < "a.b". */
+int dir_path_cmp(const string &a, const string &b)
+{
+ auto [ai, bi] = mismatch(a.begin(), a.end(), b.begin(), b.end());
+ if (ai == a.end() && bi == b.end()) {
+ return 0;
+ }
+ if (ai == a.end()) {
+ return -1;
+ }
+ if (bi == b.end()) {
+ return 1;
+ }
+ if (*ai == *bi) {
+ return 0;
+ }
+ if (*ai == '/') {
+ return -1;
+ }
+ if (*bi == '/') {
+ return 1;
+ }
+ return int((unsigned char)*ai) - int((unsigned char)*bi);
+}
+
+/* Sort LIST using dir_path_cmp () */
+void string_list_dir_path_sort(vector<string> *list)
+{
+ sort(list->begin(), list->end(), [](const string &a, const string &b) {
+ return dir_path_cmp(a, b) < 0;
+ });
+}
+
+/* Is PATH included in LIST? Update *IDX to move within LIST.
+
+ LIST is assumed to be sorted using dir_path_cmp (), successive calls to this
+ function are assumed to use PATH values increasing in dir_path_cmp (). */
+bool string_list_contains_dir_path(const vector<string> *list, size_t *idx,
+ const string &path)
+{
+ int cmp = 0;
+ while (*idx < list->size() && (cmp = dir_path_cmp((*list)[*idx], path)) < 0) {
+ (*idx)++;
+ }
+ if (*idx < list->size() && cmp == 0) {
+ (*idx)++;
+ return true;
+ }
+ return false;
+}
--- /dev/null
+/* Common functions.
+
+Copyright (C) 2005, 2007 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#ifndef LIB_H__
+#define LIB_H__
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <sys/types.h>
+#include <vector>
+
+#define _(X) (X)
+
+/* Compare two path names using the database directory order. This is not
+ exactly strcmp () order: "a" < "a.b", so "a/z" < "a.b". */
+extern int dir_path_cmp(const std::string &a, const std::string &b);
+
+/* Sort LIST using dir_path_cmp () */
+extern void string_list_dir_path_sort(std::vector<std::string> *list);
+
+/* Is PATH included in LIST? Update *IDX to move within LIST.
+
+ LIST is assumed to be sorted using dir_path_cmp (), successive calls to this
+ function are assumed to use PATH values increasing in dir_path_cmp (). */
+extern bool string_list_contains_dir_path(const std::vector<std::string> *list,
+ size_t *idx, const std::string &path);
+
+#endif
project('plocate', 'cpp', default_options: ['buildtype=debugoptimized','cpp_std=c++17'], version: '1.0.8-pre')
-# Make the version available as a #define.
-add_project_arguments('-DPLOCATE_VERSION="' + meson.project_version() + '"', language: 'cpp')
+add_project_arguments('-DGROUPNAME="' + get_option('locategroup') + '"', language: 'cpp')
+add_project_arguments('-DUPDATEDB_CONF="/etc/updatedb.conf"', language: 'cpp')
+add_project_arguments('-DDBFILE="/var/lib/mlocate/plocate.db"', language: 'cpp')
+add_project_arguments('-DPACKAGE_NAME="plocate"', language: 'cpp')
+add_project_arguments('-DPACKAGE_VERSION="' + meson.project_version() + '"', language: 'cpp')
+add_project_arguments('-DPACKAGE_BUGREPORT="steinar+plocate@gunderson.no"', language: 'cpp')
cxx = meson.get_compiler('cpp')
uringdep = dependency('liburing', required: false)
dependencies: [zstddep],
install: true,
install_dir: get_option('sbindir'))
+executable('updatedb', ['updatedb.cpp', 'database-builder.cpp', 'conf.cpp', 'lib.cpp', 'bind-mount.cpp', 'complete_pread.cpp'],
+ dependencies: [zstddep, threaddep],
+ install: true,
+ install_dir: get_option('sbindir'))
conf_data = configuration_data()
conf_data.set('PROCESSED_BY_MESON', '1')
int type = getc(fp);
if (type == DBE_NORMAL) {
string filename = read_cstr(fp);
- receiver->add_file(dir_path + "/" + filename);
+ receiver->add_file(dir_path + "/" + filename, unknown_dir_time);
} else if (type == DBE_DIRECTORY) {
string dirname = read_cstr(fp);
- receiver->add_file(dir_path + "/" + dirname);
+ receiver->add_file(dir_path + "/" + dirname, unknown_dir_time);
} else {
return; // Probably end.
}
}
if (!s.empty() && s.back() == '\n')
s.pop_back();
- receiver->add_file(move(s));
+ receiver->add_file(move(s), unknown_dir_time);
}
}
}
string dictionary = builder.train(1024);
- DatabaseBuilder db(outfile, block_size, dictionary);
- Corpus *corpus = db.start_corpus();
+ DatabaseBuilder db(outfile, /*owner=*/-1, block_size, dictionary);
+ Corpus *corpus = db.start_corpus(/*store_dir_times=*/false);
if (plaintext) {
read_plaintext(infp, corpus);
} else {
void version()
{
- printf("plocate-build %s\n", PLOCATE_VERSION);
+ printf("plocate-build %s\n", PACKAGE_VERSION);
printf("Copyright 2020 Steinar H. Gunderson\n");
printf("License GPLv2+: GNU GPL version 2 or later <https://gnu.org/licenses/gpl.html>.\n");
printf("This is free software: you are free to change and redistribute it.\n");
using namespace std;
using namespace std::chrono;
-#define DEFAULT_DBPATH "/var/lib/mlocate/plocate.db"
-
-const char *dbpath = DEFAULT_DBPATH;
+const char *dbpath = DBFILE;
bool ignore_case = false;
bool only_count = false;
bool print_nul = false;
" -b, --basename search only the file name portion of path names\n"
" -c, --count print number of matches instead of the matches\n"
" -d, --database DBPATH search for files in DBPATH\n"
- " (default is " DEFAULT_DBPATH ")\n"
+ " (default is " DBFILE ")\n"
" -i, --ignore-case search case-insensitively\n"
" -l, --limit LIMIT stop after LIMIT matches\n"
" -0, --null delimit matches by NUL instead of newline\n"
void version()
{
- printf("plocate %s\n", PLOCATE_VERSION);
+ printf("%s %s\n", PACKAGE_NAME, PACKAGE_VERSION);
printf("Copyright 2020 Steinar H. Gunderson\n");
printf("License GPLv2+: GNU GPL version 2 or later <https://gnu.org/licenses/gpl.html>.\n");
printf("This is free software: you are free to change and redistribute it.\n");
--- /dev/null
+/* updatedb(8).
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+ */
+
+#include "bind-mount.h"
+#include "complete_pread.h"
+#include "conf.h"
+#include "database-builder.h"
+#include "db.h"
+#include "dprintf.h"
+#include "io_uring_engine.h"
+#include "lib.h"
+
+#include <algorithm>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <chrono>
+#include <dirent.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <grp.h>
+#include <iosfwd>
+#include <math.h>
+#include <memory>
+#include <mntent.h>
+#include <random>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <utility>
+#include <vector>
+
+using namespace std;
+using namespace std::chrono;
+
+/* Next conf_prunepaths entry */
+static size_t conf_prunepaths_index; /* = 0; */
+
+void usage()
+{
+ printf(
+ "Usage: updatedb PLOCATE_DB\n"
+ "\n"
+ "Generate plocate index from mlocate.db, typically /var/lib/mlocate/mlocate.db.\n"
+ "Normally, the destination should be /var/lib/mlocate/plocate.db.\n"
+ "\n"
+ " -b, --block-size SIZE number of filenames to store in each block (default 32)\n"
+ " -p, --plaintext input is a plaintext file, not an mlocate database\n"
+ " --help print this help\n"
+ " --version print version information\n");
+}
+
+void version()
+{
+ printf("updatedb %s\n", PACKAGE_VERSION);
+ printf("Copyright (C) 2007 Red Hat, Inc. All rights reserved.\n");
+ printf("Copyright 2020 Steinar H. Gunderson\n");
+ printf("This software is distributed under the GPL v.2.\n");
+ printf("\n");
+ printf("This program is provided with NO WARRANTY, to the extent permitted by law.\n");
+}
+
+int opendir_noatime(int dirfd, const char *path)
+{
+ static bool noatime_failed = false;
+
+ if (!noatime_failed) {
+ int fd = openat(dirfd, path, O_RDONLY | O_DIRECTORY | O_NOATIME);
+ if (fd != -1) {
+ return fd;
+ } else if (errno == EPERM) {
+ /* EPERM is fairly O_NOATIME-specific; missing access rights cause
+ EACCES. */
+ noatime_failed = true;
+ // Retry below.
+ } else {
+ return -1;
+ }
+ }
+ return openat(dirfd, path, O_RDONLY | O_DIRECTORY);
+}
+
+bool time_is_current(const dir_time &t)
+{
+ static dir_time cache{ 0, 0 };
+
+ /* This is more difficult than it should be because Linux uses a cheaper time
+ source for filesystem timestamps than for gettimeofday() and they can get
+ slightly out of sync, see
+ https://bugzilla.redhat.com/show_bug.cgi?id=244697 . This affects even
+ nanosecond timestamps (and don't forget that tv_nsec existence doesn't
+ guarantee that the underlying filesystem has such resolution - it might be
+ microseconds or even coarser).
+
+ The worst case is probably FAT timestamps with 2-second resolution
+ (although using such a filesystem violates POSIX file times requirements).
+
+ So, to be on the safe side, require a >3.0 second difference (2 seconds to
+ make sure the FAT timestamp changed, 1 more to account for the Linux
+ timestamp races). This large margin might make updatedb marginally more
+ expensive, but it only makes a difference if the directory was very
+ recently updated _and_ is will not be updated again until the next
+ updatedb run; this is not likely to happen for most directories. */
+
+ /* Cache gettimeofday () results to rule out obviously old time stamps;
+ CACHE contains the earliest time we reject as too current. */
+ if (t < cache) {
+ return false;
+ }
+
+ struct timeval tv;
+ gettimeofday(&tv, nullptr);
+ cache.sec = tv.tv_sec - 3;
+ cache.nsec = tv.tv_usec * 1000;
+
+ return t >= cache;
+}
+
+struct entry {
+ string name;
+ bool is_directory;
+
+ // For directories only:
+ int fd = -1;
+ dir_time dt = unknown_dir_time;
+ dir_time db_modified = unknown_dir_time;
+ dev_t dev;
+};
+
+bool filesystem_is_excluded(const char *path)
+{
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, "Checking whether filesystem `%s' is excluded:\n", path);
+ }
+ FILE *f = setmntent("/proc/mounts", "r");
+ if (f == nullptr) {
+ return false;
+ }
+
+ struct mntent *me;
+ while ((me = getmntent(f)) != nullptr) {
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, " `%s', type `%s'\n", me->mnt_dir, me->mnt_type);
+ }
+ string type(me->mnt_type);
+ for (char &p : type) {
+ p = toupper(p);
+ }
+ if (find(conf_prunefs.begin(), conf_prunefs.end(), type) != conf_prunefs.end()) {
+ /* Paths in /proc/self/mounts contain no symbolic links. Besides
+ avoiding a few system calls, avoiding the realpath () avoids hangs
+ if the filesystem is unavailable hard-mounted NFS. */
+ char *dir = me->mnt_dir;
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, " => type matches, dir `%s'\n", dir);
+ }
+ bool res = (strcmp(path, dir) == 0);
+ if (dir != me->mnt_dir)
+ free(dir);
+ if (res) {
+ endmntent(f);
+ return true;
+ }
+ }
+ }
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, "...done\n");
+ }
+ endmntent(f);
+ return false;
+}
+
+dir_time get_dirtime_from_stat(const struct stat &buf)
+{
+ dir_time ctime{ buf.st_ctim.tv_sec, int32_t(buf.st_ctim.tv_nsec) };
+ dir_time mtime{ buf.st_mtim.tv_sec, int32_t(buf.st_mtim.tv_nsec) };
+ dir_time dt = max(ctime, mtime);
+
+ if (time_is_current(dt)) {
+ /* The directory might be changing right now and we can't be sure the
+ timestamp will be changed again if more changes happen very soon, mark
+ the timestamp as invalid to force rescanning the directory next time
+ updatedb is run. */
+ return unknown_dir_time;
+ } else {
+ return dt;
+ }
+}
+
+// Represents the old database we are updating.
+class ExistingDB {
+public:
+ explicit ExistingDB(int fd);
+ ~ExistingDB();
+
+ pair<string, dir_time> read_next();
+ void unread(pair<string, dir_time> record)
+ {
+ unread_record = move(record);
+ }
+ string read_next_dictionary() const;
+ bool get_error() const { return error; }
+
+private:
+ const int fd;
+ Header hdr;
+
+ uint32_t current_docid = 0;
+
+ string current_filename_block;
+ const char *current_filename_ptr = nullptr, *current_filename_end = nullptr;
+
+ off_t compressed_dir_time_pos;
+ string compressed_dir_time;
+ string current_dir_time_block;
+ const char *current_dir_time_ptr = nullptr, *current_dir_time_end = nullptr;
+
+ pair<string, dir_time> unread_record;
+
+ // Used in one-shot mode, repeatedly.
+ ZSTD_DCtx *ctx;
+
+ // Used in streaming mode.
+ ZSTD_DCtx *dir_time_ctx;
+
+ ZSTD_DDict *ddict = nullptr;
+
+ // If true, we've discovered an error or EOF, and will return only
+ // empty data from here.
+ bool eof = false, error = false;
+};
+
+ExistingDB::ExistingDB(int fd)
+ : fd(fd)
+{
+ if (fd == -1) {
+ error = true;
+ return;
+ }
+
+ if (!try_complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0)) {
+ if (conf_verbose) {
+ perror("pread(header)");
+ }
+ error = true;
+ return;
+ }
+ if (memcmp(hdr.magic, "\0plocate", 8) != 0) {
+ if (conf_verbose) {
+ fprintf(stderr, "Old database had header mismatch, ignoring.\n");
+ }
+ error = true;
+ return;
+ }
+ if (hdr.version != 1 || hdr.max_version < 2) {
+ if (conf_verbose) {
+ fprintf(stderr, "Old database had version mismatch (version=%d max_version=%d), ignoring.\n",
+ hdr.version, hdr.max_version);
+ }
+ error = true;
+ return;
+ }
+
+ // Compare the configuration block with our current one.
+ if (hdr.conf_block_length_bytes != conf_block.size()) {
+ if (conf_verbose) {
+ fprintf(stderr, "Old database had different configuration block (size mismatch), ignoring.\n");
+ }
+ error = true;
+ return;
+ }
+ string str;
+ str.resize(hdr.conf_block_length_bytes);
+ if (!try_complete_pread(fd, str.data(), hdr.conf_block_length_bytes, hdr.conf_block_offset_bytes)) {
+ if (conf_verbose) {
+ perror("pread(conf_block)");
+ }
+ error = true;
+ return;
+ }
+ if (str != conf_block) {
+ if (conf_verbose) {
+ fprintf(stderr, "Old database had different configuration block (contents mismatch), ignoring.\n");
+ }
+ error = true;
+ return;
+ }
+
+ // Read dictionary, if it exists.
+ if (hdr.zstd_dictionary_length_bytes > 0) {
+ string dictionary;
+ dictionary.resize(hdr.zstd_dictionary_length_bytes);
+ if (try_complete_pread(fd, &dictionary[0], hdr.zstd_dictionary_length_bytes, hdr.zstd_dictionary_offset_bytes)) {
+ ddict = ZSTD_createDDict(dictionary.data(), dictionary.size());
+ } else {
+ if (conf_verbose) {
+ perror("pread(dictionary)");
+ }
+ error = true;
+ return;
+ }
+ }
+ compressed_dir_time_pos = hdr.directory_data_offset_bytes;
+
+ ctx = ZSTD_createDCtx();
+ dir_time_ctx = ZSTD_createDCtx();
+}
+
+ExistingDB::~ExistingDB()
+{
+ if (fd != -1) {
+ close(fd);
+ }
+}
+
+pair<string, dir_time> ExistingDB::read_next()
+{
+ if (!unread_record.first.empty()) {
+ auto ret = move(unread_record);
+ unread_record.first.clear();
+ return ret;
+ }
+
+ if (eof || error) {
+ return { "", not_a_dir };
+ }
+
+ // See if we need to read a new filename block.
+ if (current_filename_ptr == nullptr) {
+ if (current_docid >= hdr.num_docids) {
+ eof = true;
+ return { "", not_a_dir };
+ }
+
+ // Read the file offset from this docid and the next one.
+ // This is always allowed, since we have a sentinel block at the end.
+ off_t offset_for_block = hdr.filename_index_offset_bytes + current_docid * sizeof(uint64_t);
+ uint64_t vals[2];
+ if (!try_complete_pread(fd, vals, sizeof(vals), offset_for_block)) {
+ if (conf_verbose) {
+ perror("pread(offset)");
+ }
+ error = true;
+ return { "", not_a_dir };
+ }
+
+ off_t offset = vals[0];
+ size_t compressed_len = vals[1] - vals[0];
+ unique_ptr<char[]> compressed(new char[compressed_len]);
+ if (!try_complete_pread(fd, compressed.get(), compressed_len, offset)) {
+ if (conf_verbose) {
+ perror("pread(block)");
+ }
+ error = true;
+ return { "", not_a_dir };
+ }
+
+ unsigned long long uncompressed_len = ZSTD_getFrameContentSize(compressed.get(), compressed_len);
+ if (uncompressed_len == ZSTD_CONTENTSIZE_UNKNOWN || uncompressed_len == ZSTD_CONTENTSIZE_ERROR) {
+ if (conf_verbose) {
+ fprintf(stderr, "ZSTD_getFrameContentSize() failed\n");
+ }
+ error = true;
+ return { "", not_a_dir };
+ }
+
+ string block;
+ block.resize(uncompressed_len + 1);
+
+ size_t err;
+ if (ddict != nullptr) {
+ err = ZSTD_decompress_usingDDict(ctx, &block[0], block.size(), compressed.get(),
+ compressed_len, ddict);
+ } else {
+ err = ZSTD_decompressDCtx(ctx, &block[0], block.size(), compressed.get(),
+ compressed_len);
+ }
+ if (ZSTD_isError(err)) {
+ if (conf_verbose) {
+ fprintf(stderr, "ZSTD_decompress(): %s\n", ZSTD_getErrorName(err));
+ }
+ error = true;
+ return { "", not_a_dir };
+ }
+ block[block.size() - 1] = '\0';
+ current_filename_block = move(block);
+ current_filename_ptr = current_filename_block.data();
+ current_filename_end = current_filename_block.data() + current_filename_block.size();
+ ++current_docid;
+ }
+
+ // See if we need to read more directory time data.
+ while (current_dir_time_ptr == current_dir_time_end ||
+ (*current_dir_time_ptr != 0 &&
+ size_t(current_dir_time_end - current_dir_time_ptr) < sizeof(dir_time) + 1)) {
+ if (current_dir_time_ptr != nullptr) {
+ const size_t bytes_consumed = current_dir_time_ptr - current_dir_time_block.data();
+ current_dir_time_block.erase(current_dir_time_block.begin(), current_dir_time_block.begin() + bytes_consumed);
+ }
+
+ // See if we can get more data out without reading more.
+ const size_t existing_data = current_dir_time_block.size();
+ current_dir_time_block.resize(existing_data + 4096);
+
+ ZSTD_outBuffer outbuf;
+ outbuf.dst = current_dir_time_block.data() + existing_data;
+ outbuf.size = 4096;
+ outbuf.pos = 0;
+
+ ZSTD_inBuffer inbuf;
+ inbuf.src = compressed_dir_time.data();
+ inbuf.size = compressed_dir_time.size();
+ inbuf.pos = 0;
+
+ int err = ZSTD_decompressStream(dir_time_ctx, &outbuf, &inbuf);
+ if (err < 0) {
+ if (conf_verbose) {
+ fprintf(stderr, "ZSTD_decompress(): %s\n", ZSTD_getErrorName(err));
+ }
+ error = true;
+ return { "", not_a_dir };
+ }
+ compressed_dir_time.erase(compressed_dir_time.begin(), compressed_dir_time.begin() + inbuf.pos);
+ current_dir_time_block.resize(existing_data + outbuf.pos);
+
+ if (inbuf.pos == 0 && outbuf.pos == 0) {
+ // No movement, we'll need to try to read more data.
+ char buf[4096];
+ size_t bytes_to_read = min<size_t>(
+ hdr.directory_data_offset_bytes + hdr.directory_data_length_bytes - compressed_dir_time_pos,
+ sizeof(buf));
+ if (bytes_to_read == 0) {
+ error = true;
+ return { "", not_a_dir };
+ }
+ if (!try_complete_pread(fd, buf, bytes_to_read, compressed_dir_time_pos)) {
+ if (conf_verbose) {
+ perror("pread(dirtime)");
+ }
+ error = true;
+ return { "", not_a_dir };
+ }
+ compressed_dir_time_pos += bytes_to_read;
+ compressed_dir_time.insert(compressed_dir_time.end(), buf, buf + bytes_to_read);
+
+ // Next iteration will now try decompressing more.
+ }
+
+ current_dir_time_ptr = current_dir_time_block.data();
+ current_dir_time_end = current_dir_time_block.data() + current_dir_time_block.size();
+ }
+
+ string filename = current_filename_ptr;
+ current_filename_ptr += filename.size() + 1;
+ if (current_filename_ptr == current_filename_end) {
+ // End of this block.
+ current_filename_ptr = nullptr;
+ }
+
+ if (*current_dir_time_ptr == 0) {
+ ++current_dir_time_ptr;
+ return { move(filename), not_a_dir };
+ } else {
+ ++current_dir_time_ptr;
+ dir_time dt;
+ memcpy(&dt.sec, current_dir_time_ptr, sizeof(dt.sec));
+ current_dir_time_ptr += sizeof(dt.sec);
+ memcpy(&dt.nsec, current_dir_time_ptr, sizeof(dt.nsec));
+ current_dir_time_ptr += sizeof(dt.nsec);
+ return { move(filename), dt };
+ }
+}
+
+string ExistingDB::read_next_dictionary() const
+{
+ if (hdr.next_zstd_dictionary_length_bytes == 0 || hdr.next_zstd_dictionary_length_bytes > 1048576) {
+ return "";
+ }
+ string str;
+ str.resize(hdr.next_zstd_dictionary_length_bytes);
+ if (!try_complete_pread(fd, str.data(), hdr.next_zstd_dictionary_length_bytes, hdr.next_zstd_dictionary_offset_bytes)) {
+ if (conf_verbose) {
+ perror("pread(next_dictionary)");
+ }
+ return "";
+ }
+ return str;
+}
+
+// Scans the directory with absolute path “path”, which is opened as “fd”.
+// Uses relative paths and openat() only, evading any issues with PATH_MAX
+// and time-of-check-time-of-use race conditions. (mlocate's updatedb
+// does a much more complicated dance with changing the current working
+// directory, probably in the interest of portability to old platforms.)
+// “parent_dev” must be the device of the parent directory of “path”.
+//
+// Takes ownership of fd.
+int scan(const string &path, int fd, dev_t parent_dev, dir_time modified, dir_time db_modified, ExistingDB *existing_db, Corpus *corpus, DictionaryBuilder *dict_builder)
+{
+ if (string_list_contains_dir_path(&conf_prunepaths, &conf_prunepaths_index, path)) {
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, "Skipping `%s': in prunepaths\n", path.c_str());
+ }
+ close(fd);
+ return 0;
+ }
+ if (conf_prune_bind_mounts && is_bind_mount(path.c_str())) {
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, "Skipping `%s': bind mount\n", path.c_str());
+ }
+ close(fd);
+ return 0;
+ }
+
+ // We read in the old directory no matter whether it is current or not,
+ // because even if we're not going to use it, we'll need the modification directory
+ // of any subdirectories.
+
+ // Skip over anything before this directory; it is stuff that we would have
+ // consumed earlier if we wanted it.
+ for (;;) {
+ pair<string, dir_time> record = existing_db->read_next();
+ if (record.first.empty()) {
+ break;
+ }
+ if (dir_path_cmp(path, record.first) <= 0) {
+ existing_db->unread(move(record));
+ break;
+ }
+ }
+
+ // Now read everything in this directory.
+ vector<entry> db_entries;
+ const string path_plus_slash = path.back() == '/' ? path : path + '/';
+ for (;;) {
+ pair<string, dir_time> record = existing_db->read_next();
+ if (record.first.empty()) {
+ break;
+ }
+
+ if (record.first.rfind(path_plus_slash, 0) != 0) {
+ // No longer starts with path, so we're in a different directory.
+ existing_db->unread(move(record));
+ break;
+ }
+ if (record.first.find_first_of('/', path_plus_slash.size()) != string::npos) {
+ // Entered into a subdirectory of a subdirectory.
+ // Due to our ordering, this also means we're done.
+ existing_db->unread(move(record));
+ break;
+ }
+
+ entry e;
+ e.name = record.first.substr(path_plus_slash.size());
+ e.is_directory = (record.second.sec >= 0);
+ e.db_modified = record.second;
+ db_entries.push_back(e);
+ }
+
+ DIR *dir = nullptr;
+ vector<entry> entries;
+ if (!existing_db->get_error() && db_modified.sec > 0 &&
+ modified.sec == db_modified.sec && modified.nsec == db_modified.nsec) {
+ // Not changed since the last database, so we can replace the readdir()
+ // by reading from the database. (We still need to open and stat everything,
+ // though, but that happens in a later step.)
+ entries = move(db_entries);
+ } else {
+ dir = fdopendir(fd); // Takes over ownership of fd.
+ if (dir == nullptr) {
+ perror("fdopendir");
+ exit(1);
+ }
+
+ dirent *de;
+ while ((de = readdir(dir)) != nullptr) {
+ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) {
+ continue;
+ }
+ if (strlen(de->d_name) == 0) {
+ /* Unfortunately, this does happen, and mere assert() does not give
+ users enough information to complain to the right people. */
+ fprintf(stderr, "file system error: zero-length file name in directory %s", path.c_str());
+ continue;
+ }
+
+ entry e;
+ e.name = de->d_name;
+ e.is_directory = (de->d_type == DT_DIR);
+
+ if (conf_verbose) {
+ printf("%s/%s\n", path.c_str(), de->d_name);
+ }
+ entries.push_back(move(e));
+ }
+
+ sort(entries.begin(), entries.end(), [](const entry &a, const entry &b) {
+ return a.name < b.name;
+ });
+
+ // Load directory modification times from the old database.
+ auto db_it = db_entries.begin();
+ for (entry &e : entries) {
+ for (; db_it != db_entries.end(); ++db_it) {
+ if (e.name < db_it->name) {
+ break;
+ }
+ if (e.name == db_it->name) {
+ e.db_modified = db_it->db_modified;
+ break;
+ }
+ }
+ }
+ }
+
+ // For each entry, we want to add it to the database. but this includes the modification time
+ // for directories, which means we need to open and stat it at this point.
+ //
+ // This means we may need to have many directories open at the same time, but it seems to be
+ // the simplest (only?) way of being compatible with mlocate's notion of listing all contents
+ // of a given directory before recursing, without buffering even more information. Hopefully,
+ // we won't go out of file descriptors here (it could happen if someone has tens of thousands
+ // of subdirectories in a single directory); if so, the admin will need to raise the limit.
+ for (entry &e : entries) {
+ if (!e.is_directory) {
+ e.dt = not_a_dir;
+ continue;
+ }
+
+ if (find(conf_prunenames.begin(), conf_prunenames.end(), e.name) != conf_prunenames.end()) {
+ if (conf_debug_pruning) {
+ /* This is debugging output, don't mark anything for translation */
+ fprintf(stderr, "Skipping `%s': in prunenames\n", e.name.c_str());
+ }
+ continue;
+ }
+
+ e.fd = opendir_noatime(fd, e.name.c_str());
+ if (e.fd == -1) {
+ if (errno == EMFILE || errno == ENFILE) {
+ // The admin probably wants to know about this.
+ perror((path_plus_slash + e.name).c_str());
+
+ rlimit rlim;
+ if (getrlimit(RLIMIT_NOFILE, &rlim) == -1) {
+ fprintf(stderr, "Hint: Try `ulimit -n 131072' or similar.\n");
+ } else {
+ fprintf(stderr, "Hint: Try `ulimit -n %lu' or similar (current limit is %lu).\n",
+ rlim.rlim_cur * 2, rlim.rlim_cur);
+ }
+ exit(1);
+ }
+ continue;
+ }
+
+ struct stat buf;
+ if (fstat(e.fd, &buf) != 0) {
+ perror(path.c_str());
+ exit(1);
+ }
+
+ e.dev = buf.st_dev;
+ if (buf.st_dev != parent_dev) {
+ if (filesystem_is_excluded((path_plus_slash + e.name).c_str())) {
+ close(e.fd);
+ e.fd = -1;
+ continue;
+ }
+ }
+
+ e.dt = get_dirtime_from_stat(buf);
+ }
+
+ // Actually add all the entries we figured out dates for above.
+ for (const entry &e : entries) {
+ corpus->add_file(path_plus_slash + e.name, e.dt);
+ dict_builder->add_file(path_plus_slash + e.name, e.dt);
+ }
+
+ // Now scan subdirectories.
+ for (const entry &e : entries) {
+ if (e.is_directory && e.fd != -1) {
+ int ret = scan(path_plus_slash + e.name, e.fd, e.dev, e.dt, e.db_modified, existing_db, corpus, dict_builder);
+ if (ret == -1) {
+ // TODO: The unscanned file descriptors will leak, but it doesn't really matter,
+ // as we're about to exit.
+ closedir(dir);
+ return -1;
+ }
+ }
+ }
+
+ if (dir == nullptr) {
+ close(fd);
+ } else {
+ closedir(dir);
+ }
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ // We want to bump the file limit; do it if we can (usually we are root
+ // and can set whatever we want). 128k should be ample for most setups.
+ rlimit rlim;
+ if (getrlimit(RLIMIT_NOFILE, &rlim) != -1) {
+ rlim_t wanted = std::max<rlim_t>(rlim.rlim_cur, 131072);
+ rlim.rlim_cur = std::min<rlim_t>(wanted, rlim.rlim_max);
+ setrlimit(RLIMIT_NOFILE, &rlim); // Ignore errors.
+ }
+
+ conf_prepare(argc, argv);
+ if (conf_prune_bind_mounts) {
+ bind_mount_init(MOUNTINFO_PATH);
+ }
+
+ int fd = open(conf_output.c_str(), O_RDONLY);
+ ExistingDB existing_db(fd);
+
+ DictionaryBuilder dict_builder(/*blocks_to_keep=*/1000, conf_block_size);
+
+ gid_t owner = -1;
+ if (conf_check_visibility) {
+ group *grp = getgrnam(GROUPNAME);
+ if (grp == nullptr) {
+ fprintf(stderr, "Unknown group %s\n", GROUPNAME);
+ exit(1);
+ }
+ owner = grp->gr_gid;
+ }
+
+ DatabaseBuilder db(conf_output.c_str(), owner, conf_block_size, existing_db.read_next_dictionary());
+ Corpus *corpus = db.start_corpus(/*store_dir_times=*/true);
+
+ int root_fd = opendir_noatime(AT_FDCWD, conf_scan_root);
+ if (root_fd == -1) {
+ perror(".");
+ exit(1);
+ }
+
+ struct stat buf;
+ if (fstat(root_fd, &buf) == -1) {
+ perror(".");
+ exit(1);
+ }
+
+ scan(conf_scan_root, root_fd, buf.st_dev, get_dirtime_from_stat(buf), /*db_modified=*/unknown_dir_time, &existing_db, corpus, &dict_builder);
+
+ // It's too late to use the dictionary for the data we already compressed,
+ // unless we wanted to either scan the entire file system again (acceptable
+ // for plocate-build where it's cheap, less so for us), or uncompressing
+ // and recompressing. Instead, we store it for next time, assuming that the
+ // data changes fairly little from time to time.
+ string next_dictionary = dict_builder.train(1024);
+ db.set_next_dictionary(next_dictionary);
+ db.finish_corpus();
+
+ exit(EXIT_SUCCESS);
+}