]> git.sesse.net Git - plocate/commitdiff
Add a native updatedb.
authorSteinar H. Gunderson <steinar+nageru@gunderson.no>
Sat, 21 Nov 2020 17:23:20 +0000 (18:23 +0100)
committerSteinar H. Gunderson <steinar+git@gunderson.no>
Tue, 24 Nov 2020 23:58:09 +0000 (00:58 +0100)
This incorporates some code from mlocate's updatedb, and thus is compatible
with /etc/updatedb.conf, and supports all the pruning options from it.
All the code has been heavily modified, e.g. the gnulib dependency has been
removed and replaced with STL code (kicking 10k+ lines of code), the bind
mount code has been fixed (it was all broken since the switch from /etc/mtab
to /proc/self/mountinfo) and everything has been reformatted. Like with mlocate,
plocate's updatedb is merging, ie., it can skip readdir() on unchanged
directories. (The logic here is also copied pretty verbatim from mlocate.)
updatedb reads plocate's native format; there's a new max_version 2 that
contains directory timestamps (without it, updatedb will fall back to a full
scan). The timestamps increase the database size by only about 1%, which is a
good tradeoff when we're getting rid of the entire mlocate database.

We liberally use modern features to simplify the implementation; in particular,
openat() to avoid race conditions, instead of mlocate's complicated chdir() dance.
Unfortunately, the combination of the slightly strange storage order from mlocate,
and openat(), means we can need to keep up a bunch of file descriptors open,
but they are not an expensive resource these days, and we try to bump the
limit ourselves if we are allowed to. We also use O_TMPFILE, to make sure we
never leave a half-finished file lying around (mlocate's updatedb tries to
catch signals instead). All of this may hinder portability, so we might ease up
on the requirements later. We don't use io_uring for updatedb at this point.

plocate-build does not write the needed timestamps, so the first upgrade from
mlocate to native plocate requires a full rescan.

NOTE: The format is _not_ frozen yet, and won't be until actual release.

16 files changed:
README
bind-mount.cpp [new file with mode: 0644]
bind-mount.h [new file with mode: 0644]
complete_pread.cpp
complete_pread.h
conf.cpp [new file with mode: 0644]
conf.h [new file with mode: 0644]
database-builder.cpp
database-builder.h
db.h
lib.cpp [new file with mode: 0644]
lib.h [new file with mode: 0644]
meson.build
plocate-build.cpp
plocate.cpp
updatedb.cpp [new file with mode: 0644]

diff --git a/README b/README
index dcb60c827403d6817727a3323ea16e254561682e..5071a6b57e2e5424168a08ee33db0b6b88e52dac 100644 (file)
--- a/README
+++ b/README
@@ -33,6 +33,11 @@ the reference implementation, you can check it out and run as follows:
   ninja reconfigure
   ninja bench
 
-Copyright 2020 Steinar H. Gunderson <steinar+plocate@gunderson.no>.
+plocate (except updatedb), and the plocate-specific changes to updatedb,
+is Copyright 2020 Steinar H. Gunderson <steinar+plocate@gunderson.no>.
 Licensed under the GNU General Public License, either version 2,
 or (at your option) any later version. See the included file COPYING.
+
+updatedb is Copyright (C) 2005, 2007 Red Hat, Inc. All rights reserved.
+Licensed under the GNU General Public License, version 2. See the
+included file COPYING.
diff --git a/bind-mount.cpp b/bind-mount.cpp
new file mode 100644 (file)
index 0000000..50cf7a5
--- /dev/null
@@ -0,0 +1,310 @@
+/* Bind mount detection.  Note: if you change this, change tmpwatch as well.
+
+Copyright (C) 2005, 2007, 2008, 2012 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#include "bind-mount.h"
+
+#include "conf.h"
+#include "lib.h"
+
+#include <atomic>
+#include <fcntl.h>
+#include <limits.h>
+#include <map>
+#include <poll.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <thread>
+
+using namespace std;
+
+/* mountinfo handling */
+
+/* A single mountinfo entry */
+struct mount {
+       int id, parent_id;
+       unsigned dev_major, dev_minor;
+       string root;
+       string mount_point;
+       string fs_type;
+       string source;
+};
+
+/* Path to mountinfo */
+static const char *mountinfo_path;
+atomic<bool> mountinfo_updated{ false };
+
+multimap<pair<int, int>, mount> mount_entries;  // Keyed by device major/minor.
+
+/* Read a line from F.
+   Return a string, or empty string on error. */
+static string read_mount_line(FILE *f)
+{
+       string line;
+
+       for (;;) {
+               char buf[LINE_MAX];
+
+               if (fgets(buf, sizeof(buf), f) == nullptr) {
+                       if (feof(f))
+                               break;
+                       return "";
+               }
+               size_t chunk_length = strlen(buf);
+               if (chunk_length > 0 && buf[chunk_length - 1] == '\n') {
+                       line.append(buf, chunk_length - 1);
+                       break;
+               }
+               line.append(buf, chunk_length);
+       }
+       return line;
+}
+
+/* Parse a space-delimited entry in STR, decode octal escapes, write it to
+   DEST (allocated from mount_string_obstack) if it is not nullptr.
+   Return 0 if OK, -1 on error. */
+static int parse_mount_string(string *dest, const char **str)
+{
+       const char *src = *str;
+       while (*src == ' ' || *src == '\t') {
+               src++;
+       }
+       if (*src == 0) {
+               return -1;
+       }
+       string mount_string;
+       for (;;) {
+               char c = *src;
+
+               switch (c) {
+               case 0:
+               case ' ':
+               case '\t':
+                       goto done;
+
+               case '\\':
+                       if (src[1] >= '0' && src[1] <= '7' && src[2] >= '0' && src[2] <= '7' && src[3] >= '0' && src[3] <= '7') {
+                               unsigned v;
+
+                               v = ((src[1] - '0') << 6) | ((src[2] - '0') << 3) | (src[3] - '0');
+                               if (v <= UCHAR_MAX) {
+                                       mount_string.push_back(v);
+                                       src += 4;
+                                       break;
+                               }
+                       }
+                       /* Else fall through */
+
+               default:
+                       mount_string.push_back(c);
+                       src++;
+               }
+       }
+
+done:
+       *str = src;
+       if (dest != nullptr) {
+               *dest = move(mount_string);
+       }
+       return 0;
+}
+
+/* Read a single entry from F. Return true if succesful. */
+static bool read_mount_entry(FILE *f, mount *me)
+{
+       string line = read_mount_line(f);
+       if (line.empty()) {
+               return false;
+       }
+       size_t offset;
+       if (sscanf(line.c_str(), "%d %d %u:%u%zn", &me->id, &me->parent_id, &me->dev_major,
+                  &me->dev_minor, &offset) != 4) {
+               return false;
+       }
+       const char *ptr = line.c_str() + offset;
+       if (parse_mount_string(&me->root, &ptr) != 0 ||
+           parse_mount_string(&me->mount_point, &ptr) != 0 ||
+           parse_mount_string(nullptr, &ptr) != 0) {
+               return false;
+       }
+       bool separator_found;
+       do {
+               string option;
+               if (parse_mount_string(&option, &ptr) != 0) {
+                       return false;
+               }
+               separator_found = strcmp(option.c_str(), "-") == 0;
+       } while (!separator_found);
+
+       if (parse_mount_string(&me->fs_type, &ptr) != 0 ||
+           parse_mount_string(&me->source, &ptr) != 0 ||
+           parse_mount_string(nullptr, &ptr) != 0) {
+               return false;
+       }
+       return true;
+}
+
+/* Read mount information from mountinfo_path, update mount_entries and
+   num_mount_entries.
+   Return 0 if OK, -1 on error. */
+static int read_mount_entries(void)
+{
+       FILE *f = fopen(mountinfo_path, "r");
+       if (f == nullptr) {
+               return -1;
+       }
+
+       mount_entries.clear();
+
+       mount me;
+       while (read_mount_entry(f, &me)) {
+               if (conf_debug_pruning) {
+                       /* This is debugging output, don't mark anything for translation */
+                       fprintf(stderr,
+                               " `%s' (%d on %d) is `%s' of `%s' (%u:%u), type `%s'\n",
+                               me.mount_point.c_str(), me.id, me.parent_id, me.root.c_str(), me.source.c_str(),
+                               me.dev_major, me.dev_minor, me.fs_type.c_str());
+               }
+               mount_entries.emplace(make_pair(me.dev_major, me.dev_minor), me);
+       }
+       fclose(f);
+       return 0;
+}
+
+/* Bind mount path list maintenace and top-level interface. */
+
+/* mountinfo_path file descriptor, or -1 */
+static int mountinfo_fd;
+
+/* Known bind mount paths */
+static struct vector<string> bind_mount_paths; /* = { 0, }; */
+
+/* Next bind_mount_paths entry */
+static size_t bind_mount_paths_index; /* = 0; */
+
+/* Rebuild bind_mount_paths */
+static void rebuild_bind_mount_paths(void)
+{
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "Rebuilding bind_mount_paths:\n");
+       }
+       if (read_mount_entries() != 0) {
+               return;
+       }
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "Matching bind_mount_paths:\n");
+       }
+
+       bind_mount_paths.clear();
+
+       for (const auto &[dev_id, me] : mount_entries) {
+               const auto &[first, second] = mount_entries.equal_range(make_pair(me.dev_major, me.dev_minor));
+               for (auto it = first; it != second; ++it) {
+                       const mount &other = it->second;
+                       if (other.id == me.id) {
+                               // Don't compare an element to itself.
+                               continue;
+                       }
+                       // We have two mounts from the same device. Is one a prefix of the other?
+                       // If there are two that are equal, prefer the one with lowest ID.
+                       if (me.root.size() > other.root.size() && me.root.find(other.root) == 0) {
+                               if (conf_debug_pruning) {
+                                       /* This is debugging output, don't mark anything for translation */
+                                       fprintf(stderr, " => adding `%s' (root `%s' is a child of `%s', mounted on `%s')\n",
+                                               me.mount_point.c_str(), me.root.c_str(), other.root.c_str(), other.mount_point.c_str());
+                               }
+                               bind_mount_paths.push_back(me.mount_point);
+                               break;
+                       }
+                       if (me.root == other.root && me.id > other.id) {
+                               if (conf_debug_pruning) {
+                                       /* This is debugging output, don't mark anything for translation */
+                                       fprintf(stderr, " => adding `%s' (duplicate of mount point `%s')\n",
+                                               me.mount_point.c_str(), other.mount_point.c_str());
+                               }
+                               bind_mount_paths.push_back(me.mount_point);
+                               break;
+                       }
+               }
+       }
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "...done\n");
+       }
+       string_list_dir_path_sort(&bind_mount_paths);
+}
+
+/* Return true if PATH is a destination of a bind mount.
+   (Bind mounts "to self" are ignored.) */
+bool is_bind_mount(const char *path)
+{
+       if (mountinfo_updated.exchange(false)) {  // Atomic test-and-clear.
+               rebuild_bind_mount_paths();
+               bind_mount_paths_index = 0;
+       }
+       return string_list_contains_dir_path(&bind_mount_paths,
+                                            &bind_mount_paths_index, path);
+}
+
+/* Initialize state for is_bind_mount(), to read data from MOUNTINFO. */
+void bind_mount_init(const char *mountinfo)
+{
+       mountinfo_path = mountinfo;
+       mountinfo_fd = open(mountinfo_path, O_RDONLY);
+       if (mountinfo_fd == -1)
+               return;
+       rebuild_bind_mount_paths();
+
+       // mlocate re-polls this for each and every directory it wants to check,
+       // for unclear reasons; it's possible that it's worried about a new recursive
+       // bind mount being made while updatedb is running, causing an infinite loop?
+       // Since it's probably for some good reason, we do the same, but we don't
+       // want the barrage of syscalls. It's not synchronous, but the poll signal
+       // isn't either; there's a slight race condition, but one that could only
+       // be exploited by root.
+       //
+       // The thread is forcibly terminated on exit(), so we just let it loop forever.
+       thread poll_thread([&] {
+               for (;;) {
+                       struct pollfd pfd;
+                       /* Unfortunately (mount --bind $path $path/subdir) would leave st_dev
+                          unchanged between $path and $path/subdir, so we must keep reparsing
+                          mountinfo_path each time it changes. */
+                       pfd.fd = mountinfo_fd;
+                       pfd.events = POLLPRI;
+                       if (poll(&pfd, 1, /*timeout=*/-1) == -1) {
+                               perror("poll()");
+                               exit(1);
+                       }
+                       if ((pfd.revents & POLLPRI) != 0) {
+                               mountinfo_updated = true;
+                       }
+               }
+       });
+       poll_thread.detach();
+}
diff --git a/bind-mount.h b/bind-mount.h
new file mode 100644 (file)
index 0000000..b8c9e5f
--- /dev/null
@@ -0,0 +1,36 @@
+/* Bind mount detection.
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#ifndef BIND_MOUNT_H__
+#define BIND_MOUNT_H__
+
+/* System mount information file */
+#define MOUNTINFO_PATH "/proc/self/mountinfo"
+
+/* Return true if PATH is a destination of a bind mount.
+   (Bind mounts "to self" are ignored.) */
+extern bool is_bind_mount(const char *path);
+
+/* Initialize state for is_bind_mount(), to read data from MOUNTINFO. */
+extern void bind_mount_init(const char *mountinfo);
+
+#endif
index 49136747b94fb680a8fe4285771f75c6a98e018a..8571b33682deb85261d5bb13bfdcc91cfb41ac9f 100644 (file)
@@ -3,7 +3,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 
-void complete_pread(int fd, void *ptr, size_t len, off_t offset)
+bool try_complete_pread(int fd, void *ptr, size_t len, off_t offset)
 {
        while (len > 0) {
                ssize_t ret = pread(fd, ptr, len, offset);
@@ -11,11 +11,19 @@ void complete_pread(int fd, void *ptr, size_t len, off_t offset)
                        continue;
                }
                if (ret <= 0) {
-                       perror("pread");
-                       exit(1);
+                       return false;
                }
                ptr = reinterpret_cast<char *>(ptr) + ret;
                len -= ret;
                offset -= ret;
        }
+       return true;
+}
+
+void complete_pread(int fd, void *ptr, size_t len, off_t offset)
+{
+       if (!try_complete_pread(fd, ptr, len, offset)) {
+               perror("pread");
+               exit(1);
+       }
 }
index fec57870c1c45d1a031118eee424c19a6f956ba1..b0f2bd7bc3e33adf9c8346b32b90c1a38e81c369 100644 (file)
@@ -3,8 +3,11 @@
 
 #include <unistd.h>
 
-// A wrapper around pread() that returns an incomplete read.
-// Always synchronous (no io_uring).
+// A wrapper around pread() that retries on short reads and EINTR,
+// so you never need to call it twice. Always synchronous (no io_uring).
+bool try_complete_pread(int fd, void *ptr, size_t len, off_t offset);
+
+// Same, but exit on failure, so never returns a short read.
 void complete_pread(int fd, void *ptr, size_t len, off_t offset);
 
 #endif  // !defined(COMPLETE_PREAD_H)
diff --git a/conf.cpp b/conf.cpp
new file mode 100644 (file)
index 0000000..a8ec972
--- /dev/null
+++ b/conf.cpp
@@ -0,0 +1,636 @@
+/* updatedb configuration.
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+ */
+
+#include "conf.h"
+
+#include "error.h"
+#include "lib.h"
+
+#include <algorithm>
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+using namespace std;
+
+/* true if locate(1) should check whether files are visible before reporting
+   them */
+bool conf_check_visibility = true;
+
+/* Filesystems to skip, converted to uppercase and sorted by name */
+vector<string> conf_prunefs;
+
+/* Directory names to skip, sorted by name */
+vector<string> conf_prunenames;
+
+/* Paths to skip, sorted by name using dir_path_cmp () */
+vector<string> conf_prunepaths;
+
+/* true if bind mounts should be skipped */
+bool conf_prune_bind_mounts; /* = false; */
+
+/* true if pruning debug output was requested */
+bool conf_debug_pruning; /* = false; */
+
+/* Root of the directory tree to store in the database (canonical) */
+char *conf_scan_root; /* = NULL; */
+
+/* Absolute (not necessarily canonical) path to the database */
+string conf_output;
+
+/* 1 if file names should be written to stdout as they are found */
+bool conf_verbose; /* = false; */
+
+/* Configuration representation for the database configuration block */
+string conf_block;
+
+int conf_block_size = 32;
+bool use_debug = false;
+
+/* Parse a STR, store the parsed boolean value to DEST;
+   return 0 if OK, -1 on error. */
+static int
+parse_bool(bool *dest, const char *str)
+{
+       if (strcmp(str, "0") == 0 || strcmp(str, "no") == 0) {
+               *dest = false;
+               return 0;
+       }
+       if (strcmp(str, "1") == 0 || strcmp(str, "yes") == 0) {
+               *dest = true;
+               return 0;
+       }
+       return -1;
+}
+
+/* String list handling */
+
+/* Add values from space-separated VAL to VAR and LIST */
+static void
+var_add_values(vector<string> *list, const char *val)
+{
+       for (;;) {
+               const char *start;
+
+               while (isspace((unsigned char)*val))
+                       val++;
+               if (*val == 0)
+                       break;
+               start = val;
+               do
+                       val++;
+               while (*val != 0 && !isspace((unsigned char)*val));
+               list->emplace_back(start, val - start);
+       }
+}
+
+/* Finish variable LIST, sort its contents, remove duplicates */
+static void
+var_finish(vector<string> *list)
+{
+       sort(list->begin(), list->end());
+       auto new_end = unique(list->begin(), list->end());
+       list->erase(new_end, list->end());
+}
+
+/* UPDATEDB_CONF parsing */
+
+/* UPDATEDB_CONF (locked) */
+static FILE *uc_file;
+/* Line number at token start; type matches error_at_line () */
+static unsigned uc_line;
+/* Current line number; type matches error_at_line () */
+static unsigned uc_current_line;
+/* Last string returned by uc_lex */
+static string uc_lex_buf;
+
+/* Token types */
+enum {
+       UCT_EOF,
+       UCT_EOL,
+       UCT_IDENTIFIER,
+       UCT_EQUAL,
+       UCT_QUOTED,
+       UCT_OTHER,
+       UCT_PRUNE_BIND_MOUNTS,
+       UCT_PRUNEFS,
+       UCT_PRUNENAMES,
+       UCT_PRUNEPATHS
+};
+
+/* Return next token from uc_file; for UCT_IDENTIFIER, UCT_QUOTED or keywords,
+   store the data to uc_lex_buf (valid until next call). */
+static int
+uc_lex(void)
+{
+       int c;
+
+       uc_lex_buf.clear();
+       uc_line = uc_current_line;
+       do {
+               c = getc_unlocked(uc_file);
+               if (c == EOF)
+                       return UCT_EOF;
+       } while (c != '\n' && isspace((unsigned char)c));
+       switch (c) {
+       case '#':
+               do {
+                       c = getc_unlocked(uc_file);
+                       if (c == EOF)
+                               return UCT_EOF;
+               } while (c != '\n');
+               /* Fall through */
+       case '\n':
+               uc_current_line++;
+               if (uc_current_line == 0) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_current_line - 1,
+                                     _("warning: Line number overflow"));
+                       error_message_count--; /* Don't count as an error */
+               }
+               return UCT_EOL;
+
+       case '=':
+               return UCT_EQUAL;
+
+       case '"': {
+               while ((c = getc_unlocked(uc_file)) != '"') {
+                       if (c == EOF || c == '\n') {
+                               error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                             _("missing closing `\"'"));
+                               ungetc(c, uc_file);
+                               break;
+                       }
+                       uc_lex_buf.push_back(c);
+               }
+               return UCT_QUOTED;
+       }
+
+       default: {
+               if (!isalpha((unsigned char)c) && c != '_')
+                       return UCT_OTHER;
+               do {
+                       uc_lex_buf.push_back(c);
+                       c = getc_unlocked(uc_file);
+               } while (c != EOF && (isalnum((unsigned char)c) || c == '_'));
+               ungetc(c, uc_file);
+               if (uc_lex_buf == "PRUNE_BIND_MOUNTS")
+                       return UCT_PRUNE_BIND_MOUNTS;
+               if (uc_lex_buf == "PRUNEFS")
+                       return UCT_PRUNEFS;
+               if (uc_lex_buf == "PRUNENAMES")
+                       return UCT_PRUNENAMES;
+               if (uc_lex_buf == "PRUNEPATHS")
+                       return UCT_PRUNEPATHS;
+               return UCT_IDENTIFIER;
+       }
+       }
+}
+
+/* Parse /etc/updatedb.conf.  Exit on I/O or syntax error. */
+static void
+parse_updatedb_conf(void)
+{
+       int old_error_one_per_line;
+       unsigned old_error_message_count;
+       bool had_prune_bind_mounts, had_prunefs, had_prunenames, had_prunepaths;
+
+       uc_file = fopen(UPDATEDB_CONF, "r");
+       if (uc_file == NULL) {
+               if (errno != ENOENT)
+                       error(EXIT_FAILURE, errno, _("can not open `%s'"), UPDATEDB_CONF);
+               goto err;
+       }
+       flockfile(uc_file);
+       uc_current_line = 1;
+       old_error_message_count = error_message_count;
+       old_error_one_per_line = error_one_per_line;
+       error_one_per_line = 1;
+       had_prune_bind_mounts = false;
+       had_prunefs = false;
+       had_prunenames = false;
+       had_prunepaths = false;
+       for (;;) {
+               bool *had_var;
+               int var_token, token;
+
+               token = uc_lex();
+               switch (token) {
+               case UCT_EOF:
+                       goto eof;
+
+               case UCT_EOL:
+                       continue;
+
+               case UCT_PRUNE_BIND_MOUNTS:
+                       had_var = &had_prune_bind_mounts;
+                       break;
+
+               case UCT_PRUNEFS:
+                       had_var = &had_prunefs;
+                       break;
+
+               case UCT_PRUNENAMES:
+                       had_var = &had_prunenames;
+                       break;
+
+               case UCT_PRUNEPATHS:
+                       had_var = &had_prunepaths;
+                       break;
+
+               case UCT_IDENTIFIER:
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("unknown variable `%s'"), uc_lex_buf.c_str());
+                       goto skip_to_eol;
+
+               default:
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("variable name expected"));
+                       goto skip_to_eol;
+               }
+               if (*had_var != false) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("variable `%s' was already defined"), uc_lex_buf.c_str());
+                       goto skip_to_eol;
+               }
+               *had_var = true;
+               var_token = token;
+               token = uc_lex();
+               if (token != UCT_EQUAL) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("`=' expected after variable name"));
+                       goto skip_to_eol;
+               }
+               token = uc_lex();
+               if (token != UCT_QUOTED) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("value in quotes expected after `='"));
+                       goto skip_to_eol;
+               }
+               if (var_token == UCT_PRUNE_BIND_MOUNTS) {
+                       if (parse_bool(&conf_prune_bind_mounts, uc_lex_buf.c_str()) != 0) {
+                               error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                             _("invalid value `%s' of PRUNE_BIND_MOUNTS"),
+                                             uc_lex_buf.c_str());
+                               goto skip_to_eol;
+                       }
+               } else if (var_token == UCT_PRUNEFS)
+                       var_add_values(&conf_prunefs, uc_lex_buf.c_str());
+               else if (var_token == UCT_PRUNENAMES)
+                       var_add_values(&conf_prunenames, uc_lex_buf.c_str());
+               else if (var_token == UCT_PRUNEPATHS)
+                       var_add_values(&conf_prunepaths, uc_lex_buf.c_str());
+               else
+                       abort();
+               token = uc_lex();
+               if (token != UCT_EOL && token != UCT_EOF) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("unexpected data after variable value"));
+                       goto skip_to_eol;
+               }
+               /* Fall through */
+       skip_to_eol:
+               while (token != UCT_EOL) {
+                       if (token == UCT_EOF)
+                               goto eof;
+                       token = uc_lex();
+               }
+       }
+eof:
+       if (ferror(uc_file))
+               error(EXIT_FAILURE, 0, _("I/O error reading `%s'"), UPDATEDB_CONF);
+       error_one_per_line = old_error_one_per_line;
+       funlockfile(uc_file);
+       fclose(uc_file);
+       if (error_message_count != old_error_message_count)
+               exit(EXIT_FAILURE);
+err:;
+}
+
+/* Command-line argument parsing */
+
+/* Output --help text */
+static void
+help(void)
+{
+       printf(_("Usage: updatedb [OPTION]...\n"
+                "Update a mlocate database.\n"
+                "\n"
+                "  -f, --add-prunefs FS           omit also FS\n"
+                "  -n, --add-prunenames NAMES     omit also NAMES\n"
+                "  -e, --add-prunepaths PATHS     omit also PATHS\n"
+                "  -U, --database-root PATH       the subtree to store in "
+                "database (default \"/\")\n"
+                "  -h, --help                     print this help\n"
+                "  -o, --output FILE              database to update (default\n"
+                "                                 `%s')\n"
+                "  -b, --block-size SIZE          number of filenames to store\n"
+                "                                 in each block (default 32)\n"
+                "      --prune-bind-mounts FLAG   omit bind mounts (default "
+                "\"no\")\n"
+                "      --prunefs FS               filesystems to omit from "
+                "database\n"
+                "      --prunenames NAMES         directory names to omit from "
+                "database\n"
+                "      --prunepaths PATHS         paths to omit from database\n"
+                "  -l, --require-visibility FLAG  check visibility before "
+                "reporting files\n"
+                "                                 (default \"yes\")\n"
+                "  -v, --verbose                  print paths of files as they "
+                "are found\n"
+                "  -V, --version                  print version information\n"
+                "\n"
+                "The configuration defaults to values read from\n"
+                "`%s'.\n"),
+              DBFILE, UPDATEDB_CONF);
+       printf(_("\n"
+                "Report bugs to %s.\n"),
+              PACKAGE_BUGREPORT);
+}
+
+/* Prepend current working directory to PATH;
+   return resulting path */
+static string
+prepend_cwd(const string &path)
+{
+       const char *res;
+       string buf;
+       buf.resize(BUFSIZ); /* Not PATH_MAX because it is not defined on some platforms. */
+       do
+               buf.resize(buf.size() * 1.5);
+       while ((res = getcwd(buf.data(), buf.size())) == NULL && errno == ERANGE);
+       if (res == NULL)
+               error(EXIT_FAILURE, errno, _("can not get current working directory"));
+       buf.resize(strlen(buf.data()));
+       return buf + '/' + path;
+}
+
+/* Parse ARGC, ARGV.  Exit on error or --help, --version. */
+static void
+parse_arguments(int argc, char *argv[])
+{
+       enum { OPT_DEBUG_PRUNING = CHAR_MAX + 1 };
+
+       static const struct option options[] = {
+               { "add-prunefs", required_argument, NULL, 'f' },
+               { "add-prunenames", required_argument, NULL, 'n' },
+               { "add-prunepaths", required_argument, NULL, 'e' },
+               { "database-root", required_argument, NULL, 'U' },
+               { "debug-pruning", no_argument, NULL, OPT_DEBUG_PRUNING },
+               { "help", no_argument, NULL, 'h' },
+               { "output", required_argument, NULL, 'o' },
+               { "prune-bind-mounts", required_argument, NULL, 'B' },
+               { "prunefs", required_argument, NULL, 'F' },
+               { "prunenames", required_argument, NULL, 'N' },
+               { "prunepaths", required_argument, NULL, 'P' },
+               { "require-visibility", required_argument, NULL, 'l' },
+               { "verbose", no_argument, NULL, 'v' },
+               { "version", no_argument, NULL, 'V' },
+               { "block-size", required_argument, 0, 'b' },
+               { "debug", no_argument, 0, 'D' },  // Not documented.
+               { NULL, 0, NULL, 0 }
+       };
+
+       bool prunefs_changed, prunenames_changed, prunepaths_changed;
+       bool got_prune_bind_mounts, got_visibility;
+
+       prunefs_changed = false;
+       prunenames_changed = false;
+       prunepaths_changed = false;
+       got_prune_bind_mounts = false;
+       got_visibility = false;
+       for (;;) {
+               int opt, idx;
+
+               opt = getopt_long(argc, argv, "U:Ve:f:hl:n:o:vb:D", options, &idx);
+               switch (opt) {
+               case -1:
+                       goto options_done;
+
+               case '?':
+                       exit(EXIT_FAILURE);
+
+               case 'B':
+                       if (got_prune_bind_mounts != false)
+                               error(EXIT_FAILURE, 0,
+                                     _("--%s would override earlier command-line argument"),
+                                     "prune-bind-mounts");
+                       got_prune_bind_mounts = true;
+                       if (parse_bool(&conf_prune_bind_mounts, optarg) != 0)
+                               error(EXIT_FAILURE, 0, _("invalid value `%s' of --%s"), optarg,
+                                     "prune-bind-mounts");
+                       break;
+
+               case 'F':
+                       if (prunefs_changed != false)
+                               error(EXIT_FAILURE, 0,
+                                     _("--%s would override earlier command-line argument"),
+                                     "prunefs");
+                       prunefs_changed = true;
+                       conf_prunefs.clear();
+                       var_add_values(&conf_prunefs, optarg);
+                       break;
+
+               case 'N':
+                       if (prunenames_changed != false)
+                               error(EXIT_FAILURE, 0,
+                                     _("--%s would override earlier command-line argument"),
+                                     "prunenames");
+                       prunenames_changed = true;
+                       conf_prunenames.clear();
+                       var_add_values(&conf_prunenames, optarg);
+                       break;
+
+               case 'P':
+                       if (prunepaths_changed != false)
+                               error(EXIT_FAILURE, 0,
+                                     _("--%s would override earlier command-line argument"),
+                                     "prunepaths");
+                       prunepaths_changed = true;
+                       conf_prunepaths.clear(),
+                               var_add_values(&conf_prunepaths, optarg);
+                       break;
+
+               case 'U':
+                       if (conf_scan_root != NULL)
+                               error(EXIT_FAILURE, 0, _("--%s specified twice"),
+                                     "database-root");
+                       conf_scan_root = canonicalize_file_name(optarg);
+                       if (conf_scan_root == NULL)
+                               error(EXIT_FAILURE, errno, _("invalid value `%s' of --%s"), optarg,
+                                     "database-root");
+                       break;
+
+               case 'V':
+                       puts("updatedb (" PACKAGE_NAME ") " PACKAGE_VERSION);
+                       puts(_("Copyright (C) 2007 Red Hat, Inc. All rights reserved.\n"
+                              "This software is distributed under the GPL v.2.\n"
+                              "\n"
+                              "This program is provided with NO WARRANTY, to the extent "
+                              "permitted by law."));
+                       exit(EXIT_SUCCESS);
+
+               case 'e':
+                       prunepaths_changed = true;
+                       var_add_values(&conf_prunepaths, optarg);
+                       break;
+
+               case 'f':
+                       prunefs_changed = true;
+                       var_add_values(&conf_prunefs, optarg);
+                       break;
+
+               case 'h':
+                       help();
+                       exit(EXIT_SUCCESS);
+
+               case 'l':
+                       if (got_visibility != false)
+                               error(EXIT_FAILURE, 0, _("--%s specified twice"),
+                                     "require-visibility");
+                       got_visibility = true;
+                       if (parse_bool(&conf_check_visibility, optarg) != 0)
+                               error(EXIT_FAILURE, 0, _("invalid value `%s' of --%s"), optarg,
+                                     "require-visibility");
+                       break;
+
+               case 'n':
+                       prunenames_changed = true;
+                       var_add_values(&conf_prunenames, optarg);
+                       break;
+
+               case 'o':
+                       if (!conf_output.empty())
+                               error(EXIT_FAILURE, 0, _("--%s specified twice"), "output");
+                       conf_output = optarg;
+                       break;
+
+               case 'v':
+                       conf_verbose = true;
+                       break;
+
+               case 'b':
+                       conf_block_size = atoi(optarg);
+                       break;
+
+               case 'D':
+                       use_debug = true;
+                       break;
+
+               case OPT_DEBUG_PRUNING:
+                       conf_debug_pruning = true;
+                       break;
+
+               default:
+                       abort();
+               }
+       }
+options_done:
+       if (optind != argc)
+               error(EXIT_FAILURE, 0, _("unexpected operand on command line"));
+       if (conf_scan_root == NULL) {
+               static char root[] = "/";
+
+               conf_scan_root = root;
+       }
+       if (conf_output.empty())
+               conf_output = DBFILE;
+       if (conf_output[0] != '/')
+               conf_output = prepend_cwd(conf_output);
+}
+
+/* Conversion of configuration for main code */
+
+/* Store a string list to OBSTACK */
+static void
+gen_conf_block_string_list(string *obstack,
+                           const vector<string> *strings)
+{
+       for (const string &str : *strings) {
+               *obstack += str;
+               *obstack += '\0';
+       }
+       *obstack += '\0';
+}
+
+/* Generate conf_block */
+static void
+gen_conf_block(void)
+{
+       conf_block.clear();
+
+#define CONST(S) conf_block.append(S, sizeof(S))
+       /* conf_check_visibility value is stored in the header */
+       CONST("prune_bind_mounts");
+       /* Add two NUL bytes after the value */
+       conf_block.append(conf_prune_bind_mounts != false ? "1\0" : "0\0", 3);
+       CONST("prunefs");
+       gen_conf_block_string_list(&conf_block, &conf_prunefs);
+       CONST("prunenames");
+       gen_conf_block_string_list(&conf_block, &conf_prunenames);
+       CONST("prunepaths");
+       gen_conf_block_string_list(&conf_block, &conf_prunepaths);
+       /* scan_root is contained directly in the header */
+       /* conf_output, conf_verbose are not relevant */
+#undef CONST
+}
+
+/* Parse /etc/updatedb.conf and command-line arguments ARGC, ARGV.
+   Exit on error or --help, --version. */
+void conf_prepare(int argc, char *argv[])
+{
+       parse_updatedb_conf();
+       parse_arguments(argc, argv);
+       for (string &str : conf_prunefs) {
+               /* Assuming filesystem names are ASCII-only */
+               for (char &c : str)
+                       c = toupper(c);
+       }
+       /* Finish the variable only after converting filesystem names to upper case
+          to avoid keeping duplicates that originally differed in case and to sort
+          them correctly. */
+       var_finish(&conf_prunefs);
+       var_finish(&conf_prunenames);
+       var_finish(&conf_prunepaths);
+       gen_conf_block();
+       string_list_dir_path_sort(&conf_prunepaths);
+
+       if (conf_debug_pruning) {
+               /* This is debuging output, don't mark anything for translation */
+               fprintf(stderr, "conf_block:\n");
+               for (char c : conf_block) {
+                       if (isascii((unsigned char)c) && isprint((unsigned char)c) && c != '\\')
+                               putc(c, stderr);
+                       else {
+                               fprintf(stderr, "\\%03o", (unsigned)(unsigned char)c);
+                               if (c == 0)
+                                       putc('\n', stderr);
+                       }
+               }
+               fprintf(stderr, "\n-----------------------\n");
+       }
+}
diff --git a/conf.h b/conf.h
new file mode 100644 (file)
index 0000000..388e59f
--- /dev/null
+++ b/conf.h
@@ -0,0 +1,68 @@
+/* updatedb configuration.
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#ifndef CONF_H__
+#define CONF_H__
+
+#include <stddef.h>
+#include <string>
+#include <vector>
+
+/* true if locate(1) should check whether files are visible before reporting
+   them */
+extern bool conf_check_visibility;
+
+/* Filesystems to skip, converted to uppercase and sorted by name */
+extern std::vector<std::string> conf_prunefs;
+
+/* Directory names to skip, sorted by name */
+extern std::vector<std::string> conf_prunenames;
+
+/* Paths to skip, sorted by name using dir_path_cmp () */
+extern std::vector<std::string> conf_prunepaths;
+
+/* true if bind mounts should be skipped */
+extern bool conf_prune_bind_mounts;
+
+/* true if pruning debug output was requested */
+extern bool conf_debug_pruning;
+
+/* Root of the directory tree to store in the database (canonical) */
+extern char *conf_scan_root;
+
+/* Absolute (not necessarily canonical) path to the database */
+extern std::string conf_output;
+
+/* true if file names should be written to stdout as they are found */
+extern bool conf_verbose;
+
+/* Configuration representation for the database configuration block */
+extern std::string conf_block;
+
+/* Parse /etc/updatedb.conf and command-line arguments ARGC, ARGV.
+   Exit on error or --help, --version. */
+extern void conf_prepare(int argc, char *argv[]);
+
+extern int conf_block_size;
+extern bool use_debug;
+
+#endif
index a35be2a55702cad0e8b5bdf6b1a29767da3ef8a3..642420f3c5942b6021157a2e19d559b751cdb255 100644 (file)
@@ -113,7 +113,7 @@ void PostingListBuilder::write_header(uint32_t docid)
        encoded.append(reinterpret_cast<char *>(buf), end - buf);
 }
 
-void DictionaryBuilder::add_file(string filename)
+void DictionaryBuilder::add_file(string filename, dir_time)
 {
        if (keep_current_block) {  // Only bother saving the filenames if we're actually keeping the block.
                if (!current_block.empty()) {
@@ -175,10 +175,14 @@ string DictionaryBuilder::train(size_t buf_size)
        return buf;
 }
 
-Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict)
-       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), cdict(cdict)
+Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times)
+       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict)
 {
        fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr);
+       if (store_dir_times) {
+               dir_time_ctx = ZSTD_createCStream();
+               ZSTD_initCStream(dir_time_ctx, /*level=*/6);
+       }
 }
 
 Corpus::~Corpus()
@@ -196,7 +200,7 @@ PostingListBuilder &Corpus::get_pl_builder(uint32_t trgm)
        return *invindex[trgm];
 }
 
-void Corpus::add_file(string filename)
+void Corpus::add_file(string filename, dir_time dt)
 {
        ++num_files;
        if (!current_block.empty()) {
@@ -206,6 +210,49 @@ void Corpus::add_file(string filename)
        if (++num_files_in_block == block_size) {
                flush_block();
        }
+
+       if (store_dir_times) {
+               if (dt.sec == -1) {
+                       // Not a directory.
+                       dir_times.push_back('\0');
+               } else {
+                       dir_times.push_back('\1');
+                       dir_times.append(reinterpret_cast<char *>(&dt.sec), sizeof(dt.sec));
+                       dir_times.append(reinterpret_cast<char *>(&dt.nsec), sizeof(dt.nsec));
+               }
+               compress_dir_times(/*allowed_slop=*/4096);
+       }
+}
+
+void Corpus::compress_dir_times(size_t allowed_slop) {
+       while (dir_times.size() >= allowed_slop) {
+               size_t old_size = dir_times_compressed.size();
+               dir_times_compressed.resize(old_size + 4096);
+
+               ZSTD_outBuffer outbuf;
+               outbuf.dst = dir_times_compressed.data() + old_size;
+               outbuf.size = 4096;
+               outbuf.pos = 0;
+
+               ZSTD_inBuffer inbuf;
+               inbuf.src = dir_times.data();
+               inbuf.size = dir_times.size();
+               inbuf.pos = 0;
+
+               int ret = ZSTD_compressStream(dir_time_ctx, &outbuf, &inbuf);
+               if (ret < 0) {
+                       fprintf(stderr, "ZSTD_compressStream() failed\n");
+                       exit(1);
+               }
+
+               dir_times_compressed.resize(old_size + outbuf.pos);
+               dir_times.erase(dir_times.begin(), dir_times.begin() + inbuf.pos);
+
+               if (outbuf.pos == 0 && inbuf.pos == 0) {
+                       // Nothing happened (not enough data?), try again later.
+                       return;
+               }
+       }
 }
 
 void Corpus::flush_block()
@@ -258,6 +305,40 @@ size_t Corpus::num_trigrams() const
        return num;
 }
 
+string Corpus::get_compressed_dir_times()
+{
+       if (!store_dir_times) {
+               return "";
+       }
+       compress_dir_times(/*allowed_slop=*/0);
+       assert(dir_times.empty());
+
+       for ( ;; ) {
+               size_t old_size = dir_times_compressed.size();
+               dir_times_compressed.resize(old_size + 4096);
+
+               ZSTD_outBuffer outbuf;
+               outbuf.dst = dir_times_compressed.data() + old_size;
+               outbuf.size = 4096;
+               outbuf.pos = 0;
+
+               int ret = ZSTD_endStream(dir_time_ctx, &outbuf);
+               if (ret < 0) {
+                       fprintf(stderr, "ZSTD_compressStream() failed\n");
+                       exit(1);
+               }
+
+               dir_times_compressed.resize(old_size + outbuf.pos);
+
+               if (ret == 0) {
+                       // All done.
+                       break;
+               }
+       }
+
+       return dir_times_compressed;
+}
+
 string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf)
 {
        static ZSTD_CCtx *ctx = nullptr;
@@ -335,19 +416,29 @@ unique_ptr<Trigram[]> create_hashtable(Corpus &corpus, const vector<uint32_t> &a
        return ht;
 }
 
-DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dictionary)
+DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_size, string dictionary)
        : outfile(outfile), block_size(block_size)
 {
        umask(0027);
 
        string path = outfile;
        path.resize(path.find_last_of('/') + 1);
+       if (path.empty()) {
+               path = ".";
+       }
        int fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640);
        if (fd == -1) {
                perror(path.c_str());
                exit(1);
        }
 
+       if (owner != (gid_t)-1) {
+               if (fchown(fd, (uid_t)-1, owner) == -1) {
+                       perror("fchown");
+                       exit(1);
+               }
+       }
+
        outfp = fdopen(fd, "wb");
        if (outfp == nullptr) {
                perror(outfile);
@@ -361,7 +452,7 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dic
        hdr.extra_ht_slots = num_overflow_slots;
        hdr.num_docids = 0;
        hdr.hash_table_offset_bytes = -1;  // We don't know these offsets yet.
-       hdr.max_version = 1;
+       hdr.max_version = 2;
        hdr.filename_index_offset_bytes = -1;
        hdr.zstd_dictionary_length_bytes = -1;
        fwrite(&hdr, sizeof(hdr), 1, outfp);
@@ -375,15 +466,32 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dic
                hdr.zstd_dictionary_length_bytes = dictionary.size();
                cdict = ZSTD_createCDict(dictionary.data(), dictionary.size(), /*level=*/6);
        }
+
+       hdr.directory_data_length_bytes = 0;
+       hdr.directory_data_offset_bytes = 0;
+       hdr.next_zstd_dictionary_length_bytes = 0;
+       hdr.next_zstd_dictionary_offset_bytes = 0;
+       hdr.conf_block_length_bytes = 0;
+       hdr.conf_block_offset_bytes = 0;
 }
 
-Corpus *DatabaseBuilder::start_corpus()
+Corpus *DatabaseBuilder::start_corpus(bool store_dir_times)
 {
        corpus_start = steady_clock::now();
-       corpus = new Corpus(outfp, block_size, cdict);
+       corpus = new Corpus(outfp, block_size, cdict, store_dir_times);
        return corpus;
 }
 
+void DatabaseBuilder::set_next_dictionary(std::string next_dictionary)
+{
+       this->next_dictionary = move(next_dictionary);
+}
+
+void DatabaseBuilder::set_conf_block(std::string conf_block)
+{
+       this->conf_block = move(conf_block);
+}
+
 void DatabaseBuilder::finish_corpus()
 {
        corpus->finish();
@@ -468,6 +576,31 @@ void DatabaseBuilder::finish_corpus()
                fwrite(encoded.data(), encoded.size(), 1, outfp);
        }
 
+       // Finally, write the directory times (for updatedb).
+       string compressed_dir_times = corpus->get_compressed_dir_times();
+       size_t bytes_for_compressed_dir_times = 0;
+       if (!compressed_dir_times.empty()) {
+               hdr.directory_data_offset_bytes = ftell(outfp);
+               hdr.directory_data_length_bytes = compressed_dir_times.size();
+               fwrite(compressed_dir_times.data(), compressed_dir_times.size(), 1, outfp);
+               bytes_for_compressed_dir_times = compressed_dir_times.size();
+               compressed_dir_times.clear();
+       }
+
+       // Write the recommended dictionary for next update.
+       if (!next_dictionary.empty()) {
+               hdr.next_zstd_dictionary_offset_bytes = ftell(outfp);
+               hdr.next_zstd_dictionary_length_bytes = next_dictionary.size();
+               fwrite(next_dictionary.data(), next_dictionary.size(), 1, outfp);
+       }
+
+       // And the configuration block.
+       if (!conf_block.empty()) {
+               hdr.next_zstd_dictionary_offset_bytes = ftell(outfp);
+               hdr.next_zstd_dictionary_length_bytes = conf_block.size();
+               fwrite(conf_block.data(), conf_block.size(), 1, outfp);
+       }
+
        // Rewind, and write the updated header.
        hdr.version = 1;
        fseek(outfp, 0, SEEK_SET);
@@ -485,7 +618,7 @@ void DatabaseBuilder::finish_corpus()
 
        fclose(outfp);
 
-       size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
+       size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames + bytes_for_compressed_dir_times);
 
        dprintf("Block size:     %7d files\n", block_size);
        dprintf("Dictionary:     %'7.1f MB\n", hdr.zstd_dictionary_length_bytes / 1048576.0);
@@ -493,6 +626,9 @@ void DatabaseBuilder::finish_corpus()
        dprintf("Posting lists:  %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
        dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
        dprintf("Filenames:      %'7.1f MB\n", bytes_for_filenames / 1048576.0);
+       if (bytes_for_compressed_dir_times != 0) {
+               dprintf("Modify times:   %'7.1f MB\n", bytes_for_compressed_dir_times / 1048576.0);
+       }
        dprintf("Total:          %'7.1f MB\n", total_bytes / 1048576.0);
        dprintf("\n");
 }
index e799105563270690099bffcbc9f9b196d49beb6f..e2c0c19f54c291d19d99edd87c47b23c48b2ac50 100644 (file)
@@ -8,15 +8,36 @@
 #include <random>
 #include <stddef.h>
 #include <string>
+#include <utility>
 #include <vector>
 #include <zstd.h>
 
 class PostingListBuilder;
 
+// {0,0} means unknown or so current that it should never match.
+// {-1,0} means it's not a directory.
+struct dir_time {
+       int64_t sec;
+       int32_t nsec;
+
+       bool operator<(const dir_time &other) const
+       {
+               if (sec != other.sec)
+                       return sec < other.sec;
+               return nsec < other.nsec;
+       }
+       bool operator>=(const dir_time &other) const
+       {
+               return !(other < *this);
+       }
+};
+constexpr dir_time unknown_dir_time{ 0, 0 };
+constexpr dir_time not_a_dir{ -1, 0 };
+
 class DatabaseReceiver {
 public:
        virtual ~DatabaseReceiver() = default;
-       virtual void add_file(std::string filename) = 0;
+       virtual void add_file(std::string filename, dir_time dt) = 0;
        virtual void flush_block() = 0;
        virtual void finish() { flush_block(); }
 };
@@ -25,7 +46,7 @@ class DictionaryBuilder : public DatabaseReceiver {
 public:
        DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
                : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
-       void add_file(std::string filename) override;
+       void add_file(std::string filename, dir_time dt) override;
        void flush_block() override;
        std::string train(size_t buf_size);
 
@@ -45,10 +66,10 @@ private:
 
 class Corpus : public DatabaseReceiver {
 public:
-       Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict);
+       Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times);
        ~Corpus();
 
-       void add_file(std::string filename) override;
+       void add_file(std::string filename, dir_time dt) override;
        void flush_block() override;
        void finish() override;
 
@@ -60,20 +81,30 @@ public:
        }
        PostingListBuilder &get_pl_builder(uint32_t trgm);
        size_t num_trigrams() const;
+       std::string get_compressed_dir_times();
 
 private:
+       void compress_dir_times(size_t allowed_slop);
+
        std::unique_ptr<PostingListBuilder *[]> invindex;
        FILE *outfp;
        std::string current_block;
        std::string tempbuf;
        const size_t block_size;
+       const bool store_dir_times;
        ZSTD_CDict *cdict;
+
+       ZSTD_CStream *dir_time_ctx = nullptr;
+       std::string dir_times;  // Buffer of still-uncompressed data.
+       std::string dir_times_compressed;
 };
 
 class DatabaseBuilder {
 public:
-       DatabaseBuilder(const char *outfile, int block_size, std::string dictionary);
-       Corpus *start_corpus();
+       DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary);
+       Corpus *start_corpus(bool store_dir_times);
+       void set_next_dictionary(std::string next_dictionary);
+       void set_conf_block(std::string conf_block);
        void finish_corpus();
 
 private:
@@ -84,6 +115,7 @@ private:
        std::chrono::steady_clock::time_point corpus_start;
        Corpus *corpus = nullptr;
        ZSTD_CDict *cdict = nullptr;
+       std::string next_dictionary, conf_block;
 };
 
 #endif  // !defined(_DATABASE_BUILDER_H)
diff --git a/db.h b/db.h
index df79904d94594753bf960538e0f006d82b614437..e23d47885ecedb85039fca8a71206ec765d071ba 100644 (file)
--- a/db.h
+++ b/db.h
@@ -13,9 +13,17 @@ struct Header {
        uint64_t filename_index_offset_bytes;
 
        // Version 1 and up only.
-       uint32_t max_version;  // Nominally 1, but can be increased if more features are added in a backward-compatible way.
+       uint32_t max_version;  // Nominally 1 or 2, but can be increased if more features are added in a backward-compatible way.
        uint32_t zstd_dictionary_length_bytes;
        uint64_t zstd_dictionary_offset_bytes;
+
+       // Only if max_version >= 2, and only relevant for updatedb.
+       uint64_t directory_data_length_bytes;
+       uint64_t directory_data_offset_bytes;
+       uint64_t next_zstd_dictionary_length_bytes;
+       uint64_t next_zstd_dictionary_offset_bytes;
+       uint64_t conf_block_length_bytes;
+       uint64_t conf_block_offset_bytes;
 };
 
 struct Trigram {
diff --git a/lib.cpp b/lib.cpp
new file mode 100644 (file)
index 0000000..171eced
--- /dev/null
+++ b/lib.cpp
@@ -0,0 +1,92 @@
+/* Common functions.
+
+Copyright (C) 2005, 2007 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+using namespace std;
+
+#include "lib.h"
+
+#include "db.h"
+#include "error.h"
+
+#include <algorithm>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Compare two path names using the database directory order. This is not
+   exactly strcmp () order: "a" < "a.b", so "a/z" < "a.b". */
+int dir_path_cmp(const string &a, const string &b)
+{
+       auto [ai, bi] = mismatch(a.begin(), a.end(), b.begin(), b.end());
+       if (ai == a.end() && bi == b.end()) {
+               return 0;
+       }
+       if (ai == a.end()) {
+               return -1;
+       }
+       if (bi == b.end()) {
+               return 1;
+       }
+       if (*ai == *bi) {
+               return 0;
+       }
+       if (*ai == '/') {
+               return -1;
+       }
+       if (*bi == '/') {
+               return 1;
+       }
+       return int((unsigned char)*ai) - int((unsigned char)*bi);
+}
+
+/* Sort LIST using dir_path_cmp () */
+void string_list_dir_path_sort(vector<string> *list)
+{
+       sort(list->begin(), list->end(), [](const string &a, const string &b) {
+               return dir_path_cmp(a, b) < 0;
+       });
+}
+
+/* Is PATH included in LIST?  Update *IDX to move within LIST.
+
+   LIST is assumed to be sorted using dir_path_cmp (), successive calls to this
+   function are assumed to use PATH values increasing in dir_path_cmp (). */
+bool string_list_contains_dir_path(const vector<string> *list, size_t *idx,
+                                   const string &path)
+{
+       int cmp = 0;
+       while (*idx < list->size() && (cmp = dir_path_cmp((*list)[*idx], path)) < 0) {
+               (*idx)++;
+       }
+       if (*idx < list->size() && cmp == 0) {
+               (*idx)++;
+               return true;
+       }
+       return false;
+}
diff --git a/lib.h b/lib.h
new file mode 100644 (file)
index 0000000..0239dcb
--- /dev/null
+++ b/lib.h
@@ -0,0 +1,49 @@
+/* Common functions.
+
+Copyright (C) 2005, 2007 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#ifndef LIB_H__
+#define LIB_H__
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <sys/types.h>
+#include <vector>
+
+#define _(X) (X)
+
+/* Compare two path names using the database directory order. This is not
+   exactly strcmp () order: "a" < "a.b", so "a/z" < "a.b". */
+extern int dir_path_cmp(const std::string &a, const std::string &b);
+
+/* Sort LIST using dir_path_cmp () */
+extern void string_list_dir_path_sort(std::vector<std::string> *list);
+
+/* Is PATH included in LIST?  Update *IDX to move within LIST.
+
+   LIST is assumed to be sorted using dir_path_cmp (), successive calls to this
+   function are assumed to use PATH values increasing in dir_path_cmp (). */
+extern bool string_list_contains_dir_path(const std::vector<std::string> *list,
+                                          size_t *idx, const std::string &path);
+
+#endif
index 9dc7c1514acc323d551f7ea018aef22a58b58d26..feb6c5a57f7c27a994189c2c36146acd4cb32449 100644 (file)
@@ -1,7 +1,11 @@
 project('plocate', 'cpp', default_options: ['buildtype=debugoptimized','cpp_std=c++17'], version: '1.0.8-pre')
 
-# Make the version available as a #define.
-add_project_arguments('-DPLOCATE_VERSION="' + meson.project_version() + '"', language: 'cpp')
+add_project_arguments('-DGROUPNAME="' + get_option('locategroup') + '"', language: 'cpp')
+add_project_arguments('-DUPDATEDB_CONF="/etc/updatedb.conf"', language: 'cpp')
+add_project_arguments('-DDBFILE="/var/lib/mlocate/plocate.db"', language: 'cpp')
+add_project_arguments('-DPACKAGE_NAME="plocate"', language: 'cpp')
+add_project_arguments('-DPACKAGE_VERSION="' + meson.project_version() + '"', language: 'cpp')
+add_project_arguments('-DPACKAGE_BUGREPORT="steinar+plocate@gunderson.no"', language: 'cpp')
 
 cxx = meson.get_compiler('cpp')
 uringdep = dependency('liburing', required: false)
@@ -33,6 +37,10 @@ executable('plocate-build', ['plocate-build.cpp', 'database-builder.cpp'],
        dependencies: [zstddep],
        install: true,
        install_dir: get_option('sbindir'))
+executable('updatedb', ['updatedb.cpp', 'database-builder.cpp', 'conf.cpp', 'lib.cpp', 'bind-mount.cpp', 'complete_pread.cpp'],
+       dependencies: [zstddep, threaddep],
+       install: true,
+       install_dir: get_option('sbindir'))
 
 conf_data = configuration_data()
 conf_data.set('PROCESSED_BY_MESON', '1')
index 02b18d0e42e00dc577ad9d7160d2f53d05433bfb..5c205aabef6d636f5c40cc009920ef61d5cdd1ea 100644 (file)
@@ -83,10 +83,10 @@ void handle_directory(FILE *fp, DatabaseReceiver *receiver)
                int type = getc(fp);
                if (type == DBE_NORMAL) {
                        string filename = read_cstr(fp);
-                       receiver->add_file(dir_path + "/" + filename);
+                       receiver->add_file(dir_path + "/" + filename, unknown_dir_time);
                } else if (type == DBE_DIRECTORY) {
                        string dirname = read_cstr(fp);
-                       receiver->add_file(dir_path + "/" + dirname);
+                       receiver->add_file(dir_path + "/" + dirname, unknown_dir_time);
                } else {
                        return;  // Probably end.
                }
@@ -116,7 +116,7 @@ void read_plaintext(FILE *fp, DatabaseReceiver *receiver)
                }
                if (!s.empty() && s.back() == '\n')
                        s.pop_back();
-               receiver->add_file(move(s));
+               receiver->add_file(move(s), unknown_dir_time);
        }
 }
 
@@ -166,8 +166,8 @@ void do_build(const char *infile, const char *outfile, int block_size, bool plai
        }
        string dictionary = builder.train(1024);
 
-       DatabaseBuilder db(outfile, block_size, dictionary);
-       Corpus *corpus = db.start_corpus();
+       DatabaseBuilder db(outfile, /*owner=*/-1, block_size, dictionary);
+       Corpus *corpus = db.start_corpus(/*store_dir_times=*/false);
        if (plaintext) {
                read_plaintext(infp, corpus);
        } else {
@@ -195,7 +195,7 @@ void usage()
 
 void version()
 {
-       printf("plocate-build %s\n", PLOCATE_VERSION);
+       printf("plocate-build %s\n", PACKAGE_VERSION);
        printf("Copyright 2020 Steinar H. Gunderson\n");
        printf("License GPLv2+: GNU GPL version 2 or later <https://gnu.org/licenses/gpl.html>.\n");
        printf("This is free software: you are free to change and redistribute it.\n");
index 079eb2996cf40e7d4a8c3fe5b1d37d7485e6f9a7..6fe46a4f77e36a1aa1f75ec2fc7e2036079b25fc 100644 (file)
@@ -43,9 +43,7 @@
 using namespace std;
 using namespace std::chrono;
 
-#define DEFAULT_DBPATH "/var/lib/mlocate/plocate.db"
-
-const char *dbpath = DEFAULT_DBPATH;
+const char *dbpath = DBFILE;
 bool ignore_case = false;
 bool only_count = false;
 bool print_nul = false;
@@ -656,7 +654,7 @@ void usage()
                "  -b, --basename         search only the file name portion of path names\n"
                "  -c, --count            print number of matches instead of the matches\n"
                "  -d, --database DBPATH  search for files in DBPATH\n"
-               "                         (default is " DEFAULT_DBPATH ")\n"
+               "                         (default is " DBFILE ")\n"
                "  -i, --ignore-case      search case-insensitively\n"
                "  -l, --limit LIMIT      stop after LIMIT matches\n"
                "  -0, --null             delimit matches by NUL instead of newline\n"
@@ -669,7 +667,7 @@ void usage()
 
 void version()
 {
-       printf("plocate %s\n", PLOCATE_VERSION);
+       printf("%s %s\n", PACKAGE_NAME, PACKAGE_VERSION);
        printf("Copyright 2020 Steinar H. Gunderson\n");
        printf("License GPLv2+: GNU GPL version 2 or later <https://gnu.org/licenses/gpl.html>.\n");
        printf("This is free software: you are free to change and redistribute it.\n");
diff --git a/updatedb.cpp b/updatedb.cpp
new file mode 100644 (file)
index 0000000..908f477
--- /dev/null
@@ -0,0 +1,792 @@
+/* updatedb(8).
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+ */
+
+#include "bind-mount.h"
+#include "complete_pread.h"
+#include "conf.h"
+#include "database-builder.h"
+#include "db.h"
+#include "dprintf.h"
+#include "io_uring_engine.h"
+#include "lib.h"
+
+#include <algorithm>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <chrono>
+#include <dirent.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <grp.h>
+#include <iosfwd>
+#include <math.h>
+#include <memory>
+#include <mntent.h>
+#include <random>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <utility>
+#include <vector>
+
+using namespace std;
+using namespace std::chrono;
+
+/* Next conf_prunepaths entry */
+static size_t conf_prunepaths_index; /* = 0; */
+
+void usage()
+{
+       printf(
+               "Usage: updatedb PLOCATE_DB\n"
+               "\n"
+               "Generate plocate index from mlocate.db, typically /var/lib/mlocate/mlocate.db.\n"
+               "Normally, the destination should be /var/lib/mlocate/plocate.db.\n"
+               "\n"
+               "  -b, --block-size SIZE  number of filenames to store in each block (default 32)\n"
+               "  -p, --plaintext        input is a plaintext file, not an mlocate database\n"
+               "      --help             print this help\n"
+               "      --version          print version information\n");
+}
+
+void version()
+{
+       printf("updatedb %s\n", PACKAGE_VERSION);
+       printf("Copyright (C) 2007 Red Hat, Inc. All rights reserved.\n");
+       printf("Copyright 2020 Steinar H. Gunderson\n");
+       printf("This software is distributed under the GPL v.2.\n");
+       printf("\n");
+       printf("This program is provided with NO WARRANTY, to the extent permitted by law.\n");
+}
+
+int opendir_noatime(int dirfd, const char *path)
+{
+       static bool noatime_failed = false;
+
+       if (!noatime_failed) {
+               int fd = openat(dirfd, path, O_RDONLY | O_DIRECTORY | O_NOATIME);
+               if (fd != -1) {
+                       return fd;
+               } else if (errno == EPERM) {
+                       /* EPERM is fairly O_NOATIME-specific; missing access rights cause
+                          EACCES. */
+                       noatime_failed = true;
+                       // Retry below.
+               } else {
+                       return -1;
+               }
+       }
+       return openat(dirfd, path, O_RDONLY | O_DIRECTORY);
+}
+
+bool time_is_current(const dir_time &t)
+{
+       static dir_time cache{ 0, 0 };
+
+       /* This is more difficult than it should be because Linux uses a cheaper time
+          source for filesystem timestamps than for gettimeofday() and they can get
+          slightly out of sync, see
+          https://bugzilla.redhat.com/show_bug.cgi?id=244697 .  This affects even
+          nanosecond timestamps (and don't forget that tv_nsec existence doesn't
+          guarantee that the underlying filesystem has such resolution - it might be
+          microseconds or even coarser).
+
+          The worst case is probably FAT timestamps with 2-second resolution
+          (although using such a filesystem violates POSIX file times requirements).
+
+          So, to be on the safe side, require a >3.0 second difference (2 seconds to
+          make sure the FAT timestamp changed, 1 more to account for the Linux
+          timestamp races).  This large margin might make updatedb marginally more
+          expensive, but it only makes a difference if the directory was very
+          recently updated _and_ is will not be updated again until the next
+          updatedb run; this is not likely to happen for most directories. */
+
+       /* Cache gettimeofday () results to rule out obviously old time stamps;
+          CACHE contains the earliest time we reject as too current. */
+       if (t < cache) {
+               return false;
+       }
+
+       struct timeval tv;
+       gettimeofday(&tv, nullptr);
+       cache.sec = tv.tv_sec - 3;
+       cache.nsec = tv.tv_usec * 1000;
+
+       return t >= cache;
+}
+
+struct entry {
+       string name;
+       bool is_directory;
+
+       // For directories only:
+       int fd = -1;
+       dir_time dt = unknown_dir_time;
+       dir_time db_modified = unknown_dir_time;
+       dev_t dev;
+};
+
+bool filesystem_is_excluded(const char *path)
+{
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "Checking whether filesystem `%s' is excluded:\n", path);
+       }
+       FILE *f = setmntent("/proc/mounts", "r");
+       if (f == nullptr) {
+               return false;
+       }
+
+       struct mntent *me;
+       while ((me = getmntent(f)) != nullptr) {
+               if (conf_debug_pruning) {
+                       /* This is debugging output, don't mark anything for translation */
+                       fprintf(stderr, " `%s', type `%s'\n", me->mnt_dir, me->mnt_type);
+               }
+               string type(me->mnt_type);
+               for (char &p : type) {
+                       p = toupper(p);
+               }
+               if (find(conf_prunefs.begin(), conf_prunefs.end(), type) != conf_prunefs.end()) {
+                       /* Paths in /proc/self/mounts contain no symbolic links.  Besides
+                          avoiding a few system calls, avoiding the realpath () avoids hangs
+                          if the filesystem is unavailable hard-mounted NFS. */
+                       char *dir = me->mnt_dir;
+                       if (conf_debug_pruning) {
+                               /* This is debugging output, don't mark anything for translation */
+                               fprintf(stderr, " => type matches, dir `%s'\n", dir);
+                       }
+                       bool res = (strcmp(path, dir) == 0);
+                       if (dir != me->mnt_dir)
+                               free(dir);
+                       if (res) {
+                               endmntent(f);
+                               return true;
+                       }
+               }
+       }
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "...done\n");
+       }
+       endmntent(f);
+       return false;
+}
+
+dir_time get_dirtime_from_stat(const struct stat &buf)
+{
+       dir_time ctime{ buf.st_ctim.tv_sec, int32_t(buf.st_ctim.tv_nsec) };
+       dir_time mtime{ buf.st_mtim.tv_sec, int32_t(buf.st_mtim.tv_nsec) };
+       dir_time dt = max(ctime, mtime);
+
+       if (time_is_current(dt)) {
+               /* The directory might be changing right now and we can't be sure the
+                  timestamp will be changed again if more changes happen very soon, mark
+                  the timestamp as invalid to force rescanning the directory next time
+                  updatedb is run. */
+               return unknown_dir_time;
+       } else {
+               return dt;
+       }
+}
+
+// Represents the old database we are updating.
+class ExistingDB {
+public:
+       explicit ExistingDB(int fd);
+       ~ExistingDB();
+
+       pair<string, dir_time> read_next();
+       void unread(pair<string, dir_time> record)
+       {
+               unread_record = move(record);
+       }
+       string read_next_dictionary() const;
+       bool get_error() const { return error; }
+
+private:
+       const int fd;
+       Header hdr;
+
+       uint32_t current_docid = 0;
+
+       string current_filename_block;
+       const char *current_filename_ptr = nullptr, *current_filename_end = nullptr;
+
+       off_t compressed_dir_time_pos;
+       string compressed_dir_time;
+       string current_dir_time_block;
+       const char *current_dir_time_ptr = nullptr, *current_dir_time_end = nullptr;
+
+       pair<string, dir_time> unread_record;
+
+       // Used in one-shot mode, repeatedly.
+       ZSTD_DCtx *ctx;
+
+       // Used in streaming mode.
+       ZSTD_DCtx *dir_time_ctx;
+
+       ZSTD_DDict *ddict = nullptr;
+
+       // If true, we've discovered an error or EOF, and will return only
+       // empty data from here.
+       bool eof = false, error = false;
+};
+
+ExistingDB::ExistingDB(int fd)
+       : fd(fd)
+{
+       if (fd == -1) {
+               error = true;
+               return;
+       }
+
+       if (!try_complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0)) {
+               if (conf_verbose) {
+                       perror("pread(header)");
+               }
+               error = true;
+               return;
+       }
+       if (memcmp(hdr.magic, "\0plocate", 8) != 0) {
+               if (conf_verbose) {
+                       fprintf(stderr, "Old database had header mismatch, ignoring.\n");
+               }
+               error = true;
+               return;
+       }
+       if (hdr.version != 1 || hdr.max_version < 2) {
+               if (conf_verbose) {
+                       fprintf(stderr, "Old database had version mismatch (version=%d max_version=%d), ignoring.\n",
+                               hdr.version, hdr.max_version);
+               }
+               error = true;
+               return;
+       }
+
+       // Compare the configuration block with our current one.
+       if (hdr.conf_block_length_bytes != conf_block.size()) {
+               if (conf_verbose) {
+                       fprintf(stderr, "Old database had different configuration block (size mismatch), ignoring.\n");
+               }
+               error = true;
+               return;
+       }
+       string str;
+       str.resize(hdr.conf_block_length_bytes);
+       if (!try_complete_pread(fd, str.data(), hdr.conf_block_length_bytes, hdr.conf_block_offset_bytes)) {
+               if (conf_verbose) {
+                       perror("pread(conf_block)");
+               }
+               error = true;
+               return;
+       }
+       if (str != conf_block) {
+               if (conf_verbose) {
+                       fprintf(stderr, "Old database had different configuration block (contents mismatch), ignoring.\n");
+               }
+               error = true;
+               return;
+       }
+
+       // Read dictionary, if it exists.
+       if (hdr.zstd_dictionary_length_bytes > 0) {
+               string dictionary;
+               dictionary.resize(hdr.zstd_dictionary_length_bytes);
+               if (try_complete_pread(fd, &dictionary[0], hdr.zstd_dictionary_length_bytes, hdr.zstd_dictionary_offset_bytes)) {
+                       ddict = ZSTD_createDDict(dictionary.data(), dictionary.size());
+               } else {
+                       if (conf_verbose) {
+                               perror("pread(dictionary)");
+                       }
+                       error = true;
+                       return;
+               }
+       }
+       compressed_dir_time_pos = hdr.directory_data_offset_bytes;
+
+       ctx = ZSTD_createDCtx();
+       dir_time_ctx = ZSTD_createDCtx();
+}
+
+ExistingDB::~ExistingDB()
+{
+       if (fd != -1) {
+               close(fd);
+       }
+}
+
+pair<string, dir_time> ExistingDB::read_next()
+{
+       if (!unread_record.first.empty()) {
+               auto ret = move(unread_record);
+               unread_record.first.clear();
+               return ret;
+       }
+
+       if (eof || error) {
+               return { "", not_a_dir };
+       }
+
+       // See if we need to read a new filename block.
+       if (current_filename_ptr == nullptr) {
+               if (current_docid >= hdr.num_docids) {
+                       eof = true;
+                       return { "", not_a_dir };
+               }
+
+               // Read the file offset from this docid and the next one.
+               // This is always allowed, since we have a sentinel block at the end.
+               off_t offset_for_block = hdr.filename_index_offset_bytes + current_docid * sizeof(uint64_t);
+               uint64_t vals[2];
+               if (!try_complete_pread(fd, vals, sizeof(vals), offset_for_block)) {
+                       if (conf_verbose) {
+                               perror("pread(offset)");
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+
+               off_t offset = vals[0];
+               size_t compressed_len = vals[1] - vals[0];
+               unique_ptr<char[]> compressed(new char[compressed_len]);
+               if (!try_complete_pread(fd, compressed.get(), compressed_len, offset)) {
+                       if (conf_verbose) {
+                               perror("pread(block)");
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+
+               unsigned long long uncompressed_len = ZSTD_getFrameContentSize(compressed.get(), compressed_len);
+               if (uncompressed_len == ZSTD_CONTENTSIZE_UNKNOWN || uncompressed_len == ZSTD_CONTENTSIZE_ERROR) {
+                       if (conf_verbose) {
+                               fprintf(stderr, "ZSTD_getFrameContentSize() failed\n");
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+
+               string block;
+               block.resize(uncompressed_len + 1);
+
+               size_t err;
+               if (ddict != nullptr) {
+                       err = ZSTD_decompress_usingDDict(ctx, &block[0], block.size(), compressed.get(),
+                                                        compressed_len, ddict);
+               } else {
+                       err = ZSTD_decompressDCtx(ctx, &block[0], block.size(), compressed.get(),
+                                                 compressed_len);
+               }
+               if (ZSTD_isError(err)) {
+                       if (conf_verbose) {
+                               fprintf(stderr, "ZSTD_decompress(): %s\n", ZSTD_getErrorName(err));
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+               block[block.size() - 1] = '\0';
+               current_filename_block = move(block);
+               current_filename_ptr = current_filename_block.data();
+               current_filename_end = current_filename_block.data() + current_filename_block.size();
+               ++current_docid;
+       }
+
+       // See if we need to read more directory time data.
+       while (current_dir_time_ptr == current_dir_time_end ||
+              (*current_dir_time_ptr != 0 &&
+               size_t(current_dir_time_end - current_dir_time_ptr) < sizeof(dir_time) + 1)) {
+               if (current_dir_time_ptr != nullptr) {
+                       const size_t bytes_consumed = current_dir_time_ptr - current_dir_time_block.data();
+                       current_dir_time_block.erase(current_dir_time_block.begin(), current_dir_time_block.begin() + bytes_consumed);
+               }
+
+               // See if we can get more data out without reading more.
+               const size_t existing_data = current_dir_time_block.size();
+               current_dir_time_block.resize(existing_data + 4096);
+
+               ZSTD_outBuffer outbuf;
+               outbuf.dst = current_dir_time_block.data() + existing_data;
+               outbuf.size = 4096;
+               outbuf.pos = 0;
+
+               ZSTD_inBuffer inbuf;
+               inbuf.src = compressed_dir_time.data();
+               inbuf.size = compressed_dir_time.size();
+               inbuf.pos = 0;
+
+               int err = ZSTD_decompressStream(dir_time_ctx, &outbuf, &inbuf);
+               if (err < 0) {
+                       if (conf_verbose) {
+                               fprintf(stderr, "ZSTD_decompress(): %s\n", ZSTD_getErrorName(err));
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+               compressed_dir_time.erase(compressed_dir_time.begin(), compressed_dir_time.begin() + inbuf.pos);
+               current_dir_time_block.resize(existing_data + outbuf.pos);
+
+               if (inbuf.pos == 0 && outbuf.pos == 0) {
+                       // No movement, we'll need to try to read more data.
+                       char buf[4096];
+                       size_t bytes_to_read = min<size_t>(
+                               hdr.directory_data_offset_bytes + hdr.directory_data_length_bytes - compressed_dir_time_pos,
+                               sizeof(buf));
+                       if (bytes_to_read == 0) {
+                               error = true;
+                               return { "", not_a_dir };
+                       }
+                       if (!try_complete_pread(fd, buf, bytes_to_read, compressed_dir_time_pos)) {
+                               if (conf_verbose) {
+                                       perror("pread(dirtime)");
+                               }
+                               error = true;
+                               return { "", not_a_dir };
+                       }
+                       compressed_dir_time_pos += bytes_to_read;
+                       compressed_dir_time.insert(compressed_dir_time.end(), buf, buf + bytes_to_read);
+
+                       // Next iteration will now try decompressing more.
+               }
+
+               current_dir_time_ptr = current_dir_time_block.data();
+               current_dir_time_end = current_dir_time_block.data() + current_dir_time_block.size();
+       }
+
+       string filename = current_filename_ptr;
+       current_filename_ptr += filename.size() + 1;
+       if (current_filename_ptr == current_filename_end) {
+               // End of this block.
+               current_filename_ptr = nullptr;
+       }
+
+       if (*current_dir_time_ptr == 0) {
+               ++current_dir_time_ptr;
+               return { move(filename), not_a_dir };
+       } else {
+               ++current_dir_time_ptr;
+               dir_time dt;
+               memcpy(&dt.sec, current_dir_time_ptr, sizeof(dt.sec));
+               current_dir_time_ptr += sizeof(dt.sec);
+               memcpy(&dt.nsec, current_dir_time_ptr, sizeof(dt.nsec));
+               current_dir_time_ptr += sizeof(dt.nsec);
+               return { move(filename), dt };
+       }
+}
+
+string ExistingDB::read_next_dictionary() const
+{
+       if (hdr.next_zstd_dictionary_length_bytes == 0 || hdr.next_zstd_dictionary_length_bytes > 1048576) {
+               return "";
+       }
+       string str;
+       str.resize(hdr.next_zstd_dictionary_length_bytes);
+       if (!try_complete_pread(fd, str.data(), hdr.next_zstd_dictionary_length_bytes, hdr.next_zstd_dictionary_offset_bytes)) {
+               if (conf_verbose) {
+                       perror("pread(next_dictionary)");
+               }
+               return "";
+       }
+       return str;
+}
+
+// Scans the directory with absolute path “path”, which is opened as “fd”.
+// Uses relative paths and openat() only, evading any issues with PATH_MAX
+// and time-of-check-time-of-use race conditions. (mlocate's updatedb
+// does a much more complicated dance with changing the current working
+// directory, probably in the interest of portability to old platforms.)
+// “parent_dev” must be the device of the parent directory of “path”.
+//
+// Takes ownership of fd.
+int scan(const string &path, int fd, dev_t parent_dev, dir_time modified, dir_time db_modified, ExistingDB *existing_db, Corpus *corpus, DictionaryBuilder *dict_builder)
+{
+       if (string_list_contains_dir_path(&conf_prunepaths, &conf_prunepaths_index, path)) {
+               if (conf_debug_pruning) {
+                       /* This is debugging output, don't mark anything for translation */
+                       fprintf(stderr, "Skipping `%s': in prunepaths\n", path.c_str());
+               }
+               close(fd);
+               return 0;
+       }
+       if (conf_prune_bind_mounts && is_bind_mount(path.c_str())) {
+               if (conf_debug_pruning) {
+                       /* This is debugging output, don't mark anything for translation */
+                       fprintf(stderr, "Skipping `%s': bind mount\n", path.c_str());
+               }
+               close(fd);
+               return 0;
+       }
+
+       // We read in the old directory no matter whether it is current or not,
+       // because even if we're not going to use it, we'll need the modification directory
+       // of any subdirectories.
+
+       // Skip over anything before this directory; it is stuff that we would have
+       // consumed earlier if we wanted it.
+       for (;;) {
+               pair<string, dir_time> record = existing_db->read_next();
+               if (record.first.empty()) {
+                       break;
+               }
+               if (dir_path_cmp(path, record.first) <= 0) {
+                       existing_db->unread(move(record));
+                       break;
+               }
+       }
+
+       // Now read everything in this directory.
+       vector<entry> db_entries;
+       const string path_plus_slash = path.back() == '/' ? path : path + '/';
+       for (;;) {
+               pair<string, dir_time> record = existing_db->read_next();
+               if (record.first.empty()) {
+                       break;
+               }
+
+               if (record.first.rfind(path_plus_slash, 0) != 0) {
+                       // No longer starts with path, so we're in a different directory.
+                       existing_db->unread(move(record));
+                       break;
+               }
+               if (record.first.find_first_of('/', path_plus_slash.size()) != string::npos) {
+                       // Entered into a subdirectory of a subdirectory.
+                       // Due to our ordering, this also means we're done.
+                       existing_db->unread(move(record));
+                       break;
+               }
+
+               entry e;
+               e.name = record.first.substr(path_plus_slash.size());
+               e.is_directory = (record.second.sec >= 0);
+               e.db_modified = record.second;
+               db_entries.push_back(e);
+       }
+
+       DIR *dir = nullptr;
+       vector<entry> entries;
+       if (!existing_db->get_error() && db_modified.sec > 0 &&
+           modified.sec == db_modified.sec && modified.nsec == db_modified.nsec) {
+               // Not changed since the last database, so we can replace the readdir()
+               // by reading from the database. (We still need to open and stat everything,
+               // though, but that happens in a later step.)
+               entries = move(db_entries);
+       } else {
+               dir = fdopendir(fd);  // Takes over ownership of fd.
+               if (dir == nullptr) {
+                       perror("fdopendir");
+                       exit(1);
+               }
+
+               dirent *de;
+               while ((de = readdir(dir)) != nullptr) {
+                       if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) {
+                               continue;
+                       }
+                       if (strlen(de->d_name) == 0) {
+                               /* Unfortunately, this does happen, and mere assert() does not give
+                                  users enough information to complain to the right people. */
+                               fprintf(stderr, "file system error: zero-length file name in directory %s", path.c_str());
+                               continue;
+                       }
+
+                       entry e;
+                       e.name = de->d_name;
+                       e.is_directory = (de->d_type == DT_DIR);
+
+                       if (conf_verbose) {
+                               printf("%s/%s\n", path.c_str(), de->d_name);
+                       }
+                       entries.push_back(move(e));
+               }
+
+               sort(entries.begin(), entries.end(), [](const entry &a, const entry &b) {
+                       return a.name < b.name;
+               });
+
+               // Load directory modification times from the old database.
+               auto db_it = db_entries.begin();
+               for (entry &e : entries) {
+                       for (; db_it != db_entries.end(); ++db_it) {
+                               if (e.name < db_it->name) {
+                                       break;
+                               }
+                               if (e.name == db_it->name) {
+                                       e.db_modified = db_it->db_modified;
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       // For each entry, we want to add it to the database. but this includes the modification time
+       // for directories, which means we need to open and stat it at this point.
+       //
+       // This means we may need to have many directories open at the same time, but it seems to be
+       // the simplest (only?) way of being compatible with mlocate's notion of listing all contents
+       // of a given directory before recursing, without buffering even more information. Hopefully,
+       // we won't go out of file descriptors here (it could happen if someone has tens of thousands
+       // of subdirectories in a single directory); if so, the admin will need to raise the limit.
+       for (entry &e : entries) {
+               if (!e.is_directory) {
+                       e.dt = not_a_dir;
+                       continue;
+               }
+
+               if (find(conf_prunenames.begin(), conf_prunenames.end(), e.name) != conf_prunenames.end()) {
+                       if (conf_debug_pruning) {
+                               /* This is debugging output, don't mark anything for translation */
+                               fprintf(stderr, "Skipping `%s': in prunenames\n", e.name.c_str());
+                       }
+                       continue;
+               }
+
+               e.fd = opendir_noatime(fd, e.name.c_str());
+               if (e.fd == -1) {
+                       if (errno == EMFILE || errno == ENFILE) {
+                               // The admin probably wants to know about this.
+                               perror((path_plus_slash + e.name).c_str());
+
+                               rlimit rlim;
+                               if (getrlimit(RLIMIT_NOFILE, &rlim) == -1) {
+                                       fprintf(stderr, "Hint: Try `ulimit -n 131072' or similar.\n");
+                               } else {
+                                       fprintf(stderr, "Hint: Try `ulimit -n %lu' or similar (current limit is %lu).\n",
+                                               rlim.rlim_cur * 2, rlim.rlim_cur);
+                               }
+                               exit(1);
+                       }
+                       continue;
+               }
+
+               struct stat buf;
+               if (fstat(e.fd, &buf) != 0) {
+                       perror(path.c_str());
+                       exit(1);
+               }
+
+               e.dev = buf.st_dev;
+               if (buf.st_dev != parent_dev) {
+                       if (filesystem_is_excluded((path_plus_slash + e.name).c_str())) {
+                               close(e.fd);
+                               e.fd = -1;
+                               continue;
+                       }
+               }
+
+               e.dt = get_dirtime_from_stat(buf);
+       }
+
+       // Actually add all the entries we figured out dates for above.
+       for (const entry &e : entries) {
+               corpus->add_file(path_plus_slash + e.name, e.dt);
+               dict_builder->add_file(path_plus_slash + e.name, e.dt);
+       }
+
+       // Now scan subdirectories.
+       for (const entry &e : entries) {
+               if (e.is_directory && e.fd != -1) {
+                       int ret = scan(path_plus_slash + e.name, e.fd, e.dev, e.dt, e.db_modified, existing_db, corpus, dict_builder);
+                       if (ret == -1) {
+                               // TODO: The unscanned file descriptors will leak, but it doesn't really matter,
+                               // as we're about to exit.
+                               closedir(dir);
+                               return -1;
+                       }
+               }
+       }
+
+       if (dir == nullptr) {
+               close(fd);
+       } else {
+               closedir(dir);
+       }
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       // We want to bump the file limit; do it if we can (usually we are root
+       // and can set whatever we want). 128k should be ample for most setups.
+       rlimit rlim;
+       if (getrlimit(RLIMIT_NOFILE, &rlim) != -1) {
+               rlim_t wanted = std::max<rlim_t>(rlim.rlim_cur, 131072);
+               rlim.rlim_cur = std::min<rlim_t>(wanted, rlim.rlim_max);
+               setrlimit(RLIMIT_NOFILE, &rlim);  // Ignore errors.
+       }
+
+       conf_prepare(argc, argv);
+       if (conf_prune_bind_mounts) {
+               bind_mount_init(MOUNTINFO_PATH);
+       }
+
+       int fd = open(conf_output.c_str(), O_RDONLY);
+       ExistingDB existing_db(fd);
+
+       DictionaryBuilder dict_builder(/*blocks_to_keep=*/1000, conf_block_size);
+
+       gid_t owner = -1;
+       if (conf_check_visibility) {
+               group *grp = getgrnam(GROUPNAME);
+               if (grp == nullptr) {
+                       fprintf(stderr, "Unknown group %s\n", GROUPNAME);
+                       exit(1);
+               }
+               owner = grp->gr_gid;
+       }
+
+       DatabaseBuilder db(conf_output.c_str(), owner, conf_block_size, existing_db.read_next_dictionary());
+       Corpus *corpus = db.start_corpus(/*store_dir_times=*/true);
+
+       int root_fd = opendir_noatime(AT_FDCWD, conf_scan_root);
+       if (root_fd == -1) {
+               perror(".");
+               exit(1);
+       }
+
+       struct stat buf;
+       if (fstat(root_fd, &buf) == -1) {
+               perror(".");
+               exit(1);
+       }
+
+       scan(conf_scan_root, root_fd, buf.st_dev, get_dirtime_from_stat(buf), /*db_modified=*/unknown_dir_time, &existing_db, corpus, &dict_builder);
+
+       // It's too late to use the dictionary for the data we already compressed,
+       // unless we wanted to either scan the entire file system again (acceptable
+       // for plocate-build where it's cheap, less so for us), or uncompressing
+       // and recompressing. Instead, we store it for next time, assuming that the
+       // data changes fairly little from time to time.
+       string next_dictionary = dict_builder.train(1024);
+       db.set_next_dictionary(next_dictionary);
+       db.finish_corpus();
+
+       exit(EXIT_SUCCESS);
+}