Add a native updatedb.

author Steinar H. Gunderson <steinar+nageru@gunderson.no>

Sat, 21 Nov 2020 17:23:20 +0000 (18:23 +0100)

committer Steinar H. Gunderson <steinar+git@gunderson.no>

Tue, 24 Nov 2020 23:58:09 +0000 (00:58 +0100)
author Steinar H. Gunderson <steinar+nageru@gunderson.no>
Sat, 21 Nov 2020 17:23:20 +0000 (18:23 +0100)
committer Steinar H. Gunderson <steinar+git@gunderson.no>
Tue, 24 Nov 2020 23:58:09 +0000 (00:58 +0100)
diff --git a/README b/README

index dcb60c827403d6817727a3323ea16e254561682e..5071a6b57e2e5424168a08ee33db0b6b88e52dac 100644 (file)
--- a/README
+++ b/README
@@ -33,6 +33,11 @@ the reference implementation, you can check it out and run as follows:
    ninja reconfigure
    ninja bench
  
-Copyright 2020 Steinar H. Gunderson <steinar+plocate@gunderson.no>.
+plocate (except updatedb), and the plocate-specific changes to updatedb,
+is Copyright 2020 Steinar H. Gunderson <steinar+plocate@gunderson.no>.
  Licensed under the GNU General Public License, either version 2,
  or (at your option) any later version. See the included file COPYING.
+
+updatedb is Copyright (C) 2005, 2007 Red Hat, Inc. All rights reserved.
+Licensed under the GNU General Public License, version 2. See the
+included file COPYING.
diff --git a/bind-mount.cpp b/bind-mount.cpp

new file mode 100644 (file)

index 0000000..50cf7a5
--- /dev/null
+++ b/bind-mount.cpp
@@ -0,0 +1,310 @@
+/* Bind mount detection.  Note: if you change this, change tmpwatch as well.
+
+Copyright (C) 2005, 2007, 2008, 2012 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#include "bind-mount.h"
+
+#include "conf.h"
+#include "lib.h"
+
+#include <atomic>
+#include <fcntl.h>
+#include <limits.h>
+#include <map>
+#include <poll.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <thread>
+
+using namespace std;
+
+/* mountinfo handling */
+
+/* A single mountinfo entry */
+struct mount {
+       int id, parent_id;
+       unsigned dev_major, dev_minor;
+       string root;
+       string mount_point;
+       string fs_type;
+       string source;
+};
+
+/* Path to mountinfo */
+static const char *mountinfo_path;
+atomic<bool> mountinfo_updated{ false };
+
+multimap<pair<int, int>, mount> mount_entries;  // Keyed by device major/minor.
+
+/* Read a line from F.
+   Return a string, or empty string on error. */
+static string read_mount_line(FILE *f)
+{
+       string line;
+
+       for (;;) {
+               char buf[LINE_MAX];
+
+               if (fgets(buf, sizeof(buf), f) == nullptr) {
+                       if (feof(f))
+                               break;
+                       return "";
+               }
+               size_t chunk_length = strlen(buf);
+               if (chunk_length > 0 && buf[chunk_length - 1] == '\n') {
+                       line.append(buf, chunk_length - 1);
+                       break;
+               }
+               line.append(buf, chunk_length);
+       }
+       return line;
+}
+
+/* Parse a space-delimited entry in STR, decode octal escapes, write it to
+   DEST (allocated from mount_string_obstack) if it is not nullptr.
+   Return 0 if OK, -1 on error. */
+static int parse_mount_string(string *dest, const char **str)
+{
+       const char *src = *str;
+       while (*src == ' ' || *src == '\t') {
+               src++;
+       }
+       if (*src == 0) {
+               return -1;
+       }
+       string mount_string;
+       for (;;) {
+               char c = *src;
+
+               switch (c) {
+               case 0:
+               case ' ':
+               case '\t':
+                       goto done;
+
+               case '\\':
+                       if (src[1] >= '0' && src[1] <= '7' && src[2] >= '0' && src[2] <= '7' && src[3] >= '0' && src[3] <= '7') {
+                               unsigned v;
+
+                               v = ((src[1] - '0') << 6) | ((src[2] - '0') << 3) | (src[3] - '0');
+                               if (v <= UCHAR_MAX) {
+                                       mount_string.push_back(v);
+                                       src += 4;
+                                       break;
+                               }
+                       }
+                       /* Else fall through */
+
+               default:
+                       mount_string.push_back(c);
+                       src++;
+               }
+       }
+
+done:
+       *str = src;
+       if (dest != nullptr) {
+               *dest = move(mount_string);
+       }
+       return 0;
+}
+
+/* Read a single entry from F. Return true if succesful. */
+static bool read_mount_entry(FILE *f, mount *me)
+{
+       string line = read_mount_line(f);
+       if (line.empty()) {
+               return false;
+       }
+       size_t offset;
+       if (sscanf(line.c_str(), "%d %d %u:%u%zn", &me->id, &me->parent_id, &me->dev_major,
+                  &me->dev_minor, &offset) != 4) {
+               return false;
+       }
+       const char *ptr = line.c_str() + offset;
+       if (parse_mount_string(&me->root, &ptr) != 0 ||
+           parse_mount_string(&me->mount_point, &ptr) != 0 ||
+           parse_mount_string(nullptr, &ptr) != 0) {
+               return false;
+       }
+       bool separator_found;
+       do {
+               string option;
+               if (parse_mount_string(&option, &ptr) != 0) {
+                       return false;
+               }
+               separator_found = strcmp(option.c_str(), "-") == 0;
+       } while (!separator_found);
+
+       if (parse_mount_string(&me->fs_type, &ptr) != 0 ||
+           parse_mount_string(&me->source, &ptr) != 0 ||
+           parse_mount_string(nullptr, &ptr) != 0) {
+               return false;
+       }
+       return true;
+}
+
+/* Read mount information from mountinfo_path, update mount_entries and
+   num_mount_entries.
+   Return 0 if OK, -1 on error. */
+static int read_mount_entries(void)
+{
+       FILE *f = fopen(mountinfo_path, "r");
+       if (f == nullptr) {
+               return -1;
+       }
+
+       mount_entries.clear();
+
+       mount me;
+       while (read_mount_entry(f, &me)) {
+               if (conf_debug_pruning) {
+                       /* This is debugging output, don't mark anything for translation */
+                       fprintf(stderr,
+                               " `%s' (%d on %d) is `%s' of `%s' (%u:%u), type `%s'\n",
+                               me.mount_point.c_str(), me.id, me.parent_id, me.root.c_str(), me.source.c_str(),
+                               me.dev_major, me.dev_minor, me.fs_type.c_str());
+               }
+               mount_entries.emplace(make_pair(me.dev_major, me.dev_minor), me);
+       }
+       fclose(f);
+       return 0;
+}
+
+/* Bind mount path list maintenace and top-level interface. */
+
+/* mountinfo_path file descriptor, or -1 */
+static int mountinfo_fd;
+
+/* Known bind mount paths */
+static struct vector<string> bind_mount_paths; /* = { 0, }; */
+
+/* Next bind_mount_paths entry */
+static size_t bind_mount_paths_index; /* = 0; */
+
+/* Rebuild bind_mount_paths */
+static void rebuild_bind_mount_paths(void)
+{
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "Rebuilding bind_mount_paths:\n");
+       }
+       if (read_mount_entries() != 0) {
+               return;
+       }
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "Matching bind_mount_paths:\n");
+       }
+
+       bind_mount_paths.clear();
+
+       for (const auto &[dev_id, me] : mount_entries) {
+               const auto &[first, second] = mount_entries.equal_range(make_pair(me.dev_major, me.dev_minor));
+               for (auto it = first; it != second; ++it) {
+                       const mount &other = it->second;
+                       if (other.id == me.id) {
+                               // Don't compare an element to itself.
+                               continue;
+                       }
+                       // We have two mounts from the same device. Is one a prefix of the other?
+                       // If there are two that are equal, prefer the one with lowest ID.
+                       if (me.root.size() > other.root.size() && me.root.find(other.root) == 0) {
+                               if (conf_debug_pruning) {
+                                       /* This is debugging output, don't mark anything for translation */
+                                       fprintf(stderr, " => adding `%s' (root `%s' is a child of `%s', mounted on `%s')\n",
+                                               me.mount_point.c_str(), me.root.c_str(), other.root.c_str(), other.mount_point.c_str());
+                               }
+                               bind_mount_paths.push_back(me.mount_point);
+                               break;
+                       }
+                       if (me.root == other.root && me.id > other.id) {
+                               if (conf_debug_pruning) {
+                                       /* This is debugging output, don't mark anything for translation */
+                                       fprintf(stderr, " => adding `%s' (duplicate of mount point `%s')\n",
+                                               me.mount_point.c_str(), other.mount_point.c_str());
+                               }
+                               bind_mount_paths.push_back(me.mount_point);
+                               break;
+                       }
+               }
+       }
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "...done\n");
+       }
+       string_list_dir_path_sort(&bind_mount_paths);
+}
+
+/* Return true if PATH is a destination of a bind mount.
+   (Bind mounts "to self" are ignored.) */
+bool is_bind_mount(const char *path)
+{
+       if (mountinfo_updated.exchange(false)) {  // Atomic test-and-clear.
+               rebuild_bind_mount_paths();
+               bind_mount_paths_index = 0;
+       }
+       return string_list_contains_dir_path(&bind_mount_paths,
+                                            &bind_mount_paths_index, path);
+}
+
+/* Initialize state for is_bind_mount(), to read data from MOUNTINFO. */
+void bind_mount_init(const char *mountinfo)
+{
+       mountinfo_path = mountinfo;
+       mountinfo_fd = open(mountinfo_path, O_RDONLY);
+       if (mountinfo_fd == -1)
+               return;
+       rebuild_bind_mount_paths();
+
+       // mlocate re-polls this for each and every directory it wants to check,
+       // for unclear reasons; it's possible that it's worried about a new recursive
+       // bind mount being made while updatedb is running, causing an infinite loop?
+       // Since it's probably for some good reason, we do the same, but we don't
+       // want the barrage of syscalls. It's not synchronous, but the poll signal
+       // isn't either; there's a slight race condition, but one that could only
+       // be exploited by root.
+       //
+       // The thread is forcibly terminated on exit(), so we just let it loop forever.
+       thread poll_thread([&] {
+               for (;;) {
+                       struct pollfd pfd;
+                       /* Unfortunately (mount --bind $path $path/subdir) would leave st_dev
+                          unchanged between $path and $path/subdir, so we must keep reparsing
+                          mountinfo_path each time it changes. */
+                       pfd.fd = mountinfo_fd;
+                       pfd.events = POLLPRI;
+                       if (poll(&pfd, 1, /*timeout=*/-1) == -1) {
+                               perror("poll()");
+                               exit(1);
+                       }
+                       if ((pfd.revents & POLLPRI) != 0) {
+                               mountinfo_updated = true;
+                       }
+               }
+       });
+       poll_thread.detach();
+}
diff --git a/bind-mount.h b/bind-mount.h

new file mode 100644 (file)

index 0000000..b8c9e5f
--- /dev/null
+++ b/bind-mount.h
@@ -0,0 +1,36 @@
+/* Bind mount detection.
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#ifndef BIND_MOUNT_H__
+#define BIND_MOUNT_H__
+
+/* System mount information file */
+#define MOUNTINFO_PATH "/proc/self/mountinfo"
+
+/* Return true if PATH is a destination of a bind mount.
+   (Bind mounts "to self" are ignored.) */
+extern bool is_bind_mount(const char *path);
+
+/* Initialize state for is_bind_mount(), to read data from MOUNTINFO. */
+extern void bind_mount_init(const char *mountinfo);
+
+#endif
diff --git a/complete_pread.cpp b/complete_pread.cpp

index 49136747b94fb680a8fe4285771f75c6a98e018a..8571b33682deb85261d5bb13bfdcc91cfb41ac9f 100644 (file)
--- a/complete_pread.cpp
+++ b/complete_pread.cpp
@@ -3,7 +3,7 @@
  #include <stdlib.h>
  #include <unistd.h>
  
-void complete_pread(int fd, void *ptr, size_t len, off_t offset)
+bool try_complete_pread(int fd, void *ptr, size_t len, off_t offset)
  {
         while (len > 0) {
                 ssize_t ret = pread(fd, ptr, len, offset);
@@ -11,11 +11,19 @@ void complete_pread(int fd, void *ptr, size_t len, off_t offset)
                         continue;
                 }
                 if (ret <= 0) {
-                       perror("pread");
-                       exit(1);
+                       return false;
                 }
                 ptr = reinterpret_cast<char *>(ptr) + ret;
                 len -= ret;
                 offset -= ret;
         }
+       return true;
+}
+
+void complete_pread(int fd, void *ptr, size_t len, off_t offset)
+{
+       if (!try_complete_pread(fd, ptr, len, offset)) {
+               perror("pread");
+               exit(1);
+       }
  }
diff --git a/complete_pread.h b/complete_pread.h

index fec57870c1c45d1a031118eee424c19a6f956ba1..b0f2bd7bc3e33adf9c8346b32b90c1a38e81c369 100644 (file)
--- a/complete_pread.h
+++ b/complete_pread.h
@@ -3,8 +3,11 @@
  
  #include <unistd.h>
  
-// A wrapper around pread() that returns an incomplete read.
-// Always synchronous (no io_uring).
+// A wrapper around pread() that retries on short reads and EINTR,
+// so you never need to call it twice. Always synchronous (no io_uring).
+bool try_complete_pread(int fd, void *ptr, size_t len, off_t offset);
+
+// Same, but exit on failure, so never returns a short read.
  void complete_pread(int fd, void *ptr, size_t len, off_t offset);
  
  #endif  // !defined(COMPLETE_PREAD_H)
diff --git a/conf.cpp b/conf.cpp

new file mode 100644 (file)

index 0000000..a8ec972
--- /dev/null
+++ b/conf.cpp
@@ -0,0 +1,636 @@
+/* updatedb configuration.
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+ */
+
+#include "conf.h"
+
+#include "error.h"
+#include "lib.h"
+
+#include <algorithm>
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+using namespace std;
+
+/* true if locate(1) should check whether files are visible before reporting
+   them */
+bool conf_check_visibility = true;
+
+/* Filesystems to skip, converted to uppercase and sorted by name */
+vector<string> conf_prunefs;
+
+/* Directory names to skip, sorted by name */
+vector<string> conf_prunenames;
+
+/* Paths to skip, sorted by name using dir_path_cmp () */
+vector<string> conf_prunepaths;
+
+/* true if bind mounts should be skipped */
+bool conf_prune_bind_mounts; /* = false; */
+
+/* true if pruning debug output was requested */
+bool conf_debug_pruning; /* = false; */
+
+/* Root of the directory tree to store in the database (canonical) */
+char *conf_scan_root; /* = NULL; */
+
+/* Absolute (not necessarily canonical) path to the database */
+string conf_output;
+
+/* 1 if file names should be written to stdout as they are found */
+bool conf_verbose; /* = false; */
+
+/* Configuration representation for the database configuration block */
+string conf_block;
+
+int conf_block_size = 32;
+bool use_debug = false;
+
+/* Parse a STR, store the parsed boolean value to DEST;
+   return 0 if OK, -1 on error. */
+static int
+parse_bool(bool *dest, const char *str)
+{
+       if (strcmp(str, "0") == 0 || strcmp(str, "no") == 0) {
+               *dest = false;
+               return 0;
+       }
+       if (strcmp(str, "1") == 0 || strcmp(str, "yes") == 0) {
+               *dest = true;
+               return 0;
+       }
+       return -1;
+}
+
+/* String list handling */
+
+/* Add values from space-separated VAL to VAR and LIST */
+static void
+var_add_values(vector<string> *list, const char *val)
+{
+       for (;;) {
+               const char *start;
+
+               while (isspace((unsigned char)*val))
+                       val++;
+               if (*val == 0)
+                       break;
+               start = val;
+               do
+                       val++;
+               while (*val != 0 && !isspace((unsigned char)*val));
+               list->emplace_back(start, val - start);
+       }
+}
+
+/* Finish variable LIST, sort its contents, remove duplicates */
+static void
+var_finish(vector<string> *list)
+{
+       sort(list->begin(), list->end());
+       auto new_end = unique(list->begin(), list->end());
+       list->erase(new_end, list->end());
+}
+
+/* UPDATEDB_CONF parsing */
+
+/* UPDATEDB_CONF (locked) */
+static FILE *uc_file;
+/* Line number at token start; type matches error_at_line () */
+static unsigned uc_line;
+/* Current line number; type matches error_at_line () */
+static unsigned uc_current_line;
+/* Last string returned by uc_lex */
+static string uc_lex_buf;
+
+/* Token types */
+enum {
+       UCT_EOF,
+       UCT_EOL,
+       UCT_IDENTIFIER,
+       UCT_EQUAL,
+       UCT_QUOTED,
+       UCT_OTHER,
+       UCT_PRUNE_BIND_MOUNTS,
+       UCT_PRUNEFS,
+       UCT_PRUNENAMES,
+       UCT_PRUNEPATHS
+};
+
+/* Return next token from uc_file; for UCT_IDENTIFIER, UCT_QUOTED or keywords,
+   store the data to uc_lex_buf (valid until next call). */
+static int
+uc_lex(void)
+{
+       int c;
+
+       uc_lex_buf.clear();
+       uc_line = uc_current_line;
+       do {
+               c = getc_unlocked(uc_file);
+               if (c == EOF)
+                       return UCT_EOF;
+       } while (c != '\n' && isspace((unsigned char)c));
+       switch (c) {
+       case '#':
+               do {
+                       c = getc_unlocked(uc_file);
+                       if (c == EOF)
+                               return UCT_EOF;
+               } while (c != '\n');
+               /* Fall through */
+       case '\n':
+               uc_current_line++;
+               if (uc_current_line == 0) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_current_line - 1,
+                                     _("warning: Line number overflow"));
+                       error_message_count--; /* Don't count as an error */
+               }
+               return UCT_EOL;
+
+       case '=':
+               return UCT_EQUAL;
+
+       case '"': {
+               while ((c = getc_unlocked(uc_file)) != '"') {
+                       if (c == EOF || c == '\n') {
+                               error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                             _("missing closing `\"'"));
+                               ungetc(c, uc_file);
+                               break;
+                       }
+                       uc_lex_buf.push_back(c);
+               }
+               return UCT_QUOTED;
+       }
+
+       default: {
+               if (!isalpha((unsigned char)c) && c != '_')
+                       return UCT_OTHER;
+               do {
+                       uc_lex_buf.push_back(c);
+                       c = getc_unlocked(uc_file);
+               } while (c != EOF && (isalnum((unsigned char)c) || c == '_'));
+               ungetc(c, uc_file);
+               if (uc_lex_buf == "PRUNE_BIND_MOUNTS")
+                       return UCT_PRUNE_BIND_MOUNTS;
+               if (uc_lex_buf == "PRUNEFS")
+                       return UCT_PRUNEFS;
+               if (uc_lex_buf == "PRUNENAMES")
+                       return UCT_PRUNENAMES;
+               if (uc_lex_buf == "PRUNEPATHS")
+                       return UCT_PRUNEPATHS;
+               return UCT_IDENTIFIER;
+       }
+       }
+}
+
+/* Parse /etc/updatedb.conf.  Exit on I/O or syntax error. */
+static void
+parse_updatedb_conf(void)
+{
+       int old_error_one_per_line;
+       unsigned old_error_message_count;
+       bool had_prune_bind_mounts, had_prunefs, had_prunenames, had_prunepaths;
+
+       uc_file = fopen(UPDATEDB_CONF, "r");
+       if (uc_file == NULL) {
+               if (errno != ENOENT)
+                       error(EXIT_FAILURE, errno, _("can not open `%s'"), UPDATEDB_CONF);
+               goto err;
+       }
+       flockfile(uc_file);
+       uc_current_line = 1;
+       old_error_message_count = error_message_count;
+       old_error_one_per_line = error_one_per_line;
+       error_one_per_line = 1;
+       had_prune_bind_mounts = false;
+       had_prunefs = false;
+       had_prunenames = false;
+       had_prunepaths = false;
+       for (;;) {
+               bool *had_var;
+               int var_token, token;
+
+               token = uc_lex();
+               switch (token) {
+               case UCT_EOF:
+                       goto eof;
+
+               case UCT_EOL:
+                       continue;
+
+               case UCT_PRUNE_BIND_MOUNTS:
+                       had_var = &had_prune_bind_mounts;
+                       break;
+
+               case UCT_PRUNEFS:
+                       had_var = &had_prunefs;
+                       break;
+
+               case UCT_PRUNENAMES:
+                       had_var = &had_prunenames;
+                       break;
+
+               case UCT_PRUNEPATHS:
+                       had_var = &had_prunepaths;
+                       break;
+
+               case UCT_IDENTIFIER:
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("unknown variable `%s'"), uc_lex_buf.c_str());
+                       goto skip_to_eol;
+
+               default:
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("variable name expected"));
+                       goto skip_to_eol;
+               }
+               if (*had_var != false) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("variable `%s' was already defined"), uc_lex_buf.c_str());
+                       goto skip_to_eol;
+               }
+               *had_var = true;
+               var_token = token;
+               token = uc_lex();
+               if (token != UCT_EQUAL) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("`=' expected after variable name"));
+                       goto skip_to_eol;
+               }
+               token = uc_lex();
+               if (token != UCT_QUOTED) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("value in quotes expected after `='"));
+                       goto skip_to_eol;
+               }
+               if (var_token == UCT_PRUNE_BIND_MOUNTS) {
+                       if (parse_bool(&conf_prune_bind_mounts, uc_lex_buf.c_str()) != 0) {
+                               error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                             _("invalid value `%s' of PRUNE_BIND_MOUNTS"),
+                                             uc_lex_buf.c_str());
+                               goto skip_to_eol;
+                       }
+               } else if (var_token == UCT_PRUNEFS)
+                       var_add_values(&conf_prunefs, uc_lex_buf.c_str());
+               else if (var_token == UCT_PRUNENAMES)
+                       var_add_values(&conf_prunenames, uc_lex_buf.c_str());
+               else if (var_token == UCT_PRUNEPATHS)
+                       var_add_values(&conf_prunepaths, uc_lex_buf.c_str());
+               else
+                       abort();
+               token = uc_lex();
+               if (token != UCT_EOL && token != UCT_EOF) {
+                       error_at_line(0, 0, UPDATEDB_CONF, uc_line,
+                                     _("unexpected data after variable value"));
+                       goto skip_to_eol;
+               }
+               /* Fall through */
+       skip_to_eol:
+               while (token != UCT_EOL) {
+                       if (token == UCT_EOF)
+                               goto eof;
+                       token = uc_lex();
+               }
+       }
+eof:
+       if (ferror(uc_file))
+               error(EXIT_FAILURE, 0, _("I/O error reading `%s'"), UPDATEDB_CONF);
+       error_one_per_line = old_error_one_per_line;
+       funlockfile(uc_file);
+       fclose(uc_file);
+       if (error_message_count != old_error_message_count)
+               exit(EXIT_FAILURE);
+err:;
+}
+
+/* Command-line argument parsing */
+
+/* Output --help text */
+static void
+help(void)
+{
+       printf(_("Usage: updatedb [OPTION]...\n"
+                "Update a mlocate database.\n"
+                "\n"
+                "  -f, --add-prunefs FS           omit also FS\n"
+                "  -n, --add-prunenames NAMES     omit also NAMES\n"
+                "  -e, --add-prunepaths PATHS     omit also PATHS\n"
+                "  -U, --database-root PATH       the subtree to store in "
+                "database (default \"/\")\n"
+                "  -h, --help                     print this help\n"
+                "  -o, --output FILE              database to update (default\n"
+                "                                 `%s')\n"
+                "  -b, --block-size SIZE          number of filenames to store\n"
+                "                                 in each block (default 32)\n"
+                "      --prune-bind-mounts FLAG   omit bind mounts (default "
+                "\"no\")\n"
+                "      --prunefs FS               filesystems to omit from "
+                "database\n"
+                "      --prunenames NAMES         directory names to omit from "
+                "database\n"
+                "      --prunepaths PATHS         paths to omit from database\n"
+                "  -l, --require-visibility FLAG  check visibility before "
+                "reporting files\n"
+                "                                 (default \"yes\")\n"
+                "  -v, --verbose                  print paths of files as they "
+                "are found\n"
+                "  -V, --version                  print version information\n"
+                "\n"
+                "The configuration defaults to values read from\n"
+                "`%s'.\n"),
+              DBFILE, UPDATEDB_CONF);
+       printf(_("\n"
+                "Report bugs to %s.\n"),
+              PACKAGE_BUGREPORT);
+}
+
+/* Prepend current working directory to PATH;
+   return resulting path */
+static string
+prepend_cwd(const string &path)
+{
+       const char *res;
+       string buf;
+       buf.resize(BUFSIZ); /* Not PATH_MAX because it is not defined on some platforms. */
+       do
+               buf.resize(buf.size() * 1.5);
+       while ((res = getcwd(buf.data(), buf.size())) == NULL && errno == ERANGE);
+       if (res == NULL)
+               error(EXIT_FAILURE, errno, _("can not get current working directory"));
+       buf.resize(strlen(buf.data()));
+       return buf + '/' + path;
+}
+
+/* Parse ARGC, ARGV.  Exit on error or --help, --version. */
+static void
+parse_arguments(int argc, char *argv[])
+{
+       enum { OPT_DEBUG_PRUNING = CHAR_MAX + 1 };
+
+       static const struct option options[] = {
+               { "add-prunefs", required_argument, NULL, 'f' },
+               { "add-prunenames", required_argument, NULL, 'n' },
+               { "add-prunepaths", required_argument, NULL, 'e' },
+               { "database-root", required_argument, NULL, 'U' },
+               { "debug-pruning", no_argument, NULL, OPT_DEBUG_PRUNING },
+               { "help", no_argument, NULL, 'h' },
+               { "output", required_argument, NULL, 'o' },
+               { "prune-bind-mounts", required_argument, NULL, 'B' },
+               { "prunefs", required_argument, NULL, 'F' },
+               { "prunenames", required_argument, NULL, 'N' },
+               { "prunepaths", required_argument, NULL, 'P' },
+               { "require-visibility", required_argument, NULL, 'l' },
+               { "verbose", no_argument, NULL, 'v' },
+               { "version", no_argument, NULL, 'V' },
+               { "block-size", required_argument, 0, 'b' },
+               { "debug", no_argument, 0, 'D' },  // Not documented.
+               { NULL, 0, NULL, 0 }
+       };
+
+       bool prunefs_changed, prunenames_changed, prunepaths_changed;
+       bool got_prune_bind_mounts, got_visibility;
+
+       prunefs_changed = false;
+       prunenames_changed = false;
+       prunepaths_changed = false;
+       got_prune_bind_mounts = false;
+       got_visibility = false;
+       for (;;) {
+               int opt, idx;
+
+               opt = getopt_long(argc, argv, "U:Ve:f:hl:n:o:vb:D", options, &idx);
+               switch (opt) {
+               case -1:
+                       goto options_done;
+
+               case '?':
+                       exit(EXIT_FAILURE);
+
+               case 'B':
+                       if (got_prune_bind_mounts != false)
+                               error(EXIT_FAILURE, 0,
+                                     _("--%s would override earlier command-line argument"),
+                                     "prune-bind-mounts");
+                       got_prune_bind_mounts = true;
+                       if (parse_bool(&conf_prune_bind_mounts, optarg) != 0)
+                               error(EXIT_FAILURE, 0, _("invalid value `%s' of --%s"), optarg,
+                                     "prune-bind-mounts");
+                       break;
+
+               case 'F':
+                       if (prunefs_changed != false)
+                               error(EXIT_FAILURE, 0,
+                                     _("--%s would override earlier command-line argument"),
+                                     "prunefs");
+                       prunefs_changed = true;
+                       conf_prunefs.clear();
+                       var_add_values(&conf_prunefs, optarg);
+                       break;
+
+               case 'N':
+                       if (prunenames_changed != false)
+                               error(EXIT_FAILURE, 0,
+                                     _("--%s would override earlier command-line argument"),
+                                     "prunenames");
+                       prunenames_changed = true;
+                       conf_prunenames.clear();
+                       var_add_values(&conf_prunenames, optarg);
+                       break;
+
+               case 'P':
+                       if (prunepaths_changed != false)
+                               error(EXIT_FAILURE, 0,
+                                     _("--%s would override earlier command-line argument"),
+                                     "prunepaths");
+                       prunepaths_changed = true;
+                       conf_prunepaths.clear(),
+                               var_add_values(&conf_prunepaths, optarg);
+                       break;
+
+               case 'U':
+                       if (conf_scan_root != NULL)
+                               error(EXIT_FAILURE, 0, _("--%s specified twice"),
+                                     "database-root");
+                       conf_scan_root = canonicalize_file_name(optarg);
+                       if (conf_scan_root == NULL)
+                               error(EXIT_FAILURE, errno, _("invalid value `%s' of --%s"), optarg,
+                                     "database-root");
+                       break;
+
+               case 'V':
+                       puts("updatedb (" PACKAGE_NAME ") " PACKAGE_VERSION);
+                       puts(_("Copyright (C) 2007 Red Hat, Inc. All rights reserved.\n"
+                              "This software is distributed under the GPL v.2.\n"
+                              "\n"
+                              "This program is provided with NO WARRANTY, to the extent "
+                              "permitted by law."));
+                       exit(EXIT_SUCCESS);
+
+               case 'e':
+                       prunepaths_changed = true;
+                       var_add_values(&conf_prunepaths, optarg);
+                       break;
+
+               case 'f':
+                       prunefs_changed = true;
+                       var_add_values(&conf_prunefs, optarg);
+                       break;
+
+               case 'h':
+                       help();
+                       exit(EXIT_SUCCESS);
+
+               case 'l':
+                       if (got_visibility != false)
+                               error(EXIT_FAILURE, 0, _("--%s specified twice"),
+                                     "require-visibility");
+                       got_visibility = true;
+                       if (parse_bool(&conf_check_visibility, optarg) != 0)
+                               error(EXIT_FAILURE, 0, _("invalid value `%s' of --%s"), optarg,
+                                     "require-visibility");
+                       break;
+
+               case 'n':
+                       prunenames_changed = true;
+                       var_add_values(&conf_prunenames, optarg);
+                       break;
+
+               case 'o':
+                       if (!conf_output.empty())
+                               error(EXIT_FAILURE, 0, _("--%s specified twice"), "output");
+                       conf_output = optarg;
+                       break;
+
+               case 'v':
+                       conf_verbose = true;
+                       break;
+
+               case 'b':
+                       conf_block_size = atoi(optarg);
+                       break;
+
+               case 'D':
+                       use_debug = true;
+                       break;
+
+               case OPT_DEBUG_PRUNING:
+                       conf_debug_pruning = true;
+                       break;
+
+               default:
+                       abort();
+               }
+       }
+options_done:
+       if (optind != argc)
+               error(EXIT_FAILURE, 0, _("unexpected operand on command line"));
+       if (conf_scan_root == NULL) {
+               static char root[] = "/";
+
+               conf_scan_root = root;
+       }
+       if (conf_output.empty())
+               conf_output = DBFILE;
+       if (conf_output[0] != '/')
+               conf_output = prepend_cwd(conf_output);
+}
+
+/* Conversion of configuration for main code */
+
+/* Store a string list to OBSTACK */
+static void
+gen_conf_block_string_list(string *obstack,
+                           const vector<string> *strings)
+{
+       for (const string &str : *strings) {
+               *obstack += str;
+               *obstack += '\0';
+       }
+       *obstack += '\0';
+}
+
+/* Generate conf_block */
+static void
+gen_conf_block(void)
+{
+       conf_block.clear();
+
+#define CONST(S) conf_block.append(S, sizeof(S))
+       /* conf_check_visibility value is stored in the header */
+       CONST("prune_bind_mounts");
+       /* Add two NUL bytes after the value */
+       conf_block.append(conf_prune_bind_mounts != false ? "1\0" : "0\0", 3);
+       CONST("prunefs");
+       gen_conf_block_string_list(&conf_block, &conf_prunefs);
+       CONST("prunenames");
+       gen_conf_block_string_list(&conf_block, &conf_prunenames);
+       CONST("prunepaths");
+       gen_conf_block_string_list(&conf_block, &conf_prunepaths);
+       /* scan_root is contained directly in the header */
+       /* conf_output, conf_verbose are not relevant */
+#undef CONST
+}
+
+/* Parse /etc/updatedb.conf and command-line arguments ARGC, ARGV.
+   Exit on error or --help, --version. */
+void conf_prepare(int argc, char *argv[])
+{
+       parse_updatedb_conf();
+       parse_arguments(argc, argv);
+       for (string &str : conf_prunefs) {
+               /* Assuming filesystem names are ASCII-only */
+               for (char &c : str)
+                       c = toupper(c);
+       }
+       /* Finish the variable only after converting filesystem names to upper case
+          to avoid keeping duplicates that originally differed in case and to sort
+          them correctly. */
+       var_finish(&conf_prunefs);
+       var_finish(&conf_prunenames);
+       var_finish(&conf_prunepaths);
+       gen_conf_block();
+       string_list_dir_path_sort(&conf_prunepaths);
+
+       if (conf_debug_pruning) {
+               /* This is debuging output, don't mark anything for translation */
+               fprintf(stderr, "conf_block:\n");
+               for (char c : conf_block) {
+                       if (isascii((unsigned char)c) && isprint((unsigned char)c) && c != '\\')
+                               putc(c, stderr);
+                       else {
+                               fprintf(stderr, "\\%03o", (unsigned)(unsigned char)c);
+                               if (c == 0)
+                                       putc('\n', stderr);
+                       }
+               }
+               fprintf(stderr, "\n-----------------------\n");
+       }
+}
diff --git a/conf.h b/conf.h

new file mode 100644 (file)

index 0000000..388e59f
--- /dev/null
+++ b/conf.h
@@ -0,0 +1,68 @@
+/* updatedb configuration.
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#ifndef CONF_H__
+#define CONF_H__
+
+#include <stddef.h>
+#include <string>
+#include <vector>
+
+/* true if locate(1) should check whether files are visible before reporting
+   them */
+extern bool conf_check_visibility;
+
+/* Filesystems to skip, converted to uppercase and sorted by name */
+extern std::vector<std::string> conf_prunefs;
+
+/* Directory names to skip, sorted by name */
+extern std::vector<std::string> conf_prunenames;
+
+/* Paths to skip, sorted by name using dir_path_cmp () */
+extern std::vector<std::string> conf_prunepaths;
+
+/* true if bind mounts should be skipped */
+extern bool conf_prune_bind_mounts;
+
+/* true if pruning debug output was requested */
+extern bool conf_debug_pruning;
+
+/* Root of the directory tree to store in the database (canonical) */
+extern char *conf_scan_root;
+
+/* Absolute (not necessarily canonical) path to the database */
+extern std::string conf_output;
+
+/* true if file names should be written to stdout as they are found */
+extern bool conf_verbose;
+
+/* Configuration representation for the database configuration block */
+extern std::string conf_block;
+
+/* Parse /etc/updatedb.conf and command-line arguments ARGC, ARGV.
+   Exit on error or --help, --version. */
+extern void conf_prepare(int argc, char *argv[]);
+
+extern int conf_block_size;
+extern bool use_debug;
+
+#endif
diff --git a/database-builder.cpp b/database-builder.cpp

index a35be2a55702cad0e8b5bdf6b1a29767da3ef8a3..642420f3c5942b6021157a2e19d559b751cdb255 100644 (file)
--- a/database-builder.cpp
+++ b/database-builder.cpp
@@ -113,7 +113,7 @@ void PostingListBuilder::write_header(uint32_t docid)
         encoded.append(reinterpret_cast<char *>(buf), end - buf);
  }
  
-void DictionaryBuilder::add_file(string filename)
+void DictionaryBuilder::add_file(string filename, dir_time)
  {
         if (keep_current_block) {  // Only bother saving the filenames if we're actually keeping the block.
                 if (!current_block.empty()) {
@@ -175,10 +175,14 @@ string DictionaryBuilder::train(size_t buf_size)
         return buf;
  }
  
-Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict)
-       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), cdict(cdict)
+Corpus::Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times)
+       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict)
  {
         fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr);
+       if (store_dir_times) {
+               dir_time_ctx = ZSTD_createCStream();
+               ZSTD_initCStream(dir_time_ctx, /*level=*/6);
+       }
  }
  
  Corpus::~Corpus()
@@ -196,7 +200,7 @@ PostingListBuilder &Corpus::get_pl_builder(uint32_t trgm)
         return *invindex[trgm];
  }
  
-void Corpus::add_file(string filename)
+void Corpus::add_file(string filename, dir_time dt)
  {
         ++num_files;
         if (!current_block.empty()) {
@@ -206,6 +210,49 @@ void Corpus::add_file(string filename)
         if (++num_files_in_block == block_size) {
                 flush_block();
         }
+
+       if (store_dir_times) {
+               if (dt.sec == -1) {
+                       // Not a directory.
+                       dir_times.push_back('\0');
+               } else {
+                       dir_times.push_back('\1');
+                       dir_times.append(reinterpret_cast<char *>(&dt.sec), sizeof(dt.sec));
+                       dir_times.append(reinterpret_cast<char *>(&dt.nsec), sizeof(dt.nsec));
+               }
+               compress_dir_times(/*allowed_slop=*/4096);
+       }
+}
+
+void Corpus::compress_dir_times(size_t allowed_slop) {
+       while (dir_times.size() >= allowed_slop) {
+               size_t old_size = dir_times_compressed.size();
+               dir_times_compressed.resize(old_size + 4096);
+
+               ZSTD_outBuffer outbuf;
+               outbuf.dst = dir_times_compressed.data() + old_size;
+               outbuf.size = 4096;
+               outbuf.pos = 0;
+
+               ZSTD_inBuffer inbuf;
+               inbuf.src = dir_times.data();
+               inbuf.size = dir_times.size();
+               inbuf.pos = 0;
+
+               int ret = ZSTD_compressStream(dir_time_ctx, &outbuf, &inbuf);
+               if (ret < 0) {
+                       fprintf(stderr, "ZSTD_compressStream() failed\n");
+                       exit(1);
+               }
+
+               dir_times_compressed.resize(old_size + outbuf.pos);
+               dir_times.erase(dir_times.begin(), dir_times.begin() + inbuf.pos);
+
+               if (outbuf.pos == 0 && inbuf.pos == 0) {
+                       // Nothing happened (not enough data?), try again later.
+                       return;
+               }
+       }
  }
  
  void Corpus::flush_block()
@@ -258,6 +305,40 @@ size_t Corpus::num_trigrams() const
         return num;
  }
  
+string Corpus::get_compressed_dir_times()
+{
+       if (!store_dir_times) {
+               return "";
+       }
+       compress_dir_times(/*allowed_slop=*/0);
+       assert(dir_times.empty());
+
+       for ( ;; ) {
+               size_t old_size = dir_times_compressed.size();
+               dir_times_compressed.resize(old_size + 4096);
+
+               ZSTD_outBuffer outbuf;
+               outbuf.dst = dir_times_compressed.data() + old_size;
+               outbuf.size = 4096;
+               outbuf.pos = 0;
+
+               int ret = ZSTD_endStream(dir_time_ctx, &outbuf);
+               if (ret < 0) {
+                       fprintf(stderr, "ZSTD_compressStream() failed\n");
+                       exit(1);
+               }
+
+               dir_times_compressed.resize(old_size + outbuf.pos);
+
+               if (ret == 0) {
+                       // All done.
+                       break;
+               }
+       }
+
+       return dir_times_compressed;
+}
+
  string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf)
  {
         static ZSTD_CCtx *ctx = nullptr;
@@ -335,19 +416,29 @@ unique_ptr<Trigram[]> create_hashtable(Corpus &corpus, const vector<uint32_t> &a
         return ht;
  }
  
-DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dictionary)
+DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_size, string dictionary)
         : outfile(outfile), block_size(block_size)
  {
         umask(0027);
  
         string path = outfile;
         path.resize(path.find_last_of('/') + 1);
+       if (path.empty()) {
+               path = ".";
+       }
         int fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640);
         if (fd == -1) {
                 perror(path.c_str());
                 exit(1);
         }
  
+       if (owner != (gid_t)-1) {
+               if (fchown(fd, (uid_t)-1, owner) == -1) {
+                       perror("fchown");
+                       exit(1);
+               }
+       }
+
         outfp = fdopen(fd, "wb");
         if (outfp == nullptr) {
                 perror(outfile);
@@ -361,7 +452,7 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dic
         hdr.extra_ht_slots = num_overflow_slots;
         hdr.num_docids = 0;
         hdr.hash_table_offset_bytes = -1;  // We don't know these offsets yet.
-       hdr.max_version = 1;
+       hdr.max_version = 2;
         hdr.filename_index_offset_bytes = -1;
         hdr.zstd_dictionary_length_bytes = -1;
         fwrite(&hdr, sizeof(hdr), 1, outfp);
@@ -375,15 +466,32 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, int block_size, string dic
                 hdr.zstd_dictionary_length_bytes = dictionary.size();
                 cdict = ZSTD_createCDict(dictionary.data(), dictionary.size(), /*level=*/6);
         }
+
+       hdr.directory_data_length_bytes = 0;
+       hdr.directory_data_offset_bytes = 0;
+       hdr.next_zstd_dictionary_length_bytes = 0;
+       hdr.next_zstd_dictionary_offset_bytes = 0;
+       hdr.conf_block_length_bytes = 0;
+       hdr.conf_block_offset_bytes = 0;
  }
  
-Corpus *DatabaseBuilder::start_corpus()
+Corpus *DatabaseBuilder::start_corpus(bool store_dir_times)
  {
         corpus_start = steady_clock::now();
-       corpus = new Corpus(outfp, block_size, cdict);
+       corpus = new Corpus(outfp, block_size, cdict, store_dir_times);
         return corpus;
  }
  
+void DatabaseBuilder::set_next_dictionary(std::string next_dictionary)
+{
+       this->next_dictionary = move(next_dictionary);
+}
+
+void DatabaseBuilder::set_conf_block(std::string conf_block)
+{
+       this->conf_block = move(conf_block);
+}
+
  void DatabaseBuilder::finish_corpus()
  {
         corpus->finish();
@@ -468,6 +576,31 @@ void DatabaseBuilder::finish_corpus()
                 fwrite(encoded.data(), encoded.size(), 1, outfp);
         }
  
+       // Finally, write the directory times (for updatedb).
+       string compressed_dir_times = corpus->get_compressed_dir_times();
+       size_t bytes_for_compressed_dir_times = 0;
+       if (!compressed_dir_times.empty()) {
+               hdr.directory_data_offset_bytes = ftell(outfp);
+               hdr.directory_data_length_bytes = compressed_dir_times.size();
+               fwrite(compressed_dir_times.data(), compressed_dir_times.size(), 1, outfp);
+               bytes_for_compressed_dir_times = compressed_dir_times.size();
+               compressed_dir_times.clear();
+       }
+
+       // Write the recommended dictionary for next update.
+       if (!next_dictionary.empty()) {
+               hdr.next_zstd_dictionary_offset_bytes = ftell(outfp);
+               hdr.next_zstd_dictionary_length_bytes = next_dictionary.size();
+               fwrite(next_dictionary.data(), next_dictionary.size(), 1, outfp);
+       }
+
+       // And the configuration block.
+       if (!conf_block.empty()) {
+               hdr.next_zstd_dictionary_offset_bytes = ftell(outfp);
+               hdr.next_zstd_dictionary_length_bytes = conf_block.size();
+               fwrite(conf_block.data(), conf_block.size(), 1, outfp);
+       }
+
         // Rewind, and write the updated header.
         hdr.version = 1;
         fseek(outfp, 0, SEEK_SET);
@@ -485,7 +618,7 @@ void DatabaseBuilder::finish_corpus()
  
         fclose(outfp);
  
-       size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames);
+       size_t total_bytes = (bytes_for_hashtable + bytes_for_posting_lists + bytes_for_filename_index + bytes_for_filenames + bytes_for_compressed_dir_times);
  
         dprintf("Block size:     %7d files\n", block_size);
         dprintf("Dictionary:     %'7.1f MB\n", hdr.zstd_dictionary_length_bytes / 1048576.0);
@@ -493,6 +626,9 @@ void DatabaseBuilder::finish_corpus()
         dprintf("Posting lists:  %'7.1f MB\n", bytes_for_posting_lists / 1048576.0);
         dprintf("Filename index: %'7.1f MB\n", bytes_for_filename_index / 1048576.0);
         dprintf("Filenames:      %'7.1f MB\n", bytes_for_filenames / 1048576.0);
+       if (bytes_for_compressed_dir_times != 0) {
+               dprintf("Modify times:   %'7.1f MB\n", bytes_for_compressed_dir_times / 1048576.0);
+       }
         dprintf("Total:          %'7.1f MB\n", total_bytes / 1048576.0);
         dprintf("\n");
  }
diff --git a/database-builder.h b/database-builder.h

index e799105563270690099bffcbc9f9b196d49beb6f..e2c0c19f54c291d19d99edd87c47b23c48b2ac50 100644 (file)
--- a/database-builder.h
+++ b/database-builder.h
@@ -8,15 +8,36 @@
  #include <random>
  #include <stddef.h>
  #include <string>
+#include <utility>
  #include <vector>
  #include <zstd.h>
  
  class PostingListBuilder;
  
+// {0,0} means unknown or so current that it should never match.
+// {-1,0} means it's not a directory.
+struct dir_time {
+       int64_t sec;
+       int32_t nsec;
+
+       bool operator<(const dir_time &other) const
+       {
+               if (sec != other.sec)
+                       return sec < other.sec;
+               return nsec < other.nsec;
+       }
+       bool operator>=(const dir_time &other) const
+       {
+               return !(other < *this);
+       }
+};
+constexpr dir_time unknown_dir_time{ 0, 0 };
+constexpr dir_time not_a_dir{ -1, 0 };
+
  class DatabaseReceiver {
  public:
         virtual ~DatabaseReceiver() = default;
-       virtual void add_file(std::string filename) = 0;
+       virtual void add_file(std::string filename, dir_time dt) = 0;
         virtual void flush_block() = 0;
         virtual void finish() { flush_block(); }
  };
@@ -25,7 +46,7 @@ class DictionaryBuilder : public DatabaseReceiver {
  public:
         DictionaryBuilder(size_t blocks_to_keep, size_t block_size)
                 : blocks_to_keep(blocks_to_keep), block_size(block_size) {}
-       void add_file(std::string filename) override;
+       void add_file(std::string filename, dir_time dt) override;
         void flush_block() override;
         std::string train(size_t buf_size);
  
@@ -45,10 +66,10 @@ private:
  
  class Corpus : public DatabaseReceiver {
  public:
-       Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict);
+       Corpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times);
         ~Corpus();
  
-       void add_file(std::string filename) override;
+       void add_file(std::string filename, dir_time dt) override;
         void flush_block() override;
         void finish() override;
  
@@ -60,20 +81,30 @@ public:
         }
         PostingListBuilder &get_pl_builder(uint32_t trgm);
         size_t num_trigrams() const;
+       std::string get_compressed_dir_times();
  
  private:
+       void compress_dir_times(size_t allowed_slop);
+
         std::unique_ptr<PostingListBuilder *[]> invindex;
         FILE *outfp;
         std::string current_block;
         std::string tempbuf;
         const size_t block_size;
+       const bool store_dir_times;
         ZSTD_CDict *cdict;
+
+       ZSTD_CStream *dir_time_ctx = nullptr;
+       std::string dir_times;  // Buffer of still-uncompressed data.
+       std::string dir_times_compressed;
  };
  
  class DatabaseBuilder {
  public:
-       DatabaseBuilder(const char *outfile, int block_size, std::string dictionary);
-       Corpus *start_corpus();
+       DatabaseBuilder(const char *outfile, gid_t owner, int block_size, std::string dictionary);
+       Corpus *start_corpus(bool store_dir_times);
+       void set_next_dictionary(std::string next_dictionary);
+       void set_conf_block(std::string conf_block);
         void finish_corpus();
  
  private:
@@ -84,6 +115,7 @@ private:
         std::chrono::steady_clock::time_point corpus_start;
         Corpus *corpus = nullptr;
         ZSTD_CDict *cdict = nullptr;
+       std::string next_dictionary, conf_block;
  };
  
  #endif  // !defined(_DATABASE_BUILDER_H)
diff --git a/db.h b/db.h

index df79904d94594753bf960538e0f006d82b614437..e23d47885ecedb85039fca8a71206ec765d071ba 100644 (file)
--- a/db.h
+++ b/db.h
@@ -13,9 +13,17 @@ struct Header {
         uint64_t filename_index_offset_bytes;
  
         // Version 1 and up only.
-       uint32_t max_version;  // Nominally 1, but can be increased if more features are added in a backward-compatible way.
+       uint32_t max_version;  // Nominally 1 or 2, but can be increased if more features are added in a backward-compatible way.
         uint32_t zstd_dictionary_length_bytes;
         uint64_t zstd_dictionary_offset_bytes;
+
+       // Only if max_version >= 2, and only relevant for updatedb.
+       uint64_t directory_data_length_bytes;
+       uint64_t directory_data_offset_bytes;
+       uint64_t next_zstd_dictionary_length_bytes;
+       uint64_t next_zstd_dictionary_offset_bytes;
+       uint64_t conf_block_length_bytes;
+       uint64_t conf_block_offset_bytes;
  };
  
  struct Trigram {
diff --git a/lib.cpp b/lib.cpp

new file mode 100644 (file)

index 0000000..171eced
--- /dev/null
+++ b/lib.cpp
@@ -0,0 +1,92 @@
+/* Common functions.
+
+Copyright (C) 2005, 2007 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+using namespace std;
+
+#include "lib.h"
+
+#include "db.h"
+#include "error.h"
+
+#include <algorithm>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Compare two path names using the database directory order. This is not
+   exactly strcmp () order: "a" < "a.b", so "a/z" < "a.b". */
+int dir_path_cmp(const string &a, const string &b)
+{
+       auto [ai, bi] = mismatch(a.begin(), a.end(), b.begin(), b.end());
+       if (ai == a.end() && bi == b.end()) {
+               return 0;
+       }
+       if (ai == a.end()) {
+               return -1;
+       }
+       if (bi == b.end()) {
+               return 1;
+       }
+       if (*ai == *bi) {
+               return 0;
+       }
+       if (*ai == '/') {
+               return -1;
+       }
+       if (*bi == '/') {
+               return 1;
+       }
+       return int((unsigned char)*ai) - int((unsigned char)*bi);
+}
+
+/* Sort LIST using dir_path_cmp () */
+void string_list_dir_path_sort(vector<string> *list)
+{
+       sort(list->begin(), list->end(), [](const string &a, const string &b) {
+               return dir_path_cmp(a, b) < 0;
+       });
+}
+
+/* Is PATH included in LIST?  Update *IDX to move within LIST.
+
+   LIST is assumed to be sorted using dir_path_cmp (), successive calls to this
+   function are assumed to use PATH values increasing in dir_path_cmp (). */
+bool string_list_contains_dir_path(const vector<string> *list, size_t *idx,
+                                   const string &path)
+{
+       int cmp = 0;
+       while (*idx < list->size() && (cmp = dir_path_cmp((*list)[*idx], path)) < 0) {
+               (*idx)++;
+       }
+       if (*idx < list->size() && cmp == 0) {
+               (*idx)++;
+               return true;
+       }
+       return false;
+}
diff --git a/lib.h b/lib.h

new file mode 100644 (file)

index 0000000..0239dcb
--- /dev/null
+++ b/lib.h
@@ -0,0 +1,49 @@
+/* Common functions.
+
+Copyright (C) 2005, 2007 Red Hat, Inc. All rights reserved.
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+*/
+
+#ifndef LIB_H__
+#define LIB_H__
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <sys/types.h>
+#include <vector>
+
+#define _(X) (X)
+
+/* Compare two path names using the database directory order. This is not
+   exactly strcmp () order: "a" < "a.b", so "a/z" < "a.b". */
+extern int dir_path_cmp(const std::string &a, const std::string &b);
+
+/* Sort LIST using dir_path_cmp () */
+extern void string_list_dir_path_sort(std::vector<std::string> *list);
+
+/* Is PATH included in LIST?  Update *IDX to move within LIST.
+
+   LIST is assumed to be sorted using dir_path_cmp (), successive calls to this
+   function are assumed to use PATH values increasing in dir_path_cmp (). */
+extern bool string_list_contains_dir_path(const std::vector<std::string> *list,
+                                          size_t *idx, const std::string &path);
+
+#endif
diff --git a/meson.build b/meson.build

index 9dc7c1514acc323d551f7ea018aef22a58b58d26..feb6c5a57f7c27a994189c2c36146acd4cb32449 100644 (file)
--- a/meson.build
+++ b/meson.build
@@ -1,7 +1,11 @@
  project('plocate', 'cpp', default_options: ['buildtype=debugoptimized','cpp_std=c++17'], version: '1.0.8-pre')
  
-# Make the version available as a #define.
-add_project_arguments('-DPLOCATE_VERSION="' + meson.project_version() + '"', language: 'cpp')
+add_project_arguments('-DGROUPNAME="' + get_option('locategroup') + '"', language: 'cpp')
+add_project_arguments('-DUPDATEDB_CONF="/etc/updatedb.conf"', language: 'cpp')
+add_project_arguments('-DDBFILE="/var/lib/mlocate/plocate.db"', language: 'cpp')
+add_project_arguments('-DPACKAGE_NAME="plocate"', language: 'cpp')
+add_project_arguments('-DPACKAGE_VERSION="' + meson.project_version() + '"', language: 'cpp')
+add_project_arguments('-DPACKAGE_BUGREPORT="steinar+plocate@gunderson.no"', language: 'cpp')
  
  cxx = meson.get_compiler('cpp')
  uringdep = dependency('liburing', required: false)
@@ -33,6 +37,10 @@ executable('plocate-build', ['plocate-build.cpp', 'database-builder.cpp'],
         dependencies: [zstddep],
         install: true,
         install_dir: get_option('sbindir'))
+executable('updatedb', ['updatedb.cpp', 'database-builder.cpp', 'conf.cpp', 'lib.cpp', 'bind-mount.cpp', 'complete_pread.cpp'],
+       dependencies: [zstddep, threaddep],
+       install: true,
+       install_dir: get_option('sbindir'))
  
  conf_data = configuration_data()
  conf_data.set('PROCESSED_BY_MESON', '1')
diff --git a/plocate-build.cpp b/plocate-build.cpp

index 02b18d0e42e00dc577ad9d7160d2f53d05433bfb..5c205aabef6d636f5c40cc009920ef61d5cdd1ea 100644 (file)
--- a/plocate-build.cpp
+++ b/plocate-build.cpp
@@ -83,10 +83,10 @@ void handle_directory(FILE *fp, DatabaseReceiver *receiver)
                 int type = getc(fp);
                 if (type == DBE_NORMAL) {
                         string filename = read_cstr(fp);
-                       receiver->add_file(dir_path + "/" + filename);
+                       receiver->add_file(dir_path + "/" + filename, unknown_dir_time);
                 } else if (type == DBE_DIRECTORY) {
                         string dirname = read_cstr(fp);
-                       receiver->add_file(dir_path + "/" + dirname);
+                       receiver->add_file(dir_path + "/" + dirname, unknown_dir_time);
                 } else {
                         return;  // Probably end.
                 }
@@ -116,7 +116,7 @@ void read_plaintext(FILE *fp, DatabaseReceiver *receiver)
                 }
                 if (!s.empty() && s.back() == '\n')
                         s.pop_back();
-               receiver->add_file(move(s));
+               receiver->add_file(move(s), unknown_dir_time);
         }
  }
  
@@ -166,8 +166,8 @@ void do_build(const char *infile, const char *outfile, int block_size, bool plai
         }
         string dictionary = builder.train(1024);
  
-       DatabaseBuilder db(outfile, block_size, dictionary);
-       Corpus *corpus = db.start_corpus();
+       DatabaseBuilder db(outfile, /*owner=*/-1, block_size, dictionary);
+       Corpus *corpus = db.start_corpus(/*store_dir_times=*/false);
         if (plaintext) {
                 read_plaintext(infp, corpus);
         } else {
@@ -195,7 +195,7 @@ void usage()
  
  void version()
  {
-       printf("plocate-build %s\n", PLOCATE_VERSION);
+       printf("plocate-build %s\n", PACKAGE_VERSION);
         printf("Copyright 2020 Steinar H. Gunderson\n");
         printf("License GPLv2+: GNU GPL version 2 or later <https://gnu.org/licenses/gpl.html>.\n");
         printf("This is free software: you are free to change and redistribute it.\n");
diff --git a/plocate.cpp b/plocate.cpp

index 079eb2996cf40e7d4a8c3fe5b1d37d7485e6f9a7..6fe46a4f77e36a1aa1f75ec2fc7e2036079b25fc 100644 (file)
--- a/plocate.cpp
+++ b/plocate.cpp
@@ -43,9 +43,7 @@
  using namespace std;
  using namespace std::chrono;
  
-#define DEFAULT_DBPATH "/var/lib/mlocate/plocate.db"
-
-const char *dbpath = DEFAULT_DBPATH;
+const char *dbpath = DBFILE;
  bool ignore_case = false;
  bool only_count = false;
  bool print_nul = false;
@@ -656,7 +654,7 @@ void usage()
                 "  -b, --basename         search only the file name portion of path names\n"
                 "  -c, --count            print number of matches instead of the matches\n"
                 "  -d, --database DBPATH  search for files in DBPATH\n"
-               "                         (default is " DEFAULT_DBPATH ")\n"
+               "                         (default is " DBFILE ")\n"
                 "  -i, --ignore-case      search case-insensitively\n"
                 "  -l, --limit LIMIT      stop after LIMIT matches\n"
                 "  -0, --null             delimit matches by NUL instead of newline\n"
@@ -669,7 +667,7 @@ void usage()
  
  void version()
  {
-       printf("plocate %s\n", PLOCATE_VERSION);
+       printf("%s %s\n", PACKAGE_NAME, PACKAGE_VERSION);
         printf("Copyright 2020 Steinar H. Gunderson\n");
         printf("License GPLv2+: GNU GPL version 2 or later <https://gnu.org/licenses/gpl.html>.\n");
         printf("This is free software: you are free to change and redistribute it.\n");
diff --git a/updatedb.cpp b/updatedb.cpp

new file mode 100644 (file)

index 0000000..908f477
--- /dev/null
+++ b/updatedb.cpp
@@ -0,0 +1,792 @@
+/* updatedb(8).
+
+Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
+
+This copyrighted material is made available to anyone wishing to use, modify,
+copy, or redistribute it subject to the terms and conditions of the GNU General
+Public License v.2.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
+Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+Author: Miloslav Trmac <mitr@redhat.com>
+
+
+plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
+plocate parts and modifications are licensed under the GPLv2 or, at your option,
+any later version.
+ */
+
+#include "bind-mount.h"
+#include "complete_pread.h"
+#include "conf.h"
+#include "database-builder.h"
+#include "db.h"
+#include "dprintf.h"
+#include "io_uring_engine.h"
+#include "lib.h"
+
+#include <algorithm>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <chrono>
+#include <dirent.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <grp.h>
+#include <iosfwd>
+#include <math.h>
+#include <memory>
+#include <mntent.h>
+#include <random>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <utility>
+#include <vector>
+
+using namespace std;
+using namespace std::chrono;
+
+/* Next conf_prunepaths entry */
+static size_t conf_prunepaths_index; /* = 0; */
+
+void usage()
+{
+       printf(
+               "Usage: updatedb PLOCATE_DB\n"
+               "\n"
+               "Generate plocate index from mlocate.db, typically /var/lib/mlocate/mlocate.db.\n"
+               "Normally, the destination should be /var/lib/mlocate/plocate.db.\n"
+               "\n"
+               "  -b, --block-size SIZE  number of filenames to store in each block (default 32)\n"
+               "  -p, --plaintext        input is a plaintext file, not an mlocate database\n"
+               "      --help             print this help\n"
+               "      --version          print version information\n");
+}
+
+void version()
+{
+       printf("updatedb %s\n", PACKAGE_VERSION);
+       printf("Copyright (C) 2007 Red Hat, Inc. All rights reserved.\n");
+       printf("Copyright 2020 Steinar H. Gunderson\n");
+       printf("This software is distributed under the GPL v.2.\n");
+       printf("\n");
+       printf("This program is provided with NO WARRANTY, to the extent permitted by law.\n");
+}
+
+int opendir_noatime(int dirfd, const char *path)
+{
+       static bool noatime_failed = false;
+
+       if (!noatime_failed) {
+               int fd = openat(dirfd, path, O_RDONLY | O_DIRECTORY | O_NOATIME);
+               if (fd != -1) {
+                       return fd;
+               } else if (errno == EPERM) {
+                       /* EPERM is fairly O_NOATIME-specific; missing access rights cause
+                          EACCES. */
+                       noatime_failed = true;
+                       // Retry below.
+               } else {
+                       return -1;
+               }
+       }
+       return openat(dirfd, path, O_RDONLY | O_DIRECTORY);
+}
+
+bool time_is_current(const dir_time &t)
+{
+       static dir_time cache{ 0, 0 };
+
+       /* This is more difficult than it should be because Linux uses a cheaper time
+          source for filesystem timestamps than for gettimeofday() and they can get
+          slightly out of sync, see
+          https://bugzilla.redhat.com/show_bug.cgi?id=244697 .  This affects even
+          nanosecond timestamps (and don't forget that tv_nsec existence doesn't
+          guarantee that the underlying filesystem has such resolution - it might be
+          microseconds or even coarser).
+
+          The worst case is probably FAT timestamps with 2-second resolution
+          (although using such a filesystem violates POSIX file times requirements).
+
+          So, to be on the safe side, require a >3.0 second difference (2 seconds to
+          make sure the FAT timestamp changed, 1 more to account for the Linux
+          timestamp races).  This large margin might make updatedb marginally more
+          expensive, but it only makes a difference if the directory was very
+          recently updated _and_ is will not be updated again until the next
+          updatedb run; this is not likely to happen for most directories. */
+
+       /* Cache gettimeofday () results to rule out obviously old time stamps;
+          CACHE contains the earliest time we reject as too current. */
+       if (t < cache) {
+               return false;
+       }
+
+       struct timeval tv;
+       gettimeofday(&tv, nullptr);
+       cache.sec = tv.tv_sec - 3;
+       cache.nsec = tv.tv_usec * 1000;
+
+       return t >= cache;
+}
+
+struct entry {
+       string name;
+       bool is_directory;
+
+       // For directories only:
+       int fd = -1;
+       dir_time dt = unknown_dir_time;
+       dir_time db_modified = unknown_dir_time;
+       dev_t dev;
+};
+
+bool filesystem_is_excluded(const char *path)
+{
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "Checking whether filesystem `%s' is excluded:\n", path);
+       }
+       FILE *f = setmntent("/proc/mounts", "r");
+       if (f == nullptr) {
+               return false;
+       }
+
+       struct mntent *me;
+       while ((me = getmntent(f)) != nullptr) {
+               if (conf_debug_pruning) {
+                       /* This is debugging output, don't mark anything for translation */
+                       fprintf(stderr, " `%s', type `%s'\n", me->mnt_dir, me->mnt_type);
+               }
+               string type(me->mnt_type);
+               for (char &p : type) {
+                       p = toupper(p);
+               }
+               if (find(conf_prunefs.begin(), conf_prunefs.end(), type) != conf_prunefs.end()) {
+                       /* Paths in /proc/self/mounts contain no symbolic links.  Besides
+                          avoiding a few system calls, avoiding the realpath () avoids hangs
+                          if the filesystem is unavailable hard-mounted NFS. */
+                       char *dir = me->mnt_dir;
+                       if (conf_debug_pruning) {
+                               /* This is debugging output, don't mark anything for translation */
+                               fprintf(stderr, " => type matches, dir `%s'\n", dir);
+                       }
+                       bool res = (strcmp(path, dir) == 0);
+                       if (dir != me->mnt_dir)
+                               free(dir);
+                       if (res) {
+                               endmntent(f);
+                               return true;
+                       }
+               }
+       }
+       if (conf_debug_pruning) {
+               /* This is debugging output, don't mark anything for translation */
+               fprintf(stderr, "...done\n");
+       }
+       endmntent(f);
+       return false;
+}
+
+dir_time get_dirtime_from_stat(const struct stat &buf)
+{
+       dir_time ctime{ buf.st_ctim.tv_sec, int32_t(buf.st_ctim.tv_nsec) };
+       dir_time mtime{ buf.st_mtim.tv_sec, int32_t(buf.st_mtim.tv_nsec) };
+       dir_time dt = max(ctime, mtime);
+
+       if (time_is_current(dt)) {
+               /* The directory might be changing right now and we can't be sure the
+                  timestamp will be changed again if more changes happen very soon, mark
+                  the timestamp as invalid to force rescanning the directory next time
+                  updatedb is run. */
+               return unknown_dir_time;
+       } else {
+               return dt;
+       }
+}
+
+// Represents the old database we are updating.
+class ExistingDB {
+public:
+       explicit ExistingDB(int fd);
+       ~ExistingDB();
+
+       pair<string, dir_time> read_next();
+       void unread(pair<string, dir_time> record)
+       {
+               unread_record = move(record);
+       }
+       string read_next_dictionary() const;
+       bool get_error() const { return error; }
+
+private:
+       const int fd;
+       Header hdr;
+
+       uint32_t current_docid = 0;
+
+       string current_filename_block;
+       const char *current_filename_ptr = nullptr, *current_filename_end = nullptr;
+
+       off_t compressed_dir_time_pos;
+       string compressed_dir_time;
+       string current_dir_time_block;
+       const char *current_dir_time_ptr = nullptr, *current_dir_time_end = nullptr;
+
+       pair<string, dir_time> unread_record;
+
+       // Used in one-shot mode, repeatedly.
+       ZSTD_DCtx *ctx;
+
+       // Used in streaming mode.
+       ZSTD_DCtx *dir_time_ctx;
+
+       ZSTD_DDict *ddict = nullptr;
+
+       // If true, we've discovered an error or EOF, and will return only
+       // empty data from here.
+       bool eof = false, error = false;
+};
+
+ExistingDB::ExistingDB(int fd)
+       : fd(fd)
+{
+       if (fd == -1) {
+               error = true;
+               return;
+       }
+
+       if (!try_complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0)) {
+               if (conf_verbose) {
+                       perror("pread(header)");
+               }
+               error = true;
+               return;
+       }
+       if (memcmp(hdr.magic, "\0plocate", 8) != 0) {
+               if (conf_verbose) {
+                       fprintf(stderr, "Old database had header mismatch, ignoring.\n");
+               }
+               error = true;
+               return;
+       }
+       if (hdr.version != 1 || hdr.max_version < 2) {
+               if (conf_verbose) {
+                       fprintf(stderr, "Old database had version mismatch (version=%d max_version=%d), ignoring.\n",
+                               hdr.version, hdr.max_version);
+               }
+               error = true;
+               return;
+       }
+
+       // Compare the configuration block with our current one.
+       if (hdr.conf_block_length_bytes != conf_block.size()) {
+               if (conf_verbose) {
+                       fprintf(stderr, "Old database had different configuration block (size mismatch), ignoring.\n");
+               }
+               error = true;
+               return;
+       }
+       string str;
+       str.resize(hdr.conf_block_length_bytes);
+       if (!try_complete_pread(fd, str.data(), hdr.conf_block_length_bytes, hdr.conf_block_offset_bytes)) {
+               if (conf_verbose) {
+                       perror("pread(conf_block)");
+               }
+               error = true;
+               return;
+       }
+       if (str != conf_block) {
+               if (conf_verbose) {
+                       fprintf(stderr, "Old database had different configuration block (contents mismatch), ignoring.\n");
+               }
+               error = true;
+               return;
+       }
+
+       // Read dictionary, if it exists.
+       if (hdr.zstd_dictionary_length_bytes > 0) {
+               string dictionary;
+               dictionary.resize(hdr.zstd_dictionary_length_bytes);
+               if (try_complete_pread(fd, &dictionary[0], hdr.zstd_dictionary_length_bytes, hdr.zstd_dictionary_offset_bytes)) {
+                       ddict = ZSTD_createDDict(dictionary.data(), dictionary.size());
+               } else {
+                       if (conf_verbose) {
+                               perror("pread(dictionary)");
+                       }
+                       error = true;
+                       return;
+               }
+       }
+       compressed_dir_time_pos = hdr.directory_data_offset_bytes;
+
+       ctx = ZSTD_createDCtx();
+       dir_time_ctx = ZSTD_createDCtx();
+}
+
+ExistingDB::~ExistingDB()
+{
+       if (fd != -1) {
+               close(fd);
+       }
+}
+
+pair<string, dir_time> ExistingDB::read_next()
+{
+       if (!unread_record.first.empty()) {
+               auto ret = move(unread_record);
+               unread_record.first.clear();
+               return ret;
+       }
+
+       if (eof || error) {
+               return { "", not_a_dir };
+       }
+
+       // See if we need to read a new filename block.
+       if (current_filename_ptr == nullptr) {
+               if (current_docid >= hdr.num_docids) {
+                       eof = true;
+                       return { "", not_a_dir };
+               }
+
+               // Read the file offset from this docid and the next one.
+               // This is always allowed, since we have a sentinel block at the end.
+               off_t offset_for_block = hdr.filename_index_offset_bytes + current_docid * sizeof(uint64_t);
+               uint64_t vals[2];
+               if (!try_complete_pread(fd, vals, sizeof(vals), offset_for_block)) {
+                       if (conf_verbose) {
+                               perror("pread(offset)");
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+
+               off_t offset = vals[0];
+               size_t compressed_len = vals[1] - vals[0];
+               unique_ptr<char[]> compressed(new char[compressed_len]);
+               if (!try_complete_pread(fd, compressed.get(), compressed_len, offset)) {
+                       if (conf_verbose) {
+                               perror("pread(block)");
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+
+               unsigned long long uncompressed_len = ZSTD_getFrameContentSize(compressed.get(), compressed_len);
+               if (uncompressed_len == ZSTD_CONTENTSIZE_UNKNOWN || uncompressed_len == ZSTD_CONTENTSIZE_ERROR) {
+                       if (conf_verbose) {
+                               fprintf(stderr, "ZSTD_getFrameContentSize() failed\n");
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+
+               string block;
+               block.resize(uncompressed_len + 1);
+
+               size_t err;
+               if (ddict != nullptr) {
+                       err = ZSTD_decompress_usingDDict(ctx, &block[0], block.size(), compressed.get(),
+                                                        compressed_len, ddict);
+               } else {
+                       err = ZSTD_decompressDCtx(ctx, &block[0], block.size(), compressed.get(),
+                                                 compressed_len);
+               }
+               if (ZSTD_isError(err)) {
+                       if (conf_verbose) {
+                               fprintf(stderr, "ZSTD_decompress(): %s\n", ZSTD_getErrorName(err));
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+               block[block.size() - 1] = '\0';
+               current_filename_block = move(block);
+               current_filename_ptr = current_filename_block.data();
+               current_filename_end = current_filename_block.data() + current_filename_block.size();
+               ++current_docid;
+       }
+
+       // See if we need to read more directory time data.
+       while (current_dir_time_ptr == current_dir_time_end ||
+              (*current_dir_time_ptr != 0 &&
+               size_t(current_dir_time_end - current_dir_time_ptr) < sizeof(dir_time) + 1)) {
+               if (current_dir_time_ptr != nullptr) {
+                       const size_t bytes_consumed = current_dir_time_ptr - current_dir_time_block.data();
+                       current_dir_time_block.erase(current_dir_time_block.begin(), current_dir_time_block.begin() + bytes_consumed);
+               }
+
+               // See if we can get more data out without reading more.
+               const size_t existing_data = current_dir_time_block.size();
+               current_dir_time_block.resize(existing_data + 4096);
+
+               ZSTD_outBuffer outbuf;
+               outbuf.dst = current_dir_time_block.data() + existing_data;
+               outbuf.size = 4096;
+               outbuf.pos = 0;
+
+               ZSTD_inBuffer inbuf;
+               inbuf.src = compressed_dir_time.data();
+               inbuf.size = compressed_dir_time.size();
+               inbuf.pos = 0;
+
+               int err = ZSTD_decompressStream(dir_time_ctx, &outbuf, &inbuf);
+               if (err < 0) {
+                       if (conf_verbose) {
+                               fprintf(stderr, "ZSTD_decompress(): %s\n", ZSTD_getErrorName(err));
+                       }
+                       error = true;
+                       return { "", not_a_dir };
+               }
+               compressed_dir_time.erase(compressed_dir_time.begin(), compressed_dir_time.begin() + inbuf.pos);
+               current_dir_time_block.resize(existing_data + outbuf.pos);
+
+               if (inbuf.pos == 0 && outbuf.pos == 0) {
+                       // No movement, we'll need to try to read more data.
+                       char buf[4096];
+                       size_t bytes_to_read = min<size_t>(
+                               hdr.directory_data_offset_bytes + hdr.directory_data_length_bytes - compressed_dir_time_pos,
+                               sizeof(buf));
+                       if (bytes_to_read == 0) {
+                               error = true;
+                               return { "", not_a_dir };
+                       }
+                       if (!try_complete_pread(fd, buf, bytes_to_read, compressed_dir_time_pos)) {
+                               if (conf_verbose) {
+                                       perror("pread(dirtime)");
+                               }
+                               error = true;
+                               return { "", not_a_dir };
+                       }
+                       compressed_dir_time_pos += bytes_to_read;
+                       compressed_dir_time.insert(compressed_dir_time.end(), buf, buf + bytes_to_read);
+
+                       // Next iteration will now try decompressing more.
+               }
+
+               current_dir_time_ptr = current_dir_time_block.data();
+               current_dir_time_end = current_dir_time_block.data() + current_dir_time_block.size();
+       }
+
+       string filename = current_filename_ptr;
+       current_filename_ptr += filename.size() + 1;
+       if (current_filename_ptr == current_filename_end) {
+               // End of this block.
+               current_filename_ptr = nullptr;
+       }
+
+       if (*current_dir_time_ptr == 0) {
+               ++current_dir_time_ptr;
+               return { move(filename), not_a_dir };
+       } else {
+               ++current_dir_time_ptr;
+               dir_time dt;
+               memcpy(&dt.sec, current_dir_time_ptr, sizeof(dt.sec));
+               current_dir_time_ptr += sizeof(dt.sec);
+               memcpy(&dt.nsec, current_dir_time_ptr, sizeof(dt.nsec));
+               current_dir_time_ptr += sizeof(dt.nsec);
+               return { move(filename), dt };
+       }
+}
+
+string ExistingDB::read_next_dictionary() const
+{
+       if (hdr.next_zstd_dictionary_length_bytes == 0 || hdr.next_zstd_dictionary_length_bytes > 1048576) {
+               return "";
+       }
+       string str;
+       str.resize(hdr.next_zstd_dictionary_length_bytes);
+       if (!try_complete_pread(fd, str.data(), hdr.next_zstd_dictionary_length_bytes, hdr.next_zstd_dictionary_offset_bytes)) {
+               if (conf_verbose) {
+                       perror("pread(next_dictionary)");
+               }
+               return "";
+       }
+       return str;
+}
+
+// Scans the directory with absolute path “path”, which is opened as “fd”.
+// Uses relative paths and openat() only, evading any issues with PATH_MAX
+// and time-of-check-time-of-use race conditions. (mlocate's updatedb
+// does a much more complicated dance with changing the current working
+// directory, probably in the interest of portability to old platforms.)
+// “parent_dev” must be the device of the parent directory of “path”.
+//
+// Takes ownership of fd.
+int scan(const string &path, int fd, dev_t parent_dev, dir_time modified, dir_time db_modified, ExistingDB *existing_db, Corpus *corpus, DictionaryBuilder *dict_builder)
+{
+       if (string_list_contains_dir_path(&conf_prunepaths, &conf_prunepaths_index, path)) {
+               if (conf_debug_pruning) {
+                       /* This is debugging output, don't mark anything for translation */
+                       fprintf(stderr, "Skipping `%s': in prunepaths\n", path.c_str());
+               }
+               close(fd);
+               return 0;
+       }
+       if (conf_prune_bind_mounts && is_bind_mount(path.c_str())) {
+               if (conf_debug_pruning) {
+                       /* This is debugging output, don't mark anything for translation */
+                       fprintf(stderr, "Skipping `%s': bind mount\n", path.c_str());
+               }
+               close(fd);
+               return 0;
+       }
+
+       // We read in the old directory no matter whether it is current or not,
+       // because even if we're not going to use it, we'll need the modification directory
+       // of any subdirectories.
+
+       // Skip over anything before this directory; it is stuff that we would have
+       // consumed earlier if we wanted it.
+       for (;;) {
+               pair<string, dir_time> record = existing_db->read_next();
+               if (record.first.empty()) {
+                       break;
+               }
+               if (dir_path_cmp(path, record.first) <= 0) {
+                       existing_db->unread(move(record));
+                       break;
+               }
+       }
+
+       // Now read everything in this directory.
+       vector<entry> db_entries;
+       const string path_plus_slash = path.back() == '/' ? path : path + '/';
+       for (;;) {
+               pair<string, dir_time> record = existing_db->read_next();
+               if (record.first.empty()) {
+                       break;
+               }
+
+               if (record.first.rfind(path_plus_slash, 0) != 0) {
+                       // No longer starts with path, so we're in a different directory.
+                       existing_db->unread(move(record));
+                       break;
+               }
+               if (record.first.find_first_of('/', path_plus_slash.size()) != string::npos) {
+                       // Entered into a subdirectory of a subdirectory.
+                       // Due to our ordering, this also means we're done.
+                       existing_db->unread(move(record));
+                       break;
+               }
+
+               entry e;
+               e.name = record.first.substr(path_plus_slash.size());
+               e.is_directory = (record.second.sec >= 0);
+               e.db_modified = record.second;
+               db_entries.push_back(e);
+       }
+
+       DIR *dir = nullptr;
+       vector<entry> entries;
+       if (!existing_db->get_error() && db_modified.sec > 0 &&
+           modified.sec == db_modified.sec && modified.nsec == db_modified.nsec) {
+               // Not changed since the last database, so we can replace the readdir()
+               // by reading from the database. (We still need to open and stat everything,
+               // though, but that happens in a later step.)
+               entries = move(db_entries);
+       } else {
+               dir = fdopendir(fd);  // Takes over ownership of fd.
+               if (dir == nullptr) {
+                       perror("fdopendir");
+                       exit(1);
+               }
+
+               dirent *de;
+               while ((de = readdir(dir)) != nullptr) {
+                       if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) {
+                               continue;
+                       }
+                       if (strlen(de->d_name) == 0) {
+                               /* Unfortunately, this does happen, and mere assert() does not give
+                                  users enough information to complain to the right people. */
+                               fprintf(stderr, "file system error: zero-length file name in directory %s", path.c_str());
+                               continue;
+                       }
+
+                       entry e;
+                       e.name = de->d_name;
+                       e.is_directory = (de->d_type == DT_DIR);
+
+                       if (conf_verbose) {
+                               printf("%s/%s\n", path.c_str(), de->d_name);
+                       }
+                       entries.push_back(move(e));
+               }
+
+               sort(entries.begin(), entries.end(), [](const entry &a, const entry &b) {
+                       return a.name < b.name;
+               });
+
+               // Load directory modification times from the old database.
+               auto db_it = db_entries.begin();
+               for (entry &e : entries) {
+                       for (; db_it != db_entries.end(); ++db_it) {
+                               if (e.name < db_it->name) {
+                                       break;
+                               }
+                               if (e.name == db_it->name) {
+                                       e.db_modified = db_it->db_modified;
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       // For each entry, we want to add it to the database. but this includes the modification time
+       // for directories, which means we need to open and stat it at this point.
+       //
+       // This means we may need to have many directories open at the same time, but it seems to be
+       // the simplest (only?) way of being compatible with mlocate's notion of listing all contents
+       // of a given directory before recursing, without buffering even more information. Hopefully,
+       // we won't go out of file descriptors here (it could happen if someone has tens of thousands
+       // of subdirectories in a single directory); if so, the admin will need to raise the limit.
+       for (entry &e : entries) {
+               if (!e.is_directory) {
+                       e.dt = not_a_dir;
+                       continue;
+               }
+
+               if (find(conf_prunenames.begin(), conf_prunenames.end(), e.name) != conf_prunenames.end()) {
+                       if (conf_debug_pruning) {
+                               /* This is debugging output, don't mark anything for translation */
+                               fprintf(stderr, "Skipping `%s': in prunenames\n", e.name.c_str());
+                       }
+                       continue;
+               }
+
+               e.fd = opendir_noatime(fd, e.name.c_str());
+               if (e.fd == -1) {
+                       if (errno == EMFILE || errno == ENFILE) {
+                               // The admin probably wants to know about this.
+                               perror((path_plus_slash + e.name).c_str());
+
+                               rlimit rlim;
+                               if (getrlimit(RLIMIT_NOFILE, &rlim) == -1) {
+                                       fprintf(stderr, "Hint: Try `ulimit -n 131072' or similar.\n");
+                               } else {
+                                       fprintf(stderr, "Hint: Try `ulimit -n %lu' or similar (current limit is %lu).\n",
+                                               rlim.rlim_cur * 2, rlim.rlim_cur);
+                               }
+                               exit(1);
+                       }
+                       continue;
+               }
+
+               struct stat buf;
+               if (fstat(e.fd, &buf) != 0) {
+                       perror(path.c_str());
+                       exit(1);
+               }
+
+               e.dev = buf.st_dev;
+               if (buf.st_dev != parent_dev) {
+                       if (filesystem_is_excluded((path_plus_slash + e.name).c_str())) {
+                               close(e.fd);
+                               e.fd = -1;
+                               continue;
+                       }
+               }
+
+               e.dt = get_dirtime_from_stat(buf);
+       }
+
+       // Actually add all the entries we figured out dates for above.
+       for (const entry &e : entries) {
+               corpus->add_file(path_plus_slash + e.name, e.dt);
+               dict_builder->add_file(path_plus_slash + e.name, e.dt);
+       }
+
+       // Now scan subdirectories.
+       for (const entry &e : entries) {
+               if (e.is_directory && e.fd != -1) {
+                       int ret = scan(path_plus_slash + e.name, e.fd, e.dev, e.dt, e.db_modified, existing_db, corpus, dict_builder);
+                       if (ret == -1) {
+                               // TODO: The unscanned file descriptors will leak, but it doesn't really matter,
+                               // as we're about to exit.
+                               closedir(dir);
+                               return -1;
+                       }
+               }
+       }
+
+       if (dir == nullptr) {
+               close(fd);
+       } else {
+               closedir(dir);
+       }
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       // We want to bump the file limit; do it if we can (usually we are root
+       // and can set whatever we want). 128k should be ample for most setups.
+       rlimit rlim;
+       if (getrlimit(RLIMIT_NOFILE, &rlim) != -1) {
+               rlim_t wanted = std::max<rlim_t>(rlim.rlim_cur, 131072);
+               rlim.rlim_cur = std::min<rlim_t>(wanted, rlim.rlim_max);
+               setrlimit(RLIMIT_NOFILE, &rlim);  // Ignore errors.
+       }
+
+       conf_prepare(argc, argv);
+       if (conf_prune_bind_mounts) {
+               bind_mount_init(MOUNTINFO_PATH);
+       }
+
+       int fd = open(conf_output.c_str(), O_RDONLY);
+       ExistingDB existing_db(fd);
+
+       DictionaryBuilder dict_builder(/*blocks_to_keep=*/1000, conf_block_size);
+
+       gid_t owner = -1;
+       if (conf_check_visibility) {
+               group *grp = getgrnam(GROUPNAME);
+               if (grp == nullptr) {
+                       fprintf(stderr, "Unknown group %s\n", GROUPNAME);
+                       exit(1);
+               }
+               owner = grp->gr_gid;
+       }
+
+       DatabaseBuilder db(conf_output.c_str(), owner, conf_block_size, existing_db.read_next_dictionary());
+       Corpus *corpus = db.start_corpus(/*store_dir_times=*/true);
+
+       int root_fd = opendir_noatime(AT_FDCWD, conf_scan_root);
+       if (root_fd == -1) {
+               perror(".");
+               exit(1);
+       }
+
+       struct stat buf;
+       if (fstat(root_fd, &buf) == -1) {
+               perror(".");
+               exit(1);
+       }
+
+       scan(conf_scan_root, root_fd, buf.st_dev, get_dirtime_from_stat(buf), /*db_modified=*/unknown_dir_time, &existing_db, corpus, &dict_builder);
+
+       // It's too late to use the dictionary for the data we already compressed,
+       // unless we wanted to either scan the entire file system again (acceptable
+       // for plocate-build where it's cheap, less so for us), or uncompressing
+       // and recompressing. Instead, we store it for next time, assuming that the
+       // data changes fairly little from time to time.
+       string next_dictionary = dict_builder.train(1024);
+       db.set_next_dictionary(next_dictionary);
+       db.finish_corpus();
+
+       exit(EXIT_SUCCESS);
+}
author	Steinar H. Gunderson <steinar+nageru@gunderson.no>
	Sat, 21 Nov 2020 17:23:20 +0000 (18:23 +0100)
committer	Steinar H. Gunderson <steinar+git@gunderson.no>
	Tue, 24 Nov 2020 23:58:09 +0000 (00:58 +0100)
README		patch \| blob \| history
bind-mount.cpp	[new file with mode: 0644]	patch \| blob
bind-mount.h	[new file with mode: 0644]	patch \| blob
complete_pread.cpp		patch \| blob \| history
complete_pread.h		patch \| blob \| history
conf.cpp	[new file with mode: 0644]	patch \| blob
conf.h	[new file with mode: 0644]	patch \| blob
database-builder.cpp		patch \| blob \| history
database-builder.h		patch \| blob \| history
db.h		patch \| blob \| history
lib.cpp	[new file with mode: 0644]	patch \| blob
lib.h	[new file with mode: 0644]	patch \| blob
meson.build		patch \| blob \| history
plocate-build.cpp		patch \| blob \| history
plocate.cpp		patch \| blob \| history
updatedb.cpp	[new file with mode: 0644]	patch \| blob