git.sesse.net Git - plocate/blob - updatedb.cpp

   1 /* updatedb(8).
   2
   3 Copyright (C) 2005, 2007, 2008 Red Hat, Inc. All rights reserved.
   4
   5 This copyrighted material is made available to anyone wishing to use, modify,
   6 copy, or redistribute it subject to the terms and conditions of the GNU General
   7 Public License v.2.
   8
   9 This program is distributed in the hope that it will be useful, but WITHOUT ANY
  10 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
  11 PARTICULAR PURPOSE. See the GNU General Public License for more details.
  12
  13 You should have received a copy of the GNU General Public License along with
  14 this program; if not, write to the Free Software Foundation, Inc., 51 Franklin
  15 Street, Fifth Floor, Boston, MA 02110-1301, USA.
  16
  17 Author: Miloslav Trmac <mitr@redhat.com>
  18
  19
  20 plocate modifications: Copyright (C) 2020 Steinar H. Gunderson.
  21 plocate parts and modifications are licensed under the GPLv2 or, at your option,
  22 any later version.
  23  */
  24
  25 #include "bind-mount.h"
  26 #include "complete_pread.h"
  27 #include "conf.h"
  28 #include "database-builder.h"
  29 #include "db.h"
  30 #include "dprintf.h"
  31 #include "io_uring_engine.h"
  32 #include "lib.h"
  33
  34 #include <algorithm>
  35 #include <arpa/inet.h>
  36 #include <assert.h>
  37 #include <chrono>
  38 #include <dirent.h>
  39 #include <fcntl.h>
  40 #include <getopt.h>
  41 #include <grp.h>
  42 #include <inttypes.h>
  43 #include <iosfwd>
  44 #include <math.h>
  45 #include <memory>
  46 #include <mntent.h>
  47 #include <random>
  48 #include <stdint.h>
  49 #include <stdio.h>
  50 #include <stdlib.h>
  51 #include <string.h>
  52 #include <string>
  53 #include <sys/resource.h>
  54 #include <sys/stat.h>
  55 #include <sys/time.h>
  56 #include <sys/types.h>
  57 #include <unistd.h>
  58 #include <utility>
  59 #include <vector>
  60
  61 using namespace std;
  62 using namespace std::chrono;
  63
  64 /* Next conf_prunepaths entry */
  65 static size_t conf_prunepaths_index; /* = 0; */
  66
  67 void usage()
  68 {
  69         printf(
  70                 "Usage: updatedb PLOCATE_DB\n"
  71                 "\n"
  72                 "Generate plocate index from mlocate.db, typically /var/lib/mlocate/mlocate.db.\n"
  73                 "Normally, the destination should be /var/lib/mlocate/plocate.db.\n"
  74                 "\n"
  75                 "  -b, --block-size SIZE  number of filenames to store in each block (default 32)\n"
  76                 "  -p, --plaintext        input is a plaintext file, not an mlocate database\n"
  77                 "      --help             print this help\n"
  78                 "      --version          print version information\n");
  79 }
  80
  81 void version()
  82 {
  83         printf("updatedb %s\n", PACKAGE_VERSION);
  84         printf("Copyright (C) 2007 Red Hat, Inc. All rights reserved.\n");
  85         printf("Copyright 2020 Steinar H. Gunderson\n");
  86         printf("This software is distributed under the GPL v.2.\n");
  87         printf("\n");
  88         printf("This program is provided with NO WARRANTY, to the extent permitted by law.\n");
  89 }
  90
  91 int opendir_noatime(int dirfd, const char *path)
  92 {
  93         static bool noatime_failed = false;
  94
  95         if (!noatime_failed) {
  96 #ifdef O_NOATIME
  97                 int fd = openat(dirfd, path, O_RDONLY | O_DIRECTORY | O_NOATIME);
  98 #else
  99                 int fd = openat(dirfd, path, O_RDONLY | O_DIRECTORY);
 100 #endif
 101                 if (fd != -1) {
 102                         return fd;
 103                 } else if (errno == EPERM) {
 104                         /* EPERM is fairly O_NOATIME-specific; missing access rights cause
 105                            EACCES. */
 106                         noatime_failed = true;
 107                         // Retry below.
 108                 } else {
 109                         return -1;
 110                 }
 111         }
 112         return openat(dirfd, path, O_RDONLY | O_DIRECTORY);
 113 }
 114
 115 bool time_is_current(const dir_time &t)
 116 {
 117         static dir_time cache{ 0, 0 };
 118
 119         /* This is more difficult than it should be because Linux uses a cheaper time
 120            source for filesystem timestamps than for gettimeofday() and they can get
 121            slightly out of sync, see
 122            https://bugzilla.redhat.com/show_bug.cgi?id=244697 .  This affects even
 123            nanosecond timestamps (and don't forget that tv_nsec existence doesn't
 124            guarantee that the underlying filesystem has such resolution - it might be
 125            microseconds or even coarser).
 126
 127            The worst case is probably FAT timestamps with 2-second resolution
 128            (although using such a filesystem violates POSIX file times requirements).
 129
 130            So, to be on the safe side, require a >3.0 second difference (2 seconds to
 131            make sure the FAT timestamp changed, 1 more to account for the Linux
 132            timestamp races).  This large margin might make updatedb marginally more
 133            expensive, but it only makes a difference if the directory was very
 134            recently updated _and_ is will not be updated again until the next
 135            updatedb run; this is not likely to happen for most directories. */
 136
 137         /* Cache gettimeofday () results to rule out obviously old time stamps;
 138            CACHE contains the earliest time we reject as too current. */
 139         if (t < cache) {
 140                 return false;
 141         }
 142
 143         struct timeval tv;
 144         gettimeofday(&tv, nullptr);
 145         cache.sec = tv.tv_sec - 3;
 146         cache.nsec = tv.tv_usec * 1000;
 147
 148         return t >= cache;
 149 }
 150
 151 struct entry {
 152         string name;
 153         bool is_directory;
 154
 155         // For directories only:
 156         int fd = -1;
 157         dir_time dt = unknown_dir_time;
 158         dir_time db_modified = unknown_dir_time;
 159         dev_t dev;
 160 };
 161
 162 bool filesystem_is_excluded(const char *path)
 163 {
 164         if (conf_debug_pruning) {
 165                 /* This is debugging output, don't mark anything for translation */
 166                 fprintf(stderr, "Checking whether filesystem `%s' is excluded:\n", path);
 167         }
 168         FILE *f = setmntent("/proc/mounts", "r");
 169         if (f == nullptr) {
 170                 return false;
 171         }
 172
 173         struct mntent *me;
 174         while ((me = getmntent(f)) != nullptr) {
 175                 if (conf_debug_pruning) {
 176                         /* This is debugging output, don't mark anything for translation */
 177                         fprintf(stderr, " `%s', type `%s'\n", me->mnt_dir, me->mnt_type);
 178                 }
 179                 string type(me->mnt_type);
 180                 for (char &p : type) {
 181                         p = toupper(p);
 182                 }
 183                 if (find(conf_prunefs.begin(), conf_prunefs.end(), type) != conf_prunefs.end()) {
 184                         /* Paths in /proc/self/mounts contain no symbolic links.  Besides
 185                            avoiding a few system calls, avoiding the realpath () avoids hangs
 186                            if the filesystem is unavailable hard-mounted NFS. */
 187                         char *dir = me->mnt_dir;
 188                         if (conf_debug_pruning) {
 189                                 /* This is debugging output, don't mark anything for translation */
 190                                 fprintf(stderr, " => type matches, dir `%s'\n", dir);
 191                         }
 192                         bool res = (strcmp(path, dir) == 0);
 193                         if (dir != me->mnt_dir)
 194                                 free(dir);
 195                         if (res) {
 196                                 endmntent(f);
 197                                 return true;
 198                         }
 199                 }
 200         }
 201         if (conf_debug_pruning) {
 202                 /* This is debugging output, don't mark anything for translation */
 203                 fprintf(stderr, "...done\n");
 204         }
 205         endmntent(f);
 206         return false;
 207 }
 208
 209 dir_time get_dirtime_from_stat(const struct stat &buf)
 210 {
 211         dir_time ctime{ buf.st_ctim.tv_sec, int32_t(buf.st_ctim.tv_nsec) };
 212         dir_time mtime{ buf.st_mtim.tv_sec, int32_t(buf.st_mtim.tv_nsec) };
 213         dir_time dt = max(ctime, mtime);
 214
 215         if (time_is_current(dt)) {
 216                 /* The directory might be changing right now and we can't be sure the
 217                    timestamp will be changed again if more changes happen very soon, mark
 218                    the timestamp as invalid to force rescanning the directory next time
 219                    updatedb is run. */
 220                 return unknown_dir_time;
 221         } else {
 222                 return dt;
 223         }
 224 }
 225
 226 // Represents the old database we are updating.
 227 class ExistingDB {
 228 public:
 229         explicit ExistingDB(int fd);
 230         ~ExistingDB();
 231
 232         pair<string, dir_time> read_next();
 233         void unread(pair<string, dir_time> record)
 234         {
 235                 unread_record = move(record);
 236         }
 237         string read_next_dictionary() const;
 238         bool get_error() const { return error; }
 239
 240 private:
 241         const int fd;
 242         Header hdr;
 243
 244         uint32_t current_docid = 0;
 245
 246         string current_filename_block;
 247         const char *current_filename_ptr = nullptr, *current_filename_end = nullptr;
 248
 249         off_t compressed_dir_time_pos;
 250         string compressed_dir_time;
 251         string current_dir_time_block;
 252         const char *current_dir_time_ptr = nullptr, *current_dir_time_end = nullptr;
 253
 254         pair<string, dir_time> unread_record;
 255
 256         // Used in one-shot mode, repeatedly.
 257         ZSTD_DCtx *ctx;
 258
 259         // Used in streaming mode.
 260         ZSTD_DCtx *dir_time_ctx;
 261
 262         ZSTD_DDict *ddict = nullptr;
 263
 264         // If true, we've discovered an error or EOF, and will return only
 265         // empty data from here.
 266         bool eof = false, error = false;
 267 };
 268
 269 ExistingDB::ExistingDB(int fd)
 270         : fd(fd)
 271 {
 272         if (fd == -1) {
 273                 error = true;
 274                 return;
 275         }
 276
 277         if (!try_complete_pread(fd, &hdr, sizeof(hdr), /*offset=*/0)) {
 278                 if (conf_verbose) {
 279                         perror("pread(header)");
 280                 }
 281                 error = true;
 282                 return;
 283         }
 284         if (memcmp(hdr.magic, "\0plocate", 8) != 0) {
 285                 if (conf_verbose) {
 286                         fprintf(stderr, "Old database had header mismatch, ignoring.\n");
 287                 }
 288                 error = true;
 289                 return;
 290         }
 291         if (hdr.version != 1 || hdr.max_version < 2) {
 292                 if (conf_verbose) {
 293                         fprintf(stderr, "Old database had version mismatch (version=%d max_version=%d), ignoring.\n",
 294                                 hdr.version, hdr.max_version);
 295                 }
 296                 error = true;
 297                 return;
 298         }
 299
 300         // Compare the configuration block with our current one.
 301         if (hdr.conf_block_length_bytes != conf_block.size()) {
 302                 if (conf_verbose) {
 303                         fprintf(stderr, "Old database had different configuration block (size mismatch), ignoring.\n");
 304                 }
 305                 error = true;
 306                 return;
 307         }
 308         string str;
 309         str.resize(hdr.conf_block_length_bytes);
 310         if (!try_complete_pread(fd, str.data(), hdr.conf_block_length_bytes, hdr.conf_block_offset_bytes)) {
 311                 if (conf_verbose) {
 312                         perror("pread(conf_block)");
 313                 }
 314                 error = true;
 315                 return;
 316         }
 317         if (str != conf_block) {
 318                 if (conf_verbose) {
 319                         fprintf(stderr, "Old database had different configuration block (contents mismatch), ignoring.\n");
 320                 }
 321                 error = true;
 322                 return;
 323         }
 324
 325         // Read dictionary, if it exists.
 326         if (hdr.zstd_dictionary_length_bytes > 0) {
 327                 string dictionary;
 328                 dictionary.resize(hdr.zstd_dictionary_length_bytes);
 329                 if (try_complete_pread(fd, &dictionary[0], hdr.zstd_dictionary_length_bytes, hdr.zstd_dictionary_offset_bytes)) {
 330                         ddict = ZSTD_createDDict(dictionary.data(), dictionary.size());
 331                 } else {
 332                         if (conf_verbose) {
 333                                 perror("pread(dictionary)");
 334                         }
 335                         error = true;
 336                         return;
 337                 }
 338         }
 339         compressed_dir_time_pos = hdr.directory_data_offset_bytes;
 340
 341         ctx = ZSTD_createDCtx();
 342         dir_time_ctx = ZSTD_createDCtx();
 343 }
 344
 345 ExistingDB::~ExistingDB()
 346 {
 347         if (fd != -1) {
 348                 close(fd);
 349         }
 350 }
 351
 352 pair<string, dir_time> ExistingDB::read_next()
 353 {
 354         if (!unread_record.first.empty()) {
 355                 auto ret = move(unread_record);
 356                 unread_record.first.clear();
 357                 return ret;
 358         }
 359
 360         if (eof || error) {
 361                 return { "", not_a_dir };
 362         }
 363
 364         // See if we need to read a new filename block.
 365         if (current_filename_ptr == nullptr) {
 366                 if (current_docid >= hdr.num_docids) {
 367                         eof = true;
 368                         return { "", not_a_dir };
 369                 }
 370
 371                 // Read the file offset from this docid and the next one.
 372                 // This is always allowed, since we have a sentinel block at the end.
 373                 off_t offset_for_block = hdr.filename_index_offset_bytes + current_docid * sizeof(uint64_t);
 374                 uint64_t vals[2];
 375                 if (!try_complete_pread(fd, vals, sizeof(vals), offset_for_block)) {
 376                         if (conf_verbose) {
 377                                 perror("pread(offset)");
 378                         }
 379                         error = true;
 380                         return { "", not_a_dir };
 381                 }
 382
 383                 off_t offset = vals[0];
 384                 size_t compressed_len = vals[1] - vals[0];
 385                 unique_ptr<char[]> compressed(new char[compressed_len]);
 386                 if (!try_complete_pread(fd, compressed.get(), compressed_len, offset)) {
 387                         if (conf_verbose) {
 388                                 perror("pread(block)");
 389                         }
 390                         error = true;
 391                         return { "", not_a_dir };
 392                 }
 393
 394                 unsigned long long uncompressed_len = ZSTD_getFrameContentSize(compressed.get(), compressed_len);
 395                 if (uncompressed_len == ZSTD_CONTENTSIZE_UNKNOWN || uncompressed_len == ZSTD_CONTENTSIZE_ERROR) {
 396                         if (conf_verbose) {
 397                                 fprintf(stderr, "ZSTD_getFrameContentSize() failed\n");
 398                         }
 399                         error = true;
 400                         return { "", not_a_dir };
 401                 }
 402
 403                 string block;
 404                 block.resize(uncompressed_len + 1);
 405
 406                 size_t err;
 407                 if (ddict != nullptr) {
 408                         err = ZSTD_decompress_usingDDict(ctx, &block[0], block.size(), compressed.get(),
 409                                                          compressed_len, ddict);
 410                 } else {
 411                         err = ZSTD_decompressDCtx(ctx, &block[0], block.size(), compressed.get(),
 412                                                   compressed_len);
 413                 }
 414                 if (ZSTD_isError(err)) {
 415                         if (conf_verbose) {
 416                                 fprintf(stderr, "ZSTD_decompress(): %s\n", ZSTD_getErrorName(err));
 417                         }
 418                         error = true;
 419                         return { "", not_a_dir };
 420                 }
 421                 block[block.size() - 1] = '\0';
 422                 current_filename_block = move(block);
 423                 current_filename_ptr = current_filename_block.data();
 424                 current_filename_end = current_filename_block.data() + current_filename_block.size();
 425                 ++current_docid;
 426         }
 427
 428         // See if we need to read more directory time data.
 429         while (current_dir_time_ptr == current_dir_time_end ||
 430                (*current_dir_time_ptr != 0 &&
 431                 size_t(current_dir_time_end - current_dir_time_ptr) < sizeof(dir_time) + 1)) {
 432                 if (current_dir_time_ptr != nullptr) {
 433                         const size_t bytes_consumed = current_dir_time_ptr - current_dir_time_block.data();
 434                         current_dir_time_block.erase(current_dir_time_block.begin(), current_dir_time_block.begin() + bytes_consumed);
 435                 }
 436
 437                 // See if we can get more data out without reading more.
 438                 const size_t existing_data = current_dir_time_block.size();
 439                 current_dir_time_block.resize(existing_data + 4096);
 440
 441                 ZSTD_outBuffer outbuf;
 442                 outbuf.dst = current_dir_time_block.data() + existing_data;
 443                 outbuf.size = 4096;
 444                 outbuf.pos = 0;
 445
 446                 ZSTD_inBuffer inbuf;
 447                 inbuf.src = compressed_dir_time.data();
 448                 inbuf.size = compressed_dir_time.size();
 449                 inbuf.pos = 0;
 450
 451                 int err = ZSTD_decompressStream(dir_time_ctx, &outbuf, &inbuf);
 452                 if (err < 0) {
 453                         if (conf_verbose) {
 454                                 fprintf(stderr, "ZSTD_decompress(): %s\n", ZSTD_getErrorName(err));
 455                         }
 456                         error = true;
 457                         return { "", not_a_dir };
 458                 }
 459                 compressed_dir_time.erase(compressed_dir_time.begin(), compressed_dir_time.begin() + inbuf.pos);
 460                 current_dir_time_block.resize(existing_data + outbuf.pos);
 461
 462                 if (inbuf.pos == 0 && outbuf.pos == 0) {
 463                         // No movement, we'll need to try to read more data.
 464                         char buf[4096];
 465                         size_t bytes_to_read = min<size_t>(
 466                                 hdr.directory_data_offset_bytes + hdr.directory_data_length_bytes - compressed_dir_time_pos,
 467                                 sizeof(buf));
 468                         if (bytes_to_read == 0) {
 469                                 error = true;
 470                                 return { "", not_a_dir };
 471                         }
 472                         if (!try_complete_pread(fd, buf, bytes_to_read, compressed_dir_time_pos)) {
 473                                 if (conf_verbose) {
 474                                         perror("pread(dirtime)");
 475                                 }
 476                                 error = true;
 477                                 return { "", not_a_dir };
 478                         }
 479                         compressed_dir_time_pos += bytes_to_read;
 480                         compressed_dir_time.insert(compressed_dir_time.end(), buf, buf + bytes_to_read);
 481
 482                         // Next iteration will now try decompressing more.
 483                 }
 484
 485                 current_dir_time_ptr = current_dir_time_block.data();
 486                 current_dir_time_end = current_dir_time_block.data() + current_dir_time_block.size();
 487         }
 488
 489         string filename = current_filename_ptr;
 490         current_filename_ptr += filename.size() + 1;
 491         if (current_filename_ptr == current_filename_end) {
 492                 // End of this block.
 493                 current_filename_ptr = nullptr;
 494         }
 495
 496         if (*current_dir_time_ptr == 0) {
 497                 ++current_dir_time_ptr;
 498                 return { move(filename), not_a_dir };
 499         } else {
 500                 ++current_dir_time_ptr;
 501                 dir_time dt;
 502                 memcpy(&dt.sec, current_dir_time_ptr, sizeof(dt.sec));
 503                 current_dir_time_ptr += sizeof(dt.sec);
 504                 memcpy(&dt.nsec, current_dir_time_ptr, sizeof(dt.nsec));
 505                 current_dir_time_ptr += sizeof(dt.nsec);
 506                 return { move(filename), dt };
 507         }
 508 }
 509
 510 string ExistingDB::read_next_dictionary() const
 511 {
 512         if (hdr.next_zstd_dictionary_length_bytes == 0 || hdr.next_zstd_dictionary_length_bytes > 1048576) {
 513                 return "";
 514         }
 515         string str;
 516         str.resize(hdr.next_zstd_dictionary_length_bytes);
 517         if (!try_complete_pread(fd, str.data(), hdr.next_zstd_dictionary_length_bytes, hdr.next_zstd_dictionary_offset_bytes)) {
 518                 if (conf_verbose) {
 519                         perror("pread(next_dictionary)");
 520                 }
 521                 return "";
 522         }
 523         return str;
 524 }
 525
 526 // Scans the directory with absolute path “path”, which is opened as “fd”.
 527 // Uses relative paths and openat() only, evading any issues with PATH_MAX
 528 // and time-of-check-time-of-use race conditions. (mlocate's updatedb
 529 // does a much more complicated dance with changing the current working
 530 // directory, probably in the interest of portability to old platforms.)
 531 // “parent_dev” must be the device of the parent directory of “path”.
 532 //
 533 // Takes ownership of fd.
 534 int scan(const string &path, int fd, dev_t parent_dev, dir_time modified, dir_time db_modified, ExistingDB *existing_db, DatabaseReceiver *corpus, DictionaryBuilder *dict_builder)
 535 {
 536         if (conf_prune_bind_mounts && is_bind_mount(path.c_str())) {
 537                 if (conf_debug_pruning) {
 538                         /* This is debugging output, don't mark anything for translation */
 539                         fprintf(stderr, "Skipping `%s': bind mount\n", path.c_str());
 540                 }
 541                 close(fd);
 542                 return 0;
 543         }
 544
 545         // We read in the old directory no matter whether it is current or not,
 546         // because even if we're not going to use it, we'll need the modification directory
 547         // of any subdirectories.
 548
 549         // Skip over anything before this directory; it is stuff that we would have
 550         // consumed earlier if we wanted it.
 551         for (;;) {
 552                 pair<string, dir_time> record = existing_db->read_next();
 553                 if (record.first.empty()) {
 554                         break;
 555                 }
 556                 if (dir_path_cmp(path, record.first) <= 0) {
 557                         existing_db->unread(move(record));
 558                         break;
 559                 }
 560         }
 561
 562         // Now read everything in this directory.
 563         vector<entry> db_entries;
 564         const string path_plus_slash = path.back() == '/' ? path : path + '/';
 565         for (;;) {
 566                 pair<string, dir_time> record = existing_db->read_next();
 567                 if (record.first.empty()) {
 568                         break;
 569                 }
 570
 571                 if (record.first.rfind(path_plus_slash, 0) != 0) {
 572                         // No longer starts with path, so we're in a different directory.
 573                         existing_db->unread(move(record));
 574                         break;
 575                 }
 576                 if (record.first.find_first_of('/', path_plus_slash.size()) != string::npos) {
 577                         // Entered into a subdirectory of a subdirectory.
 578                         // Due to our ordering, this also means we're done.
 579                         existing_db->unread(move(record));
 580                         break;
 581                 }
 582
 583                 entry e;
 584                 e.name = record.first.substr(path_plus_slash.size());
 585                 e.is_directory = (record.second.sec >= 0);
 586                 e.db_modified = record.second;
 587                 db_entries.push_back(e);
 588         }
 589
 590         DIR *dir = nullptr;
 591         vector<entry> entries;
 592         if (!existing_db->get_error() && db_modified.sec > 0 &&
 593             modified.sec == db_modified.sec && modified.nsec == db_modified.nsec) {
 594                 // Not changed since the last database, so we can replace the readdir()
 595                 // by reading from the database. (We still need to open and stat everything,
 596                 // though, but that happens in a later step.)
 597                 entries = move(db_entries);
 598                 if (conf_verbose) {
 599                         for (const entry &e : entries) {
 600                                 printf("%s/%s\n", path.c_str(), e.name.c_str());
 601                         }
 602                 }
 603         } else {
 604                 dir = fdopendir(fd);  // Takes over ownership of fd.
 605                 if (dir == nullptr) {
 606                         // fdopendir() wants to fstat() the fd to verify that it's indeed
 607                         // a directory, which can seemingly fail on at least CIFS filesystems
 608                         // if the server feels like it. We treat this as if we had an error
 609                         // on opening it, ie., ignore the directory.
 610                         close(fd);
 611                         return 0;
 612                 }
 613
 614                 dirent *de;
 615                 while ((de = readdir(dir)) != nullptr) {
 616                         if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) {
 617                                 continue;
 618                         }
 619                         if (strlen(de->d_name) == 0) {
 620                                 /* Unfortunately, this does happen, and mere assert() does not give
 621                                    users enough information to complain to the right people. */
 622                                 fprintf(stderr, "file system error: zero-length file name in directory %s", path.c_str());
 623                                 continue;
 624                         }
 625
 626                         entry e;
 627                         e.name = de->d_name;
 628                         if (de->d_type == DT_UNKNOWN) {
 629                                 // Evidently some file systems, like older versions of XFS
 630                                 // (mkfs.xfs -m crc=0 -n ftype=0), can return this,
 631                                 // and we need a stat(). If we wanted to optimize for this,
 632                                 // we could probably defer it to later (we're stat-ing directories
 633                                 // when recursing), but this is rare, and not really worth it --
 634                                 // the second stat() will be cached anyway.
 635                                 struct stat buf;
 636                                 if (fstatat(fd, de->d_name, &buf, AT_SYMLINK_NOFOLLOW) == 0 &&
 637                                     S_ISDIR(buf.st_mode)) {
 638                                         e.is_directory = true;
 639                                 } else {
 640                                         e.is_directory = false;
 641                                 }
 642                         } else {
 643                                 e.is_directory = (de->d_type == DT_DIR);
 644                         }
 645
 646                         if (conf_verbose) {
 647                                 printf("%s/%s\n", path.c_str(), de->d_name);
 648                         }
 649                         entries.push_back(move(e));
 650                 }
 651
 652                 sort(entries.begin(), entries.end(), [](const entry &a, const entry &b) {
 653                         return a.name < b.name;
 654                 });
 655
 656                 // Load directory modification times from the old database.
 657                 auto db_it = db_entries.begin();
 658                 for (entry &e : entries) {
 659                         for (; db_it != db_entries.end(); ++db_it) {
 660                                 if (e.name < db_it->name) {
 661                                         break;
 662                                 }
 663                                 if (e.name == db_it->name) {
 664                                         e.db_modified = db_it->db_modified;
 665                                         break;
 666                                 }
 667                         }
 668                 }
 669         }
 670
 671         // For each entry, we want to add it to the database. but this includes the modification time
 672         // for directories, which means we need to open and stat it at this point.
 673         //
 674         // This means we may need to have many directories open at the same time, but it seems to be
 675         // the simplest (only?) way of being compatible with mlocate's notion of listing all contents
 676         // of a given directory before recursing, without buffering even more information. Hopefully,
 677         // we won't go out of file descriptors here (it could happen if someone has tens of thousands
 678         // of subdirectories in a single directory); if so, the admin will need to raise the limit.
 679         for (entry &e : entries) {
 680                 if (!e.is_directory) {
 681                         e.dt = not_a_dir;
 682                         continue;
 683                 }
 684
 685                 if (find(conf_prunenames.begin(), conf_prunenames.end(), e.name) != conf_prunenames.end()) {
 686                         if (conf_debug_pruning) {
 687                                 /* This is debugging output, don't mark anything for translation */
 688                                 fprintf(stderr, "Skipping `%s': in prunenames\n", e.name.c_str());
 689                         }
 690                         continue;
 691                 }
 692                 if (string_list_contains_dir_path(&conf_prunepaths, &conf_prunepaths_index, (path_plus_slash + e.name).c_str())) {
 693                         if (conf_debug_pruning) {
 694                                 /* This is debugging output, don't mark anything for translation */
 695                                 fprintf(stderr, "Skipping `%s/%s': in prunepaths\n", path.c_str(), e.name.c_str());
 696                         }
 697                         continue;
 698                 }
 699
 700                 e.fd = opendir_noatime(fd, e.name.c_str());
 701                 if (e.fd == -1) {
 702                         if (errno == EMFILE || errno == ENFILE) {
 703                                 // The admin probably wants to know about this.
 704                                 perror((path_plus_slash + e.name).c_str());
 705
 706                                 rlimit rlim;
 707                                 if (getrlimit(RLIMIT_NOFILE, &rlim) == -1) {
 708                                         fprintf(stderr, "Hint: Try `ulimit -n 131072' or similar.\n");
 709                                 } else {
 710                                         fprintf(stderr, "Hint: Try `ulimit -n %" PRIu64 " or similar (current limit is %" PRIu64 ").\n",
 711                                                 static_cast<uint64_t>(rlim.rlim_cur * 2), static_cast<uint64_t>(rlim.rlim_cur));
 712                                 }
 713                                 exit(1);
 714                         }
 715                         continue;
 716                 }
 717
 718                 struct stat buf;
 719                 if (fstat(e.fd, &buf) != 0) {
 720                         // It's possible that this is a filesystem that's excluded
 721                         // (and the failure is e.g. because the network is down).
 722                         // As a last-ditch effort, we try to check that before dying,
 723                         // i.e., duplicate the check from further down.
 724                         //
 725                         // It would be better to be able to run filesystem_is_excluded()
 726                         // for cheap on everything and just avoid the stat, but it seems
 727                         // hard to do that without any kind of raciness.
 728                         if (filesystem_is_excluded((path_plus_slash + e.name).c_str())) {
 729                                 close(e.fd);
 730                                 e.fd = -1;
 731                                 continue;
 732                         }
 733
 734                         perror((path_plus_slash + e.name).c_str());
 735                         exit(1);
 736                 }
 737
 738                 e.dev = buf.st_dev;
 739                 if (buf.st_dev != parent_dev) {
 740                         if (filesystem_is_excluded((path_plus_slash + e.name).c_str())) {
 741                                 close(e.fd);
 742                                 e.fd = -1;
 743                                 continue;
 744                         }
 745                 }
 746
 747                 e.dt = get_dirtime_from_stat(buf);
 748         }
 749
 750         // Actually add all the entries we figured out dates for above.
 751         for (const entry &e : entries) {
 752                 corpus->add_file(path_plus_slash + e.name, e.dt);
 753                 dict_builder->add_file(path_plus_slash + e.name, e.dt);
 754         }
 755
 756         // Now scan subdirectories.
 757         for (const entry &e : entries) {
 758                 if (e.is_directory && e.fd != -1) {
 759                         int ret = scan(path_plus_slash + e.name, e.fd, e.dev, e.dt, e.db_modified, existing_db, corpus, dict_builder);
 760                         if (ret == -1) {
 761                                 // TODO: The unscanned file descriptors will leak, but it doesn't really matter,
 762                                 // as we're about to exit.
 763                                 closedir(dir);
 764                                 return -1;
 765                         }
 766                 }
 767         }
 768
 769         if (dir == nullptr) {
 770                 close(fd);
 771         } else {
 772                 closedir(dir);
 773         }
 774         return 0;
 775 }
 776
 777 int main(int argc, char **argv)
 778 {
 779         // We want to bump the file limit; do it if we can (usually we are root
 780         // and can set whatever we want). 128k should be ample for most setups.
 781         rlimit rlim;
 782         if (getrlimit(RLIMIT_NOFILE, &rlim) != -1) {
 783                 // Even root cannot increase rlim_cur beyond rlim_max,
 784                 // so we need to try to increase rlim_max first.
 785                 // Ignore errors, though.
 786                 if (rlim.rlim_max < 131072) {
 787                         rlim.rlim_max = 131072;
 788                         setrlimit(RLIMIT_NOFILE, &rlim);
 789                         getrlimit(RLIMIT_NOFILE, &rlim);
 790                 }
 791
 792                 rlim_t wanted = std::max<rlim_t>(rlim.rlim_cur, 131072);
 793                 rlim.rlim_cur = std::min<rlim_t>(wanted, rlim.rlim_max);
 794                 setrlimit(RLIMIT_NOFILE, &rlim);  // Ignore errors.
 795         }
 796
 797         conf_prepare(argc, argv);
 798         if (conf_prune_bind_mounts) {
 799                 bind_mount_init(MOUNTINFO_PATH);
 800         }
 801
 802         int fd = open(conf_output.c_str(), O_RDONLY);
 803         ExistingDB existing_db(fd);
 804
 805         DictionaryBuilder dict_builder(/*blocks_to_keep=*/1000, conf_block_size);
 806
 807         gid_t owner = -1;
 808         if (conf_check_visibility) {
 809                 group *grp = getgrnam(GROUPNAME);
 810                 if (grp == nullptr) {
 811                         fprintf(stderr, "Unknown group %s\n", GROUPNAME);
 812                         exit(1);
 813                 }
 814                 owner = grp->gr_gid;
 815         }
 816
 817         DatabaseBuilder db(conf_output.c_str(), owner, conf_block_size, existing_db.read_next_dictionary(), conf_check_visibility);
 818         db.set_conf_block(conf_block);
 819         DatabaseReceiver *corpus = db.start_corpus(/*store_dir_times=*/true);
 820
 821         int root_fd = opendir_noatime(AT_FDCWD, conf_scan_root);
 822         if (root_fd == -1) {
 823                 perror(".");
 824                 exit(1);
 825         }
 826
 827         struct stat buf;
 828         if (fstat(root_fd, &buf) == -1) {
 829                 perror(".");
 830                 exit(1);
 831         }
 832
 833         scan(conf_scan_root, root_fd, buf.st_dev, get_dirtime_from_stat(buf), /*db_modified=*/unknown_dir_time, &existing_db, corpus, &dict_builder);
 834
 835         // It's too late to use the dictionary for the data we already compressed,
 836         // unless we wanted to either scan the entire file system again (acceptable
 837         // for plocate-build where it's cheap, less so for us), or uncompressing
 838         // and recompressing. Instead, we store it for next time, assuming that the
 839         // data changes fairly little from time to time.
 840         string next_dictionary = dict_builder.train(1024);
 841         db.set_next_dictionary(next_dictionary);
 842         db.finish_corpus();
 843
 844         exit(EXIT_SUCCESS);
 845 }