Release plocate 1.1.22.

[plocate] / database-builder.cpp
diff --git a/database-builder.cpp b/database-builder.cpp

index 76479aa3dde8673e5a5f389f244d8895f3370b31..f9dfb715391b3be7c4c521710d48f86b3b10795b 100644 (file)
--- a/database-builder.cpp
+++ b/database-builder.cpp
@@ -29,16 +29,6 @@ constexpr unsigned num_overflow_slots = 16;
  
  string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf);
  
-// NOTE: Will read one byte past the end of the trigram, but it's OK,
-// since we always call it from contexts where there's a terminating zero byte.
-static inline uint32_t read_trigram(const string_view s, size_t start)
-{
-       uint32_t trgm;
-       memcpy(&trgm, s.data() + start, sizeof(trgm));
-       trgm = le32toh(trgm);
-       return trgm & 0xffffff;
-}
-
  class PostingListBuilder {
  public:
         inline void add_docid(uint32_t docid);
@@ -46,7 +36,12 @@ public:
         void finish();
  
         vector<unsigned char> encoded;
-       size_t num_docids = 0;
+       size_t get_num_docids() const
+       {
+               // Updated only when we flush, so check that we're finished.
+               assert(pending_deltas.empty());
+               return num_docids;
+       }
  
  private:
         void write_header(uint32_t docid);
@@ -54,6 +49,7 @@ private:
  
         vector<uint32_t> pending_deltas;
  
+       uint32_t num_docids = 0;  // Should be size_t, except the format only supports 2^32 docids per posting list anyway.
         uint32_t last_docid = -1;
  };
  
@@ -64,15 +60,13 @@ void PostingListBuilder::add_docid(uint32_t docid)
                 return;
         }
  
-       assert(num_docids != 0);
-
         pending_deltas.push_back(docid - last_docid - 1);
         last_docid = docid;
         if (pending_deltas.size() == 128) {
                 append_block();
                 pending_deltas.clear();
+               num_docids += 128;
         }
-       ++num_docids;
  }
  
  void PostingListBuilder::add_first_docid(uint32_t docid)
@@ -94,6 +88,9 @@ void PostingListBuilder::finish()
         unsigned char buf[P4NENC_BOUND(128)];
         unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf);
         encoded.insert(encoded.end(), buf, end);
+
+       num_docids += pending_deltas.size();
+       pending_deltas.clear();
  }
  
  void PostingListBuilder::append_block()
@@ -212,6 +209,7 @@ private:
  
         std::unique_ptr<PostingListBuilder *[]> invindex;
         FILE *outfp;
+       off_t outfp_pos;  // Cheaper than calling ftell(outfp) all the time.
         std::string current_block;
         std::string tempbuf;
         const size_t block_size;
@@ -223,9 +221,8 @@ private:
         std::string dir_times_compressed;
  };
  
-
  EncodingCorpus::EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times)
-       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict)
+       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), outfp_pos(ftell(outfp)), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict)
  {
         fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr);
         if (store_dir_times) {
@@ -307,24 +304,46 @@ void EncodingCorpus::flush_block()
  
         // Create trigrams.
         const char *ptr = current_block.c_str();
-       while (ptr < current_block.c_str() + current_block.size()) {
-               string_view s(ptr);
-               if (s.size() >= 3) {
-                       for (size_t j = 0; j < s.size() - 2; ++j) {
-                               uint32_t trgm = read_trigram(s, j);
-                               add_docid(trgm, docid);
+       const char *end = ptr + current_block.size();
+       while (ptr < end - 3) {  // Must be at least one filename left, that's at least three bytes.
+               if (ptr[0] == '\0') {
+                       // This filename is zero bytes, so skip it (and the zero terminator).
+                       ++ptr;
+                       continue;
+               } else if (ptr[1] == '\0') {
+                       // This filename is one byte, so skip it (and the zero terminator).
+                       ptr += 2;
+                       continue;
+               } else if (ptr[2] == '\0') {
+                       // This filename is two bytes, so skip it (and the zero terminator).
+                       ptr += 3;
+                       continue;
+               }
+               for (;;) {
+                       // NOTE: Will read one byte past the end of the trigram, but it's OK,
+                       // since we always call it from contexts where there's a terminating zero byte.
+                       uint32_t trgm;
+                       memcpy(&trgm, ptr, sizeof(trgm));
+                       ++ptr;
+                       trgm = le32toh(trgm);
+                       add_docid(trgm & 0xffffff, docid);
+                       if (trgm <= 0xffffff) {
+                               // Terminating zero byte, so we're done with this filename.
+                               // Skip the remaining two bytes, and the zero terminator.
+                               ptr += 3;
+                               break;
                         }
                 }
-               ptr += s.size() + 1;
         }
  
         // Compress and add the filename block.
-       filename_blocks.push_back(ftell(outfp));
+       filename_blocks.push_back(outfp_pos);
         string compressed = zstd_compress(current_block, cdict, &tempbuf);
         if (fwrite(compressed.data(), compressed.size(), 1, outfp) != 1) {
                 perror("fwrite()");
                 exit(1);
         }
+       outfp_pos += compressed.size();
  
         current_block.clear();
         num_files_in_block = 0;
@@ -436,7 +455,7 @@ unique_ptr<Trigram[]> create_hashtable(EncodingCorpus &corpus, const vector<uint
         }
         for (uint32_t trgm : all_trigrams) {
                 // We don't know offset yet, so set it to zero.
-               Trigram to_insert{ trgm, uint32_t(corpus.get_pl_builder(trgm).num_docids), 0 };
+               Trigram to_insert{ trgm, uint32_t(corpus.get_pl_builder(trgm).get_num_docids()), 0 };
  
                 uint32_t bucket = hash_trigram(trgm, ht_size);
                 unsigned distance = 0;
@@ -468,24 +487,26 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_siz
         if (path.empty()) {
                 path = ".";
         }
+       int fd = -1;
  #ifdef O_TMPFILE
-       int fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640);
-       if (fd == -1) {
+       fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640);
+       if (fd == -1 && errno != EOPNOTSUPP && errno != EISDIR) {
                 perror(path.c_str());
                 exit(1);
         }
-#else
-       temp_filename = string(outfile) + ".XXXXXX";
-       int fd = mkstemp(&temp_filename[0]);
+#endif
         if (fd == -1) {
-               perror(temp_filename.c_str());
-               exit(1);
-       }
-       if (fchmod(fd, 0640) == -1) {
-               perror("fchmod");
-               exit(1);
+               temp_filename = string(outfile) + ".XXXXXX";
+               fd = mkstemp(&temp_filename[0]);
+               if (fd == -1) {
+                       perror(temp_filename.c_str());
+                       exit(1);
+               }
+               if (fchmod(fd, 0640) == -1) {
+                       perror("fchmod");
+                       exit(1);
+               }
         }
-#endif
  
         if (owner != (gid_t)-1) {
                 if (fchown(fd, (uid_t)-1, owner) == -1) {
@@ -572,8 +593,8 @@ void DatabaseBuilder::finish_corpus()
                         continue;
                 PostingListBuilder &pl_builder = corpus->get_pl_builder(trgm);
                 pl_builder.finish();
-               longest_posting_list = max(longest_posting_list, pl_builder.num_docids);
-               trigrams += pl_builder.num_docids;
+               longest_posting_list = max(longest_posting_list, pl_builder.get_num_docids());
+               trigrams += pl_builder.get_num_docids();
                 bytes_for_posting_lists += pl_builder.encoded.size();
         }
         size_t num_trigrams = corpus->num_trigrams();
@@ -662,22 +683,28 @@ void DatabaseBuilder::finish_corpus()
         fseek(outfp, 0, SEEK_SET);
         fwrite(&hdr, sizeof(hdr), 1, outfp);
  
+       // This is needed on systems that simulate linkat() by copying
+       // the contents of the file instead of linking.
+       fflush(outfp);
+
+       if (!temp_filename.empty()) {
+               if (rename(temp_filename.c_str(), outfile.c_str()) == -1) {
+                       perror("rename");
+                       exit(1);
+               }
+       } else {
  #ifdef O_TMPFILE
-       // Give the file a proper name, making it visible in the file system.
-       // TODO: It would be nice to be able to do this atomically, like with rename.
-       unlink(outfile.c_str());
-       char procpath[256];
-       snprintf(procpath, sizeof(procpath), "/proc/self/fd/%d", fileno(outfp));
-       if (linkat(AT_FDCWD, procpath, AT_FDCWD, outfile.c_str(), AT_SYMLINK_FOLLOW) == -1) {
-               perror("linkat");
-               exit(1);
-       }
-#else
-       if (rename(temp_filename.c_str(), outfile.c_str()) == -1) {
-               perror("rename");
-               exit(1);
-       }
+               // Give the file a proper name, making it visible in the file system.
+               // TODO: It would be nice to be able to do this atomically, like with rename.
+               unlink(outfile.c_str());
+               char procpath[256];
+               snprintf(procpath, sizeof(procpath), "/proc/self/fd/%d", fileno(outfp));
+               if (linkat(AT_FDCWD, procpath, AT_FDCWD, outfile.c_str(), AT_SYMLINK_FOLLOW) == -1) {
+                       perror("linkat");
+                       exit(1);
+               }
  #endif
+       }
  
         fclose(outfp);