]> git.sesse.net Git - plocate/blobdiff - database-builder.cpp
Release plocate 1.1.22.
[plocate] / database-builder.cpp
index cc88ea0489000618f781a920b4716be48befbae6..f9dfb715391b3be7c4c521710d48f86b3b10795b 100644 (file)
@@ -29,16 +29,6 @@ constexpr unsigned num_overflow_slots = 16;
 
 string zstd_compress(const string &src, ZSTD_CDict *cdict, string *tempbuf);
 
-// NOTE: Will read one byte past the end of the trigram, but it's OK,
-// since we always call it from contexts where there's a terminating zero byte.
-static inline uint32_t read_trigram(const string_view s, size_t start)
-{
-       uint32_t trgm;
-       memcpy(&trgm, s.data() + start, sizeof(trgm));
-       trgm = le32toh(trgm);
-       return trgm & 0xffffff;
-}
-
 class PostingListBuilder {
 public:
        inline void add_docid(uint32_t docid);
@@ -46,7 +36,8 @@ public:
        void finish();
 
        vector<unsigned char> encoded;
-       size_t get_num_docids() const {
+       size_t get_num_docids() const
+       {
                // Updated only when we flush, so check that we're finished.
                assert(pending_deltas.empty());
                return num_docids;
@@ -218,6 +209,7 @@ private:
 
        std::unique_ptr<PostingListBuilder *[]> invindex;
        FILE *outfp;
+       off_t outfp_pos;  // Cheaper than calling ftell(outfp) all the time.
        std::string current_block;
        std::string tempbuf;
        const size_t block_size;
@@ -229,9 +221,8 @@ private:
        std::string dir_times_compressed;
 };
 
-
 EncodingCorpus::EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times)
-       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict)
+       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), outfp_pos(ftell(outfp)), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict)
 {
        fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr);
        if (store_dir_times) {
@@ -313,24 +304,46 @@ void EncodingCorpus::flush_block()
 
        // Create trigrams.
        const char *ptr = current_block.c_str();
-       while (ptr < current_block.c_str() + current_block.size()) {
-               string_view s(ptr);
-               if (s.size() >= 3) {
-                       for (size_t j = 0; j < s.size() - 2; ++j) {
-                               uint32_t trgm = read_trigram(s, j);
-                               add_docid(trgm, docid);
+       const char *end = ptr + current_block.size();
+       while (ptr < end - 3) {  // Must be at least one filename left, that's at least three bytes.
+               if (ptr[0] == '\0') {
+                       // This filename is zero bytes, so skip it (and the zero terminator).
+                       ++ptr;
+                       continue;
+               } else if (ptr[1] == '\0') {
+                       // This filename is one byte, so skip it (and the zero terminator).
+                       ptr += 2;
+                       continue;
+               } else if (ptr[2] == '\0') {
+                       // This filename is two bytes, so skip it (and the zero terminator).
+                       ptr += 3;
+                       continue;
+               }
+               for (;;) {
+                       // NOTE: Will read one byte past the end of the trigram, but it's OK,
+                       // since we always call it from contexts where there's a terminating zero byte.
+                       uint32_t trgm;
+                       memcpy(&trgm, ptr, sizeof(trgm));
+                       ++ptr;
+                       trgm = le32toh(trgm);
+                       add_docid(trgm & 0xffffff, docid);
+                       if (trgm <= 0xffffff) {
+                               // Terminating zero byte, so we're done with this filename.
+                               // Skip the remaining two bytes, and the zero terminator.
+                               ptr += 3;
+                               break;
                        }
                }
-               ptr += s.size() + 1;
        }
 
        // Compress and add the filename block.
-       filename_blocks.push_back(ftell(outfp));
+       filename_blocks.push_back(outfp_pos);
        string compressed = zstd_compress(current_block, cdict, &tempbuf);
        if (fwrite(compressed.data(), compressed.size(), 1, outfp) != 1) {
                perror("fwrite()");
                exit(1);
        }
+       outfp_pos += compressed.size();
 
        current_block.clear();
        num_files_in_block = 0;
@@ -474,24 +487,26 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_siz
        if (path.empty()) {
                path = ".";
        }
+       int fd = -1;
 #ifdef O_TMPFILE
-       int fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640);
-       if (fd == -1) {
+       fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640);
+       if (fd == -1 && errno != EOPNOTSUPP && errno != EISDIR) {
                perror(path.c_str());
                exit(1);
        }
-#else
-       temp_filename = string(outfile) + ".XXXXXX";
-       int fd = mkstemp(&temp_filename[0]);
+#endif
        if (fd == -1) {
-               perror(temp_filename.c_str());
-               exit(1);
-       }
-       if (fchmod(fd, 0640) == -1) {
-               perror("fchmod");
-               exit(1);
+               temp_filename = string(outfile) + ".XXXXXX";
+               fd = mkstemp(&temp_filename[0]);
+               if (fd == -1) {
+                       perror(temp_filename.c_str());
+                       exit(1);
+               }
+               if (fchmod(fd, 0640) == -1) {
+                       perror("fchmod");
+                       exit(1);
+               }
        }
-#endif
 
        if (owner != (gid_t)-1) {
                if (fchown(fd, (uid_t)-1, owner) == -1) {
@@ -668,22 +683,28 @@ void DatabaseBuilder::finish_corpus()
        fseek(outfp, 0, SEEK_SET);
        fwrite(&hdr, sizeof(hdr), 1, outfp);
 
+       // This is needed on systems that simulate linkat() by copying
+       // the contents of the file instead of linking.
+       fflush(outfp);
+
+       if (!temp_filename.empty()) {
+               if (rename(temp_filename.c_str(), outfile.c_str()) == -1) {
+                       perror("rename");
+                       exit(1);
+               }
+       } else {
 #ifdef O_TMPFILE
-       // Give the file a proper name, making it visible in the file system.
-       // TODO: It would be nice to be able to do this atomically, like with rename.
-       unlink(outfile.c_str());
-       char procpath[256];
-       snprintf(procpath, sizeof(procpath), "/proc/self/fd/%d", fileno(outfp));
-       if (linkat(AT_FDCWD, procpath, AT_FDCWD, outfile.c_str(), AT_SYMLINK_FOLLOW) == -1) {
-               perror("linkat");
-               exit(1);
-       }
-#else
-       if (rename(temp_filename.c_str(), outfile.c_str()) == -1) {
-               perror("rename");
-               exit(1);
-       }
+               // Give the file a proper name, making it visible in the file system.
+               // TODO: It would be nice to be able to do this atomically, like with rename.
+               unlink(outfile.c_str());
+               char procpath[256];
+               snprintf(procpath, sizeof(procpath), "/proc/self/fd/%d", fileno(outfp));
+               if (linkat(AT_FDCWD, procpath, AT_FDCWD, outfile.c_str(), AT_SYMLINK_FOLLOW) == -1) {
+                       perror("linkat");
+                       exit(1);
+               }
 #endif
+       }
 
        fclose(outfp);