]> git.sesse.net Git - plocate/blobdiff - database-builder.cpp
Proof-of-concept of using ICU for strength-zero searches.
[plocate] / database-builder.cpp
index da52cdb57764b6d60247c74e4318a84872503c49..e9c67a7fd4c315182248768b02bb2f2d512d98d3 100644 (file)
@@ -17,6 +17,8 @@
 #include <unistd.h>
 #include <zdict.h>
 #include <zstd.h>
+#include <unicode/coll.h>
+#include <unicode/locid.h>
 
 #define P4NENC_BOUND(n) ((n + 127) / 128 + (n + 32) * sizeof(uint32_t))
 
@@ -36,7 +38,8 @@ public:
        void finish();
 
        vector<unsigned char> encoded;
-       size_t get_num_docids() const {
+       size_t get_num_docids() const
+       {
                // Updated only when we flush, so check that we're finished.
                assert(pending_deltas.empty());
                return num_docids;
@@ -208,6 +211,7 @@ private:
 
        std::unique_ptr<PostingListBuilder *[]> invindex;
        FILE *outfp;
+       off_t outfp_pos;  // Cheaper than calling ftell(outfp) all the time.
        std::string current_block;
        std::string tempbuf;
        const size_t block_size;
@@ -219,9 +223,8 @@ private:
        std::string dir_times_compressed;
 };
 
-
 EncodingCorpus::EncodingCorpus(FILE *outfp, size_t block_size, ZSTD_CDict *cdict, bool store_dir_times)
-       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict)
+       : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), outfp_pos(ftell(outfp)), block_size(block_size), store_dir_times(store_dir_times), cdict(cdict)
 {
        fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr);
        if (store_dir_times) {
@@ -301,6 +304,52 @@ void EncodingCorpus::flush_block()
 
        uint32_t docid = num_blocks;
 
+       // Oh, ICU...
+       vector<uint8_t> sort_key;
+       sort_key.resize(32);
+        int32_t num_locales;
+        const icu::Locale* locales = icu::Collator::getAvailableLocales(num_locales);
+        for (int i = 0; i < num_locales; ++i) {
+                const icu::Locale &loc = locales[i];
+               if (strcmp(loc.getName(), "en_US_POSIX") == 0) {
+                       continue;  // Too weird.
+               }
+                UErrorCode status = U_ZERO_ERROR;
+               icu::Collator *coll = icu::Collator::createInstance(loc, status);
+               if (U_FAILURE(status)) {
+                       fprintf(stderr, "ERROR: Failed to create collator\n");
+                       exit(1);
+               }
+               coll->setStrength(icu::Collator::PRIMARY);
+               const char *ptr = current_block.c_str();
+               const char *end = ptr + current_block.size();
+               while (ptr < end) {
+                       size_t len = strlen(ptr);
+                       int32_t sortkey_len;
+                       for ( ;; ) {
+                               sortkey_len = coll->getSortKey(icu::UnicodeString::fromUTF8(icu::StringPiece(ptr, len)), sort_key.data(), sort_key.size());
+                               if (sortkey_len < sort_key.size()) {  // Note <, not <=; we need to keep a slop byte.
+                                       break;
+                               }
+                               sort_key.resize(sortkey_len * 3 / 2);
+                       }
+
+                       const uint8_t *keyptr = &sort_key[0];
+                       const uint8_t *keyend = keyptr + sortkey_len;
+                       while (keyptr < keyend - 3) {
+                               // NOTE: Will read one byte past the end of the trigram, but it's OK,
+                               // since we always call it from contexts where there's a terminating zero byte.
+                               uint32_t trgm;
+                               memcpy(&trgm, keyptr, sizeof(trgm));
+                               ++keyptr;
+                               trgm = le32toh(trgm);
+                               add_docid(trgm & 0xffffff, docid);
+                       }
+
+                       ptr += len + 1;
+               }
+        }
+#if 0
        // Create trigrams.
        const char *ptr = current_block.c_str();
        const char *end = ptr + current_block.size();
@@ -318,7 +367,7 @@ void EncodingCorpus::flush_block()
                        ptr += 3;
                        continue;
                }
-               for ( ;; ) {
+               for (;;) {
                        // NOTE: Will read one byte past the end of the trigram, but it's OK,
                        // since we always call it from contexts where there's a terminating zero byte.
                        uint32_t trgm;
@@ -334,14 +383,16 @@ void EncodingCorpus::flush_block()
                        }
                }
        }
+#endif
 
        // Compress and add the filename block.
-       filename_blocks.push_back(ftell(outfp));
+       filename_blocks.push_back(outfp_pos);
        string compressed = zstd_compress(current_block, cdict, &tempbuf);
        if (fwrite(compressed.data(), compressed.size(), 1, outfp) != 1) {
                perror("fwrite()");
                exit(1);
        }
+       outfp_pos += compressed.size();
 
        current_block.clear();
        num_files_in_block = 0;
@@ -485,24 +536,26 @@ DatabaseBuilder::DatabaseBuilder(const char *outfile, gid_t owner, int block_siz
        if (path.empty()) {
                path = ".";
        }
+       int fd = -1;
 #ifdef O_TMPFILE
-       int fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640);
-       if (fd == -1) {
+       fd = open(path.c_str(), O_WRONLY | O_TMPFILE, 0640);
+       if (fd == -1 && errno != EOPNOTSUPP && errno != EISDIR) {
                perror(path.c_str());
                exit(1);
        }
-#else
-       temp_filename = string(outfile) + ".XXXXXX";
-       int fd = mkstemp(&temp_filename[0]);
+#endif
        if (fd == -1) {
-               perror(temp_filename.c_str());
-               exit(1);
-       }
-       if (fchmod(fd, 0640) == -1) {
-               perror("fchmod");
-               exit(1);
+               temp_filename = string(outfile) + ".XXXXXX";
+               fd = mkstemp(&temp_filename[0]);
+               if (fd == -1) {
+                       perror(temp_filename.c_str());
+                       exit(1);
+               }
+               if (fchmod(fd, 0640) == -1) {
+                       perror("fchmod");
+                       exit(1);
+               }
        }
-#endif
 
        if (owner != (gid_t)-1) {
                if (fchown(fd, (uid_t)-1, owner) == -1) {
@@ -679,22 +732,24 @@ void DatabaseBuilder::finish_corpus()
        fseek(outfp, 0, SEEK_SET);
        fwrite(&hdr, sizeof(hdr), 1, outfp);
 
+       if (!temp_filename.empty()) {
+               if (rename(temp_filename.c_str(), outfile.c_str()) == -1) {
+                       perror("rename");
+                       exit(1);
+               }
+       } else {
 #ifdef O_TMPFILE
-       // Give the file a proper name, making it visible in the file system.
-       // TODO: It would be nice to be able to do this atomically, like with rename.
-       unlink(outfile.c_str());
-       char procpath[256];
-       snprintf(procpath, sizeof(procpath), "/proc/self/fd/%d", fileno(outfp));
-       if (linkat(AT_FDCWD, procpath, AT_FDCWD, outfile.c_str(), AT_SYMLINK_FOLLOW) == -1) {
-               perror("linkat");
-               exit(1);
-       }
-#else
-       if (rename(temp_filename.c_str(), outfile.c_str()) == -1) {
-               perror("rename");
-               exit(1);
-       }
+               // Give the file a proper name, making it visible in the file system.
+               // TODO: It would be nice to be able to do this atomically, like with rename.
+               unlink(outfile.c_str());
+               char procpath[256];
+               snprintf(procpath, sizeof(procpath), "/proc/self/fd/%d", fileno(outfp));
+               if (linkat(AT_FDCWD, procpath, AT_FDCWD, outfile.c_str(), AT_SYMLINK_FOLLOW) == -1) {
+                       perror("linkat");
+                       exit(1);
+               }
 #endif
+       }
 
        fclose(outfp);