]> git.sesse.net Git - plocate/blobdiff - plocate.cpp
Fix some warnings found by Clang.
[plocate] / plocate.cpp
index 8ba5aa8aeea1fb660798d06f10f3d6d160c3facf..336483b87efb96c8a863c4b9661715544f7fd2b9 100644 (file)
@@ -1,6 +1,5 @@
 #include "db.h"
 #include "io_uring_engine.h"
-#include "vp4.h"
 
 #include <algorithm>
 #include <arpa/inet.h>
@@ -10,6 +9,7 @@
 #include <fcntl.h>
 #include <functional>
 #include <getopt.h>
+#include <limits.h>
 #include <memory>
 #include <stdio.h>
 #include <string.h>
@@ -25,6 +25,8 @@ using namespace std::chrono;
 #define dprintf(...)
 //#define dprintf(...) fprintf(stderr, __VA_ARGS__);
 
+#include "turbopfor.h"
+
 const char *dbpath = "/var/lib/mlocate/plocate.db";
 bool print_nul = false;
 
@@ -50,7 +52,7 @@ private:
 
 void Serializer::print_delayed(int seq, const vector<string> msg)
 {
-       pending.push(Element{seq, move(msg)});
+       pending.push(Element{ seq, move(msg) });
 }
 
 void Serializer::release_current()
@@ -160,7 +162,7 @@ Corpus::~Corpus()
 void Corpus::find_trigram(uint32_t trgm, function<void(const Trigram *trgmptr, size_t len)> cb)
 {
        uint32_t bucket = hash_trigram(trgm, hdr.hashtable_size);
-       engine->submit_read(fd, sizeof(Trigram) * (hdr.extra_ht_slots + 2), hdr.hash_table_offset_bytes + sizeof(Trigram) * bucket, [this, trgm, bucket, cb{ move(cb) }](string s) {
+       engine->submit_read(fd, sizeof(Trigram) * (hdr.extra_ht_slots + 2), hdr.hash_table_offset_bytes + sizeof(Trigram) * bucket, [this, trgm, cb{ move(cb) }](string s) {
                const Trigram *trgmptr = reinterpret_cast<const Trigram *>(s.data());
                for (unsigned i = 0; i < hdr.extra_ht_slots + 1; ++i) {
                        if (trgmptr[i].trgm == trgm) {
@@ -188,12 +190,7 @@ void Corpus::get_compressed_filename_block(uint32_t docid, function<void(string)
 
 size_t Corpus::get_num_filename_blocks() const
 {
-       // The beginning of the filename blocks is the end of the filename index blocks.
-       uint64_t end;
-       complete_pread(fd, &end, sizeof(end), hdr.filename_index_offset_bytes);
-
-       // Subtract the sentinel block.
-       return (end - hdr.filename_index_offset_bytes) / sizeof(uint64_t) - 1;
+       return hdr.num_docids;
 }
 
 size_t scan_file_block(const vector<string> &needles, string_view compressed,
@@ -321,16 +318,26 @@ void do_search_file(const vector<string> &needles, const char *filename)
        dprintf("Corpus init done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
 
        vector<pair<Trigram, size_t>> trigrams;
+       uint64_t shortest_so_far = numeric_limits<uint32_t>::max();
        for (const string &needle : needles) {
-               if (needle.size() < 3) continue;
+               if (needle.size() < 3)
+                       continue;
                for (size_t i = 0; i < needle.size() - 2; ++i) {
                        uint32_t trgm = read_trigram(needle, i);
-                       corpus.find_trigram(trgm, [trgm, &trigrams](const Trigram *trgmptr, size_t len) {
+                       corpus.find_trigram(trgm, [trgm, &trigrams, &shortest_so_far](const Trigram *trgmptr, size_t len) {
                                if (trgmptr == nullptr) {
-                                       dprintf("trigram %06x isn't found, we abort the search\n", trgm);
-                                       return;
+                                       dprintf("trigram '%c%c%c' isn't found, we abort the search\n",
+                                               trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff);
+                                       exit(0);
+                               }
+                               if (trgmptr->num_docids > shortest_so_far * 100) {
+                                       dprintf("not loading trigram '%c%c%c' with %u docids, it would be ignored later anyway\n",
+                                               trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff,
+                                               trgmptr->num_docids);
+                               } else {
+                                       trigrams.emplace_back(*trgmptr, len);
+                                       shortest_so_far = std::min<uint64_t>(shortest_so_far, trgmptr->num_docids);
                                }
-                               trigrams.emplace_back(*trgmptr, len);
                        });
                }
        }
@@ -375,7 +382,7 @@ void do_search_file(const vector<string> &needles, const char *filename)
                        if (done)
                                break;
                }
-               engine.submit_read(fd, len, trgmptr.offset, [trgmptr, len, &done, &in1, &in2, &out](string s) {
+               engine.submit_read(fd, len, trgmptr.offset, [trgmptr{trgmptr}, len{len}, &done, &in1, &in2, &out](string s) {
                        if (done)
                                return;
                        uint32_t trgm __attribute__((unused)) = trgmptr.trgm;
@@ -383,7 +390,7 @@ void do_search_file(const vector<string> &needles, const char *filename)
                        unsigned char *pldata = reinterpret_cast<unsigned char *>(s.data());
                        if (in1.empty()) {
                                in1.resize(num + 128);
-                               p4nd1dec128v32(pldata, num, &in1[0]);
+                               decode_pfor_delta1<128>(pldata, num, /*interleaved=*/true, &in1[0]);
                                in1.resize(num);
                                dprintf("trigram '%c%c%c' (%zu bytes) decoded to %zu entries\n", trgm & 0xff,
                                        (trgm >> 8) & 0xff, (trgm >> 16) & 0xff, len, num);
@@ -391,7 +398,7 @@ void do_search_file(const vector<string> &needles, const char *filename)
                                if (in2.size() < num + 128) {
                                        in2.resize(num + 128);
                                }
-                               p4nd1dec128v32(pldata, num, &in2[0]);
+                               decode_pfor_delta1<128>(pldata, num, /*interleaved=*/true, &in2[0]);
 
                                out.clear();
                                set_intersection(in1.begin(), in1.end(), in2.begin(), in2.begin() + num,
@@ -423,6 +430,7 @@ void usage()
 {
        // The help text comes from mlocate.
        printf("Usage: plocate [OPTION]... PATTERN...\n");
+       printf("\n");
        printf("  -d, --database DBPATH  use DBPATH instead of default database (which is\n");
        printf("                         %s)\n", dbpath);
        printf("  -h, --help             print this help\n");