Implement the --count option.

author Steinar H. Gunderson <steinar+git@gunderson.no>

Thu, 8 Oct 2020 18:05:41 +0000 (20:05 +0200)

committer Steinar H. Gunderson <steinar+git@gunderson.no>

Thu, 8 Oct 2020 18:05:41 +0000 (20:05 +0200)
author Steinar H. Gunderson <steinar+git@gunderson.no>
Thu, 8 Oct 2020 18:05:41 +0000 (20:05 +0200)
committer Steinar H. Gunderson <steinar+git@gunderson.no>
Thu, 8 Oct 2020 18:05:41 +0000 (20:05 +0200)
diff --git a/plocate.cpp b/plocate.cpp

index 336483b87efb96c8a863c4b9661715544f7fd2b9..fa25fc02a01d4fc3c35835196c632cb0267866d1 100644 (file)
--- a/plocate.cpp
+++ b/plocate.cpp
@@ -28,6 +28,7 @@ using namespace std::chrono;
  #include "turbopfor.h"
  
  const char *dbpath = "/var/lib/mlocate/plocate.db";
+bool only_count = false;
  bool print_nul = false;
  
  class Serializer {
@@ -193,11 +194,11 @@ size_t Corpus::get_num_filename_blocks() const
         return hdr.num_docids;
  }
  
-size_t scan_file_block(const vector<string> &needles, string_view compressed,
+uint64_t scan_file_block(const vector<string> &needles, string_view compressed,
                         unordered_map<string, bool> *access_rx_cache, int seq,
                         Serializer *serializer)
  {
-       size_t matched = 0;
+       uint64_t matched = 0;
  
         unsigned long long uncompressed_len = ZSTD_getFrameContentSize(compressed.data(), compressed.size());
         if (uncompressed_len == ZSTD_CONTENTSIZE_UNKNOWN || uncompressed_len == ZSTD_CONTENTSIZE_ERROR) {
@@ -231,6 +232,7 @@ size_t scan_file_block(const vector<string> &needles, string_view compressed,
                 }
                 if (found && has_access(filename, access_rx_cache)) {
                         ++matched;
+                       if (only_count) continue;
                         if (immediate_print) {
                                 if (print_nul) {
                                         printf("%s%c", filename, 0);
@@ -242,7 +244,7 @@ size_t scan_file_block(const vector<string> &needles, string_view compressed,
                         }
                 }
         }
-       if (serializer != nullptr) {
+       if (serializer != nullptr && !only_count) {
                 if (immediate_print) {
                         serializer->release_current();
                 } else {
@@ -256,7 +258,7 @@ size_t scan_docids(const vector<string> &needles, const vector<uint32_t> &docids
  {
         Serializer docids_in_order;
         unordered_map<string, bool> access_rx_cache;
-       size_t matched = 0;
+       uint64_t matched = 0;
         for (size_t i = 0; i < docids.size(); ++i) {
                 uint32_t docid = docids[i];
                 corpus.get_compressed_filename_block(docid, [i, &matched, &needles, &access_rx_cache, &docids_in_order](string compressed) {
@@ -270,13 +272,14 @@ size_t scan_docids(const vector<string> &needles, const vector<uint32_t> &docids
  // We do this sequentially, as it's faster than scattering
  // a lot of I/O through io_uring and hoping the kernel will
  // coalesce it plus readahead for us.
-void scan_all_docids(const vector<string> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
+uint64_t scan_all_docids(const vector<string> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
  {
         unordered_map<string, bool> access_rx_cache;
         uint32_t num_blocks = corpus.get_num_filename_blocks();
         unique_ptr<uint64_t[]> offsets(new uint64_t[num_blocks + 1]);
         complete_pread(fd, offsets.get(), (num_blocks + 1) * sizeof(uint64_t), corpus.offset_for_block(0));
         string compressed;
+       uint64_t matched = 0;
         for (uint32_t io_docid = 0; io_docid < num_blocks; io_docid += 32) {
                 uint32_t last_docid = std::min(io_docid + 32, num_blocks);
                 size_t io_len = offsets[last_docid] - offsets[io_docid];
@@ -288,9 +291,10 @@ void scan_all_docids(const vector<string> &needles, int fd, const Corpus &corpus
                 for (uint32_t docid = io_docid; docid < last_docid; ++docid) {
                         size_t relative_offset = offsets[docid] - offsets[io_docid];
                         size_t len = offsets[docid + 1] - offsets[docid];
-                       scan_file_block(needles, { &compressed[relative_offset], len }, &access_rx_cache, 0, nullptr);
+                       matched += scan_file_block(needles, { &compressed[relative_offset], len }, &access_rx_cache, 0, nullptr);
                 }
         }
+       return matched;
  }
  
  void do_search_file(const vector<string> &needles, const char *filename)
@@ -328,6 +332,9 @@ void do_search_file(const vector<string> &needles, const char *filename)
                                 if (trgmptr == nullptr) {
                                         dprintf("trigram '%c%c%c' isn't found, we abort the search\n",
                                                 trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff);
+                                       if (only_count) {
+                                               printf("0\n");
+                                       }
                                         exit(0);
                                 }
                                 if (trgmptr->num_docids > shortest_so_far * 100) {
@@ -349,7 +356,8 @@ void do_search_file(const vector<string> &needles, const char *filename)
                 // (We could have searched through all trigrams that matched
                 // the pattern and done a union of them, but that's a lot of
                 // work for fairly unclear gain.)
-               scan_all_docids(needles, fd, corpus, &engine);
+               uint64_t matched = scan_all_docids(needles, fd, corpus, &engine);
+               printf("%zu\n", matched);
                 return;
         }
         sort(trigrams.begin(), trigrams.end());
@@ -421,9 +429,13 @@ void do_search_file(const vector<string> &needles, const char *filename)
         dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n",
                 1e3 * duration<float>(steady_clock::now() - start).count());
  
-       size_t matched __attribute__((unused)) = scan_docids(needles, in1, corpus, &engine);
+       uint64_t matched = scan_docids(needles, in1, corpus, &engine);
         dprintf("Done in %.1f ms, found %zu matches.\n",
                 1e3 * duration<float>(steady_clock::now() - start).count(), matched);
+
+       if (only_count) {
+               printf("%zu\n", matched);
+       }
  }
  
  void usage()
@@ -431,6 +443,7 @@ void usage()
         // The help text comes from mlocate.
         printf("Usage: plocate [OPTION]... PATTERN...\n");
         printf("\n");
+       printf("  -c, --count            only print number of found entries\n");
         printf("  -d, --database DBPATH  use DBPATH instead of default database (which is\n");
         printf("                         %s)\n", dbpath);
         printf("  -h, --help             print this help\n");
@@ -441,6 +454,7 @@ int main(int argc, char **argv)
  {
         static const struct option long_options[] = {
                 { "help", no_argument, 0, 'h' },
+               { "count", no_argument, 0, 'c' },
                 { "database", required_argument, 0, 'd' },
                 { "null", no_argument, 0, '0' },
                 { 0, 0, 0, 0 }
@@ -448,11 +462,14 @@ int main(int argc, char **argv)
  
         for (;;) {
                 int option_index = 0;
-               int c = getopt_long(argc, argv, "d:h0", long_options, &option_index);
+               int c = getopt_long(argc, argv, "cd:h0", long_options, &option_index);
                 if (c == -1) {
                         break;
                 }
                 switch (c) {
+               case 'c':
+                       only_count = true;
+                       break;
                 case 'd':
                         dbpath = strdup(optarg);
                         break;
author	Steinar H. Gunderson <steinar+git@gunderson.no>
	Thu, 8 Oct 2020 18:05:41 +0000 (20:05 +0200)
committer	Steinar H. Gunderson <steinar+git@gunderson.no>
	Thu, 8 Oct 2020 18:05:41 +0000 (20:05 +0200)