From: Steinar H. Gunderson Date: Sat, 13 Feb 2021 00:58:31 +0000 (+0100) Subject: Microoptimizations to docid counting. X-Git-Tag: 1.1.4~2 X-Git-Url: https://git.sesse.net/?p=plocate;a=commitdiff_plain;h=46e9aad1339d6d3dd3dc04181730fab2ce0e8ced Microoptimizations to docid counting. --- diff --git a/database-builder.cpp b/database-builder.cpp index 75b96f8..cc88ea0 100644 --- a/database-builder.cpp +++ b/database-builder.cpp @@ -46,7 +46,11 @@ public: void finish(); vector encoded; - size_t num_docids = 0; + size_t get_num_docids() const { + // Updated only when we flush, so check that we're finished. + assert(pending_deltas.empty()); + return num_docids; + } private: void write_header(uint32_t docid); @@ -54,6 +58,7 @@ private: vector pending_deltas; + uint32_t num_docids = 0; // Should be size_t, except the format only supports 2^32 docids per posting list anyway. uint32_t last_docid = -1; }; @@ -69,8 +74,8 @@ void PostingListBuilder::add_docid(uint32_t docid) if (pending_deltas.size() == 128) { append_block(); pending_deltas.clear(); + num_docids += 128; } - ++num_docids; } void PostingListBuilder::add_first_docid(uint32_t docid) @@ -92,6 +97,9 @@ void PostingListBuilder::finish() unsigned char buf[P4NENC_BOUND(128)]; unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf); encoded.insert(encoded.end(), buf, end); + + num_docids += pending_deltas.size(); + pending_deltas.clear(); } void PostingListBuilder::append_block() @@ -434,7 +442,7 @@ unique_ptr create_hashtable(EncodingCorpus &corpus, const vectorget_pl_builder(trgm); pl_builder.finish(); - longest_posting_list = max(longest_posting_list, pl_builder.num_docids); - trigrams += pl_builder.num_docids; + longest_posting_list = max(longest_posting_list, pl_builder.get_num_docids()); + trigrams += pl_builder.get_num_docids(); bytes_for_posting_lists += pl_builder.encoded.size(); } size_t num_trigrams = corpus->num_trigrams();