+class PostingListBuilder {
+public:
+ inline void add_docid(uint32_t docid);
+ void finish();
+
+ string encoded;
+ size_t num_docids = 0;
+
+private:
+ void write_header(uint32_t docid);
+ void append_block();
+
+ vector<uint32_t> pending_deltas;
+
+ uint32_t last_block_end, last_docid = -1;
+};
+
+void PostingListBuilder::add_docid(uint32_t docid)
+{
+ // Deduplicate against the last inserted value, if any.
+ if (docid == last_docid) {
+ return;
+ }
+
+ if (num_docids == 0) {
+ // Very first docid.
+ write_header(docid);
+ ++num_docids;
+ last_block_end = last_docid = docid;
+ return;
+ }
+
+ pending_deltas.push_back(docid - last_docid - 1);
+ last_docid = docid;
+ if (pending_deltas.size() == 128) {
+ append_block();
+ pending_deltas.clear();
+ last_block_end = docid;
+ }
+ ++num_docids;
+}
+
+void PostingListBuilder::finish()
+{
+ if (pending_deltas.empty()) {
+ return;
+ }
+
+ assert(!encoded.empty()); // write_header() should already have run.
+
+ // No interleaving for partial blocks.
+ unsigned char buf[P4NENC_BOUND(128)];
+ unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), pending_deltas.size(), /*interleaved=*/false, buf);
+ encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
+}
+
+void PostingListBuilder::append_block()
+{
+ unsigned char buf[P4NENC_BOUND(128)];
+ assert(pending_deltas.size() == 128);
+ unsigned char *end = encode_pfor_single_block<128>(pending_deltas.data(), 128, /*interleaved=*/true, buf);
+ encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
+}
+
+void PostingListBuilder::write_header(uint32_t docid)
+{
+ unsigned char buf[P4NENC_BOUND(1)];
+ unsigned char *end = write_baseval(docid, buf);
+ encoded.append(reinterpret_cast<char *>(buf), end - buf);
+}
+
+class Corpus {
+public:
+ Corpus(FILE *outfp, size_t block_size)
+ : invindex(new PostingListBuilder *[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size)
+ {
+ fill(invindex.get(), invindex.get() + NUM_TRIGRAMS, nullptr);
+ }
+ ~Corpus()
+ {
+ for (unsigned i = 0; i < NUM_TRIGRAMS; ++i) {
+ delete invindex[i];
+ }
+ }
+
+ void add_file(string filename);
+ void flush_block();
+
+ vector<uint64_t> filename_blocks;
+ size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
+ bool seen_trigram(uint32_t trgm)
+ {
+ return invindex[trgm] != nullptr;
+ }
+ PostingListBuilder &get_pl_builder(uint32_t trgm)
+ {
+ if (invindex[trgm] == nullptr) {
+ invindex[trgm] = new PostingListBuilder;
+ }
+ return *invindex[trgm];
+ }
+
+private:
+ unique_ptr<PostingListBuilder *[]> invindex;
+ FILE *outfp;
+ string current_block;
+ string tempbuf;
+ const size_t block_size;
+};
+
+void Corpus::add_file(string filename)