- }
-
- pending_docids.push_back(docid);
- if (pending_docids.size() == 128) {
- append_block();
- pending_docids.clear();
- last_block_end = docid;
- }
- ++num_docids;
-}
-
-void PostingListBuilder::finish()
-{
- if (pending_docids.empty()) {
- return;
- }
-
- assert(!encoded.empty()); // write_header() should already have run.
-
- // No interleaving for partial blocks.
- unsigned char buf[P4NENC_BOUND(128)];
- unsigned char *end = p4d1enc32(pending_docids.data(), pending_docids.size(), buf, last_block_end);
- encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
-}
-
-void PostingListBuilder::append_block()
-{
- unsigned char buf[P4NENC_BOUND(128)];
- assert(pending_docids.size() == 128);
- unsigned char *end = p4d1enc128v32(pending_docids.data(), 128, buf, last_block_end);
- encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
-}
-
-void PostingListBuilder::write_header(uint32_t docid)
-{
- unsigned char buf[P4NENC_BOUND(1)];
- size_t bytes = p4nd1enc128v32(&docid, 1, buf);
- encoded.append(reinterpret_cast<char *>(buf), bytes);
-}
-
-class Corpus {
-public:
- Corpus(size_t block_size) : block_size(block_size) {}
- void add_file(string filename);
- void flush_block();
-
- vector<string> filename_blocks;
- unordered_map<uint32_t, PostingListBuilder> invindex;
- size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
-
-private:
- string current_block;
- string tempbuf;
- const size_t block_size;
-};
-
-void Corpus::add_file(string filename)
-{
- ++num_files;
- if (!current_block.empty()) {
- current_block.push_back('\0');
- }
- current_block += filename;
- if (++num_files_in_block == block_size) {
- flush_block();