-class PostingListBuilder {
-public:
- inline void add_docid(uint32_t docid);
- void finish();
-
- string encoded;
- size_t num_docids = 0;
-
-private:
- void write_header(uint32_t docid);
- void append_block();
-
- vector<uint32_t> pending_docids;
-
- uint32_t last_block_end, last_docid = -1;
-};
-
-void PostingListBuilder::add_docid(uint32_t docid)
-{
- // Deduplicate against the last inserted value, if any.
- if (docid == last_docid) {
- return;
- }
-
- if (num_docids == 0) {
- // Very first docid.
- write_header(docid);
- ++num_docids;
- last_block_end = last_docid = docid;
- return;
- }
-
- last_docid = docid;
- pending_docids.push_back(docid);
- if (pending_docids.size() == 128) {
- append_block();
- pending_docids.clear();
- last_block_end = docid;
- }
- ++num_docids;
-}
-
-void PostingListBuilder::finish()
-{
- if (pending_docids.empty()) {
- return;
- }
-
- assert(!encoded.empty()); // write_header() should already have run.
-
- // No interleaving for partial blocks.
- unsigned char buf[P4NENC_BOUND(128)];
- unsigned char *end = p4d1enc32(pending_docids.data(), pending_docids.size(), buf, last_block_end);
- encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
-}
-
-void PostingListBuilder::append_block()
-{
- unsigned char buf[P4NENC_BOUND(128)];
- assert(pending_docids.size() == 128);
- unsigned char *end = p4d1enc32(pending_docids.data(), 128, buf, last_block_end);
- encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
-}
-
-void PostingListBuilder::write_header(uint32_t docid)
-{
- unsigned char buf[P4NENC_BOUND(1)];
- size_t bytes = p4nd1enc32(&docid, 1, buf);
- encoded.append(reinterpret_cast<char *>(buf), bytes);
-}
-
-class Corpus {
-public:
- Corpus(FILE *outfp, size_t block_size)
- : invindex(new PostingListBuilder*[NUM_TRIGRAMS]), outfp(outfp), block_size(block_size) {}
- void add_file(string filename);
- void flush_block();
-
- vector<uint64_t> filename_blocks;
- size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
- bool seen_trigram(uint32_t trgm) {
- return invindex[trgm] != nullptr;
- }
- PostingListBuilder& get_pl_builder(uint32_t trgm) {
- if (invindex[trgm] == nullptr) {
- invindex[trgm] = new PostingListBuilder;
- }
- return *invindex[trgm];
- }
-
-private:
- unique_ptr<PostingListBuilder*[]> invindex;
- FILE *outfp;
- string current_block;
- string tempbuf;
- const size_t block_size;
-};
-
-void Corpus::add_file(string filename)
-{
- ++num_files;
- if (!current_block.empty()) {
- current_block.push_back('\0');
- }
- current_block += filename;
- if (++num_files_in_block == block_size) {
- flush_block();
- }
-}
-
-void Corpus::flush_block()
-{
- if (current_block.empty()) {
- return;
- }
-
- uint32_t docid = num_blocks;
-
- // Create trigrams.
- const char *ptr = current_block.c_str();
- while (ptr < current_block.c_str() + current_block.size()) {
- string_view s(ptr);
- if (s.size() >= 3) {
- for (size_t j = 0; j < s.size() - 2; ++j) {
- uint32_t trgm = read_trigram(s, j);
- get_pl_builder(trgm).add_docid(docid);
- }
- }
- ptr += s.size() + 1;
- }
-
- // Compress and add the filename block.
- filename_blocks.push_back(ftell(outfp));
- string compressed = zstd_compress(current_block, &tempbuf);
- if (fwrite(compressed.data(), compressed.size(), 1, outfp) != 1) {
- perror("fwrite()");
- exit(1);
- }
-
- current_block.clear();
- num_files_in_block = 0;
- ++num_blocks;
-}
-