+class PostingListBuilder {
+public:
+ void add_docid(uint32_t docid);
+ void finish();
+
+ string encoded;
+ size_t num_docids = 0;
+
+private:
+ void write_header(uint32_t docid);
+ void append_block();
+
+ vector<uint32_t> pending_docids;
+
+ uint32_t last_block_end;
+};
+
+void PostingListBuilder::add_docid(uint32_t docid)
+{
+ // Deduplicate against the last inserted value, if any.
+ if (pending_docids.empty()) {
+ if (encoded.empty()) {
+ // Very first docid.
+ write_header(docid);
+ ++num_docids;
+ last_block_end = docid;
+ return;
+ } else if (docid == last_block_end) {
+ return;
+ }
+ } else {
+ if (docid == pending_docids.back()) {
+ return;
+ }
+ }
+
+ pending_docids.push_back(docid);
+ if (pending_docids.size() == 128) {
+ append_block();
+ pending_docids.clear();
+ last_block_end = docid;
+ }
+ ++num_docids;
+}
+
+void PostingListBuilder::finish()
+{
+ if (pending_docids.empty()) {
+ return;
+ }
+
+ assert(!encoded.empty()); // write_header() should already have run.
+
+ // No interleaving for partial blocks.
+ unsigned char buf[P4NENC_BOUND(128)];
+ unsigned char *end = p4d1enc32(pending_docids.data(), pending_docids.size(), buf, last_block_end);
+ encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
+}
+
+void PostingListBuilder::append_block()
+{
+ unsigned char buf[P4NENC_BOUND(128)];
+ assert(pending_docids.size() == 128);
+ unsigned char *end = p4d1enc128v32(pending_docids.data(), 128, buf, last_block_end);
+ encoded.append(reinterpret_cast<char *>(buf), reinterpret_cast<char *>(end));
+}
+
+void PostingListBuilder::write_header(uint32_t docid)
+{
+ unsigned char buf[P4NENC_BOUND(1)];
+ size_t bytes = p4nd1enc128v32(&docid, 1, buf);
+ encoded.append(reinterpret_cast<char *>(buf), bytes);
+}
+
+class Corpus {
+public:
+ Corpus(size_t block_size) : block_size(block_size) {}
+ void add_file(string filename);
+ void flush_block();
+
+ vector<string> filename_blocks;
+ unordered_map<uint32_t, PostingListBuilder> invindex;
+ size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
+
+private:
+ string current_block;
+ string tempbuf;
+ const size_t block_size;
+};
+
+void Corpus::add_file(string filename)
+{
+ ++num_files;
+ if (!current_block.empty()) {
+ current_block.push_back('\0');
+ }
+ current_block += filename;
+ if (++num_files_in_block == block_size) {
+ flush_block();
+ }
+}
+
+void Corpus::flush_block()
+{
+ if (current_block.empty()) {
+ return;
+ }
+
+ uint32_t docid = num_blocks;
+
+ // Create trigrams.
+ const char *ptr = current_block.c_str();
+ while (ptr < current_block.c_str() + current_block.size()) {
+ string_view s(ptr);
+ if (s.size() >= 3) {
+ for (size_t j = 0; j < s.size() - 2; ++j) {
+ uint32_t trgm = read_trigram(s, j);
+ invindex[trgm].add_docid(docid);
+ }
+ }
+ ptr += s.size() + 1;
+ }
+
+ // Compress and add the filename block.
+ filename_blocks.push_back(zstd_compress(current_block, &tempbuf));
+
+ current_block.clear();
+ num_files_in_block = 0;
+ ++num_blocks;
+}
+
+const char *handle_directory(const char *ptr, Corpus *corpus)