- unsigned char buf[P4NENC_BOUND(1)];
- size_t bytes = p4nd1enc128v32(&docid, 1, buf);
- encoded.append(reinterpret_cast<char *>(buf), bytes);
-}
-
-class Corpus {
-public:
- Corpus(size_t block_size)
- : block_size(block_size) {}
- void add_file(string filename);
- void flush_block();
-
- vector<string> filename_blocks;
- unordered_map<uint32_t, PostingListBuilder> invindex;
- size_t num_files = 0, num_files_in_block = 0, num_blocks = 0;
-
-private:
- string current_block;
- string tempbuf;
- const size_t block_size;
-};
-
-void Corpus::add_file(string filename)
-{
- ++num_files;
- if (!current_block.empty()) {
- current_block.push_back('\0');
- }
- current_block += filename;
- if (++num_files_in_block == block_size) {
- flush_block();
- }
-}
-
-void Corpus::flush_block()
-{
- if (current_block.empty()) {
- return;
- }
-
- uint32_t docid = num_blocks;
-
- // Create trigrams.
- const char *ptr = current_block.c_str();
- while (ptr < current_block.c_str() + current_block.size()) {
- string_view s(ptr);
- if (s.size() >= 3) {
- for (size_t j = 0; j < s.size() - 2; ++j) {
- uint32_t trgm = read_trigram(s, j);
- invindex[trgm].add_docid(docid);
- }