+ hdr.filename_index_offset_bytes = ftell(outfp);
+ const size_t bytes_for_filename_index = corpus.filename_blocks.size() * sizeof(uint64_t);
+ fwrite(corpus.filename_blocks.data(), corpus.filename_blocks.size(), sizeof(uint64_t), outfp);
+ corpus.filename_blocks.clear();
+ corpus.filename_blocks.shrink_to_fit();
+
+ // Finish up encoding the posting lists.
+ size_t trigrams = 0, longest_posting_list = 0;
+ size_t bytes_for_posting_lists = 0;
+ for (unsigned trgm = 0; trgm < NUM_TRIGRAMS; ++trgm) {
+ if (!corpus.seen_trigram(trgm))
+ continue;
+ PostingListBuilder &pl_builder = corpus.get_pl_builder(trgm);
+ pl_builder.finish();
+ longest_posting_list = max(longest_posting_list, pl_builder.num_docids);
+ trigrams += pl_builder.num_docids;
+ bytes_for_posting_lists += pl_builder.encoded.size();
+ }
+ size_t num_trigrams = corpus.num_trigrams();
+ dprintf("%zu files, %zu different trigrams, %zu entries, avg len %.2f, longest %zu\n",
+ corpus.num_files, num_trigrams, trigrams, double(trigrams) / num_trigrams, longest_posting_list);
+ dprintf("%zu bytes used for posting lists (%.2f bits/entry)\n", bytes_for_posting_lists, 8 * bytes_for_posting_lists / double(trigrams));
+
+ dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+
+ // Find the used trigrams.
+ vector<uint32_t> all_trigrams;
+ for (unsigned trgm = 0; trgm < NUM_TRIGRAMS; ++trgm) {
+ if (corpus.seen_trigram(trgm)) {
+ all_trigrams.push_back(trgm);
+ }
+ }
+
+ // Create the hash table.
+ unique_ptr<Trigram[]> hashtable;
+ uint32_t ht_size = next_prime(all_trigrams.size());
+ for (;;) {
+ hashtable = create_hashtable(corpus, all_trigrams, ht_size, num_overflow_slots);
+ if (hashtable == nullptr) {
+ dprintf("Failed creating hash table of size %u, increasing by 5%% and trying again.\n", ht_size);
+ ht_size = next_prime(ht_size * 1.05);
+ } else {
+ dprintf("Created hash table of size %u.\n\n", ht_size);
+ break;
+ }
+ }
+
+ // Find the offsets for each posting list.
+ size_t bytes_for_hashtable = (ht_size + num_overflow_slots + 1) * sizeof(Trigram);
+ uint64_t offset = ftell(outfp) + bytes_for_hashtable;
+ for (unsigned i = 0; i < ht_size + num_overflow_slots + 1; ++i) {
+ hashtable[i].offset = offset; // Needs to be there even for empty slots.
+ if (hashtable[i].num_docids == 0) {
+ continue;
+ }
+
+ const string &encoded = corpus.get_pl_builder(hashtable[i].trgm).encoded;
+ offset += encoded.size();