- Corpus corpus(outfp, block_size);
-
- read_mlocate(infile, &corpus);
- if (false) { // To read a plain text file.
- FILE *fp = fopen(infile, "r");
- while (!feof(fp)) {
- char buf[1024];
- if (fgets(buf, 1024, fp) == nullptr || feof(fp)) {
- break;
- }
- string s(buf);
- if (s.back() == '\n')
- s.pop_back();
- corpus.add_file(move(s));
- }
- fclose(fp);
- }
- corpus.flush_block();
- dprintf("Read %zu files from %s\n", corpus.num_files, infile);
-
- // Stick an empty block at the end as sentinel.
- corpus.filename_blocks.push_back(ftell(outfp));
- const size_t bytes_for_filenames = corpus.filename_blocks.back() - corpus.filename_blocks.front();
-
- // Write the offsets to the filenames.
- hdr.filename_index_offset_bytes = ftell(outfp);
- const size_t bytes_for_filename_index = corpus.filename_blocks.size() * sizeof(uint64_t);
- fwrite(corpus.filename_blocks.data(), corpus.filename_blocks.size(), sizeof(uint64_t), outfp);
- corpus.filename_blocks.clear();
- corpus.filename_blocks.shrink_to_fit();
-
- // Finish up encoding the posting lists.
- size_t trigrams = 0, longest_posting_list = 0;
- size_t bytes_for_posting_lists = 0;
- for (auto &[trigram, pl_builder] : corpus.invindex) {
- pl_builder.finish();
- longest_posting_list = max(longest_posting_list, pl_builder.num_docids);
- trigrams += pl_builder.num_docids;
- bytes_for_posting_lists += pl_builder.encoded.size();
- }
- dprintf("%zu files, %zu different trigrams, %zu entries, avg len %.2f, longest %zu\n",
- corpus.num_files, corpus.invindex.size(), trigrams, double(trigrams) / corpus.invindex.size(), longest_posting_list);
- dprintf("%zu bytes used for posting lists (%.2f bits/entry)\n", bytes_for_posting_lists, 8 * bytes_for_posting_lists / double(trigrams));
-
- dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(steady_clock::now() - start).count());
-
- // Sort the trigrams, mostly to get a consistent result every time
- // (the hash table will put things in random order anyway).
- vector<uint32_t> all_trigrams;
- for (auto &[trigram, pl_builder] : corpus.invindex) {
- all_trigrams.push_back(trigram);
- }
- sort(all_trigrams.begin(), all_trigrams.end());
-
- // Create the hash table.
- unique_ptr<Trigram[]> hashtable;
- uint32_t ht_size = next_prime(all_trigrams.size());