- //steady_clock::time_point start = steady_clock::now();
-
- vector<string> files;
- read_mlocate(infile, &files);
- if (false) { // To read a plain text file.
- FILE *fp = fopen(infile, "r");
- while (!feof(fp)) {
- char buf[1024];
- if (fgets(buf, 1024, fp) == nullptr || feof(fp)) {
- break;
- }
- string s(buf);
- if (s.back() == '\n') s.pop_back();
- files.push_back(move(s));
- }
- fclose(fp);
- }
- dprintf("Read %zu files from %s\n", files.size(), infile);
-
- unordered_map<uint32_t, string> pl;
- size_t trigrams = 0, longest_posting_list = 0;
- unordered_map<uint32_t, vector<uint32_t>> invindex;
- for (size_t i = 0; i < files.size(); ++i) {
- const string &s = files[i];
- if (s.size() >= 3) {
- for (size_t j = 0; j < s.size() - 2; ++j) {
- uint32_t trgm = read_trigram(s, j);
- invindex[trgm].push_back(i);
- }
- }
- }
- string buf;
- size_t bytes_used = 0;
- for (auto &[trigram, docids] : invindex) {
- auto last = unique(docids.begin(), docids.end());
- docids.erase(last, docids.end());
- longest_posting_list = max(longest_posting_list, docids.size());
- trigrams += docids.size();
-
- size_t bytes_needed = P4NENC_BOUND(docids.size());
- if (buf.size() < bytes_needed) buf.resize(bytes_needed);
- size_t bytes = p4nd1enc128v32(&docids[0], docids.size(), reinterpret_cast<unsigned char *>(&buf[0]));
- pl[trigram] = string(buf.data(), bytes);
- bytes_used += bytes;
- }
- dprintf("%zu files, %zu different trigrams, %zu entries, avg len %.2f, longest %zu\n",
- files.size(), invindex.size(), trigrams, double(trigrams) / invindex.size(), longest_posting_list);
-
- dprintf("%zu bytes used for posting lists (%.2f bits/entry)\n", bytes_used, 8 * bytes_used / double(trigrams));
- //steady_clock::time_point end = steady_clock::now();
- dprintf("Building posting lists took %.1f ms.\n\n", 1e3 * duration<float>(end - start).count());
-
- vector<uint32_t> all_trigrams;
- for (auto &[trigram, docids] : invindex) {
- all_trigrams.push_back(trigram);