- unique_sort(&trigrams);
- sort(trigrams.begin(), trigrams.end(),
- [&](const pair<Trigram, size_t> &a, const pair<Trigram, size_t> &b) {
- return a.first.num_docids < b.first.num_docids;
+
+ // Look them all up on disk.
+ for (auto &[trgm, trigram_groups] : trigrams_to_lookup) {
+ corpus.find_trigram(trgm, [trgm{ trgm }, trigram_groups{ &trigram_groups }](const Trigram *trgmptr, size_t len) {
+ if (trgmptr == nullptr) {
+ dprintf("trigram %s isn't found\n", print_trigram(trgm).c_str());
+ for (TrigramDisjunction *td : *trigram_groups) {
+ --td->remaining_trigrams_to_read;
+ if (td->remaining_trigrams_to_read == 0 && td->read_trigrams.empty()) {
+ dprintf("zero matches in %s, so we are done\n", print_td(*td).c_str());
+ if (only_count) {
+ printf("0\n");
+ }
+ exit(0);
+ }
+ }
+ return;
+ }
+ for (TrigramDisjunction *td : *trigram_groups) {
+ --td->remaining_trigrams_to_read;
+ td->max_num_docids += trgmptr->num_docids;
+ td->read_trigrams.emplace_back(*trgmptr, len);
+ }
+ });
+ }
+ engine.finish();
+ dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+
+ for (TrigramDisjunction &td : trigram_groups) {
+ // Reset for reads.
+ td.remaining_trigrams_to_read = td.read_trigrams.size();
+
+ if (ignore_case) { // If case-sensitive, they'll all be pretty obvious single-entry groups.
+ dprintf("OR group %u (max_num_docids=%u): %s\n", td.index, td.max_num_docids, print_td(td).c_str());
+ }
+ }
+
+ // TODO: For case-insensitive (ie. more than one alternative in each),
+ // prioritize the ones with fewer seeks?
+ sort(trigram_groups.begin(), trigram_groups.end(),
+ [&](const TrigramDisjunction &a, const TrigramDisjunction &b) {
+ return a.max_num_docids < b.max_num_docids;