+ const Header &hdr = corpus.get_hdr();
+ if (hdr.zstd_dictionary_length_bytes > 0) {
+ engine.submit_read(fd, hdr.zstd_dictionary_length_bytes, hdr.zstd_dictionary_offset_bytes, [](string_view s) {
+ ddict = ZSTD_createDDict(s.data(), s.size());
+ dprintf("Dictionary initialized after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+ });
+ }
+ }
+
+ // Look them all up on disk.
+ bool should_early_exit = false;
+ for (auto &[trgm, trigram_groups] : trigrams_to_lookup) {
+ corpus.find_trigram(trgm, [trgm{ trgm }, trigram_groups{ &trigram_groups }, &should_early_exit](const Trigram *trgmptr, size_t len) {
+ if (trgmptr == nullptr) {
+ dprintf("trigram %s isn't found\n", print_trigram(trgm).c_str());
+ for (TrigramDisjunction *td : *trigram_groups) {
+ --td->remaining_trigrams_to_read;
+
+ // If we now know this trigram group doesn't match anything at all,
+ // we can do early exit; however, if we're in a forked child,
+ // that would confuse the parent process (since we don't write
+ // our count to the pipe), so we wait until we're back in to the
+ // regular (non-async) context. This is a fairly rare case anyway,
+ // and the gains from dropping the remaining trigram reads are limited.
+ if (td->remaining_trigrams_to_read == 0 && td->read_trigrams.empty()) {
+ if (in_forked_child) {
+ should_early_exit = true;
+ } else {
+ dprintf("zero matches in %s, so we are done\n", print_td(*td).c_str());
+ if (only_count) {
+ printf("0\n");
+ }
+ exit(1);
+ }
+ }
+ }
+ return;
+ }
+ for (TrigramDisjunction *td : *trigram_groups) {
+ --td->remaining_trigrams_to_read;
+ td->max_num_docids += trgmptr->num_docids;
+ td->read_trigrams.emplace_back(*trgmptr, len);
+ }
+ });
+ }
+ engine.finish();
+ dprintf("Hashtable lookups done after %.1f ms.\n", 1e3 * duration<float>(steady_clock::now() - start).count());
+
+ if (should_early_exit) {
+ close(fd);
+ return 0;
+ }
+
+ for (TrigramDisjunction &td : trigram_groups) {
+ // Reset for reads.
+ td.remaining_trigrams_to_read = td.read_trigrams.size();
+
+ if (ignore_case) { // If case-sensitive, they'll all be pretty obvious single-entry groups.
+ dprintf("OR group %u (max_num_docids=%u): %s\n", td.index, td.max_num_docids, print_td(td).c_str());
+ }