unordered_map<string, bool> *access_rx_cache, int seq,
Serializer *serializer)
{
unordered_map<string, bool> *access_rx_cache, int seq,
Serializer *serializer)
{
unsigned long long uncompressed_len = ZSTD_getFrameContentSize(compressed.data(), compressed.size());
if (uncompressed_len == ZSTD_CONTENTSIZE_UNKNOWN || uncompressed_len == ZSTD_CONTENTSIZE_ERROR) {
unsigned long long uncompressed_len = ZSTD_getFrameContentSize(compressed.data(), compressed.size());
if (uncompressed_len == ZSTD_CONTENTSIZE_UNKNOWN || uncompressed_len == ZSTD_CONTENTSIZE_ERROR) {
{
Serializer docids_in_order;
unordered_map<string, bool> access_rx_cache;
{
Serializer docids_in_order;
unordered_map<string, bool> access_rx_cache;
for (size_t i = 0; i < docids.size(); ++i) {
uint32_t docid = docids[i];
corpus.get_compressed_filename_block(docid, [i, &matched, &needles, &access_rx_cache, &docids_in_order](string compressed) {
for (size_t i = 0; i < docids.size(); ++i) {
uint32_t docid = docids[i];
corpus.get_compressed_filename_block(docid, [i, &matched, &needles, &access_rx_cache, &docids_in_order](string compressed) {
// We do this sequentially, as it's faster than scattering
// a lot of I/O through io_uring and hoping the kernel will
// coalesce it plus readahead for us.
// We do this sequentially, as it's faster than scattering
// a lot of I/O through io_uring and hoping the kernel will
// coalesce it plus readahead for us.
-void scan_all_docids(const vector<string> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
+uint64_t scan_all_docids(const vector<string> &needles, int fd, const Corpus &corpus, IOUringEngine *engine)
{
unordered_map<string, bool> access_rx_cache;
uint32_t num_blocks = corpus.get_num_filename_blocks();
unique_ptr<uint64_t[]> offsets(new uint64_t[num_blocks + 1]);
complete_pread(fd, offsets.get(), (num_blocks + 1) * sizeof(uint64_t), corpus.offset_for_block(0));
string compressed;
{
unordered_map<string, bool> access_rx_cache;
uint32_t num_blocks = corpus.get_num_filename_blocks();
unique_ptr<uint64_t[]> offsets(new uint64_t[num_blocks + 1]);
complete_pread(fd, offsets.get(), (num_blocks + 1) * sizeof(uint64_t), corpus.offset_for_block(0));
string compressed;
for (uint32_t io_docid = 0; io_docid < num_blocks; io_docid += 32) {
uint32_t last_docid = std::min(io_docid + 32, num_blocks);
size_t io_len = offsets[last_docid] - offsets[io_docid];
for (uint32_t io_docid = 0; io_docid < num_blocks; io_docid += 32) {
uint32_t last_docid = std::min(io_docid + 32, num_blocks);
size_t io_len = offsets[last_docid] - offsets[io_docid];
for (uint32_t docid = io_docid; docid < last_docid; ++docid) {
size_t relative_offset = offsets[docid] - offsets[io_docid];
size_t len = offsets[docid + 1] - offsets[docid];
for (uint32_t docid = io_docid; docid < last_docid; ++docid) {
size_t relative_offset = offsets[docid] - offsets[io_docid];
size_t len = offsets[docid + 1] - offsets[docid];
- scan_file_block(needles, { &compressed[relative_offset], len }, &access_rx_cache, 0, nullptr);
+ matched += scan_file_block(needles, { &compressed[relative_offset], len }, &access_rx_cache, 0, nullptr);
if (trgmptr == nullptr) {
dprintf("trigram '%c%c%c' isn't found, we abort the search\n",
trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff);
if (trgmptr == nullptr) {
dprintf("trigram '%c%c%c' isn't found, we abort the search\n",
trgm & 0xff, (trgm >> 8) & 0xff, (trgm >> 16) & 0xff);
// (We could have searched through all trigrams that matched
// the pattern and done a union of them, but that's a lot of
// work for fairly unclear gain.)
// (We could have searched through all trigrams that matched
// the pattern and done a union of them, but that's a lot of
// work for fairly unclear gain.)
- scan_all_docids(needles, fd, corpus, &engine);
+ uint64_t matched = scan_all_docids(needles, fd, corpus, &engine);
+ printf("%zu\n", matched);
dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n",
1e3 * duration<float>(steady_clock::now() - start).count());
dprintf("Intersection done after %.1f ms. Doing final verification and printing:\n",
1e3 * duration<float>(steady_clock::now() - start).count());
- size_t matched __attribute__((unused)) = scan_docids(needles, in1, corpus, &engine);
+ uint64_t matched = scan_docids(needles, in1, corpus, &engine);
dprintf("Done in %.1f ms, found %zu matches.\n",
1e3 * duration<float>(steady_clock::now() - start).count(), matched);
dprintf("Done in %.1f ms, found %zu matches.\n",
1e3 * duration<float>(steady_clock::now() - start).count(), matched);
printf(" -d, --database DBPATH use DBPATH instead of default database (which is\n");
printf(" %s)\n", dbpath);
printf(" -h, --help print this help\n");
printf(" -d, --database DBPATH use DBPATH instead of default database (which is\n");
printf(" %s)\n", dbpath);
printf(" -h, --help print this help\n");
{
static const struct option long_options[] = {
{ "help", no_argument, 0, 'h' },
{
static const struct option long_options[] = {
{ "help", no_argument, 0, 'h' },
{ "database", required_argument, 0, 'd' },
{ "null", no_argument, 0, '0' },
{ 0, 0, 0, 0 }
{ "database", required_argument, 0, 'd' },
{ "null", no_argument, 0, '0' },
{ 0, 0, 0, 0 }
- int c = getopt_long(argc, argv, "d:h0", long_options, &option_index);
+ int c = getopt_long(argc, argv, "cd:h0", long_options, &option_index);