]> git.sesse.net Git - index/blob - lookup.cpp
Initial checkin for move to Git (no prior version history available).
[index] / lookup.cpp
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <vector>
4 #include "common.h"
5
6 using namespace std;
7
8 struct meta_index_entry {
9         unsigned pos;   // offset in its respective index shard
10         unsigned size;
11 };
12 meta_index_entry meta_index[1 << 24];
13
14 void read_meta_index()
15 {
16         fprintf(stderr, "Reading meta-index... ");
17
18         FILE *index_file = fopen("index/meta", "rb");
19         if (index_file == NULL) {
20                 perror("index/meta");
21                 exit(1);
22         }
23
24         if (fread(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
25                 perror("fread");
26                 exit(1);
27         }
28         
29         fprintf(stderr, "done.\n");
30 }
31         
32 vector<posting_list_entry> read_posting_list(unsigned trigram)
33 {
34         int shard_num = trigram / POSTING_LISTS_PER_SHARD;
35         char filename[256];
36         sprintf(filename, FILENAME_FORMAT_GLOBAL, shard_num);
37         FILE *index_shard = fopen(filename, "rb");
38         if (index_shard == NULL) {
39                 perror(filename);
40                 exit(1);
41         }
42
43         int pos = meta_index[trigram].pos * sizeof(posting_list_entry);
44         if (fseek(index_shard, pos, SEEK_SET) != 0) {
45                 perror("fseek");
46                 exit(1);
47         }
48
49         int size = meta_index[trigram].size;
50         vector<posting_list_entry> ret(size);
51         for (int i = 0; i < size; ++i) {
52                 if (fread(&ret[i], sizeof(posting_list_entry), 1, index_shard) != 1) {
53                         perror("fread");
54                         exit(1);
55                 }
56         }
57
58         fclose(index_shard);
59         return ret;
60 }
61         
62 vector<posting_list_entry> intersect_posting_lists(
63         const vector<posting_list_entry>& first,
64         const vector<posting_list_entry>& second,
65         int relative_pos)
66 {
67         vector<posting_list_entry> ret;
68         vector<posting_list_entry>::const_iterator first_it = first.begin();
69         vector<posting_list_entry>::const_iterator second_it = second.begin();
70
71         while (first_it != first.end() && second_it != second.end()) {
72                 if (first_it->file_num == second_it->file_num &&
73                     first_it->pos + relative_pos == second_it->pos) {
74                         ret.push_back(*first_it);
75                         ++first_it;
76                         ++second_it;
77                         continue;
78                 }
79                 if (first_it->file_num < second_it->file_num) {
80                         ++first_it;
81                         continue;
82                 }
83                 if (first_it->file_num > second_it->file_num) {
84                         ++second_it;
85                         continue;
86                 }
87                 if (first_it->pos + relative_pos < second_it->pos) {
88                         ++first_it;
89                 } else {
90                         ++second_it;
91                 }
92         }
93
94         return ret;
95 }
96
97 void print_posting_list(const vector<posting_list_entry>& pl)
98 {
99         for (int i = 0; i < pl.size(); ++i) {
100                 fprintf(stderr, "[file %u pos %u] ", pl[i].file_num, pl[i].pos);
101         }
102         fprintf(stderr, "\n");
103 }
104
105 void lookup(unsigned char *needle, unsigned needle_len)
106 {
107         // trigram and relative position
108         vector<pair<unsigned, unsigned> > trigrams;
109         for (int i = 0; i < needle_len - 2; i += 3) {
110                 unsigned trigram =
111                         (unsigned(needle[i]) << 16) |
112                         (unsigned(needle[i + 1]) << 8) |
113                          unsigned(needle[i + 2]);
114                 trigrams.push_back(make_pair(trigram, i));
115         }
116         if (needle_len % 3 != 0) {
117                 int i = needle_len - 3;
118                 unsigned trigram =
119                         (unsigned(needle[i]) << 16) |
120                         (unsigned(needle[i + 1]) << 8) |
121                          unsigned(needle[i + 2]);
122                 trigrams.push_back(make_pair(trigram, i));
123         }
124
125         fprintf(stderr, "Reading posting list for %06x... ", trigrams[0].first);
126         vector<posting_list_entry> curr_posting_list = read_posting_list(trigrams[0].first);
127         fprintf(stderr, "%u entries.\n", curr_posting_list.size());
128
129         for (unsigned i = 1; i < trigrams.size(); ++i) {
130                 fprintf(stderr, "Filtering by posting list for %06x (pos %u)... ", trigrams[i].first, trigrams[i].second);
131                 vector<posting_list_entry> filter_posting_list = read_posting_list(trigrams[i].first);
132                 curr_posting_list = intersect_posting_lists(curr_posting_list, filter_posting_list, trigrams[i].second);
133                 fprintf(stderr, "%u entries.\n", curr_posting_list.size());
134         }
135         
136         print_posting_list(curr_posting_list);
137 }
138
139 int main(int argc, char **argv)
140 {
141         read_meta_index();
142
143         unsigned char ch1[] = { 0x12, 0x9e, 0x10, 0x11, 0x35 };
144         lookup(ch1, 5);
145
146         unsigned char ch2[] = { 0x7c, 0x9e, 0x7a, 0x89, 0x9d, 0xbc, 0xda, 0x59, 0xb0, 0x03, 0x0a, 0xba, 0x4e, 0xbc };
147         lookup(ch2, 14);
148
149         unsigned char ch3[] = { 0x0d, 0x91, 0x1f, 0xae, 0x2a, 0xc1, 0x84, 0xac, 0x92, 0xfd, 0x69, 0x1f, 0x95, 0xcf };
150         lookup(ch3, 14);
151
152         unsigned char ch4[] = { 0x25, 0x00, 0xc6, 0x4f, 0x60, 0xcf, 0xc4 };
153         lookup(ch4, 7);
154 }