8 struct meta_index_entry {
9 unsigned pos; // offset in its respective index shard
12 meta_index_entry meta_index[1 << 24];
14 void read_meta_index()
16 fprintf(stderr, "Reading meta-index... ");
18 FILE *index_file = fopen("index/meta", "rb");
19 if (index_file == NULL) {
24 if (fread(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
29 fprintf(stderr, "done.\n");
32 vector<posting_list_entry> read_posting_list(unsigned trigram)
34 int shard_num = trigram / POSTING_LISTS_PER_SHARD;
36 sprintf(filename, FILENAME_FORMAT_GLOBAL, shard_num);
37 FILE *index_shard = fopen(filename, "rb");
38 if (index_shard == NULL) {
43 int pos = meta_index[trigram].pos * sizeof(posting_list_entry);
44 if (fseek(index_shard, pos, SEEK_SET) != 0) {
49 int size = meta_index[trigram].size;
50 vector<posting_list_entry> ret(size);
51 for (int i = 0; i < size; ++i) {
52 if (fread(&ret[i], sizeof(posting_list_entry), 1, index_shard) != 1) {
62 vector<posting_list_entry> intersect_posting_lists(
63 const vector<posting_list_entry>& first,
64 const vector<posting_list_entry>& second,
67 vector<posting_list_entry> ret;
68 vector<posting_list_entry>::const_iterator first_it = first.begin();
69 vector<posting_list_entry>::const_iterator second_it = second.begin();
71 while (first_it != first.end() && second_it != second.end()) {
72 if (first_it->file_num == second_it->file_num &&
73 first_it->pos + relative_pos == second_it->pos) {
74 ret.push_back(*first_it);
79 if (first_it->file_num < second_it->file_num) {
83 if (first_it->file_num > second_it->file_num) {
87 if (first_it->pos + relative_pos < second_it->pos) {
97 void print_posting_list(const vector<posting_list_entry>& pl)
99 for (int i = 0; i < pl.size(); ++i) {
100 fprintf(stderr, "[file %u pos %u] ", pl[i].file_num, pl[i].pos);
102 fprintf(stderr, "\n");
105 void lookup(unsigned char *needle, unsigned needle_len)
107 // trigram and relative position
108 vector<pair<unsigned, unsigned> > trigrams;
109 for (int i = 0; i < needle_len - 2; i += 3) {
111 (unsigned(needle[i]) << 16) |
112 (unsigned(needle[i + 1]) << 8) |
113 unsigned(needle[i + 2]);
114 trigrams.push_back(make_pair(trigram, i));
116 if (needle_len % 3 != 0) {
117 int i = needle_len - 3;
119 (unsigned(needle[i]) << 16) |
120 (unsigned(needle[i + 1]) << 8) |
121 unsigned(needle[i + 2]);
122 trigrams.push_back(make_pair(trigram, i));
125 fprintf(stderr, "Reading posting list for %06x... ", trigrams[0].first);
126 vector<posting_list_entry> curr_posting_list = read_posting_list(trigrams[0].first);
127 fprintf(stderr, "%u entries.\n", curr_posting_list.size());
129 for (unsigned i = 1; i < trigrams.size(); ++i) {
130 fprintf(stderr, "Filtering by posting list for %06x (pos %u)... ", trigrams[i].first, trigrams[i].second);
131 vector<posting_list_entry> filter_posting_list = read_posting_list(trigrams[i].first);
132 curr_posting_list = intersect_posting_lists(curr_posting_list, filter_posting_list, trigrams[i].second);
133 fprintf(stderr, "%u entries.\n", curr_posting_list.size());
136 print_posting_list(curr_posting_list);
139 int main(int argc, char **argv)
143 unsigned char ch1[] = { 0x12, 0x9e, 0x10, 0x11, 0x35 };
146 unsigned char ch2[] = { 0x7c, 0x9e, 0x7a, 0x89, 0x9d, 0xbc, 0xda, 0x59, 0xb0, 0x03, 0x0a, 0xba, 0x4e, 0xbc };
149 unsigned char ch3[] = { 0x0d, 0x91, 0x1f, 0xae, 0x2a, 0xc1, 0x84, 0xac, 0x92, 0xfd, 0x69, 0x1f, 0x95, 0xcf };
152 unsigned char ch4[] = { 0x25, 0x00, 0xc6, 0x4f, 0x60, 0xcf, 0xc4 };