]> git.sesse.net Git - index/blob - mkindex.cpp
Initial checkin for move to Git (no prior version history available).
[index] / mkindex.cpp
1 #include <stdio.h>
2 #include <string.h>
3 #include <stdlib.h>
4 #include <unistd.h>
5 #include <vector>
6 #include "common.h"
7
8 #define MEMORY_BUDGET_MB 2048
9
10 using namespace std;
11
12 struct meta_index_entry {
13         unsigned pos;   // offset in its respective index shard
14         unsigned size;
15 };
16 meta_index_entry meta_index[1 << 24];
17
18 struct posting_list_entry {
19         unsigned file_num;
20         unsigned pos;
21
22         bool operator< (const posting_list_entry& other) const {
23                 if (file_num != other.file_num)
24                         return file_num < other.file_num;
25                 return pos < other.pos;
26         }
27 };
28 vector<posting_list_entry> posting_lists[1 << 24];
29 unsigned long long num_entries_used = 0;
30
31 vector<posting_list_entry> union_posting_lists(
32         const vector<posting_list_entry>& first,
33         const vector<posting_list_entry>& second)
34 {
35         vector<posting_list_entry> ret(first.size() + second.size());
36         vector<posting_list_entry>::const_iterator first_it = first.begin();
37         vector<posting_list_entry>::const_iterator second_it = second.begin();
38
39         while (first_it != first.end() && second_it != second.end()) {
40                 if (first_it == first.end()) {
41                         ret.push_back(*second_it++);
42                 } else if (second_it == second.end()) {
43                         ret.push_back(*first_it++);
44                         continue;
45                 } else if (*first_it < *second_it) {
46                         ret.push_back(*first_it++);
47                 } else {
48                         ret.push_back(*second_it++);
49                 }
50         }
51
52         return ret;
53 }
54
55 void write_index()
56 {
57         for (int i = 0; i < NUM_SHARDS; ++i) {
58                 int pos = 0;
59
60                 char filename[256];
61                 sprintf(filename, FILENAME_FORMAT, i);
62                 fprintf(stderr, "%s...\r", filename);
63                 FILE *index_shard = fopen(filename, "wb");
64                 if (index_shard == NULL) {
65                         perror(filename);
66                         exit(1);
67                 }
68                 for (int j = 0; j < POSTING_LISTS_PER_SHARD; ++j) {
69                         unsigned trigram = (i * POSTING_LISTS_PER_SHARD) + j;
70                 
71                         for (int k = 0; k < posting_lists[trigram].size(); ++k) {
72                                 if (fwrite(&posting_lists[trigram][k], sizeof(posting_list_entry), 1, index_shard) != 1) {
73                                         perror("fwrite");
74                                         exit(1);
75                                 }
76                         }
77                         unsigned size = posting_lists[trigram].size();
78                         meta_index[trigram].pos = pos;
79                         meta_index[trigram].size = size;
80                         pos += size;
81                 }
82                 fclose(index_shard);
83
84         }
85
86         fprintf(stderr, "%s...\r", META_INDEX_NAME);
87
88         FILE *index_file = fopen("index/meta", "wb");
89         if (index_file == NULL) {
90                 perror("index/meta");
91                 exit(1);
92         }
93         if (fwrite(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
94                 perror("fwrite(meta_index)");
95                 exit(1);
96         }
97         fclose(index_file);
98
99         fprintf(stderr, "Index written.\n");
100 }
101
102 void merge_index()
103 {
104         for (int i = 0; i < NUM_SHARDS; ++i) {
105                 int pos = 0;
106
107                 char filename[256], temp_filename[256];
108                 sprintf(filename, FILENAME_FORMAT, i);
109                 strcpy(temp_filename, filename);
110                 strcat(temp_filename, ".temp");
111                 fprintf(stderr, "%s...\r", filename);
112                 FILE *old_index_shard = fopen(filename, "rb");
113                 if (old_index_shard == NULL) {
114                         perror(filename);
115                         exit(1);
116                 }
117                 FILE *index_shard = fopen(temp_filename, "wb");
118                 if (index_shard == NULL) {
119                         perror(filename);
120                         exit(1);
121                 }
122                 for (int j = 0; j < POSTING_LISTS_PER_SHARD; ++j) {
123                         unsigned trigram = (i * POSTING_LISTS_PER_SHARD) + j;
124
125                         // Read in the old posting list
126                         int old_size = meta_index[trigram].size;
127                         vector<posting_list_entry> old_posting_list(old_size);
128                         for (int k = 0; k < old_size; ++k) {
129                                 if (fread(&old_posting_list[k], sizeof(posting_list_entry), 1, old_index_shard) != 1) {
130                                         perror("fread(old posting list)");
131                                         exit(1);
132                                 }
133                         }
134
135                         vector<posting_list_entry> posting_list =
136                                 union_posting_lists(old_posting_list, posting_lists[trigram]);
137                         vector<posting_list_entry> empty;
138                         swap(posting_lists[trigram], empty);
139
140                         for (int k = 0; k < posting_list.size(); ++k) {
141                                 if (fwrite(&posting_list[k], sizeof(posting_list_entry), 1, index_shard) != 1) {
142                                         perror("fwrite");
143                                         exit(1);
144                                 }
145                         }
146                         unsigned size = posting_list.size();
147                         meta_index[trigram].pos = pos;
148                         meta_index[trigram].size = size;
149                         pos += size;
150                 }
151                 fclose(old_index_shard);
152                 fclose(index_shard);
153
154                 if (unlink(filename) == -1) {
155                         perror("unlink");
156                         exit(1);
157                 }
158                 if (rename(temp_filename, filename) == -1) {
159                         perror("rename");
160                         exit(1);
161                 }
162         }
163
164         fprintf(stderr, "%s...\r", META_INDEX_NAME);
165
166         FILE *index_file = fopen("index/meta", "wb");
167         if (index_file == NULL) {
168                 perror("index/meta");
169                 exit(1);
170         }
171         if (fwrite(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
172                 perror("fwrite(meta_index)");
173                 exit(1);
174         }
175         fclose(index_file);
176
177         fprintf(stderr, "New index written.\n");
178
179         num_entries_used = 0;
180 }
181
182 void index_file(const char *filename, int file_num)
183 {
184         fprintf(stderr, "%s... ", filename);
185
186         FILE *file = fopen(filename, "rb");
187         if (file == NULL) {
188                 perror(filename);
189                 exit(1);
190         }
191
192         unsigned trigram = 0;
193         unsigned pos = 0;
194         while (!feof(file)) {
195                 int ch = getc(file);
196                 ++pos;
197                 if (ch == -1) {
198                         break;
199                 }
200
201                 trigram = ((trigram << 8) | ch) & 0xffffff;
202
203                 posting_list_entry ple;
204                 ple.file_num = file_num;
205                 ple.pos = pos - 2;
206                 posting_lists[trigram].push_back(ple);
207                 ++num_entries_used;
208         }
209         
210         fprintf(stderr, "%u bytes. [%llu MB RAM used]\n", pos, num_entries_used / (1048576 / sizeof(posting_list_entry)));
211         fclose(file);
212         
213         if (num_entries_used >= (MEMORY_BUDGET_MB * 1048576ULL / sizeof(posting_list_entry))) {
214                 fprintf(stderr, "[need to flush index to disk]\n");
215                 merge_index();
216         }
217 }
218
219 int main(int argc, char **argv)
220 {
221         fprintf(stderr, "Writing empty index.\n");
222         write_index();
223
224         for (int i = 1; i < argc; ++i) {
225                 index_file(argv[i], i - 1);
226         }
227         merge_index();
228 }