8 #define MEMORY_BUDGET_MB 2048
12 struct meta_index_entry {
13 unsigned pos; // offset in its respective index shard
16 meta_index_entry meta_index[1 << 24];
18 struct posting_list_entry {
22 bool operator< (const posting_list_entry& other) const {
23 if (file_num != other.file_num)
24 return file_num < other.file_num;
25 return pos < other.pos;
28 vector<posting_list_entry> posting_lists[1 << 24];
29 unsigned long long num_entries_used = 0;
31 vector<posting_list_entry> union_posting_lists(
32 const vector<posting_list_entry>& first,
33 const vector<posting_list_entry>& second)
35 vector<posting_list_entry> ret(first.size() + second.size());
36 vector<posting_list_entry>::const_iterator first_it = first.begin();
37 vector<posting_list_entry>::const_iterator second_it = second.begin();
39 while (first_it != first.end() && second_it != second.end()) {
40 if (first_it == first.end()) {
41 ret.push_back(*second_it++);
42 } else if (second_it == second.end()) {
43 ret.push_back(*first_it++);
45 } else if (*first_it < *second_it) {
46 ret.push_back(*first_it++);
48 ret.push_back(*second_it++);
57 for (int i = 0; i < NUM_SHARDS; ++i) {
61 sprintf(filename, FILENAME_FORMAT, i);
62 fprintf(stderr, "%s...\r", filename);
63 FILE *index_shard = fopen(filename, "wb");
64 if (index_shard == NULL) {
68 for (int j = 0; j < POSTING_LISTS_PER_SHARD; ++j) {
69 unsigned trigram = (i * POSTING_LISTS_PER_SHARD) + j;
71 for (int k = 0; k < posting_lists[trigram].size(); ++k) {
72 if (fwrite(&posting_lists[trigram][k], sizeof(posting_list_entry), 1, index_shard) != 1) {
77 unsigned size = posting_lists[trigram].size();
78 meta_index[trigram].pos = pos;
79 meta_index[trigram].size = size;
86 fprintf(stderr, "%s...\r", META_INDEX_NAME);
88 FILE *index_file = fopen("index/meta", "wb");
89 if (index_file == NULL) {
93 if (fwrite(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
94 perror("fwrite(meta_index)");
99 fprintf(stderr, "Index written.\n");
104 for (int i = 0; i < NUM_SHARDS; ++i) {
107 char filename[256], temp_filename[256];
108 sprintf(filename, FILENAME_FORMAT, i);
109 strcpy(temp_filename, filename);
110 strcat(temp_filename, ".temp");
111 fprintf(stderr, "%s...\r", filename);
112 FILE *old_index_shard = fopen(filename, "rb");
113 if (old_index_shard == NULL) {
117 FILE *index_shard = fopen(temp_filename, "wb");
118 if (index_shard == NULL) {
122 for (int j = 0; j < POSTING_LISTS_PER_SHARD; ++j) {
123 unsigned trigram = (i * POSTING_LISTS_PER_SHARD) + j;
125 // Read in the old posting list
126 int old_size = meta_index[trigram].size;
127 vector<posting_list_entry> old_posting_list(old_size);
128 for (int k = 0; k < old_size; ++k) {
129 if (fread(&old_posting_list[k], sizeof(posting_list_entry), 1, old_index_shard) != 1) {
130 perror("fread(old posting list)");
135 vector<posting_list_entry> posting_list =
136 union_posting_lists(old_posting_list, posting_lists[trigram]);
137 vector<posting_list_entry> empty;
138 swap(posting_lists[trigram], empty);
140 for (int k = 0; k < posting_list.size(); ++k) {
141 if (fwrite(&posting_list[k], sizeof(posting_list_entry), 1, index_shard) != 1) {
146 unsigned size = posting_list.size();
147 meta_index[trigram].pos = pos;
148 meta_index[trigram].size = size;
151 fclose(old_index_shard);
154 if (unlink(filename) == -1) {
158 if (rename(temp_filename, filename) == -1) {
164 fprintf(stderr, "%s...\r", META_INDEX_NAME);
166 FILE *index_file = fopen("index/meta", "wb");
167 if (index_file == NULL) {
168 perror("index/meta");
171 if (fwrite(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
172 perror("fwrite(meta_index)");
177 fprintf(stderr, "New index written.\n");
179 num_entries_used = 0;
182 void index_file(const char *filename, int file_num)
184 fprintf(stderr, "%s... ", filename);
186 FILE *file = fopen(filename, "rb");
192 unsigned trigram = 0;
194 while (!feof(file)) {
201 trigram = ((trigram << 8) | ch) & 0xffffff;
203 posting_list_entry ple;
204 ple.file_num = file_num;
206 posting_lists[trigram].push_back(ple);
210 fprintf(stderr, "%u bytes. [%llu MB RAM used]\n", pos, num_entries_used / (1048576 / sizeof(posting_list_entry)));
213 if (num_entries_used >= (MEMORY_BUDGET_MB * 1048576ULL / sizeof(posting_list_entry))) {
214 fprintf(stderr, "[need to flush index to disk]\n");
219 int main(int argc, char **argv)
221 fprintf(stderr, "Writing empty index.\n");
224 for (int i = 1; i < argc; ++i) {
225 index_file(argv[i], i - 1);