]> git.sesse.net Git - index/blob - mkindex-sorter.cpp
Initial checkin for move to Git (no prior version history available).
[index] / mkindex-sorter.cpp
1 #include <stdio.h>
2 #include <string.h>
3 #include <stdlib.h>
4 #include <unistd.h>
5 #include <sys/stat.h>
6 #include <sys/types.h>
7 #include <assert.h>
8 #include <vector>
9 #include <algorithm>
10 #include "common.h"
11
12 #define MEMORY_BUDGET_MB 32
13
14 using namespace std;
15
16 struct meta_index_entry {
17         unsigned pos;   // offset in its respective index shard
18         unsigned size;
19 };
20 meta_index_entry meta_index[1 << 24];
21
22 struct sort_buf_entry {
23         unsigned trigram;
24         unsigned file_num;
25         unsigned pos;
26         
27         bool operator< (const sort_buf_entry& other) const {
28         //      if (trigram != other.trigram)
29                         return trigram < other.trigram;
30 #if 0
31                 if (file_num != other.file_num)
32                         return file_num < other.file_num;
33                 return pos < other.pos;
34 #endif
35         }
36 };
37 sort_buf_entry *sort_buf = NULL;
38
39 unsigned long long num_entries_used = 0;
40 unsigned long long num_entries_room;
41 unsigned index_num = 0;
42
43 void write_index()
44 {
45         fprintf(stderr, "[sorting");
46         stable_sort(sort_buf, sort_buf + num_entries_used);
47         fprintf(stderr, "]");
48
49         fprintf(stderr, "[writing ");
50         
51         char dirname[256];
52         sprintf(dirname, DIRNAME_FORMAT, index_num);
53         mkdir(dirname, 0700);  // no checking
54
55         int curr_shard_open = -1;
56         int last_trigram = -1;
57         int pos = 0;
58         int size = 0;
59         FILE *index_shard = NULL;
60         for (unsigned long long i = 0; i < num_entries_used; ++i) {
61                 const sort_buf_entry& entry = sort_buf[i];
62
63                 int shard = entry.trigram / POSTING_LISTS_PER_SHARD;
64                 if (entry.trigram != last_trigram) {
65                         if (last_trigram != -1) {
66                                 meta_index[last_trigram].pos = pos - size;
67                                 meta_index[last_trigram].size = size;
68                                 size = 0;
69                         }
70                         last_trigram = entry.trigram;
71                 }
72                 if (shard != curr_shard_open) {
73                         if (index_shard != NULL) {
74                                 fclose(index_shard);
75                         }
76                 
77                         char filename[256];
78                         sprintf(filename, FILENAME_FORMAT, index_num, shard);
79                         fprintf(stderr, " %s", filename);
80                         for (unsigned j = 0; j < strlen(filename) + 1; ++j) {
81                                 fprintf(stderr, "\b");
82                         }
83
84                         index_shard = fopen(filename, "wb");
85                         if (index_shard == NULL) {
86                                 perror(filename);
87                                 exit(1);
88                         }
89
90                         curr_shard_open = shard;
91                         pos = 0;
92                         assert(size == 0);
93                 }
94
95                 posting_list_entry ple;
96                 ple.file_num = entry.file_num;
97                 ple.pos = entry.pos;
98                 if (fwrite(&ple, sizeof(posting_list_entry), 1, index_shard) != 1) {
99                         perror("fwrite");
100                         exit(1);
101                 }
102                 ++pos, ++size;
103         }
104
105         if (last_trigram != -1) {
106                 meta_index[last_trigram].pos = pos - size;
107                 meta_index[last_trigram].size = size;
108         }
109         if (index_shard != NULL) {
110                 fclose(index_shard);
111         }
112
113         char filename[256];
114         sprintf(filename, META_INDEX_PATTERN, index_num);
115         fprintf(stderr, "%s]", filename);
116
117         FILE *index_file = fopen(filename, "wb");
118         if (index_file == NULL) {
119                 perror(filename);
120                 exit(1);
121         }
122         if (fwrite(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
123                 perror("fwrite(meta_index)");
124                 exit(1);
125         }
126         fclose(index_file);
127
128         ++index_num;
129
130         memset(meta_index, 0, sizeof(meta_index));
131         num_entries_used = 0;
132 }
133
134 void index_file(const char *filename, int file_num)
135 {
136         fprintf(stderr, "%s... ", filename);
137
138         FILE *file = fopen(filename, "rb");
139         if (file == NULL) {
140                 perror(filename);
141                 exit(1);
142         }
143
144         unsigned trigram = 0;
145         unsigned pos = 0;
146         while (!feof(file)) {
147                 int ch = getc(file);
148                 ++pos;
149                 if (ch == -1) {
150                         break;
151                 }
152
153                 trigram = ((trigram << 8) | ch) & 0xffffff;
154
155                 sort_buf[num_entries_used].trigram = trigram;
156                 sort_buf[num_entries_used].file_num = file_num;
157                 sort_buf[num_entries_used].pos = pos - 2;
158         
159                 if (++num_entries_used >= num_entries_room) {
160                         fprintf(stderr, "[index flush: ");
161                         write_index();
162                         fprintf(stderr, "] ");
163                 }
164         }
165         
166         fprintf(stderr, "%u bytes. [%llu MB RAM used]\n", pos, num_entries_used * sizeof(sort_buf_entry) / 1048576ULL);
167         fclose(file);
168 }
169
170 int main(int argc, char **argv)
171 {
172         num_entries_room = MEMORY_BUDGET_MB * 1048576ULL / sizeof(sort_buf_entry);
173         sort_buf = new sort_buf_entry[num_entries_room];
174
175         for (int i = 1; i < argc; ++i) {
176                 index_file(argv[i], i - 1);
177         }
178         write_index();
179 }