12 #define MEMORY_BUDGET_MB 32
16 struct meta_index_entry {
17 unsigned pos; // offset in its respective index shard
20 meta_index_entry meta_index[1 << 24];
22 struct sort_buf_entry {
27 bool operator< (const sort_buf_entry& other) const {
28 // if (trigram != other.trigram)
29 return trigram < other.trigram;
31 if (file_num != other.file_num)
32 return file_num < other.file_num;
33 return pos < other.pos;
37 sort_buf_entry *sort_buf = NULL;
39 unsigned long long num_entries_used = 0;
40 unsigned long long num_entries_room;
41 unsigned index_num = 0;
45 fprintf(stderr, "[sorting");
46 stable_sort(sort_buf, sort_buf + num_entries_used);
49 fprintf(stderr, "[writing ");
52 sprintf(dirname, DIRNAME_FORMAT, index_num);
53 mkdir(dirname, 0700); // no checking
55 int curr_shard_open = -1;
56 int last_trigram = -1;
59 FILE *index_shard = NULL;
60 for (unsigned long long i = 0; i < num_entries_used; ++i) {
61 const sort_buf_entry& entry = sort_buf[i];
63 int shard = entry.trigram / POSTING_LISTS_PER_SHARD;
64 if (entry.trigram != last_trigram) {
65 if (last_trigram != -1) {
66 meta_index[last_trigram].pos = pos - size;
67 meta_index[last_trigram].size = size;
70 last_trigram = entry.trigram;
72 if (shard != curr_shard_open) {
73 if (index_shard != NULL) {
78 sprintf(filename, FILENAME_FORMAT, index_num, shard);
79 fprintf(stderr, " %s", filename);
80 for (unsigned j = 0; j < strlen(filename) + 1; ++j) {
81 fprintf(stderr, "\b");
84 index_shard = fopen(filename, "wb");
85 if (index_shard == NULL) {
90 curr_shard_open = shard;
95 posting_list_entry ple;
96 ple.file_num = entry.file_num;
98 if (fwrite(&ple, sizeof(posting_list_entry), 1, index_shard) != 1) {
105 if (last_trigram != -1) {
106 meta_index[last_trigram].pos = pos - size;
107 meta_index[last_trigram].size = size;
109 if (index_shard != NULL) {
114 sprintf(filename, META_INDEX_PATTERN, index_num);
115 fprintf(stderr, "%s]", filename);
117 FILE *index_file = fopen(filename, "wb");
118 if (index_file == NULL) {
122 if (fwrite(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
123 perror("fwrite(meta_index)");
130 memset(meta_index, 0, sizeof(meta_index));
131 num_entries_used = 0;
134 void index_file(const char *filename, int file_num)
136 fprintf(stderr, "%s... ", filename);
138 FILE *file = fopen(filename, "rb");
144 unsigned trigram = 0;
146 while (!feof(file)) {
153 trigram = ((trigram << 8) | ch) & 0xffffff;
155 sort_buf[num_entries_used].trigram = trigram;
156 sort_buf[num_entries_used].file_num = file_num;
157 sort_buf[num_entries_used].pos = pos - 2;
159 if (++num_entries_used >= num_entries_room) {
160 fprintf(stderr, "[index flush: ");
162 fprintf(stderr, "] ");
166 fprintf(stderr, "%u bytes. [%llu MB RAM used]\n", pos, num_entries_used * sizeof(sort_buf_entry) / 1048576ULL);
170 int main(int argc, char **argv)
172 num_entries_room = MEMORY_BUDGET_MB * 1048576ULL / sizeof(sort_buf_entry);
173 sort_buf = new sort_buf_entry[num_entries_room];
175 for (int i = 1; i < argc; ++i) {
176 index_file(argv[i], i - 1);