+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <vector>
+#include "common.h"
+
+#define MEMORY_BUDGET_MB 2048
+
+using namespace std;
+
+struct meta_index_entry {
+ unsigned pos; // offset in its respective index shard
+ unsigned size;
+};
+meta_index_entry meta_index[1 << 24];
+
+struct posting_list_entry {
+ unsigned file_num;
+ unsigned pos;
+
+ bool operator< (const posting_list_entry& other) const {
+ if (file_num != other.file_num)
+ return file_num < other.file_num;
+ return pos < other.pos;
+ }
+};
+vector<posting_list_entry> posting_lists[1 << 24];
+unsigned long long num_entries_used = 0;
+
+vector<posting_list_entry> union_posting_lists(
+ const vector<posting_list_entry>& first,
+ const vector<posting_list_entry>& second)
+{
+ vector<posting_list_entry> ret(first.size() + second.size());
+ vector<posting_list_entry>::const_iterator first_it = first.begin();
+ vector<posting_list_entry>::const_iterator second_it = second.begin();
+
+ while (first_it != first.end() && second_it != second.end()) {
+ if (first_it == first.end()) {
+ ret.push_back(*second_it++);
+ } else if (second_it == second.end()) {
+ ret.push_back(*first_it++);
+ continue;
+ } else if (*first_it < *second_it) {
+ ret.push_back(*first_it++);
+ } else {
+ ret.push_back(*second_it++);
+ }
+ }
+
+ return ret;
+}
+
+void write_index()
+{
+ for (int i = 0; i < NUM_SHARDS; ++i) {
+ int pos = 0;
+
+ char filename[256];
+ sprintf(filename, FILENAME_FORMAT, i);
+ fprintf(stderr, "%s...\r", filename);
+ FILE *index_shard = fopen(filename, "wb");
+ if (index_shard == NULL) {
+ perror(filename);
+ exit(1);
+ }
+ for (int j = 0; j < POSTING_LISTS_PER_SHARD; ++j) {
+ unsigned trigram = (i * POSTING_LISTS_PER_SHARD) + j;
+
+ for (int k = 0; k < posting_lists[trigram].size(); ++k) {
+ if (fwrite(&posting_lists[trigram][k], sizeof(posting_list_entry), 1, index_shard) != 1) {
+ perror("fwrite");
+ exit(1);
+ }
+ }
+ unsigned size = posting_lists[trigram].size();
+ meta_index[trigram].pos = pos;
+ meta_index[trigram].size = size;
+ pos += size;
+ }
+ fclose(index_shard);
+
+ }
+
+ fprintf(stderr, "%s...\r", META_INDEX_NAME);
+
+ FILE *index_file = fopen("index/meta", "wb");
+ if (index_file == NULL) {
+ perror("index/meta");
+ exit(1);
+ }
+ if (fwrite(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
+ perror("fwrite(meta_index)");
+ exit(1);
+ }
+ fclose(index_file);
+
+ fprintf(stderr, "Index written.\n");
+}
+
+void merge_index()
+{
+ for (int i = 0; i < NUM_SHARDS; ++i) {
+ int pos = 0;
+
+ char filename[256], temp_filename[256];
+ sprintf(filename, FILENAME_FORMAT, i);
+ strcpy(temp_filename, filename);
+ strcat(temp_filename, ".temp");
+ fprintf(stderr, "%s...\r", filename);
+ FILE *old_index_shard = fopen(filename, "rb");
+ if (old_index_shard == NULL) {
+ perror(filename);
+ exit(1);
+ }
+ FILE *index_shard = fopen(temp_filename, "wb");
+ if (index_shard == NULL) {
+ perror(filename);
+ exit(1);
+ }
+ for (int j = 0; j < POSTING_LISTS_PER_SHARD; ++j) {
+ unsigned trigram = (i * POSTING_LISTS_PER_SHARD) + j;
+
+ // Read in the old posting list
+ int old_size = meta_index[trigram].size;
+ vector<posting_list_entry> old_posting_list(old_size);
+ for (int k = 0; k < old_size; ++k) {
+ if (fread(&old_posting_list[k], sizeof(posting_list_entry), 1, old_index_shard) != 1) {
+ perror("fread(old posting list)");
+ exit(1);
+ }
+ }
+
+ vector<posting_list_entry> posting_list =
+ union_posting_lists(old_posting_list, posting_lists[trigram]);
+ vector<posting_list_entry> empty;
+ swap(posting_lists[trigram], empty);
+
+ for (int k = 0; k < posting_list.size(); ++k) {
+ if (fwrite(&posting_list[k], sizeof(posting_list_entry), 1, index_shard) != 1) {
+ perror("fwrite");
+ exit(1);
+ }
+ }
+ unsigned size = posting_list.size();
+ meta_index[trigram].pos = pos;
+ meta_index[trigram].size = size;
+ pos += size;
+ }
+ fclose(old_index_shard);
+ fclose(index_shard);
+
+ if (unlink(filename) == -1) {
+ perror("unlink");
+ exit(1);
+ }
+ if (rename(temp_filename, filename) == -1) {
+ perror("rename");
+ exit(1);
+ }
+ }
+
+ fprintf(stderr, "%s...\r", META_INDEX_NAME);
+
+ FILE *index_file = fopen("index/meta", "wb");
+ if (index_file == NULL) {
+ perror("index/meta");
+ exit(1);
+ }
+ if (fwrite(meta_index, sizeof(meta_index_entry), 1 << 24, index_file) != (1 << 24)) {
+ perror("fwrite(meta_index)");
+ exit(1);
+ }
+ fclose(index_file);
+
+ fprintf(stderr, "New index written.\n");
+
+ num_entries_used = 0;
+}
+
+void index_file(const char *filename, int file_num)
+{
+ fprintf(stderr, "%s... ", filename);
+
+ FILE *file = fopen(filename, "rb");
+ if (file == NULL) {
+ perror(filename);
+ exit(1);
+ }
+
+ unsigned trigram = 0;
+ unsigned pos = 0;
+ while (!feof(file)) {
+ int ch = getc(file);
+ ++pos;
+ if (ch == -1) {
+ break;
+ }
+
+ trigram = ((trigram << 8) | ch) & 0xffffff;
+
+ posting_list_entry ple;
+ ple.file_num = file_num;
+ ple.pos = pos - 2;
+ posting_lists[trigram].push_back(ple);
+ ++num_entries_used;
+ }
+
+ fprintf(stderr, "%u bytes. [%llu MB RAM used]\n", pos, num_entries_used / (1048576 / sizeof(posting_list_entry)));
+ fclose(file);
+
+ if (num_entries_used >= (MEMORY_BUDGET_MB * 1048576ULL / sizeof(posting_list_entry))) {
+ fprintf(stderr, "[need to flush index to disk]\n");
+ merge_index();
+ }
+}
+
+int main(int argc, char **argv)
+{
+ fprintf(stderr, "Writing empty index.\n");
+ write_index();
+
+ for (int i = 1; i < argc; ++i) {
+ index_file(argv[i], i - 1);
+ }
+ merge_index();
+}