X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=binloader.cpp;h=98b29457e8a807eccb7806d0ab5d230ac6953468;hb=cb27055548a580a20c65a58cc1e1760a41245314;hp=e4b40a6f59a6c0d2ac028f6f0cd69422c5ee1cfd;hpb=2d13258e04f6800f1a5d29186a460994e442b4ff;p=remoteglot-book diff --git a/binloader.cpp b/binloader.cpp index e4b40a6..98b2945 100644 --- a/binloader.cpp +++ b/binloader.cpp @@ -1,4 +1,7 @@ //#define _GLIBCXX_PARALLEL + +// Usage: ./binloader IN1 IN2 IN3 ... OUT NUM_BUCKETS + #include #include #include @@ -8,6 +11,7 @@ #include #include #include "count.h" +#include "hash.h" using namespace std; @@ -24,9 +28,13 @@ struct Element { int main(int argc, char **argv) { - vector elems; + int num_buckets = atoi(argv[argc - 1]); + + vector> elems; + elems.resize(num_buckets); - for (int i = 1; i < argc; ++i) { + size_t num_elems = 0; + for (int i = 1; i < argc - 2; ++i) { FILE *fp = fopen(argv[i], "rb"); if (fp == NULL) { perror(argv[i]); @@ -69,45 +77,52 @@ int main(int argc, char **argv) //exit(1); break; } - elems.emplace_back(Element {move(bpfen_and_move), Result(r), opening_num, white_elo, black_elo}); + + int bucket = hash_key_to_bucket(bpfen_and_move.data(), bpfen_and_move.size(), num_buckets); + elems[bucket].emplace_back(Element {move(bpfen_and_move), Result(r), opening_num, white_elo, black_elo}); + ++num_elems; } fclose(fp); - printf("Read %ld elems\n", elems.size()); + printf("Read %ld elems\n", num_elems); } printf("Sorting...\n"); - sort(elems.begin(), elems.end()); + for (int i = 0; i < num_buckets; ++i) { + sort(elems[i].begin(), elems[i].end()); + } - printf("Writing SSTable...\n"); - mtbl_writer* mtbl = mtbl_writer_init("open.mtbl", NULL); - Count c; - int num_elo = 0; - double sum_white_elo = 0.0, sum_black_elo = 0.0; - for (int i = 0; i < elems.size(); ++i) { - if (elems[i].result == WHITE) { - ++c.white; - } else if (elems[i].result == DRAW) { - ++c.draw; - } else if (elems[i].result == BLACK) { - ++c.black; - } - c.opening_num = elems[i].opening_num; - if (elems[i].white_elo >= 100 && elems[i].black_elo >= 100) { - sum_white_elo += elems[i].white_elo; - sum_black_elo += elems[i].black_elo; - ++num_elo; - } - if (i == elems.size() - 1 || elems[i].bpfen_and_move != elems[i + 1].bpfen_and_move) { - c.avg_white_elo = sum_white_elo / num_elo; - c.avg_black_elo = sum_black_elo / num_elo; - mtbl_writer_add(mtbl, - (const uint8_t *)elems[i].bpfen_and_move.data(), elems[i].bpfen_and_move.size(), - (const uint8_t *)&c, sizeof(c)); - c = Count(); - num_elo = 0; - sum_white_elo = sum_black_elo = 0.0; + printf("Writing SSTables...\n"); + for (int i = 0; i < num_buckets; ++i) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s.part%04d", argv[argc - 2], i); + + mtbl_writer_options* wopt = mtbl_writer_options_init(); + mtbl_writer_options_set_compression(wopt, MTBL_COMPRESSION_SNAPPY); + mtbl_writer* mtbl = mtbl_writer_init(filename, wopt); + Count c; + for (size_t j = 0; j < elems[i].size(); ++j) { + const Element &e = elems[i][j]; + if (e.result == WHITE) { + ++c.white; + } else if (e.result == DRAW) { + ++c.draw; + } else if (e.result == BLACK) { + ++c.black; + } + c.opening_num = e.opening_num; + if (e.white_elo >= 100 && e.black_elo >= 100) { + c.sum_white_elo += e.white_elo; + c.sum_black_elo += e.black_elo; + ++c.num_elo; + } + if (j == elems[i].size() - 1 || e.bpfen_and_move != elems[i][j + 1].bpfen_and_move) { + mtbl_writer_add(mtbl, + (const uint8_t *)e.bpfen_and_move.data(), e.bpfen_and_move.size(), + (const uint8_t *)&c, sizeof(c)); + c = Count(); + } } + mtbl_writer_destroy(&mtbl); } - mtbl_writer_destroy(&mtbl); }