X-Git-Url: https://git.sesse.net/?p=remoteglot-book;a=blobdiff_plain;f=binloader.cpp;h=8660e8e299bd93f70f19767d671af0e6d8288f80;hp=e4b40a6f59a6c0d2ac028f6f0cd69422c5ee1cfd;hb=b94d3db60ee33f550078c313560af3718d304ee6;hpb=2d13258e04f6800f1a5d29186a460994e442b4ff diff --git a/binloader.cpp b/binloader.cpp index e4b40a6..8660e8e 100644 --- a/binloader.cpp +++ b/binloader.cpp @@ -1,4 +1,7 @@ //#define _GLIBCXX_PARALLEL + +// Usage: ./binloader IN1 IN2 IN3 ... OUT NUM_BUCKETS + #include #include #include @@ -6,27 +9,39 @@ #include #include #include +#include #include -#include "count.h" +#include "count.pb.h" +#include "hash.h" + +#define DUMMY_TIMESTAMP 32503680000 using namespace std; enum Result { WHITE = 0, DRAW, BLACK }; struct Element { - string bpfen_and_move; + string bpfen; + string move; Result result; int opening_num, white_elo, black_elo; + time_t timestamp; + int file_num; + long start_position; bool operator< (const Element& other) const { - return bpfen_and_move < other.bpfen_and_move; + return bpfen < other.bpfen; } }; int main(int argc, char **argv) { - vector elems; + int num_buckets = atoi(argv[argc - 1]); - for (int i = 1; i < argc; ++i) { + vector> elems; + elems.resize(num_buckets); + + size_t num_elems = 0; + for (int i = 1; i < argc - 2; ++i) { FILE *fp = fopen(argv[i], "rb"); if (fp == NULL) { perror(argv[i]); @@ -38,9 +53,9 @@ int main(int argc, char **argv) break; } - string bpfen_and_move; - bpfen_and_move.resize(l); - if (fread(&bpfen_and_move[0], l, 1, fp) != 1) { + string bpfen; + bpfen.resize(l); + if (fread(&bpfen[0], l, 1, fp) != 1) { perror("fread()"); // exit(1); break; @@ -53,7 +68,9 @@ int main(int argc, char **argv) break; } - int opening_num, white_elo, black_elo; + int opening_num, white_elo, black_elo, file_num; + time_t timestamp; + long start_position; if (fread(&white_elo, sizeof(white_elo), 1, fp) != 1) { perror("fread()"); //exit(1); @@ -69,45 +86,95 @@ int main(int argc, char **argv) //exit(1); break; } - elems.emplace_back(Element {move(bpfen_and_move), Result(r), opening_num, white_elo, black_elo}); + if (fread(×tamp, sizeof(timestamp), 1, fp) != 1) { + perror("fread()"); + //exit(1); + break; + } + if (fread(&file_num, sizeof(file_num), 1, fp) != 1) { + perror("fread()"); + //exit(1); + break; + } + if (fread(&start_position, sizeof(start_position), 1, fp) != 1) { + perror("fread()"); + //exit(1); + break; + } + + + l = getc(fp); + if (l == -1) { + break; + } + string move; + move.resize(l); + if (fread(&move[0], l, 1, fp) != 1) { + perror("fread()"); + // exit(1); + break; + } + + int bucket = hash_key_to_bucket(bpfen.data(), bpfen.size(), num_buckets); + elems[bucket].emplace_back(Element {std::move(bpfen), std::move(move), Result(r), opening_num, white_elo, black_elo, timestamp, file_num, start_position}); + ++num_elems; } fclose(fp); - printf("Read %ld elems\n", elems.size()); + printf("Read %ld elems\n", num_elems); } printf("Sorting...\n"); - sort(elems.begin(), elems.end()); - - printf("Writing SSTable...\n"); - mtbl_writer* mtbl = mtbl_writer_init("open.mtbl", NULL); - Count c; - int num_elo = 0; - double sum_white_elo = 0.0, sum_black_elo = 0.0; - for (int i = 0; i < elems.size(); ++i) { - if (elems[i].result == WHITE) { - ++c.white; - } else if (elems[i].result == DRAW) { - ++c.draw; - } else if (elems[i].result == BLACK) { - ++c.black; - } - c.opening_num = elems[i].opening_num; - if (elems[i].white_elo >= 100 && elems[i].black_elo >= 100) { - sum_white_elo += elems[i].white_elo; - sum_black_elo += elems[i].black_elo; - ++num_elo; - } - if (i == elems.size() - 1 || elems[i].bpfen_and_move != elems[i + 1].bpfen_and_move) { - c.avg_white_elo = sum_white_elo / num_elo; - c.avg_black_elo = sum_black_elo / num_elo; - mtbl_writer_add(mtbl, - (const uint8_t *)elems[i].bpfen_and_move.data(), elems[i].bpfen_and_move.size(), - (const uint8_t *)&c, sizeof(c)); - c = Count(); - num_elo = 0; - sum_white_elo = sum_black_elo = 0.0; + for (int i = 0; i < num_buckets; ++i) { + sort(elems[i].begin(), elems[i].end()); + } + + printf("Writing SSTables...\n"); + string buf; // Keep allocated. + for (int i = 0; i < num_buckets; ++i) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s.part%04d", argv[argc - 2], i); + + mtbl_writer_options* wopt = mtbl_writer_options_init(); + mtbl_writer_options_set_compression(wopt, MTBL_COMPRESSION_SNAPPY); + mtbl_writer* mtbl = mtbl_writer_init(filename, wopt); + Count c; + unordered_set moves; + for (size_t j = 0; j < elems[i].size(); ++j) { + const Element &e = elems[i][j]; + if (e.result == WHITE) { + c.set_white(c.white() + 1); + } else if (e.result == DRAW) { + c.set_draw(c.draw() + 1); + } else if (e.result == BLACK) { + c.set_black(c.black() + 1); + } + if (e.white_elo >= 100 && e.black_elo >= 100) { + c.set_sum_white_elo(c.sum_white_elo() + e.white_elo); + c.set_sum_black_elo(c.sum_black_elo() + e.black_elo); + c.set_num_elo(c.num_elo() + 1); + } + if (!c.has_first_timestamp() || e.timestamp < c.first_timestamp()) { + if (e.timestamp != DUMMY_TIMESTAMP) { + c.set_first_timestamp(e.timestamp); + } + c.set_opening_num(e.opening_num); + c.set_pgn_file_num(e.file_num); + c.set_pgn_start_position(e.start_position); + } + if (!moves.count(e.move)) { + moves.insert(e.move); + c.add_move(e.move); + } + if (j == elems[i].size() - 1 || e.bpfen != elems[i][j + 1].bpfen) { + c.SerializeToString(&buf); + mtbl_writer_add(mtbl, + (const uint8_t *)e.bpfen.data(), e.bpfen.size(), + (const uint8_t *)buf.data(), buf.size()); + c = Count(); + moves.clear(); + } } + mtbl_writer_destroy(&mtbl); } - mtbl_writer_destroy(&mtbl); }