X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=binloader.cpp;h=5bf2502c6867610fbb963c91d63aaf3f717312c8;hb=55481cd69d21b31bfdcc6e33853ea475ed16f622;hp=a56182917287695d6127169b093a169d2129dd7c;hpb=1e683bba44dd60c04eea0cd8b6d1a7e36ad4acc8;p=remoteglot-book diff --git a/binloader.cpp b/binloader.cpp index a561829..5bf2502 100644 --- a/binloader.cpp +++ b/binloader.cpp @@ -1,6 +1,6 @@ //#define _GLIBCXX_PARALLEL -// Usage: ./binloader IN1 IN2 IN3 ... OUT +// Usage: ./binloader IN1 IN2 IN3 ... OUT NUM_BUCKETS #include #include @@ -9,27 +9,38 @@ #include #include #include +#include #include -#include "count.h" +#include "count.pb.h" +#include "hash.h" + +#define DUMMY_TIMESTAMP 32503680000 using namespace std; enum Result { WHITE = 0, DRAW, BLACK }; struct Element { - string bpfen_and_move; + string bpfen; + string move; Result result; int opening_num, white_elo, black_elo; + time_t timestamp; + long start_position; bool operator< (const Element& other) const { - return bpfen_and_move < other.bpfen_and_move; + return bpfen < other.bpfen; } }; int main(int argc, char **argv) { - vector elems; + int num_buckets = atoi(argv[argc - 1]); + + vector> elems; + elems.resize(num_buckets); - for (int i = 1; i < argc - 1; ++i) { + size_t num_elems = 0; + for (int i = 1; i < argc - 2; ++i) { FILE *fp = fopen(argv[i], "rb"); if (fp == NULL) { perror(argv[i]); @@ -41,9 +52,9 @@ int main(int argc, char **argv) break; } - string bpfen_and_move; - bpfen_and_move.resize(l); - if (fread(&bpfen_and_move[0], l, 1, fp) != 1) { + string bpfen; + bpfen.resize(l); + if (fread(&bpfen[0], l, 1, fp) != 1) { perror("fread()"); // exit(1); break; @@ -57,6 +68,8 @@ int main(int argc, char **argv) } int opening_num, white_elo, black_elo; + time_t timestamp; + long start_position; if (fread(&white_elo, sizeof(white_elo), 1, fp) != 1) { perror("fread()"); //exit(1); @@ -72,39 +85,89 @@ int main(int argc, char **argv) //exit(1); break; } - elems.emplace_back(Element {move(bpfen_and_move), Result(r), opening_num, white_elo, black_elo}); + if (fread(×tamp, sizeof(timestamp), 1, fp) != 1) { + perror("fread()"); + //exit(1); + break; + } + if (fread(&start_position, sizeof(start_position), 1, fp) != 1) { + perror("fread()"); + //exit(1); + break; + } + + + l = getc(fp); + if (l == -1) { + break; + } + string move; + move.resize(l); + if (fread(&move[0], l, 1, fp) != 1) { + perror("fread()"); + // exit(1); + break; + } + + int bucket = hash_key_to_bucket(bpfen.data(), bpfen.size(), num_buckets); + elems[bucket].emplace_back(Element {std::move(bpfen), std::move(move), Result(r), opening_num, white_elo, black_elo, timestamp, start_position}); + ++num_elems; } fclose(fp); - printf("Read %ld elems\n", elems.size()); + printf("Read %ld elems\n", num_elems); } printf("Sorting...\n"); - sort(elems.begin(), elems.end()); - - printf("Writing SSTable...\n"); - mtbl_writer* mtbl = mtbl_writer_init(argv[argc - 1], NULL); - Count c; - for (int i = 0; i < elems.size(); ++i) { - if (elems[i].result == WHITE) { - ++c.white; - } else if (elems[i].result == DRAW) { - ++c.draw; - } else if (elems[i].result == BLACK) { - ++c.black; - } - c.opening_num = elems[i].opening_num; - if (elems[i].white_elo >= 100 && elems[i].black_elo >= 100) { - c.sum_white_elo += elems[i].white_elo; - c.sum_black_elo += elems[i].black_elo; - ++c.num_elo; - } - if (i == elems.size() - 1 || elems[i].bpfen_and_move != elems[i + 1].bpfen_and_move) { - mtbl_writer_add(mtbl, - (const uint8_t *)elems[i].bpfen_and_move.data(), elems[i].bpfen_and_move.size(), - (const uint8_t *)&c, sizeof(c)); - c = Count(); + for (int i = 0; i < num_buckets; ++i) { + sort(elems[i].begin(), elems[i].end()); + } + + printf("Writing SSTables...\n"); + string buf; // Keep allocated. + for (int i = 0; i < num_buckets; ++i) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s.part%04d", argv[argc - 2], i); + + mtbl_writer_options* wopt = mtbl_writer_options_init(); + mtbl_writer_options_set_compression(wopt, MTBL_COMPRESSION_SNAPPY); + mtbl_writer* mtbl = mtbl_writer_init(filename, wopt); + Count c; + unordered_set moves; + for (size_t j = 0; j < elems[i].size(); ++j) { + const Element &e = elems[i][j]; + if (e.result == WHITE) { + c.set_white(c.white() + 1); + } else if (e.result == DRAW) { + c.set_draw(c.draw() + 1); + } else if (e.result == BLACK) { + c.set_black(c.black() + 1); + } + if (e.white_elo >= 100 && e.black_elo >= 100) { + c.set_sum_white_elo(c.sum_white_elo() + e.white_elo); + c.set_sum_black_elo(c.sum_black_elo() + e.black_elo); + c.set_num_elo(c.num_elo() + 1); + } + if (!c.has_first_timestamp() || e.timestamp < c.first_timestamp()) { + if (e.timestamp != DUMMY_TIMESTAMP) { + c.set_first_timestamp(e.timestamp); + } + c.set_opening_num(e.opening_num); + c.set_pgn_start_position(e.start_position); + } + if (!moves.count(e.move)) { + moves.insert(e.move); + c.add_move(e.move); + } + if (j == elems[i].size() - 1 || e.bpfen != elems[i][j + 1].bpfen) { + c.SerializeToString(&buf); + mtbl_writer_add(mtbl, + (const uint8_t *)e.bpfen.data(), e.bpfen.size(), + (const uint8_t *)buf.data(), buf.size()); + c = Count(); + moves.clear(); + } } + mtbl_writer_destroy(&mtbl); } - mtbl_writer_destroy(&mtbl); }