From 2a322a4ad576d87535ce8a479f7a516da9457e80 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Thu, 11 Dec 2014 19:55:22 +0100 Subject: [PATCH] Partition the SSTable; somewhat less efficient space-wise, it seems, but we avoid the huge serialization step in the end. --- .gitignore | 3 +- Makefile | 4 ++- binloader.cpp | 78 ++++++++++++++++++++++++++------------------ binlookup.cpp | 34 +++++++++++-------- build-book.sh | 16 ++++++--- hash.cpp | 14 ++++++++ hash.h | 6 ++++ opening-stats.pl | 2 +- www/opening-stats.pl | 2 +- 9 files changed, 105 insertions(+), 54 deletions(-) create mode 100644 hash.cpp create mode 100644 hash.h diff --git a/.gitignore b/.gitignore index f7e2f2c..d378a6a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ binlookup binmerger eco.pgn openings.txt -open.mtbl +open.mtbl.part???? +*.o diff --git a/Makefile b/Makefile index 1b20627..80d4c5e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,9 @@ CXXFLAGS=-std=gnu++11 -O2 -g -Wall -LDLIBS=-lmtbl +LDLIBS=-lmtbl -lfarmhash all: binloader binlookup binmerger +binloader: binloader.o hash.o + .PHONY: clean clean: $(RM) binloader binlookup binmerger diff --git a/binloader.cpp b/binloader.cpp index 9658c2e..98b2945 100644 --- a/binloader.cpp +++ b/binloader.cpp @@ -1,6 +1,6 @@ //#define _GLIBCXX_PARALLEL -// Usage: ./binloader IN1 IN2 IN3 ... OUT +// Usage: ./binloader IN1 IN2 IN3 ... OUT NUM_BUCKETS #include #include @@ -11,6 +11,7 @@ #include #include #include "count.h" +#include "hash.h" using namespace std; @@ -27,9 +28,13 @@ struct Element { int main(int argc, char **argv) { - vector elems; + int num_buckets = atoi(argv[argc - 1]); - for (int i = 1; i < argc - 1; ++i) { + vector> elems; + elems.resize(num_buckets); + + size_t num_elems = 0; + for (int i = 1; i < argc - 2; ++i) { FILE *fp = fopen(argv[i], "rb"); if (fp == NULL) { perror(argv[i]); @@ -72,41 +77,52 @@ int main(int argc, char **argv) //exit(1); break; } - elems.emplace_back(Element {move(bpfen_and_move), Result(r), opening_num, white_elo, black_elo}); + + int bucket = hash_key_to_bucket(bpfen_and_move.data(), bpfen_and_move.size(), num_buckets); + elems[bucket].emplace_back(Element {move(bpfen_and_move), Result(r), opening_num, white_elo, black_elo}); + ++num_elems; } fclose(fp); - printf("Read %ld elems\n", elems.size()); + printf("Read %ld elems\n", num_elems); } printf("Sorting...\n"); - sort(elems.begin(), elems.end()); + for (int i = 0; i < num_buckets; ++i) { + sort(elems[i].begin(), elems[i].end()); + } - printf("Writing SSTable...\n"); - mtbl_writer_options* wopt = mtbl_writer_options_init(); - mtbl_writer_options_set_compression(wopt, MTBL_COMPRESSION_SNAPPY); - mtbl_writer* mtbl = mtbl_writer_init(argv[argc - 1], wopt); - Count c; - for (int i = 0; i < elems.size(); ++i) { - if (elems[i].result == WHITE) { - ++c.white; - } else if (elems[i].result == DRAW) { - ++c.draw; - } else if (elems[i].result == BLACK) { - ++c.black; - } - c.opening_num = elems[i].opening_num; - if (elems[i].white_elo >= 100 && elems[i].black_elo >= 100) { - c.sum_white_elo += elems[i].white_elo; - c.sum_black_elo += elems[i].black_elo; - ++c.num_elo; - } - if (i == elems.size() - 1 || elems[i].bpfen_and_move != elems[i + 1].bpfen_and_move) { - mtbl_writer_add(mtbl, - (const uint8_t *)elems[i].bpfen_and_move.data(), elems[i].bpfen_and_move.size(), - (const uint8_t *)&c, sizeof(c)); - c = Count(); + printf("Writing SSTables...\n"); + for (int i = 0; i < num_buckets; ++i) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s.part%04d", argv[argc - 2], i); + + mtbl_writer_options* wopt = mtbl_writer_options_init(); + mtbl_writer_options_set_compression(wopt, MTBL_COMPRESSION_SNAPPY); + mtbl_writer* mtbl = mtbl_writer_init(filename, wopt); + Count c; + for (size_t j = 0; j < elems[i].size(); ++j) { + const Element &e = elems[i][j]; + if (e.result == WHITE) { + ++c.white; + } else if (e.result == DRAW) { + ++c.draw; + } else if (e.result == BLACK) { + ++c.black; + } + c.opening_num = e.opening_num; + if (e.white_elo >= 100 && e.black_elo >= 100) { + c.sum_white_elo += e.white_elo; + c.sum_black_elo += e.black_elo; + ++c.num_elo; + } + if (j == elems[i].size() - 1 || e.bpfen_and_move != elems[i][j + 1].bpfen_and_move) { + mtbl_writer_add(mtbl, + (const uint8_t *)e.bpfen_and_move.data(), e.bpfen_and_move.size(), + (const uint8_t *)&c, sizeof(c)); + c = Count(); + } } + mtbl_writer_destroy(&mtbl); } - mtbl_writer_destroy(&mtbl); } diff --git a/binlookup.cpp b/binlookup.cpp index 488a8e3..083941d 100644 --- a/binlookup.cpp +++ b/binlookup.cpp @@ -12,7 +12,8 @@ using namespace std; int main(int argc, char **argv) { - const char *hex_prefix = argv[2]; + int num_buckets = atoi(argv[2]); + const char *hex_prefix = argv[3]; const int prefix_len = strlen(hex_prefix) / 2; uint8_t *prefix = new uint8_t[prefix_len]; @@ -26,20 +27,25 @@ int main(int argc, char **argv) prefix[i] = k; } - mtbl_reader* mtbl = mtbl_reader_init(argv[1], NULL); - const mtbl_source *src = mtbl_reader_source(mtbl); - mtbl_iter *it = mtbl_source_get_prefix(src, prefix, prefix_len); + for (int i = 0; i < num_buckets; ++i) { + char filename[256]; + snprintf(filename, sizeof(filename), "%s.part%04d", argv[1], i); - const uint8_t *key, *val; - size_t len_key, len_val; + mtbl_reader* mtbl = mtbl_reader_init(filename, NULL); + const mtbl_source *src = mtbl_reader_source(mtbl); + mtbl_iter *it = mtbl_source_get_prefix(src, prefix, prefix_len); - while (mtbl_iter_next(it, &key, &len_key, &val, &len_val)) { - string move((char *)(key + prefix_len), len_key - prefix_len); - const Count* c = (Count *)val; - printf("%s %d %d %d %u %f %f %d\n", move.c_str(), - c->white, c->draw, c->black, c->opening_num, - float(c->sum_white_elo) / c->num_elo, - float(c->sum_black_elo) / c->num_elo, - c->num_elo); + const uint8_t *key, *val; + size_t len_key, len_val; + + while (mtbl_iter_next(it, &key, &len_key, &val, &len_val)) { + string move((char *)(key + prefix_len), len_key - prefix_len); + const Count* c = (Count *)val; + printf("%s %d %d %d %u %f %f %d\n", move.c_str(), + c->white, c->draw, c->black, c->opening_num, + float(c->sum_white_elo) / c->num_elo, + float(c->sum_black_elo) / c->num_elo, + c->num_elo); + } } } diff --git a/build-book.sh b/build-book.sh index 1d5e3b5..01c51c7 100755 --- a/build-book.sh +++ b/build-book.sh @@ -2,7 +2,7 @@ set -e -rm -f part-*.bin part-*.mtbl open.mtbl.new 2>/dev/null +rm -f part-*.bin part-*.mtbl part-*.mtbl.part???? open.mtbl.new open.mtbl.part???? open.mtbl.part????.new 2>/dev/null for FILE in $@; do date | tr -d "\n" @@ -12,12 +12,18 @@ done date for FILE in part-*.bin; do - ( ./binloader $FILE ${FILE/bin/mtbl} ) & + ( ./binloader $FILE ${FILE/bin/mtbl} 40 ) & done wait rm -f part-*.bin -./binmerger part-*.mtbl open.mtbl.new -mv open.mtbl.new open.mtbl -rm -f part-*.mtbl +for X in $( seq 0 39 ); do + ( ./binmerger part-*.mtbl.part$( printf %04d $X ) open.mtbl.part$( printf %04d $X ).new ) & +done +wait + +for X in $( seq 0 39 ); do + mv open.mtbl.part$( printf %04d $X ).new open.mtbl.part$( printf %04d $X) +done +rm -f part-*.mtbl.part???? diff --git a/hash.cpp b/hash.cpp new file mode 100644 index 0000000..72ed743 --- /dev/null +++ b/hash.cpp @@ -0,0 +1,14 @@ +#include +#include +#include "hash.h" + +using namespace std; + +int hash_key_to_bucket(const char* s, size_t len, int num_buckets) +{ + // We hash only the first 10 bytes; it should be enough to get a + // reasonable spread, but also mostly miss the move, so that + // same position + different move usually land in the same bucket. + len = max(len, 10); + return util::Fingerprint32(s, len) % num_buckets; +} diff --git a/hash.h b/hash.h new file mode 100644 index 0000000..9b9550a --- /dev/null +++ b/hash.h @@ -0,0 +1,6 @@ +#ifndef _HASH_H +#define _HASH_H 1 + +int hash_key_to_bucket(const char* s, size_t len, int num_buckets); + +#endif // !defined(_HASH_H) diff --git a/opening-stats.pl b/opening-stats.pl index 3bbcc88..1c17f50 100755 --- a/opening-stats.pl +++ b/opening-stats.pl @@ -10,5 +10,5 @@ my $cgi = CGI->new; my $fen = $ARGV[0]; my $pos = Position->from_fen($fen); my $hex = unpack('H*', $pos->bitpacked_fen); -system("./binlookup", "./open.mtbl", $hex); +system("./binlookup", "./open.mtbl", "40", $hex); diff --git a/www/opening-stats.pl b/www/opening-stats.pl index 9b68e91..33d3c08 100755 --- a/www/opening-stats.pl +++ b/www/opening-stats.pl @@ -13,7 +13,7 @@ my $cgi = CGI->new; my $fen = $cgi->param('fen'); my $pos = Position->from_fen($fen); my $hex = unpack('H*', $pos->bitpacked_fen); -open my $fh, "-|", "../binlookup", "../open.mtbl", $hex +open my $fh, "-|", "../binlookup", "../open.mtbl", "40", $hex or die "../binlookup: $!"; my $opening; -- 2.39.2