]> git.sesse.net Git - remoteglot-book/blob - binloader.cpp
Set parallel merges to a value different from parallel loads.
[remoteglot-book] / binloader.cpp
1 //#define _GLIBCXX_PARALLEL
2
3 // Usage: ./binloader IN1 IN2 IN3 ... OUT NUM_BUCKETS
4
5 #include <stdio.h>
6 #include <vector>
7 #include <mtbl.h>
8 #include <algorithm>
9 #include <utility>
10 #include <memory>
11 #include <string>
12 #include <unordered_set>
13 #include <string.h>
14 #include "count.pb.h"
15 #include "hash.h"
16
17 #define DUMMY_TIMESTAMP 32503680000
18
19 using namespace std;
20
21 enum Result { WHITE = 0, DRAW, BLACK };
22 struct Element {
23         string bpfen;
24         string move;
25         Result result;
26         int opening_num, white_elo, black_elo;
27         time_t timestamp;
28         int file_num;
29         long start_position;
30
31         bool operator< (const Element& other) const {
32                 return bpfen < other.bpfen;
33         }
34 };
35
36 int main(int argc, char **argv)
37 {
38         int num_buckets = atoi(argv[argc - 1]);
39
40         vector<vector<Element>> elems;
41         elems.resize(num_buckets);
42
43         size_t num_elems = 0;
44         for (int i = 1; i < argc - 2; ++i) {
45                 FILE *fp = fopen(argv[i], "rb");
46                 if (fp == NULL) {
47                         perror(argv[i]);
48                         exit(1);
49                 }
50                 for ( ;; ) {
51                         int l = getc(fp);
52                         if (l == -1) {
53                                 break;
54                         }
55                 
56                         string bpfen;
57                         bpfen.resize(l);
58                         if (fread(&bpfen[0], l, 1, fp) != 1) {
59                                 perror("fread()");
60                 //              exit(1);
61                                 break;
62                         }
63
64                         int r = getc(fp);
65                         if (r == -1) {
66                                 perror("getc()");
67                                 //exit(1);
68                                 break;
69                         }
70
71                         int opening_num, white_elo, black_elo, file_num;
72                         time_t timestamp;
73                         long start_position;
74                         if (fread(&white_elo, sizeof(white_elo), 1, fp) != 1) {
75                                 perror("fread()");
76                                 //exit(1);
77                                 break;
78                         }
79                         if (fread(&black_elo, sizeof(black_elo), 1, fp) != 1) {
80                                 perror("fread()");
81                                 //exit(1);
82                                 break;
83                         }
84                         if (fread(&opening_num, sizeof(opening_num), 1, fp) != 1) {
85                                 perror("fread()");
86                                 //exit(1);
87                                 break;
88                         }
89                         if (fread(&timestamp, sizeof(timestamp), 1, fp) != 1) {
90                                 perror("fread()");
91                                 //exit(1);
92                                 break;
93                         }
94                         if (fread(&file_num, sizeof(file_num), 1, fp) != 1) {
95                                 perror("fread()");
96                                 //exit(1);
97                                 break;
98                         }
99                         if (fread(&start_position, sizeof(start_position), 1, fp) != 1) {
100                                 perror("fread()");
101                                 //exit(1);
102                                 break;
103                         }
104
105
106                         l = getc(fp);
107                         if (l == -1) {
108                                 break;
109                         }
110                         string move;
111                         move.resize(l);
112                         if (fread(&move[0], l, 1, fp) != 1) {
113                                 perror("fread()");
114                 //              exit(1);
115                                 break;
116                         }
117
118                         int bucket = hash_key_to_bucket(bpfen.data(), bpfen.size(), num_buckets);
119                         elems[bucket].emplace_back(Element {std::move(bpfen), std::move(move), Result(r), opening_num, white_elo, black_elo, timestamp, file_num, start_position});
120                         ++num_elems;
121                 }
122                 fclose(fp);
123
124                 printf("Read %ld elems\n", num_elems);
125         }
126
127         printf("Sorting...\n");
128         for (int i = 0; i < num_buckets; ++i) {
129                 sort(elems[i].begin(), elems[i].end());
130         }
131
132         printf("Writing SSTables...\n");
133         string buf;  // Keep allocated.
134         for (int i = 0; i < num_buckets; ++i) {
135                 char filename[256];
136                 snprintf(filename, sizeof(filename), "%s.part%04d", argv[argc - 2], i);
137
138                 mtbl_writer_options* wopt = mtbl_writer_options_init();
139                 mtbl_writer_options_set_compression(wopt, MTBL_COMPRESSION_SNAPPY);
140                 mtbl_writer* mtbl = mtbl_writer_init(filename, wopt);
141                 Count c;
142                 unordered_set<string> moves;
143                 for (size_t j = 0; j < elems[i].size(); ++j) {
144                         const Element &e = elems[i][j];
145                         if (e.result == WHITE) {
146                                 c.set_white(c.white() + 1);
147                         } else if (e.result == DRAW) {
148                                 c.set_draw(c.draw() + 1);
149                         } else if (e.result == BLACK) {
150                                 c.set_black(c.black() + 1);
151                         }
152                         if (e.white_elo >= 100 && e.black_elo >= 100) {
153                                 c.set_sum_white_elo(c.sum_white_elo() + e.white_elo);
154                                 c.set_sum_black_elo(c.sum_black_elo() + e.black_elo);
155                                 c.set_num_elo(c.num_elo() + 1);
156                         }
157                         if (!c.has_first_timestamp() || e.timestamp < c.first_timestamp()) {
158                                 if (e.timestamp != DUMMY_TIMESTAMP) {
159                                         c.set_first_timestamp(e.timestamp);
160                                 }
161                                 c.set_opening_num(e.opening_num);
162                                 c.set_pgn_file_num(e.file_num);
163                                 c.set_pgn_start_position(e.start_position);
164                         }
165                         if (!moves.count(e.move)) {
166                                 moves.insert(e.move);
167                                 c.add_move(e.move);
168                         }
169                         if (j == elems[i].size() - 1 || e.bpfen != elems[i][j + 1].bpfen) {
170                                 c.SerializeToString(&buf);
171                                 mtbl_writer_add(mtbl,
172                                         (const uint8_t *)e.bpfen.data(), e.bpfen.size(),
173                                         (const uint8_t *)buf.data(), buf.size());
174                                 c = Count();
175                                 moves.clear();
176                         }
177                 }
178                 mtbl_writer_destroy(&mtbl);
179         }
180 }