]> git.sesse.net Git - remoteglot-book/blob - binloader.cpp
Fixes for filenames with spaces.
[remoteglot-book] / binloader.cpp
1 //#define _GLIBCXX_PARALLEL
2
3 // Usage: ./binloader IN1 IN2 IN3 ... OUT NUM_BUCKETS
4
5 #include <stdio.h>
6 #include <vector>
7 #include <mtbl.h>
8 #include <algorithm>
9 #include <utility>
10 #include <memory>
11 #include <string>
12 #include <unordered_set>
13 #include <string.h>
14 #include "count.pb.h"
15 #include "hash.h"
16
17 #define DUMMY_TIMESTAMP 32503680000
18
19 using namespace std;
20
21 enum Result { WHITE = 0, DRAW, BLACK };
22 struct Element {
23         string bpfen;
24         string move;
25         Result result;
26         int opening_num, white_elo, black_elo;
27         time_t timestamp;
28
29         bool operator< (const Element& other) const {
30                 return bpfen < other.bpfen;
31         }
32 };
33
34 int main(int argc, char **argv)
35 {
36         int num_buckets = atoi(argv[argc - 1]);
37
38         vector<vector<Element>> elems;
39         elems.resize(num_buckets);
40
41         size_t num_elems = 0;
42         for (int i = 1; i < argc - 2; ++i) {
43                 FILE *fp = fopen(argv[i], "rb");
44                 if (fp == NULL) {
45                         perror(argv[i]);
46                         exit(1);
47                 }
48                 for ( ;; ) {
49                         int l = getc(fp);
50                         if (l == -1) {
51                                 break;
52                         }
53                 
54                         string bpfen;
55                         bpfen.resize(l);
56                         if (fread(&bpfen[0], l, 1, fp) != 1) {
57                                 perror("fread()");
58                 //              exit(1);
59                                 break;
60                         }
61
62                         int r = getc(fp);
63                         if (r == -1) {
64                                 perror("getc()");
65                                 //exit(1);
66                                 break;
67                         }
68
69                         int opening_num, white_elo, black_elo;
70                         time_t timestamp;
71                         if (fread(&white_elo, sizeof(white_elo), 1, fp) != 1) {
72                                 perror("fread()");
73                                 //exit(1);
74                                 break;
75                         }
76                         if (fread(&black_elo, sizeof(black_elo), 1, fp) != 1) {
77                                 perror("fread()");
78                                 //exit(1);
79                                 break;
80                         }
81                         if (fread(&opening_num, sizeof(opening_num), 1, fp) != 1) {
82                                 perror("fread()");
83                                 //exit(1);
84                                 break;
85                         }
86                         if (fread(&timestamp, sizeof(timestamp), 1, fp) != 1) {
87                                 perror("fread()");
88                                 //exit(1);
89                                 break;
90                         }
91
92                         l = getc(fp);
93                         if (l == -1) {
94                                 break;
95                         }
96                         string move;
97                         move.resize(l);
98                         if (fread(&move[0], l, 1, fp) != 1) {
99                                 perror("fread()");
100                 //              exit(1);
101                                 break;
102                         }
103
104                         int bucket = hash_key_to_bucket(bpfen.data(), bpfen.size(), num_buckets);
105                         elems[bucket].emplace_back(Element {std::move(bpfen), std::move(move), Result(r), opening_num, white_elo, black_elo, timestamp});
106                         ++num_elems;
107                 }
108                 fclose(fp);
109
110                 printf("Read %ld elems\n", num_elems);
111         }
112
113         printf("Sorting...\n");
114         for (int i = 0; i < num_buckets; ++i) {
115                 sort(elems[i].begin(), elems[i].end());
116         }
117
118         printf("Writing SSTables...\n");
119         string buf;  // Keep allocated.
120         for (int i = 0; i < num_buckets; ++i) {
121                 char filename[256];
122                 snprintf(filename, sizeof(filename), "%s.part%04d", argv[argc - 2], i);
123
124                 mtbl_writer_options* wopt = mtbl_writer_options_init();
125                 mtbl_writer_options_set_compression(wopt, MTBL_COMPRESSION_SNAPPY);
126                 mtbl_writer* mtbl = mtbl_writer_init(filename, wopt);
127                 Count c;
128                 unordered_set<string> moves;
129                 for (size_t j = 0; j < elems[i].size(); ++j) {
130                         const Element &e = elems[i][j];
131                         if (e.result == WHITE) {
132                                 c.set_white(c.white() + 1);
133                         } else if (e.result == DRAW) {
134                                 c.set_draw(c.draw() + 1);
135                         } else if (e.result == BLACK) {
136                                 c.set_black(c.black() + 1);
137                         }
138                         if (e.white_elo >= 100 && e.black_elo >= 100) {
139                                 c.set_sum_white_elo(c.sum_white_elo() + e.white_elo);
140                                 c.set_sum_black_elo(c.sum_black_elo() + e.black_elo);
141                                 c.set_num_elo(c.num_elo() + 1);
142                         }
143                         if (!c.has_first_timestamp() || e.timestamp < c.first_timestamp()) {
144                                 if (e.timestamp != DUMMY_TIMESTAMP) {
145                                         c.set_first_timestamp(e.timestamp);
146                                 }
147                                 c.set_opening_num(e.opening_num);
148                         }
149                         if (!moves.count(e.move)) {
150                                 moves.insert(e.move);
151                                 c.add_move(e.move);
152                         }
153                         if (j == elems[i].size() - 1 || e.bpfen != elems[i][j + 1].bpfen) {
154                                 c.SerializeToString(&buf);
155                                 mtbl_writer_add(mtbl,
156                                         (const uint8_t *)e.bpfen.data(), e.bpfen.size(),
157                                         (const uint8_t *)buf.data(), buf.size());
158                                 c = Count();
159                                 moves.clear();
160                         }
161                 }
162                 mtbl_writer_destroy(&mtbl);
163         }
164 }