From: Steinar H. Gunderson Date: Wed, 29 Aug 2012 21:26:41 +0000 (+0200) Subject: Move most of the train.pl logic into C++. Faster, and less crazy swappy. X-Git-Url: https://git.sesse.net/?p=wloh;a=commitdiff_plain;h=e7674d31d1c284487aa4021f66f068c0e9035ccc Move most of the train.pl logic into C++. Faster, and less crazy swappy. --- diff --git a/bayeswf.cpp b/bayeswf.cpp index cc59183..7ae502d 100644 --- a/bayeswf.cpp +++ b/bayeswf.cpp @@ -17,11 +17,32 @@ using namespace Eigen; #define PRIOR_WEIGHT 1.0 #define MAX_PLAYERS 4096 #define DUMP_RAW 0 +#define USE_DB 1 + +#if USE_DB +#include +#include +#include +#endif float mu[MAX_PLAYERS]; float mu_stddev[MAX_PLAYERS]; -float global_sigma = 70.0f; -float prior_sigma = 70.0f; +float global_sigma; +float prior_sigma; + +// Data waiting for insertion into the database. + +struct RatingDBTuple { + string player; + float mu, mu_stddev; +}; +struct CovarianceDBTuple { + string player1, player2; + float covariance; +}; +vector rating_db_tuples; +vector covariance_db_tuples; +map, float> aux_params; #define EPSILON 1e-3 @@ -46,16 +67,11 @@ vector all_matches; void dump_scores(const vector &players, const float *mu, const float *mu_stddev, int num_players) { -#if 0 - for (int i = 0; i < num_players; ++i) { - printf("%s=[%5.1f, %4.1f] ", players[i].c_str(), mu[i], sigma[i]); - } - printf("\n"); -#elif 0 +#if USE_DB for (int i = 0; i < num_players; ++i) { - printf("%5.1f ", mu[i]); + RatingDBTuple tuple = { players[i], mu[i], mu_stddev[i] }; + rating_db_tuples.push_back(tuple); } - printf("\n"); #else for (int i = 0; i < num_players; ++i) { printf("%f %f %s\n", mu[i], mu_stddev[i], players[i].c_str()); @@ -235,6 +251,17 @@ void compute_mu_uncertainty(const float *mu, const vector &players) mu_stddev[i] = sqrt(ih(i, i)); } +#if USE_DB + for (unsigned i = 0; i < players.size(); ++i) { + for (unsigned j = 0; j < players.size(); ++j) { + CovarianceDBTuple tuple; + tuple.player1 = players[i]; + tuple.player2 = players[j]; + tuple.covariance = ih(i, j); + covariance_db_tuples.push_back(tuple); + } + } +#else for (unsigned i = 0; i < players.size(); ++i) { for (unsigned j = 0; j < players.size(); ++j) { printf("covariance %s %s %f\n", @@ -243,12 +270,32 @@ void compute_mu_uncertainty(const float *mu, const vector &players) ih(i, j)); } } +#endif } -int main(int argc, char **argv) +void process_file(const char *filename) { + printf("%s...\n", filename); + + global_sigma = 70.0f; + prior_sigma = 70.0f; + matches_for_player.clear(); + all_matches.clear(); + + FILE *fp = fopen(filename, "r"); + if (fp == NULL) { + perror(filename); + exit(1); + } + + char locale[256]; + if (fscanf(fp, "%s", locale) != 1) { + fprintf(stderr, "Could't read number of players\n"); + exit(1); + } + int num_players; - if (scanf("%d", &num_players) != 1) { + if (fscanf(fp,"%d", &num_players) != 1) { fprintf(stderr, "Could't read number of players\n"); exit(1); } @@ -263,7 +310,7 @@ int main(int argc, char **argv) for (int i = 0; i < num_players; ++i) { char buf[256]; - if (scanf("%s", buf) != 1) { + if (fscanf(fp, "%s", buf) != 1) { fprintf(stderr, "Couldn't read player %d\n", i); exit(1); } @@ -278,7 +325,7 @@ int main(int argc, char **argv) int score1, score2; float weight; - if (scanf("%s %s %d %d %f", pl1, pl2, &score1, &score2, &weight) != 5) { + if (fscanf(fp, "%s %s %d %d %f", pl1, pl2, &score1, &score2, &weight) != 5) { //fprintf(stderr, "Read %d matches.\n", num_matches); break; } @@ -310,6 +357,8 @@ int main(int argc, char **argv) all_matches.push_back(m1); } + + fclose(fp); float mu[MAX_PLAYERS]; @@ -317,6 +366,7 @@ int main(int argc, char **argv) mu[i] = PRIOR_MU; } + int num_iterations = -1; for (int j = 0; j < 1000; ++j) { float old_mu[MAX_PLAYERS]; float old_global_sigma = global_sigma; @@ -340,22 +390,100 @@ int main(int argc, char **argv) sumdiff += (global_sigma - old_global_sigma) * (global_sigma - old_global_sigma); if (sumdiff < EPSILON) { //fprintf(stderr, "Converged after %d iterations. Stopping.\n", j); - printf("aux_param num_iterations %d\n", j + 1); + num_iterations = j + 1; break; } } -#if DUMP_RAW - dump_raw(mu, num_players); -#else construct_hessian(mu, num_players); + aux_params[make_pair(locale, "num_iterations")] = num_iterations; + aux_params[make_pair(locale, "score_stddev")] = global_sigma / sqrt(2.0f); + aux_params[make_pair(locale, "rating_prior_stddev")] = prior_sigma; + aux_params[make_pair(locale, "total_log_likelihood")] = compute_total_logl(mu, num_players); + compute_mu_uncertainty(mu, players); dump_scores(players, mu, mu_stddev, num_players); - //fprintf(stderr, "Optimal sigma: %f (two-player: %f)\n", sigma[0], sigma[0] * sqrt(2.0f)); - printf("aux_param score_stddev %f\n", global_sigma / sqrt(2.0f)); - printf("aux_param rating_prior_stddev %f\n", prior_sigma); +} + +int main(int argc, char **argv) +{ +#if USE_DB + pqxx::connection conn("dbname=wloh_dev host=127.0.0.1 user=wloh password=oto4iCh5"); +#endif + + for (int i = 1; i < argc; ++i) { + process_file(argv[i]); + } + +#if DUMP_RAW + dump_raw(mu, num_players); + return 0; +#endif + +#if USE_DB + pqxx::work txn(conn); + txn.exec("SET client_min_messages TO WARNING"); - float total_logl = compute_total_logl(mu, num_players); - printf("aux_param total_log_likelihood %f\n", total_logl); + // Dump ratings. + { + txn.exec("TRUNCATE ratings"); + pqxx::tablewriter writer(txn, "ratings"); + for (unsigned i = 0; i < rating_db_tuples.size(); ++i) { + char mu_str[128], mu_stddev_str[128]; + snprintf(mu_str, 128, "%f", rating_db_tuples[i].mu); + snprintf(mu_stddev_str, 128, "%f", rating_db_tuples[i].mu_stddev); + + vector tuple; + tuple.push_back(rating_db_tuples[i].player); + tuple.push_back(mu_str); + tuple.push_back(mu_stddev_str); + writer.push_back(tuple); + } + writer.complete(); + } + + // Create a table new_covariance, and dump covariance into it. + { + txn.exec("CREATE TABLE new_covariance ( player1 smallint NOT NULL, player2 smallint NOT NULL, cov float NOT NULL )"); + pqxx::tablewriter writer(txn, "new_covariance"); + for (unsigned i = 0; i < covariance_db_tuples.size(); ++i) { + char cov_str[128]; + snprintf(cov_str, 128, "%f", covariance_db_tuples[i].covariance); + + vector tuple; + tuple.push_back(covariance_db_tuples[i].player1); + tuple.push_back(covariance_db_tuples[i].player2); + tuple.push_back(cov_str); + writer.push_back(tuple); + } + writer.complete(); + } + + // Create index, and rename new_covariance on top of covariance. + txn.exec("ALTER TABLE new_covariance ADD PRIMARY KEY ( player1, player2 );"); + txn.exec("DROP TABLE IF EXISTS covariance"); + txn.exec("ALTER TABLE new_covariance RENAME TO covariance"); + + // Dump aux_params. + { + txn.exec("TRUNCATE aux_params"); + pqxx::tablewriter writer(txn, "aux_params"); + for (map, float>::const_iterator it = aux_params.begin(); it != aux_params.end(); ++it) { + char str[128]; + snprintf(str, 128, "%f", it->second); + + vector tuple; + tuple.push_back(it->first.first); // locale + tuple.push_back(it->first.second); // parameter name + tuple.push_back(str); + writer.push_back(tuple); + } + writer.complete(); + } +#else + //fprintf(stderr, "Optimal sigma: %f (two-player: %f)\n", sigma[0], sigma[0] * sqrt(2.0f)); + for (map, float>::const_iterator it = aux_params.begin(); it != aux_params.end(); ++it) { + printf("%s: aux_param %s %f\n", it->first->first, it->first->second, it->second); + } #endif } diff --git a/train.pl b/train.pl index 84335e1..97c33d3 100755 --- a/train.pl +++ b/train.pl @@ -42,12 +42,13 @@ WHERE deltager1.Nr > deltager2.nr AND kultur=? } sub output_to_file { - my ($games, $ids) = @_; + my ($locale, $games, $ids) = @_; my $tmpnam = POSIX::tmpnam(); open DATA, ">", $tmpnam or die "$tmpnam: $!"; + printf DATA "%s\n", $locale; printf DATA "%d\n", scalar keys %$ids; for my $id (keys %$ids) { printf DATA "%d\n", $id; @@ -60,66 +61,22 @@ sub output_to_file { return $tmpnam; } -sub train_model { - my ($filename, $locale, $ratings, $covariances, $aux_params) = @_; - - open RATINGS, "$config::base_dir/bayeswf < $filename |" - or die "bayeswf: $!"; - while () { - chomp; - my @x = split; - if ($x[0] eq 'covariance') { - push @$covariances, (join("\t", @x[1..3])); - } elsif ($x[0] eq 'aux_param') { - push @$aux_params, ($locale . "\t" . $x[1] . "\t" . $x[2]); - } else { - push @$ratings, ($x[2] . "\t" . $x[0] . "\t" . $x[1]); - } - } - - close RATINGS; -} - my $dbh = DBI->connect($config::local_connstr, $config::local_username, $config::local_password) or die "connect: " . $DBI::errstr; -$dbh->{AutoCommit} = 0; -$dbh->{RaiseError} = 1; - -$dbh->do('SET client_min_messages TO WARNING'); +$dbh->{AutoCommit} = 1; my @locales = wloh_common::find_all_locales($dbh); - -my @ratings = (); -my @covariances = (); -my @aux_params = (); +my @filenames = (); for my $locale (@locales) { my $last_season = find_last_season($dbh, $locale); my @games = (); my %ids = (); fetch_games($dbh, $locale, $last_season, \@games, \%ids); - my $tmpnam = output_to_file(\@games, \%ids); - - train_model($tmpnam, $locale, \@ratings, \@covariances, \@aux_params); - unlink($tmpnam); + my $tmpnam = output_to_file($locale, \@games, \%ids); + push @filenames, $tmpnam; } -$dbh->do('CREATE TABLE new_covariance ( player1 smallint NOT NULL, player2 smallint NOT NULL, cov float NOT NULL )'); -$dbh->do('COPY new_covariance ( player1, player2, cov ) FROM STDIN'); -$dbh->pg_putcopydata(join("\n", @covariances)); -$dbh->pg_putcopyend(); -$dbh->do('ALTER TABLE new_covariance ADD PRIMARY KEY ( player1, player2 );'); -$dbh->do('DROP TABLE IF EXISTS covariance'); -$dbh->do('ALTER TABLE new_covariance RENAME TO covariance'); - -$dbh->do('TRUNCATE aux_params'); -$dbh->do('COPY aux_params ( kultur, id, value ) FROM STDIN'); -$dbh->pg_putcopydata(join("\n", @aux_params)); -$dbh->pg_putcopyend(); - -$dbh->do('TRUNCATE ratings'); -$dbh->do('COPY ratings ( id, rating, rating_stddev ) FROM STDIN'); -$dbh->pg_putcopydata(join("\n", @ratings)); -$dbh->pg_putcopyend(); +$dbh->disconnect; -$dbh->commit; +system("$config::base_dir/bayeswf", @filenames);