return current_score;
}
-void find_optimal_stream_assignment(int base)
+double find_inv_sum(const SymbolStats &stats)
{
- double inv_sum[64];
- for (unsigned i = 0; i < 64; ++i) {
- double s = 0.0;
- for (unsigned k = 0; k < 256; ++k) {
- s += stats[i + base].freqs[k] + 0.5;
- }
- inv_sum[i] = 1.0 / s;
- }
-
- for (unsigned i = 0; i < 64; ++i) {
- for (unsigned j = 0; j < 64; ++j) {
- double d = 0.0;
- for (unsigned k = 0; k < 256; ++k) {
- double p1 = (stats[i + base].freqs[k] + 0.5) * inv_sum[i];
- double p2 = (stats[j + base].freqs[k] + 0.5) * inv_sum[j];
-
- // K-L divergence is asymmetric; this is a hack.
- d += p1 * log(p1 / p2);
- d += p2 * log(p2 / p1);
- }
- kl_dist[i][j] = d;
- //printf("%.3f ", d);
- }
- //printf("\n");
+ double s = 0.0;
+ for (unsigned j = 0; j < NUM_SYMS; ++j) {
+ s += stats.freqs[j] + 0.5;
}
+ return 1.0 / s;
+}
- // k-medoids init
- int medoids[64]; // only the first NUM_CLUSTERS matter
- bool is_medoid[64] = { false };
- std::iota(medoids, medoids + 64, 0);
+void find_optimal_stream_assignment(int base)
+{
+ // k-means init; make random assignments
std::random_device rd;
std::mt19937 g(rd());
- std::shuffle(medoids, medoids + 64, g);
- for (int i = 0; i < NUM_CLUSTERS; ++i) {
- printf("%d ", medoids[i]);
- is_medoid[medoids[i]] = true;
+ std::uniform_int_distribution<> u(0, NUM_CLUSTERS - 1);
+ int assignment[64];
+ for (unsigned i = 0; i < 64; ++i) {
+ assignment[i] = u(g);
+ }
+ double inv_sum_coeffs[64];
+ for (unsigned i = 0; i < 64; ++i) {
+ inv_sum_coeffs[i] = find_inv_sum(stats[i + base]);
}
- printf("\n");
- // assign each data point to the closest medoid
- int assignment[64];
- double current_score = find_best_assignment(medoids, assignment);
+ for (unsigned iter = 0; iter < 1000; ++iter) {
+ // make new clusters based on the current assignments
+ SymbolStats clusters[NUM_CLUSTERS];
+ for (unsigned i = 0; i < NUM_CLUSTERS; ++i) {
+ clusters[i].clear();
+ }
+ for (unsigned i = 0; i < 64; ++i) {
+ for (unsigned j = 0; j < NUM_SYMS; ++j) {
+ clusters[assignment[i]].freqs[j] += stats[i + base].freqs[j];
+ }
+ }
- for (int i = 0; i < 1000; ++i) {
- printf("iter %d\n", i);
+ double inv_sum_clusters[NUM_CLUSTERS];
+ for (unsigned i = 0; i < NUM_CLUSTERS; ++i) {
+ inv_sum_clusters[i] = find_inv_sum(clusters[i]);
+ }
+
+ // find new assignments based on distance to the clusters
bool any_changed = false;
- for (int m = 0; m < NUM_CLUSTERS; ++m) {
- for (int o = 0; o < 64; ++o) {
- if (is_medoid[o]) continue;
- int old_medoid = medoids[m];
- medoids[m] = o;
-
- int new_assignment[64];
- double candidate_score = find_best_assignment(medoids, new_assignment);
-
- if (candidate_score < current_score) {
- current_score = candidate_score;
- memcpy(assignment, new_assignment, sizeof(assignment));
-
- is_medoid[old_medoid] = false;
- is_medoid[medoids[m]] = true;
- printf("%f: ", current_score);
- for (int i = 0; i < 64; ++i) {
- printf("%d ", assignment[i]);
- }
- printf("\n");
- any_changed = true;
- } else {
- medoids[m] = old_medoid;
+ double total_d = 0.0;
+ for (unsigned i = 0; i < 64; ++i) {
+ int best_assignment = -1;
+ double best_distance = HUGE_VAL;
+ for (unsigned j = 0; j < NUM_CLUSTERS; ++j) {
+ double d = 0.0;
+ for (unsigned k = 0; k < NUM_SYMS; ++k) {
+ double p1 = (clusters[j].freqs[k] + 0.5) * inv_sum_clusters[j];
+ double p2 = (stats[i + base].freqs[k] + 0.5) * inv_sum_coeffs[i];
+
+ // K-L divergence is asymmetric; this is a hack.
+ d += p1 * log(p1 / p2);
+ d += p2 * log(p2 / p1);
+ }
+ if (d < best_distance) {
+ best_assignment = j;
+ best_distance = d;
}
}
+ if (assignment[i] != best_assignment) {
+ any_changed = true;
+ }
+ assignment[i] = best_assignment;
+ total_d += best_distance;
}
+ printf("iter %u: %.3f\n", iter, total_d);
if (!any_changed) break;
}
printf("\n");