- for (int m = 0; m < NUM_CLUSTERS; ++m) {
- for (int o = 0; o < 64; ++o) {
- if (is_medoid[o]) continue;
- int old_medoid = medoids[m];
- medoids[m] = o;
-
- int new_assignment[64];
- double candidate_score = find_best_assignment(medoids, new_assignment);
-
- if (candidate_score < current_score) {
- current_score = candidate_score;
- memcpy(assignment, new_assignment, sizeof(assignment));
-
- is_medoid[old_medoid] = false;
- is_medoid[medoids[m]] = true;
- printf("%f: ", current_score);
- for (int i = 0; i < 64; ++i) {
- printf("%d ", assignment[i]);
- }
- printf("\n");
- any_changed = true;
- } else {
- medoids[m] = old_medoid;
+ double total_d = 0.0;
+ for (unsigned i = 0; i < 64; ++i) {
+ int best_assignment = -1;
+ double best_distance = HUGE_VAL;
+ for (unsigned j = 0; j < NUM_CLUSTERS; ++j) {
+ double d = 0.0;
+ for (unsigned k = 0; k < NUM_SYMS; ++k) {
+ double p1 = (clusters[j].freqs[k] + 0.5) * inv_sum_clusters[j];
+ double p2 = (stats[i + base].freqs[k] + 0.5) * inv_sum_coeffs[i];
+
+ // K-L divergence is asymmetric; this is a hack.
+ d += p1 * log(p1 / p2);
+ d += p2 * log(p2 / p1);
+ }
+ if (d < best_distance) {
+ best_assignment = j;
+ best_distance = d;