]> git.sesse.net Git - narabu/blobdiff - qdc.cpp
Revert "k-means instead of k-medoids; doesn't work as well, so just keep it here...
[narabu] / qdc.cpp
diff --git a/qdc.cpp b/qdc.cpp
index c4e1ba4e73f2615addb13f6ca09551e8bb9970e4..b5594b76856918d9b14bee3f55dd5bcec24fdd7a 100644 (file)
--- a/qdc.cpp
+++ b/qdc.cpp
@@ -374,75 +374,80 @@ double find_best_assignment(const int *medoids, int *assignment)
        return current_score;
 }
 
-double find_inv_sum(const SymbolStats &stats)
-{
-       double s = 0.0;
-       for (unsigned j = 0; j < NUM_SYMS; ++j) {
-               s += stats.freqs[j] + 0.5;
-       }
-       return 1.0 / s;
-}
-
 void find_optimal_stream_assignment(int base)
 {
-       // k-means init; make random assignments
-       std::random_device rd;
-       std::mt19937 g(rd());
-       std::uniform_int_distribution<> u(0, NUM_CLUSTERS - 1);
-       int assignment[64];
-       for (unsigned i = 0; i < 64; ++i) {
-               assignment[i] = u(g);
-       }       
-       double inv_sum_coeffs[64];
+       double inv_sum[64];
        for (unsigned i = 0; i < 64; ++i) {
-               inv_sum_coeffs[i] = find_inv_sum(stats[i + base]);
+               double s = 0.0;
+               for (unsigned k = 0; k < 256; ++k) {
+                       s += stats[i + base].freqs[k] + 0.5;
+               }
+               inv_sum[i] = 1.0 / s;
        }
 
-       for (unsigned iter = 0; iter < 1000; ++iter) {
-               // make new clusters based on the current assignments
-               SymbolStats clusters[NUM_CLUSTERS];
-               for (unsigned i = 0; i < NUM_CLUSTERS; ++i) {
-                       clusters[i].clear();
-               }
-               for (unsigned i = 0; i < 64; ++i) {
-                       for (unsigned j = 0; j < NUM_SYMS; ++j) {
-                               clusters[assignment[i]].freqs[j] += stats[i + base].freqs[j];
+       for (unsigned i = 0; i < 64; ++i) {
+               for (unsigned j = 0; j < 64; ++j) {
+                       double d = 0.0;
+                       for (unsigned k = 0; k < 256; ++k) {
+                               double p1 = (stats[i + base].freqs[k] + 0.5) * inv_sum[i];
+                               double p2 = (stats[j + base].freqs[k] + 0.5) * inv_sum[j];
+
+                               // K-L divergence is asymmetric; this is a hack.
+                               d += p1 * log(p1 / p2);
+                               d += p2 * log(p2 / p1);
                        }
+                       kl_dist[i][j] = d;
+                       //printf("%.3f ", d);
                }
+               //printf("\n");
+       }
 
-               double inv_sum_clusters[NUM_CLUSTERS];
-               for (unsigned i = 0; i < NUM_CLUSTERS; ++i) {
-                       inv_sum_clusters[i] = find_inv_sum(clusters[i]);
-               }
-               
-               // find new assignments based on distance to the clusters
+       // k-medoids init
+       int medoids[64];  // only the first NUM_CLUSTERS matter
+       bool is_medoid[64] = { false };
+       std::iota(medoids, medoids + 64, 0);
+       std::random_device rd;
+       std::mt19937 g(rd());
+       std::shuffle(medoids, medoids + 64, g);
+       for (int i = 0; i < NUM_CLUSTERS; ++i) {
+               printf("%d ", medoids[i]);
+               is_medoid[medoids[i]] = true;
+       }
+       printf("\n");
+
+       // assign each data point to the closest medoid
+       int assignment[64];
+       double current_score = find_best_assignment(medoids, assignment);
+
+       for (int i = 0; i < 1000; ++i) {
+               printf("iter %d\n", i);
                bool any_changed = false;
-               double total_d = 0.0;
-               for (unsigned i = 0; i < 64; ++i) {
-                       int best_assignment = -1;
-                       double best_distance = HUGE_VAL;
-                       for (unsigned j = 0; j < NUM_CLUSTERS; ++j) {
-                               double d = 0.0;
-                               for (unsigned k = 0; k < NUM_SYMS; ++k) {
-                                       double p1 = (clusters[j].freqs[k] + 0.5) * inv_sum_clusters[j];
-                                       double p2 = (stats[i + base].freqs[k] + 0.5) * inv_sum_coeffs[i];
-
-                                       // K-L divergence is asymmetric; this is a hack.
-                                       d += p1 * log(p1 / p2);
-                                       d += p2 * log(p2 / p1);
-                               }
-                               if (d < best_distance) {
-                                       best_assignment = j;
-                                       best_distance = d;
+               for (int m = 0; m < NUM_CLUSTERS; ++m) {
+                       for (int o = 0; o < 64; ++o) {
+                               if (is_medoid[o]) continue;
+                               int old_medoid = medoids[m];
+                               medoids[m] = o;
+
+                               int new_assignment[64];
+                               double candidate_score = find_best_assignment(medoids, new_assignment);
+
+                               if (candidate_score < current_score) {
+                                       current_score = candidate_score;
+                                       memcpy(assignment, new_assignment, sizeof(assignment));
+
+                                       is_medoid[old_medoid] = false;
+                                       is_medoid[medoids[m]] = true;
+                                       printf("%f: ", current_score);
+                                       for (int i = 0; i < 64; ++i) {
+                                               printf("%d ", assignment[i]);
+                                       }
+                                       printf("\n");
+                                       any_changed = true;
+                               } else {
+                                       medoids[m] = old_medoid;
                                }
                        }
-                       if (assignment[i] != best_assignment) {
-                               any_changed = true;
-                       }
-                       assignment[i] = best_assignment;
-                       total_d += best_distance;
                }
-               printf("iter %u: %.3f\n", iter, total_d);
                if (!any_changed) break;
        }
        printf("\n");