]> git.sesse.net Git - narabu/commitdiff
k-means instead of k-medoids; doesn't work as well, so just keep it here to be immedi...
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 17 Sep 2017 09:41:36 +0000 (11:41 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 17 Sep 2017 09:41:36 +0000 (11:41 +0200)
qdc.cpp

diff --git a/qdc.cpp b/qdc.cpp
index b5594b76856918d9b14bee3f55dd5bcec24fdd7a..c4e1ba4e73f2615addb13f6ca09551e8bb9970e4 100644 (file)
--- a/qdc.cpp
+++ b/qdc.cpp
@@ -374,80 +374,75 @@ double find_best_assignment(const int *medoids, int *assignment)
        return current_score;
 }
 
-void find_optimal_stream_assignment(int base)
+double find_inv_sum(const SymbolStats &stats)
 {
-       double inv_sum[64];
-       for (unsigned i = 0; i < 64; ++i) {
-               double s = 0.0;
-               for (unsigned k = 0; k < 256; ++k) {
-                       s += stats[i + base].freqs[k] + 0.5;
-               }
-               inv_sum[i] = 1.0 / s;
-       }
-
-       for (unsigned i = 0; i < 64; ++i) {
-               for (unsigned j = 0; j < 64; ++j) {
-                       double d = 0.0;
-                       for (unsigned k = 0; k < 256; ++k) {
-                               double p1 = (stats[i + base].freqs[k] + 0.5) * inv_sum[i];
-                               double p2 = (stats[j + base].freqs[k] + 0.5) * inv_sum[j];
-
-                               // K-L divergence is asymmetric; this is a hack.
-                               d += p1 * log(p1 / p2);
-                               d += p2 * log(p2 / p1);
-                       }
-                       kl_dist[i][j] = d;
-                       //printf("%.3f ", d);
-               }
-               //printf("\n");
+       double s = 0.0;
+       for (unsigned j = 0; j < NUM_SYMS; ++j) {
+               s += stats.freqs[j] + 0.5;
        }
+       return 1.0 / s;
+}
 
-       // k-medoids init
-       int medoids[64];  // only the first NUM_CLUSTERS matter
-       bool is_medoid[64] = { false };
-       std::iota(medoids, medoids + 64, 0);
+void find_optimal_stream_assignment(int base)
+{
+       // k-means init; make random assignments
        std::random_device rd;
        std::mt19937 g(rd());
-       std::shuffle(medoids, medoids + 64, g);
-       for (int i = 0; i < NUM_CLUSTERS; ++i) {
-               printf("%d ", medoids[i]);
-               is_medoid[medoids[i]] = true;
+       std::uniform_int_distribution<> u(0, NUM_CLUSTERS - 1);
+       int assignment[64];
+       for (unsigned i = 0; i < 64; ++i) {
+               assignment[i] = u(g);
+       }       
+       double inv_sum_coeffs[64];
+       for (unsigned i = 0; i < 64; ++i) {
+               inv_sum_coeffs[i] = find_inv_sum(stats[i + base]);
        }
-       printf("\n");
 
-       // assign each data point to the closest medoid
-       int assignment[64];
-       double current_score = find_best_assignment(medoids, assignment);
+       for (unsigned iter = 0; iter < 1000; ++iter) {
+               // make new clusters based on the current assignments
+               SymbolStats clusters[NUM_CLUSTERS];
+               for (unsigned i = 0; i < NUM_CLUSTERS; ++i) {
+                       clusters[i].clear();
+               }
+               for (unsigned i = 0; i < 64; ++i) {
+                       for (unsigned j = 0; j < NUM_SYMS; ++j) {
+                               clusters[assignment[i]].freqs[j] += stats[i + base].freqs[j];
+                       }
+               }
 
-       for (int i = 0; i < 1000; ++i) {
-               printf("iter %d\n", i);
+               double inv_sum_clusters[NUM_CLUSTERS];
+               for (unsigned i = 0; i < NUM_CLUSTERS; ++i) {
+                       inv_sum_clusters[i] = find_inv_sum(clusters[i]);
+               }
+               
+               // find new assignments based on distance to the clusters
                bool any_changed = false;
-               for (int m = 0; m < NUM_CLUSTERS; ++m) {
-                       for (int o = 0; o < 64; ++o) {
-                               if (is_medoid[o]) continue;
-                               int old_medoid = medoids[m];
-                               medoids[m] = o;
-
-                               int new_assignment[64];
-                               double candidate_score = find_best_assignment(medoids, new_assignment);
-
-                               if (candidate_score < current_score) {
-                                       current_score = candidate_score;
-                                       memcpy(assignment, new_assignment, sizeof(assignment));
-
-                                       is_medoid[old_medoid] = false;
-                                       is_medoid[medoids[m]] = true;
-                                       printf("%f: ", current_score);
-                                       for (int i = 0; i < 64; ++i) {
-                                               printf("%d ", assignment[i]);
-                                       }
-                                       printf("\n");
-                                       any_changed = true;
-                               } else {
-                                       medoids[m] = old_medoid;
+               double total_d = 0.0;
+               for (unsigned i = 0; i < 64; ++i) {
+                       int best_assignment = -1;
+                       double best_distance = HUGE_VAL;
+                       for (unsigned j = 0; j < NUM_CLUSTERS; ++j) {
+                               double d = 0.0;
+                               for (unsigned k = 0; k < NUM_SYMS; ++k) {
+                                       double p1 = (clusters[j].freqs[k] + 0.5) * inv_sum_clusters[j];
+                                       double p2 = (stats[i + base].freqs[k] + 0.5) * inv_sum_coeffs[i];
+
+                                       // K-L divergence is asymmetric; this is a hack.
+                                       d += p1 * log(p1 / p2);
+                                       d += p2 * log(p2 / p1);
+                               }
+                               if (d < best_distance) {
+                                       best_assignment = j;
+                                       best_distance = d;
                                }
                        }
+                       if (assignment[i] != best_assignment) {
+                               any_changed = true;
+                       }
+                       assignment[i] = best_assignment;
+                       total_d += best_distance;
                }
+               printf("iter %u: %.3f\n", iter, total_d);
                if (!any_changed) break;
        }
        printf("\n");