From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Thu, 11 Dec 2014 19:51:38 +0000 (+0100)
Subject: Change to even shorter prefix length; down from 3.8 to 3.1 GB (single partition is... 
X-Git-Url: https://git.sesse.net/?p=remoteglot-book;a=commitdiff_plain;h=dd46a39c59caf1ed3c3c79073a4acbd73c1496d0

Change to even shorter prefix length; down from 3.8 to 3.1 GB (single partition is 3.0 GB).
---

diff --git a/hash.cpp b/hash.cpp
index af56263..cc15148 100644
--- a/hash.cpp
+++ b/hash.cpp
@@ -6,9 +6,6 @@ using namespace std;
 
 int hash_key_to_bucket(const char* s, size_t len, int num_buckets)
 {
-	// We hash only the first 10 bytes; it should be enough to get a
-	// reasonable spread, but also mostly miss the move, so that
-	// same position + different move usually land in the same bucket.
-	len = min<size_t>(len, 10);
+	len = min<size_t>(len, HASH_PREFIX_BYTES);
 	return util::Fingerprint32(s, len) % num_buckets;
 }
diff --git a/hash.h b/hash.h
index 9b9550a..39f01e0 100644
--- a/hash.h
+++ b/hash.h
@@ -1,6 +1,12 @@
 #ifndef _HASH_H
 #define _HASH_H 1
 
+// Hashing more or fewer bytes is a tradeoff between more even partitions
+// and total size (since seemingly key/prefix compression works better with
+// smaller values). This value seems to be very close to optimal wrt. size,
+// and has imbalances smaller than 2:1.
+#define HASH_PREFIX_BYTES 4
+
 int hash_key_to_bucket(const char* s, size_t len, int num_buckets);
 
 #endif  // !defined(_HASH_H)