Change to even shorter prefix length; down from 3.8 to 3.1 GB (single partition is...

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 11 Dec 2014 19:51:38 +0000 (20:51 +0100)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 11 Dec 2014 19:51:38 +0000 (20:51 +0100)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 11 Dec 2014 19:51:38 +0000 (20:51 +0100)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 11 Dec 2014 19:51:38 +0000 (20:51 +0100)
diff --git a/hash.cpp b/hash.cpp

index af56263ec5f71c71c161fd956572201eab6791e6..cc15148cdd0d1f40a33add1d8ad398d18f7d7966 100644 (file)
--- a/hash.cpp
+++ b/hash.cpp
@@ -6,9 +6,6 @@ using namespace std;
  
  int hash_key_to_bucket(const char* s, size_t len, int num_buckets)
  {
-       // We hash only the first 10 bytes; it should be enough to get a
-       // reasonable spread, but also mostly miss the move, so that
-       // same position + different move usually land in the same bucket.
-       len = min<size_t>(len, 10);
+       len = min<size_t>(len, HASH_PREFIX_BYTES);
         return util::Fingerprint32(s, len) % num_buckets;
  }
diff --git a/hash.h b/hash.h

index 9b9550afec5574228e9b4a5dd3b82013ddd2b72d..39f01e08109712450c6381bab7772b43f140b0a8 100644 (file)
--- a/hash.h
+++ b/hash.h
@@ -1,6 +1,12 @@
  #ifndef _HASH_H
  #define _HASH_H 1
  
+// Hashing more or fewer bytes is a tradeoff between more even partitions
+// and total size (since seemingly key/prefix compression works better with
+// smaller values). This value seems to be very close to optimal wrt. size,
+// and has imbalances smaller than 2:1.
+#define HASH_PREFIX_BYTES 4
+
  int hash_key_to_bucket(const char* s, size_t len, int num_buckets);
  
  #endif  // !defined(_HASH_H)
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 11 Dec 2014 19:51:38 +0000 (20:51 +0100)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 11 Dec 2014 19:51:38 +0000 (20:51 +0100)