From dd46a39c59caf1ed3c3c79073a4acbd73c1496d0 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Thu, 11 Dec 2014 20:51:38 +0100 Subject: [PATCH] Change to even shorter prefix length; down from 3.8 to 3.1 GB (single partition is 3.0 GB). --- hash.cpp | 5 +---- hash.h | 6 ++++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/hash.cpp b/hash.cpp index af56263..cc15148 100644 --- a/hash.cpp +++ b/hash.cpp @@ -6,9 +6,6 @@ using namespace std; int hash_key_to_bucket(const char* s, size_t len, int num_buckets) { - // We hash only the first 10 bytes; it should be enough to get a - // reasonable spread, but also mostly miss the move, so that - // same position + different move usually land in the same bucket. - len = min(len, 10); + len = min(len, HASH_PREFIX_BYTES); return util::Fingerprint32(s, len) % num_buckets; } diff --git a/hash.h b/hash.h index 9b9550a..39f01e0 100644 --- a/hash.h +++ b/hash.h @@ -1,6 +1,12 @@ #ifndef _HASH_H #define _HASH_H 1 +// Hashing more or fewer bytes is a tradeoff between more even partitions +// and total size (since seemingly key/prefix compression works better with +// smaller values). This value seems to be very close to optimal wrt. size, +// and has imbalances smaller than 2:1. +#define HASH_PREFIX_BYTES 4 + int hash_key_to_bucket(const char* s, size_t len, int num_buckets); #endif // !defined(_HASH_H) -- 2.39.2