2 * Resizable, Scalable, Concurrent Hash Table
4 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
5 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
6 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
8 * Code partially derived from nft_hash
9 * Rewritten with rehash code from br_multicast plus single list
10 * pointer as suggested by Josh Triplett
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
17 #include <linux/atomic.h>
18 #include <linux/kernel.h>
19 #include <linux/log2.h>
20 #include <linux/sched.h>
21 #include <linux/slab.h>
22 #include <linux/vmalloc.h>
24 #include <linux/jhash.h>
25 #include <linux/random.h>
26 #include <linux/rhashtable.h>
27 #include <linux/err.h>
29 #define HASH_DEFAULT_SIZE 64UL
30 #define HASH_MIN_SIZE 4U
31 #define BUCKET_LOCKS_PER_CPU 32UL
33 static u32 head_hashfn(struct rhashtable *ht,
34 const struct bucket_table *tbl,
35 const struct rhash_head *he)
37 return rht_head_hashfn(ht, tbl, he, ht->p);
40 static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
44 unsigned int nr_pcpus = num_possible_cpus();
46 nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
47 size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
49 /* Never allocate more than 0.5 locks per bucket */
50 size = min_t(unsigned int, size, tbl->size >> 1);
52 if (sizeof(spinlock_t) != 0) {
54 if (gfp != GFP_KERNEL)
55 gfp |= __GFP_NOWARN | __GFP_NORETRY;
58 tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
62 for (i = 0; i < size; i++)
63 spin_lock_init(&tbl->locks[i]);
65 tbl->locks_mask = size - 1;
70 static void bucket_table_free(struct bucket_table *tbl)
78 static void bucket_table_free_rcu(struct rcu_head *head)
80 bucket_table_free(container_of(head, struct bucket_table, rcu));
83 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
87 struct bucket_table *tbl = NULL;
91 size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
92 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
94 tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
95 if (tbl == NULL && gfp == GFP_KERNEL)
100 tbl->size = nbuckets;
102 if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
103 bucket_table_free(tbl);
107 INIT_LIST_HEAD(&tbl->walkers);
109 get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
111 for (i = 0; i < nbuckets; i++)
112 INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
117 static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
118 struct bucket_table *tbl)
120 struct bucket_table *new_tbl;
124 tbl = rht_dereference_rcu(tbl->future_tbl, ht);
130 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
132 struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
133 struct bucket_table *new_tbl = rhashtable_last_table(ht,
134 rht_dereference_rcu(old_tbl->future_tbl, ht));
135 struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
137 struct rhash_head *head, *next, *entry;
138 spinlock_t *new_bucket_lock;
139 unsigned int new_hash;
141 rht_for_each(entry, old_tbl, old_hash) {
143 next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
145 if (rht_is_a_nulls(next))
148 pprev = &entry->next;
154 new_hash = head_hashfn(ht, new_tbl, entry);
156 new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
158 spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
159 head = rht_dereference_bucket(new_tbl->buckets[new_hash],
162 RCU_INIT_POINTER(entry->next, head);
164 rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
165 spin_unlock(new_bucket_lock);
167 rcu_assign_pointer(*pprev, next);
173 static void rhashtable_rehash_chain(struct rhashtable *ht,
174 unsigned int old_hash)
176 struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
177 spinlock_t *old_bucket_lock;
179 old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
181 spin_lock_bh(old_bucket_lock);
182 while (!rhashtable_rehash_one(ht, old_hash))
185 spin_unlock_bh(old_bucket_lock);
188 static int rhashtable_rehash_attach(struct rhashtable *ht,
189 struct bucket_table *old_tbl,
190 struct bucket_table *new_tbl)
192 /* Protect future_tbl using the first bucket lock. */
193 spin_lock_bh(old_tbl->locks);
195 /* Did somebody beat us to it? */
196 if (rcu_access_pointer(old_tbl->future_tbl)) {
197 spin_unlock_bh(old_tbl->locks);
201 /* Make insertions go into the new, empty table right away. Deletions
202 * and lookups will be attempted in both tables until we synchronize.
204 rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
206 spin_unlock_bh(old_tbl->locks);
211 static int rhashtable_rehash_table(struct rhashtable *ht)
213 struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
214 struct bucket_table *new_tbl;
215 struct rhashtable_walker *walker;
216 unsigned int old_hash;
218 new_tbl = rht_dereference(old_tbl->future_tbl, ht);
222 for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
223 rhashtable_rehash_chain(ht, old_hash);
225 /* Publish the new table pointer. */
226 rcu_assign_pointer(ht->tbl, new_tbl);
228 spin_lock(&ht->lock);
229 list_for_each_entry(walker, &old_tbl->walkers, list)
231 spin_unlock(&ht->lock);
233 /* Wait for readers. All new readers will see the new
234 * table, and thus no references to the old table will
237 call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
239 return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
242 static int rhashtable_expand(struct rhashtable *ht)
244 struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
247 old_tbl = rhashtable_last_table(ht, old_tbl);
249 new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
253 err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
255 bucket_table_free(new_tbl);
260 static int rhashtable_shrink(struct rhashtable *ht)
262 struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
263 unsigned int nelems = atomic_read(&ht->nelems);
264 unsigned int size = 0;
268 size = roundup_pow_of_two(nelems * 3 / 2);
269 if (size < ht->p.min_size)
270 size = ht->p.min_size;
272 if (old_tbl->size <= size)
275 if (rht_dereference(old_tbl->future_tbl, ht))
278 new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
282 err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
284 bucket_table_free(new_tbl);
289 static void rht_deferred_worker(struct work_struct *work)
291 struct rhashtable *ht;
292 struct bucket_table *tbl;
295 ht = container_of(work, struct rhashtable, run_work);
296 mutex_lock(&ht->mutex);
298 tbl = rht_dereference(ht->tbl, ht);
299 tbl = rhashtable_last_table(ht, tbl);
301 if (rht_grow_above_75(ht, tbl))
302 rhashtable_expand(ht);
303 else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
304 rhashtable_shrink(ht);
306 err = rhashtable_rehash_table(ht);
308 mutex_unlock(&ht->mutex);
311 schedule_work(&ht->run_work);
314 static bool rhashtable_check_elasticity(struct rhashtable *ht,
315 struct bucket_table *tbl,
318 unsigned int elasticity = ht->elasticity;
319 struct rhash_head *head;
321 rht_for_each(head, tbl, hash)
328 int rhashtable_insert_rehash(struct rhashtable *ht,
329 struct bucket_table *tbl)
331 struct bucket_table *old_tbl;
332 struct bucket_table *new_tbl;
336 old_tbl = rht_dereference_rcu(ht->tbl, ht);
342 if (rht_grow_above_75(ht, tbl))
344 /* Do not schedule more than one rehash */
345 else if (old_tbl != tbl)
350 new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
354 err = rhashtable_rehash_attach(ht, tbl, new_tbl);
356 bucket_table_free(new_tbl);
360 schedule_work(&ht->run_work);
365 /* Do not fail the insert if someone else did a rehash. */
366 if (likely(rcu_dereference_raw(tbl->future_tbl)))
369 /* Schedule async rehash to retry allocation in process context. */
371 schedule_work(&ht->run_work);
376 struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
378 struct rhash_head *obj,
379 struct bucket_table *tbl)
381 struct rhash_head *head;
385 tbl = rhashtable_last_table(ht, tbl);
386 hash = head_hashfn(ht, tbl, obj);
387 spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
390 if (key && rhashtable_lookup_fast(ht, key, ht->p))
394 if (unlikely(rht_grow_above_max(ht, tbl)))
398 if (rhashtable_check_elasticity(ht, tbl, hash) ||
399 rht_grow_above_100(ht, tbl))
404 head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
406 RCU_INIT_POINTER(obj->next, head);
408 rcu_assign_pointer(tbl->buckets[hash], obj);
410 atomic_inc(&ht->nelems);
413 spin_unlock(rht_bucket_lock(tbl, hash));
417 else if (err == -EAGAIN)
423 static size_t rounded_hashtable_size(const struct rhashtable_params *params)
425 return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
426 (unsigned long)params->min_size);
429 static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
431 return jhash2(key, length, seed);
434 int rhashtable_init(struct rhashtable *ht,
435 const struct rhashtable_params *params)
437 struct bucket_table *tbl;
440 size = HASH_DEFAULT_SIZE;
442 if ((!params->key_len && !params->obj_hashfn) ||
443 (params->obj_hashfn && !params->obj_cmpfn))
446 if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
449 memset(ht, 0, sizeof(*ht));
450 mutex_init(&ht->mutex);
451 spin_lock_init(&ht->lock);
452 memcpy(&ht->p, params, sizeof(*params));
454 if (params->min_size)
455 ht->p.min_size = roundup_pow_of_two(params->min_size);
457 if (params->max_size)
458 ht->p.max_size = rounddown_pow_of_two(params->max_size);
460 if (params->insecure_max_entries)
461 ht->p.insecure_max_entries =
462 rounddown_pow_of_two(params->insecure_max_entries);
464 ht->p.insecure_max_entries = ht->p.max_size * 2;
466 ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
468 if (params->nelem_hint)
469 size = rounded_hashtable_size(&ht->p);
471 /* The maximum (not average) chain length grows with the
472 * size of the hash table, at a rate of (log N)/(log log N).
473 * The value of 16 is selected so that even if the hash
474 * table grew to 2^32 you would not expect the maximum
475 * chain length to exceed it unless we are under attack
476 * (or extremely unlucky).
478 * As this limit is only to detect attacks, we don't need
479 * to set it to a lower value as you'd need the chain
480 * length to vastly exceed 16 to have any real effect
483 if (!params->insecure_elasticity)
486 if (params->locks_mul)
487 ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
489 ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
491 ht->key_len = ht->p.key_len;
492 if (!params->hashfn) {
493 ht->p.hashfn = jhash;
495 if (!(ht->key_len & (sizeof(u32) - 1))) {
496 ht->key_len /= sizeof(u32);
497 ht->p.hashfn = rhashtable_jhash2;
501 tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
505 atomic_set(&ht->nelems, 0);
507 RCU_INIT_POINTER(ht->tbl, tbl);
509 INIT_WORK(&ht->run_work, rht_deferred_worker);
514 void rhashtable_destroy(struct rhashtable *ht)
516 struct bucket_table *tbl;
518 cancel_work_sync(&ht->run_work);
520 mutex_lock(&ht->mutex);
521 tbl = rht_dereference(ht->tbl, ht);
522 bucket_table_free(tbl);
523 mutex_unlock(&ht->mutex);