git.sesse.net Git - bcachefs-tools-debian/blob - linux/rhashtable.c

   1 /*
   2  * Resizable, Scalable, Concurrent Hash Table
   3  *
   4  * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
   5  * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
   6  * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
   7  *
   8  * Code partially derived from nft_hash
   9  * Rewritten with rehash code from br_multicast plus single list
  10  * pointer as suggested by Josh Triplett
  11  *
  12  * This program is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License version 2 as
  14  * published by the Free Software Foundation.
  15  */
  16
  17 #include <linux/atomic.h>
  18 #include <linux/kernel.h>
  19 #include <linux/log2.h>
  20 #include <linux/sched.h>
  21 #include <linux/slab.h>
  22 #include <linux/vmalloc.h>
  23 #include <linux/mm.h>
  24 #include <linux/jhash.h>
  25 #include <linux/random.h>
  26 #include <linux/rhashtable.h>
  27 #include <linux/err.h>
  28
  29 #define HASH_DEFAULT_SIZE       64UL
  30 #define HASH_MIN_SIZE           4U
  31 #define BUCKET_LOCKS_PER_CPU    32UL
  32
  33 static u32 head_hashfn(struct rhashtable *ht,
  34                        const struct bucket_table *tbl,
  35                        const struct rhash_head *he)
  36 {
  37         return rht_head_hashfn(ht, tbl, he, ht->p);
  38 }
  39
  40 static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl,
  41                               gfp_t gfp)
  42 {
  43         unsigned int i, size;
  44         unsigned int nr_pcpus = num_possible_cpus();
  45
  46         nr_pcpus = min_t(unsigned int, nr_pcpus, 64UL);
  47         size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
  48
  49         /* Never allocate more than 0.5 locks per bucket */
  50         size = min_t(unsigned int, size, tbl->size >> 1);
  51
  52         if (sizeof(spinlock_t) != 0) {
  53                 tbl->locks = NULL;
  54                 if (gfp != GFP_KERNEL)
  55                         gfp |= __GFP_NOWARN | __GFP_NORETRY;
  56
  57                 if (!tbl->locks)
  58                         tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
  59                                                    gfp);
  60                 if (!tbl->locks)
  61                         return -ENOMEM;
  62                 for (i = 0; i < size; i++)
  63                         spin_lock_init(&tbl->locks[i]);
  64         }
  65         tbl->locks_mask = size - 1;
  66
  67         return 0;
  68 }
  69
  70 static void bucket_table_free(struct bucket_table *tbl)
  71 {
  72         if (tbl)
  73                 kvfree(tbl->locks);
  74
  75         kvfree(tbl);
  76 }
  77
  78 static void bucket_table_free_rcu(struct rcu_head *head)
  79 {
  80         bucket_table_free(container_of(head, struct bucket_table, rcu));
  81 }
  82
  83 static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
  84                                                size_t nbuckets,
  85                                                gfp_t gfp)
  86 {
  87         struct bucket_table *tbl = NULL;
  88         size_t size;
  89         int i;
  90
  91         size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
  92         if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
  93             gfp != GFP_KERNEL)
  94                 tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
  95         if (tbl == NULL && gfp == GFP_KERNEL)
  96                 tbl = vzalloc(size);
  97         if (tbl == NULL)
  98                 return NULL;
  99
 100         tbl->size = nbuckets;
 101
 102         if (alloc_bucket_locks(ht, tbl, gfp) < 0) {
 103                 bucket_table_free(tbl);
 104                 return NULL;
 105         }
 106
 107         INIT_LIST_HEAD(&tbl->walkers);
 108
 109         get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
 110
 111         for (i = 0; i < nbuckets; i++)
 112                 INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
 113
 114         return tbl;
 115 }
 116
 117 static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
 118                                                   struct bucket_table *tbl)
 119 {
 120         struct bucket_table *new_tbl;
 121
 122         do {
 123                 new_tbl = tbl;
 124                 tbl = rht_dereference_rcu(tbl->future_tbl, ht);
 125         } while (tbl);
 126
 127         return new_tbl;
 128 }
 129
 130 static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 131 {
 132         struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 133         struct bucket_table *new_tbl = rhashtable_last_table(ht,
 134                 rht_dereference_rcu(old_tbl->future_tbl, ht));
 135         struct rhash_head __rcu **pprev = &old_tbl->buckets[old_hash];
 136         int err = -ENOENT;
 137         struct rhash_head *head, *next, *entry;
 138         spinlock_t *new_bucket_lock;
 139         unsigned int new_hash;
 140
 141         rht_for_each(entry, old_tbl, old_hash) {
 142                 err = 0;
 143                 next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
 144
 145                 if (rht_is_a_nulls(next))
 146                         break;
 147
 148                 pprev = &entry->next;
 149         }
 150
 151         if (err)
 152                 goto out;
 153
 154         new_hash = head_hashfn(ht, new_tbl, entry);
 155
 156         new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
 157
 158         spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
 159         head = rht_dereference_bucket(new_tbl->buckets[new_hash],
 160                                       new_tbl, new_hash);
 161
 162         RCU_INIT_POINTER(entry->next, head);
 163
 164         rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
 165         spin_unlock(new_bucket_lock);
 166
 167         rcu_assign_pointer(*pprev, next);
 168
 169 out:
 170         return err;
 171 }
 172
 173 static void rhashtable_rehash_chain(struct rhashtable *ht,
 174                                     unsigned int old_hash)
 175 {
 176         struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 177         spinlock_t *old_bucket_lock;
 178
 179         old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
 180
 181         spin_lock_bh(old_bucket_lock);
 182         while (!rhashtable_rehash_one(ht, old_hash))
 183                 ;
 184         old_tbl->rehash++;
 185         spin_unlock_bh(old_bucket_lock);
 186 }
 187
 188 static int rhashtable_rehash_attach(struct rhashtable *ht,
 189                                     struct bucket_table *old_tbl,
 190                                     struct bucket_table *new_tbl)
 191 {
 192         /* Protect future_tbl using the first bucket lock. */
 193         spin_lock_bh(old_tbl->locks);
 194
 195         /* Did somebody beat us to it? */
 196         if (rcu_access_pointer(old_tbl->future_tbl)) {
 197                 spin_unlock_bh(old_tbl->locks);
 198                 return -EEXIST;
 199         }
 200
 201         /* Make insertions go into the new, empty table right away. Deletions
 202          * and lookups will be attempted in both tables until we synchronize.
 203          */
 204         rcu_assign_pointer(old_tbl->future_tbl, new_tbl);
 205
 206         spin_unlock_bh(old_tbl->locks);
 207
 208         return 0;
 209 }
 210
 211 static int rhashtable_rehash_table(struct rhashtable *ht)
 212 {
 213         struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 214         struct bucket_table *new_tbl;
 215         struct rhashtable_walker *walker;
 216         unsigned int old_hash;
 217
 218         new_tbl = rht_dereference(old_tbl->future_tbl, ht);
 219         if (!new_tbl)
 220                 return 0;
 221
 222         for (old_hash = 0; old_hash < old_tbl->size; old_hash++)
 223                 rhashtable_rehash_chain(ht, old_hash);
 224
 225         /* Publish the new table pointer. */
 226         rcu_assign_pointer(ht->tbl, new_tbl);
 227
 228         spin_lock(&ht->lock);
 229         list_for_each_entry(walker, &old_tbl->walkers, list)
 230                 walker->tbl = NULL;
 231         spin_unlock(&ht->lock);
 232
 233         /* Wait for readers. All new readers will see the new
 234          * table, and thus no references to the old table will
 235          * remain.
 236          */
 237         call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
 238
 239         return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 240 }
 241
 242 static int rhashtable_expand(struct rhashtable *ht)
 243 {
 244         struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 245         int err;
 246
 247         old_tbl = rhashtable_last_table(ht, old_tbl);
 248
 249         new_tbl = bucket_table_alloc(ht, old_tbl->size * 2, GFP_KERNEL);
 250         if (new_tbl == NULL)
 251                 return -ENOMEM;
 252
 253         err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
 254         if (err)
 255                 bucket_table_free(new_tbl);
 256
 257         return err;
 258 }
 259
 260 static int rhashtable_shrink(struct rhashtable *ht)
 261 {
 262         struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 263         unsigned int nelems = atomic_read(&ht->nelems);
 264         unsigned int size = 0;
 265         int err;
 266
 267         if (nelems)
 268                 size = roundup_pow_of_two(nelems * 3 / 2);
 269         if (size < ht->p.min_size)
 270                 size = ht->p.min_size;
 271
 272         if (old_tbl->size <= size)
 273                 return 0;
 274
 275         if (rht_dereference(old_tbl->future_tbl, ht))
 276                 return -EEXIST;
 277
 278         new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 279         if (new_tbl == NULL)
 280                 return -ENOMEM;
 281
 282         err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
 283         if (err)
 284                 bucket_table_free(new_tbl);
 285
 286         return err;
 287 }
 288
 289 static void rht_deferred_worker(struct work_struct *work)
 290 {
 291         struct rhashtable *ht;
 292         struct bucket_table *tbl;
 293         int err = 0;
 294
 295         ht = container_of(work, struct rhashtable, run_work);
 296         mutex_lock(&ht->mutex);
 297
 298         tbl = rht_dereference(ht->tbl, ht);
 299         tbl = rhashtable_last_table(ht, tbl);
 300
 301         if (rht_grow_above_75(ht, tbl))
 302                 rhashtable_expand(ht);
 303         else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
 304                 rhashtable_shrink(ht);
 305
 306         err = rhashtable_rehash_table(ht);
 307
 308         mutex_unlock(&ht->mutex);
 309
 310         if (err)
 311                 schedule_work(&ht->run_work);
 312 }
 313
 314 static bool rhashtable_check_elasticity(struct rhashtable *ht,
 315                                         struct bucket_table *tbl,
 316                                         unsigned int hash)
 317 {
 318         unsigned int elasticity = ht->elasticity;
 319         struct rhash_head *head;
 320
 321         rht_for_each(head, tbl, hash)
 322                 if (!--elasticity)
 323                         return true;
 324
 325         return false;
 326 }
 327
 328 int rhashtable_insert_rehash(struct rhashtable *ht,
 329                              struct bucket_table *tbl)
 330 {
 331         struct bucket_table *old_tbl;
 332         struct bucket_table *new_tbl;
 333         unsigned int size;
 334         int err;
 335
 336         old_tbl = rht_dereference_rcu(ht->tbl, ht);
 337
 338         size = tbl->size;
 339
 340         err = -EBUSY;
 341
 342         if (rht_grow_above_75(ht, tbl))
 343                 size *= 2;
 344         /* Do not schedule more than one rehash */
 345         else if (old_tbl != tbl)
 346                 goto fail;
 347
 348         err = -ENOMEM;
 349
 350         new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC);
 351         if (new_tbl == NULL)
 352                 goto fail;
 353
 354         err = rhashtable_rehash_attach(ht, tbl, new_tbl);
 355         if (err) {
 356                 bucket_table_free(new_tbl);
 357                 if (err == -EEXIST)
 358                         err = 0;
 359         } else
 360                 schedule_work(&ht->run_work);
 361
 362         return err;
 363
 364 fail:
 365         /* Do not fail the insert if someone else did a rehash. */
 366         if (likely(rcu_dereference_raw(tbl->future_tbl)))
 367                 return 0;
 368
 369         /* Schedule async rehash to retry allocation in process context. */
 370         if (err == -ENOMEM)
 371                 schedule_work(&ht->run_work);
 372
 373         return err;
 374 }
 375
 376 struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht,
 377                                             const void *key,
 378                                             struct rhash_head *obj,
 379                                             struct bucket_table *tbl)
 380 {
 381         struct rhash_head *head;
 382         unsigned int hash;
 383         int err;
 384
 385         tbl = rhashtable_last_table(ht, tbl);
 386         hash = head_hashfn(ht, tbl, obj);
 387         spin_lock_nested(rht_bucket_lock(tbl, hash), SINGLE_DEPTH_NESTING);
 388
 389         err = -EEXIST;
 390         if (key && rhashtable_lookup_fast(ht, key, ht->p))
 391                 goto exit;
 392
 393         err = -E2BIG;
 394         if (unlikely(rht_grow_above_max(ht, tbl)))
 395                 goto exit;
 396
 397         err = -EAGAIN;
 398         if (rhashtable_check_elasticity(ht, tbl, hash) ||
 399             rht_grow_above_100(ht, tbl))
 400                 goto exit;
 401
 402         err = 0;
 403
 404         head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
 405
 406         RCU_INIT_POINTER(obj->next, head);
 407
 408         rcu_assign_pointer(tbl->buckets[hash], obj);
 409
 410         atomic_inc(&ht->nelems);
 411
 412 exit:
 413         spin_unlock(rht_bucket_lock(tbl, hash));
 414
 415         if (err == 0)
 416                 return NULL;
 417         else if (err == -EAGAIN)
 418                 return tbl;
 419         else
 420                 return ERR_PTR(err);
 421 }
 422
 423 static size_t rounded_hashtable_size(const struct rhashtable_params *params)
 424 {
 425         return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
 426                    (unsigned long)params->min_size);
 427 }
 428
 429 static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
 430 {
 431         return jhash2(key, length, seed);
 432 }
 433
 434 int rhashtable_init(struct rhashtable *ht,
 435                     const struct rhashtable_params *params)
 436 {
 437         struct bucket_table *tbl;
 438         size_t size;
 439
 440         size = HASH_DEFAULT_SIZE;
 441
 442         if ((!params->key_len && !params->obj_hashfn) ||
 443             (params->obj_hashfn && !params->obj_cmpfn))
 444                 return -EINVAL;
 445
 446         if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
 447                 return -EINVAL;
 448
 449         memset(ht, 0, sizeof(*ht));
 450         mutex_init(&ht->mutex);
 451         spin_lock_init(&ht->lock);
 452         memcpy(&ht->p, params, sizeof(*params));
 453
 454         if (params->min_size)
 455                 ht->p.min_size = roundup_pow_of_two(params->min_size);
 456
 457         if (params->max_size)
 458                 ht->p.max_size = rounddown_pow_of_two(params->max_size);
 459
 460         if (params->insecure_max_entries)
 461                 ht->p.insecure_max_entries =
 462                         rounddown_pow_of_two(params->insecure_max_entries);
 463         else
 464                 ht->p.insecure_max_entries = ht->p.max_size * 2;
 465
 466         ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
 467
 468         if (params->nelem_hint)
 469                 size = rounded_hashtable_size(&ht->p);
 470
 471         /* The maximum (not average) chain length grows with the
 472          * size of the hash table, at a rate of (log N)/(log log N).
 473          * The value of 16 is selected so that even if the hash
 474          * table grew to 2^32 you would not expect the maximum
 475          * chain length to exceed it unless we are under attack
 476          * (or extremely unlucky).
 477          *
 478          * As this limit is only to detect attacks, we don't need
 479          * to set it to a lower value as you'd need the chain
 480          * length to vastly exceed 16 to have any real effect
 481          * on the system.
 482          */
 483         if (!params->insecure_elasticity)
 484                 ht->elasticity = 16;
 485
 486         if (params->locks_mul)
 487                 ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
 488         else
 489                 ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
 490
 491         ht->key_len = ht->p.key_len;
 492         if (!params->hashfn) {
 493                 ht->p.hashfn = jhash;
 494
 495                 if (!(ht->key_len & (sizeof(u32) - 1))) {
 496                         ht->key_len /= sizeof(u32);
 497                         ht->p.hashfn = rhashtable_jhash2;
 498                 }
 499         }
 500
 501         tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
 502         if (tbl == NULL)
 503                 return -ENOMEM;
 504
 505         atomic_set(&ht->nelems, 0);
 506
 507         RCU_INIT_POINTER(ht->tbl, tbl);
 508
 509         INIT_WORK(&ht->run_work, rht_deferred_worker);
 510
 511         return 0;
 512 }
 513
 514 void rhashtable_destroy(struct rhashtable *ht)
 515 {
 516         struct bucket_table *tbl;
 517
 518         cancel_work_sync(&ht->run_work);
 519
 520         mutex_lock(&ht->mutex);
 521         tbl = rht_dereference(ht->tbl, ht);
 522         bucket_table_free(tbl);
 523         mutex_unlock(&ht->mutex);
 524 }