git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_key_cache.c

   1
   2 #include "bcachefs.h"
   3 #include "btree_cache.h"
   4 #include "btree_iter.h"
   5 #include "btree_key_cache.h"
   6 #include "btree_locking.h"
   7 #include "btree_update.h"
   8 #include "errcode.h"
   9 #include "error.h"
  10 #include "journal.h"
  11 #include "journal_reclaim.h"
  12
  13 #include <linux/sched/mm.h>
  14 #include <trace/events/bcachefs.h>
  15
  16 static struct kmem_cache *bch2_key_cache;
  17
  18 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
  19                                        const void *obj)
  20 {
  21         const struct bkey_cached *ck = obj;
  22         const struct bkey_cached_key *key = arg->key;
  23
  24         return cmp_int(ck->key.btree_id, key->btree_id) ?:
  25                 bpos_cmp(ck->key.pos, key->pos);
  26 }
  27
  28 static const struct rhashtable_params bch2_btree_key_cache_params = {
  29         .head_offset    = offsetof(struct bkey_cached, hash),
  30         .key_offset     = offsetof(struct bkey_cached, key),
  31         .key_len        = sizeof(struct bkey_cached_key),
  32         .obj_cmpfn      = bch2_btree_key_cache_cmp_fn,
  33 };
  34
  35 __flatten
  36 inline struct bkey_cached *
  37 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
  38 {
  39         struct bkey_cached_key key = {
  40                 .btree_id       = btree_id,
  41                 .pos            = pos,
  42         };
  43
  44         return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
  45                                       bch2_btree_key_cache_params);
  46 }
  47
  48 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
  49 {
  50         if (!six_trylock_intent(&ck->c.lock))
  51                 return false;
  52
  53         if (!six_trylock_write(&ck->c.lock)) {
  54                 six_unlock_intent(&ck->c.lock);
  55                 return false;
  56         }
  57
  58         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
  59                 six_unlock_write(&ck->c.lock);
  60                 six_unlock_intent(&ck->c.lock);
  61                 return false;
  62         }
  63
  64         return true;
  65 }
  66
  67 static void bkey_cached_evict(struct btree_key_cache *c,
  68                               struct bkey_cached *ck)
  69 {
  70         BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
  71                                       bch2_btree_key_cache_params));
  72         memset(&ck->key, ~0, sizeof(ck->key));
  73
  74         atomic_long_dec(&c->nr_keys);
  75 }
  76
  77 static void bkey_cached_free(struct btree_key_cache *bc,
  78                              struct bkey_cached *ck)
  79 {
  80         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
  81
  82         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
  83
  84         ck->btree_trans_barrier_seq =
  85                 start_poll_synchronize_srcu(&c->btree_trans_barrier);
  86
  87         list_move_tail(&ck->list, &bc->freed);
  88         atomic_long_inc(&bc->nr_freed);
  89
  90         kfree(ck->k);
  91         ck->k           = NULL;
  92         ck->u64s        = 0;
  93
  94         six_unlock_write(&ck->c.lock);
  95         six_unlock_intent(&ck->c.lock);
  96 }
  97
  98 static void bkey_cached_free_fast(struct btree_key_cache *bc,
  99                                   struct bkey_cached *ck)
 100 {
 101         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 102         struct btree_key_cache_freelist *f;
 103         bool freed = false;
 104
 105         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 106
 107         ck->btree_trans_barrier_seq =
 108                 start_poll_synchronize_srcu(&c->btree_trans_barrier);
 109
 110         list_del_init(&ck->list);
 111         atomic_long_inc(&bc->nr_freed);
 112
 113         kfree(ck->k);
 114         ck->k           = NULL;
 115         ck->u64s        = 0;
 116
 117         preempt_disable();
 118         f = this_cpu_ptr(bc->pcpu_freed);
 119
 120         if (f->nr < ARRAY_SIZE(f->objs)) {
 121                 f->objs[f->nr++] = ck;
 122                 freed = true;
 123         }
 124         preempt_enable();
 125
 126         if (!freed) {
 127                 mutex_lock(&bc->lock);
 128                 preempt_disable();
 129                 f = this_cpu_ptr(bc->pcpu_freed);
 130
 131                 while (f->nr > ARRAY_SIZE(f->objs) / 2) {
 132                         struct bkey_cached *ck2 = f->objs[--f->nr];
 133
 134                         list_move_tail(&ck2->list, &bc->freed);
 135                 }
 136                 preempt_enable();
 137
 138                 list_move_tail(&ck->list, &bc->freed);
 139                 mutex_unlock(&bc->lock);
 140         }
 141
 142         six_unlock_write(&ck->c.lock);
 143         six_unlock_intent(&ck->c.lock);
 144 }
 145
 146 static struct bkey_cached *
 147 bkey_cached_alloc(struct btree_key_cache *c)
 148 {
 149         struct bkey_cached *ck = NULL;
 150         struct btree_key_cache_freelist *f;
 151
 152         preempt_disable();
 153         f = this_cpu_ptr(c->pcpu_freed);
 154         if (f->nr)
 155                 ck = f->objs[--f->nr];
 156         preempt_enable();
 157
 158         if (!ck) {
 159                 mutex_lock(&c->lock);
 160                 preempt_disable();
 161                 f = this_cpu_ptr(c->pcpu_freed);
 162
 163                 while (!list_empty(&c->freed) &&
 164                        f->nr < ARRAY_SIZE(f->objs) / 2) {
 165                         ck = list_last_entry(&c->freed, struct bkey_cached, list);
 166                         list_del_init(&ck->list);
 167                         f->objs[f->nr++] = ck;
 168                 }
 169
 170                 ck = f->nr ? f->objs[--f->nr] : NULL;
 171                 preempt_enable();
 172                 mutex_unlock(&c->lock);
 173         }
 174
 175         if (ck) {
 176                 six_lock_intent(&ck->c.lock, NULL, NULL);
 177                 six_lock_write(&ck->c.lock, NULL, NULL);
 178                 return ck;
 179         }
 180
 181         ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
 182         if (likely(ck)) {
 183                 INIT_LIST_HEAD(&ck->list);
 184                 six_lock_init(&ck->c.lock);
 185                 BUG_ON(!six_trylock_intent(&ck->c.lock));
 186                 BUG_ON(!six_trylock_write(&ck->c.lock));
 187                 return ck;
 188         }
 189
 190         return NULL;
 191 }
 192
 193 static struct bkey_cached *
 194 bkey_cached_reuse(struct btree_key_cache *c)
 195 {
 196         struct bucket_table *tbl;
 197         struct rhash_head *pos;
 198         struct bkey_cached *ck;
 199         unsigned i;
 200
 201         rcu_read_lock();
 202         tbl = rht_dereference_rcu(c->table.tbl, &c->table);
 203         for (i = 0; i < tbl->size; i++)
 204                 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 205                         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 206                             bkey_cached_lock_for_evict(ck)) {
 207                                 bkey_cached_evict(c, ck);
 208                                 rcu_read_unlock();
 209                                 return ck;
 210                         }
 211                 }
 212         rcu_read_unlock();
 213
 214         return NULL;
 215 }
 216
 217 static struct bkey_cached *
 218 btree_key_cache_create(struct bch_fs *c,
 219                        enum btree_id btree_id,
 220                        struct bpos pos)
 221 {
 222         struct btree_key_cache *bc = &c->btree_key_cache;
 223         struct bkey_cached *ck;
 224         bool was_new = true;
 225
 226         ck = bkey_cached_alloc(bc);
 227
 228         if (unlikely(!ck)) {
 229                 ck = bkey_cached_reuse(bc);
 230                 if (unlikely(!ck)) {
 231                         bch_err(c, "error allocating memory for key cache item, btree %s",
 232                                 bch2_btree_ids[btree_id]);
 233                         return ERR_PTR(-ENOMEM);
 234                 }
 235
 236                 was_new = false;
 237         } else {
 238                 if (btree_id == BTREE_ID_subvolumes)
 239                         six_lock_pcpu_alloc(&ck->c.lock);
 240                 else
 241                         six_lock_pcpu_free(&ck->c.lock);
 242         }
 243
 244         ck->c.level             = 0;
 245         ck->c.btree_id          = btree_id;
 246         ck->key.btree_id        = btree_id;
 247         ck->key.pos             = pos;
 248         ck->valid               = false;
 249         ck->flags               = 1U << BKEY_CACHED_ACCESSED;
 250
 251         if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
 252                                           &ck->hash,
 253                                           bch2_btree_key_cache_params))) {
 254                 /* We raced with another fill: */
 255
 256                 if (likely(was_new)) {
 257                         six_unlock_write(&ck->c.lock);
 258                         six_unlock_intent(&ck->c.lock);
 259                         kfree(ck);
 260                 } else {
 261                         bkey_cached_free_fast(bc, ck);
 262                 }
 263
 264                 return NULL;
 265         }
 266
 267         atomic_long_inc(&bc->nr_keys);
 268
 269         six_unlock_write(&ck->c.lock);
 270
 271         return ck;
 272 }
 273
 274 static int btree_key_cache_fill(struct btree_trans *trans,
 275                                 struct btree_path *ck_path,
 276                                 struct bkey_cached *ck)
 277 {
 278         struct btree_path *path;
 279         struct bkey_s_c k;
 280         unsigned new_u64s = 0;
 281         struct bkey_i *new_k = NULL;
 282         struct bkey u;
 283         int ret;
 284
 285         path = bch2_path_get(trans, ck->key.btree_id,
 286                              ck->key.pos, 0, 0, 0, _THIS_IP_);
 287         ret = bch2_btree_path_traverse(trans, path, 0);
 288         if (ret)
 289                 goto err;
 290
 291         k = bch2_btree_path_peek_slot(path, &u);
 292
 293         if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 294                 trace_trans_restart_relock_key_cache_fill(trans, _THIS_IP_, ck_path);
 295                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
 296                 goto err;
 297         }
 298
 299         /*
 300          * bch2_varint_decode can read past the end of the buffer by at
 301          * most 7 bytes (it won't be used):
 302          */
 303         new_u64s = k.k->u64s + 1;
 304
 305         /*
 306          * Allocate some extra space so that the transaction commit path is less
 307          * likely to have to reallocate, since that requires a transaction
 308          * restart:
 309          */
 310         new_u64s = min(256U, (new_u64s * 3) / 2);
 311
 312         if (new_u64s > ck->u64s) {
 313                 new_u64s = roundup_pow_of_two(new_u64s);
 314                 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
 315                 if (!new_k) {
 316                         bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
 317                                 bch2_btree_ids[ck->key.btree_id], new_u64s);
 318                         ret = -ENOMEM;
 319                         goto err;
 320                 }
 321         }
 322
 323         /*
 324          * XXX: not allowed to be holding read locks when we take a write lock,
 325          * currently
 326          */
 327         bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
 328         if (new_k) {
 329                 kfree(ck->k);
 330                 ck->u64s = new_u64s;
 331                 ck->k = new_k;
 332         }
 333
 334         bkey_reassemble(ck->k, k);
 335         ck->valid = true;
 336         bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 337
 338         /* We're not likely to need this iterator again: */
 339         path->preserve = false;
 340 err:
 341         bch2_path_put(trans, path, 0);
 342         return ret;
 343 }
 344
 345 static int bkey_cached_check_fn(struct six_lock *lock, void *p)
 346 {
 347         struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
 348         const struct btree_path *path = p;
 349
 350         if (ck->key.btree_id != path->btree_id &&
 351             bpos_cmp(ck->key.pos, path->pos))
 352                 return BCH_ERR_lock_fail_node_reused;
 353         return 0;
 354 }
 355
 356 __flatten
 357 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
 358                                     unsigned flags)
 359 {
 360         struct bch_fs *c = trans->c;
 361         struct bkey_cached *ck;
 362         int ret = 0;
 363
 364         BUG_ON(path->level);
 365
 366         path->l[1].b = NULL;
 367
 368         if (bch2_btree_node_relock(trans, path, 0)) {
 369                 ck = (void *) path->l[0].b;
 370                 goto fill;
 371         }
 372 retry:
 373         ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 374         if (!ck) {
 375                 ck = btree_key_cache_create(c, path->btree_id, path->pos);
 376                 ret = PTR_ERR_OR_ZERO(ck);
 377                 if (ret)
 378                         goto err;
 379                 if (!ck)
 380                         goto retry;
 381
 382                 mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
 383                 path->locks_want = 1;
 384         } else {
 385                 enum six_lock_type lock_want = __btree_lock_want(path, 0);
 386
 387                 ret = btree_node_lock(trans, path, (void *) ck, path->pos, 0,
 388                                       lock_want,
 389                                       bkey_cached_check_fn, path, _THIS_IP_);
 390                 if (ret) {
 391                         if (bch2_err_matches(ret, BCH_ERR_lock_fail_node_reused))
 392                                 goto retry;
 393                         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 394                                 goto err;
 395                         BUG();
 396                 }
 397
 398                 if (ck->key.btree_id != path->btree_id ||
 399                     bpos_cmp(ck->key.pos, path->pos)) {
 400                         six_unlock_type(&ck->c.lock, lock_want);
 401                         goto retry;
 402                 }
 403
 404                 mark_btree_node_locked(trans, path, 0, lock_want);
 405         }
 406
 407         path->l[0].lock_seq     = ck->c.lock.state.seq;
 408         path->l[0].b            = (void *) ck;
 409 fill:
 410         if (!ck->valid) {
 411                 /*
 412                  * Using the underscore version because we haven't set
 413                  * path->uptodate yet:
 414                  */
 415                 if (!path->locks_want &&
 416                     !__bch2_btree_path_upgrade(trans, path, 1)) {
 417                         trace_transaction_restart_key_cache_upgrade(trans, _THIS_IP_);
 418                         ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
 419                         goto err;
 420                 }
 421
 422                 ret = btree_key_cache_fill(trans, path, ck);
 423                 if (ret)
 424                         goto err;
 425         }
 426
 427         if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 428                 set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 429
 430         path->uptodate = BTREE_ITER_UPTODATE;
 431         BUG_ON(!ck->valid);
 432         BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 433
 434         return ret;
 435 err:
 436         if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 437                 btree_node_unlock(trans, path, 0);
 438                 path->l[0].b = ERR_PTR(ret);
 439         }
 440         return ret;
 441 }
 442
 443 static int btree_key_cache_flush_pos(struct btree_trans *trans,
 444                                      struct bkey_cached_key key,
 445                                      u64 journal_seq,
 446                                      unsigned commit_flags,
 447                                      bool evict)
 448 {
 449         struct bch_fs *c = trans->c;
 450         struct journal *j = &c->journal;
 451         struct btree_iter c_iter, b_iter;
 452         struct bkey_cached *ck = NULL;
 453         int ret;
 454
 455         bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
 456                              BTREE_ITER_SLOTS|
 457                              BTREE_ITER_INTENT|
 458                              BTREE_ITER_ALL_SNAPSHOTS);
 459         bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
 460                              BTREE_ITER_CACHED|
 461                              BTREE_ITER_INTENT);
 462         b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
 463
 464         ret = bch2_btree_iter_traverse(&c_iter);
 465         if (ret)
 466                 goto out;
 467
 468         ck = (void *) c_iter.path->l[0].b;
 469         if (!ck)
 470                 goto out;
 471
 472         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 473                 if (evict)
 474                         goto evict;
 475                 goto out;
 476         }
 477
 478         BUG_ON(!ck->valid);
 479
 480         if (journal_seq && ck->journal.seq != journal_seq)
 481                 goto out;
 482
 483         /*
 484          * Since journal reclaim depends on us making progress here, and the
 485          * allocator/copygc depend on journal reclaim making progress, we need
 486          * to be using alloc reserves:
 487          * */
 488         ret   = bch2_btree_iter_traverse(&b_iter) ?:
 489                 bch2_trans_update(trans, &b_iter, ck->k,
 490                                   BTREE_UPDATE_KEY_CACHE_RECLAIM|
 491                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 492                                   BTREE_TRIGGER_NORUN) ?:
 493                 bch2_trans_commit(trans, NULL, NULL,
 494                                   BTREE_INSERT_NOCHECK_RW|
 495                                   BTREE_INSERT_NOFAIL|
 496                                   BTREE_INSERT_USE_RESERVE|
 497                                   (ck->journal.seq == journal_last_seq(j)
 498                                    ? JOURNAL_WATERMARK_reserved
 499                                    : 0)|
 500                                   commit_flags);
 501
 502         bch2_fs_fatal_err_on(ret &&
 503                              !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 504                              !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
 505                              !bch2_journal_error(j), c,
 506                              "error flushing key cache: %s", bch2_err_str(ret));
 507         if (ret)
 508                 goto out;
 509
 510         bch2_journal_pin_drop(j, &ck->journal);
 511         bch2_journal_preres_put(j, &ck->res);
 512
 513         BUG_ON(!btree_node_locked(c_iter.path, 0));
 514
 515         if (!evict) {
 516                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 517                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 518                         atomic_long_dec(&c->btree_key_cache.nr_dirty);
 519                 }
 520         } else {
 521 evict:
 522                 BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
 523
 524                 mark_btree_node_unlocked(c_iter.path, 0);
 525                 c_iter.path->l[0].b = NULL;
 526
 527                 six_lock_write(&ck->c.lock, NULL, NULL);
 528
 529                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 530                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 531                         atomic_long_dec(&c->btree_key_cache.nr_dirty);
 532                 }
 533
 534                 bkey_cached_evict(&c->btree_key_cache, ck);
 535
 536                 bkey_cached_free_fast(&c->btree_key_cache, ck);
 537         }
 538 out:
 539         bch2_trans_iter_exit(trans, &b_iter);
 540         bch2_trans_iter_exit(trans, &c_iter);
 541         return ret;
 542 }
 543
 544 int bch2_btree_key_cache_journal_flush(struct journal *j,
 545                                 struct journal_entry_pin *pin, u64 seq)
 546 {
 547         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 548         struct bkey_cached *ck =
 549                 container_of(pin, struct bkey_cached, journal);
 550         struct bkey_cached_key key;
 551         int ret = 0;
 552
 553         int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 554
 555         six_lock_read(&ck->c.lock, NULL, NULL);
 556         key = ck->key;
 557
 558         if (ck->journal.seq != seq ||
 559             !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 560                 six_unlock_read(&ck->c.lock);
 561                 goto unlock;
 562         }
 563         six_unlock_read(&ck->c.lock);
 564
 565         ret = bch2_trans_do(c, NULL, NULL, 0,
 566                 btree_key_cache_flush_pos(&trans, key, seq,
 567                                 BTREE_INSERT_JOURNAL_RECLAIM, false));
 568 unlock:
 569         srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 570
 571         return ret;
 572 }
 573
 574 /*
 575  * Flush and evict a key from the key cache:
 576  */
 577 int bch2_btree_key_cache_flush(struct btree_trans *trans,
 578                                enum btree_id id, struct bpos pos)
 579 {
 580         struct bch_fs *c = trans->c;
 581         struct bkey_cached_key key = { id, pos };
 582
 583         /* Fastpath - assume it won't be found: */
 584         if (!bch2_btree_key_cache_find(c, id, pos))
 585                 return 0;
 586
 587         return btree_key_cache_flush_pos(trans, key, 0, 0, true);
 588 }
 589
 590 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 591                                   struct btree_path *path,
 592                                   struct bkey_i *insert)
 593 {
 594         struct bch_fs *c = trans->c;
 595         struct bkey_cached *ck = (void *) path->l[0].b;
 596         bool kick_reclaim = false;
 597
 598         BUG_ON(insert->u64s > ck->u64s);
 599
 600         if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 601                 int difference;
 602
 603                 BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
 604
 605                 difference = jset_u64s(insert->u64s) - ck->res.u64s;
 606                 if (difference > 0) {
 607                         trans->journal_preres.u64s      -= difference;
 608                         ck->res.u64s                    += difference;
 609                 }
 610         }
 611
 612         bkey_copy(ck->k, insert);
 613         ck->valid = true;
 614
 615         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 616                 set_bit(BKEY_CACHED_DIRTY, &ck->flags);
 617                 atomic_long_inc(&c->btree_key_cache.nr_dirty);
 618
 619                 if (bch2_nr_btree_keys_need_flush(c))
 620                         kick_reclaim = true;
 621         }
 622
 623         bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
 624                                 &ck->journal, bch2_btree_key_cache_journal_flush);
 625
 626         if (kick_reclaim)
 627                 journal_reclaim_kick(&c->journal);
 628         return true;
 629 }
 630
 631 void bch2_btree_key_cache_drop(struct btree_trans *trans,
 632                                struct btree_path *path)
 633 {
 634         struct bkey_cached *ck = (void *) path->l[0].b;
 635
 636         ck->valid = false;
 637
 638         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 639 }
 640
 641 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 642                                            struct shrink_control *sc)
 643 {
 644         struct bch_fs *c = container_of(shrink, struct bch_fs,
 645                                         btree_key_cache.shrink);
 646         struct btree_key_cache *bc = &c->btree_key_cache;
 647         struct bucket_table *tbl;
 648         struct bkey_cached *ck, *t;
 649         size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
 650         unsigned start, flags;
 651         int srcu_idx;
 652
 653         /* Return -1 if we can't do anything right now */
 654         if (sc->gfp_mask & __GFP_FS)
 655                 mutex_lock(&bc->lock);
 656         else if (!mutex_trylock(&bc->lock))
 657                 return -1;
 658
 659         srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 660         flags = memalloc_nofs_save();
 661
 662         /*
 663          * Newest freed entries are at the end of the list - once we hit one
 664          * that's too new to be freed, we can bail out:
 665          */
 666         list_for_each_entry_safe(ck, t, &bc->freed, list) {
 667                 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 668                                                  ck->btree_trans_barrier_seq))
 669                         break;
 670
 671                 list_del(&ck->list);
 672                 kmem_cache_free(bch2_key_cache, ck);
 673                 atomic_long_dec(&bc->nr_freed);
 674                 scanned++;
 675                 freed++;
 676         }
 677
 678         if (scanned >= nr)
 679                 goto out;
 680
 681         rcu_read_lock();
 682         tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
 683         if (bc->shrink_iter >= tbl->size)
 684                 bc->shrink_iter = 0;
 685         start = bc->shrink_iter;
 686
 687         do {
 688                 struct rhash_head *pos, *next;
 689
 690                 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
 691
 692                 while (!rht_is_a_nulls(pos)) {
 693                         next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
 694                         ck = container_of(pos, struct bkey_cached, hash);
 695
 696                         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
 697                                 goto next;
 698
 699                         if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 700                                 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 701                         else if (bkey_cached_lock_for_evict(ck)) {
 702                                 bkey_cached_evict(bc, ck);
 703                                 bkey_cached_free(bc, ck);
 704                         }
 705
 706                         scanned++;
 707                         if (scanned >= nr)
 708                                 break;
 709 next:
 710                         pos = next;
 711                 }
 712
 713                 bc->shrink_iter++;
 714                 if (bc->shrink_iter >= tbl->size)
 715                         bc->shrink_iter = 0;
 716         } while (scanned < nr && bc->shrink_iter != start);
 717
 718         rcu_read_unlock();
 719 out:
 720         memalloc_nofs_restore(flags);
 721         srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 722         mutex_unlock(&bc->lock);
 723
 724         return freed;
 725 }
 726
 727 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
 728                                             struct shrink_control *sc)
 729 {
 730         struct bch_fs *c = container_of(shrink, struct bch_fs,
 731                                         btree_key_cache.shrink);
 732         struct btree_key_cache *bc = &c->btree_key_cache;
 733         long nr = atomic_long_read(&bc->nr_keys) -
 734                 atomic_long_read(&bc->nr_dirty);
 735
 736         return max(0L, nr);
 737 }
 738
 739 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 740 {
 741         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 742         struct bucket_table *tbl;
 743         struct bkey_cached *ck, *n;
 744         struct rhash_head *pos;
 745         unsigned i;
 746         int cpu;
 747
 748         if (bc->shrink.list.next)
 749                 unregister_shrinker(&bc->shrink);
 750
 751         mutex_lock(&bc->lock);
 752
 753         rcu_read_lock();
 754         tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
 755         if (tbl)
 756                 for (i = 0; i < tbl->size; i++)
 757                         rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 758                                 bkey_cached_evict(bc, ck);
 759                                 list_add(&ck->list, &bc->freed);
 760                         }
 761         rcu_read_unlock();
 762
 763         for_each_possible_cpu(cpu) {
 764                 struct btree_key_cache_freelist *f =
 765                         per_cpu_ptr(bc->pcpu_freed, cpu);
 766
 767                 for (i = 0; i < f->nr; i++) {
 768                         ck = f->objs[i];
 769                         list_add(&ck->list, &bc->freed);
 770                 }
 771         }
 772
 773         list_for_each_entry_safe(ck, n, &bc->freed, list) {
 774                 cond_resched();
 775
 776                 bch2_journal_pin_drop(&c->journal, &ck->journal);
 777                 bch2_journal_preres_put(&c->journal, &ck->res);
 778
 779                 list_del(&ck->list);
 780                 kfree(ck->k);
 781                 kmem_cache_free(bch2_key_cache, ck);
 782         }
 783
 784         BUG_ON(atomic_long_read(&bc->nr_dirty) &&
 785                !bch2_journal_error(&c->journal) &&
 786                test_bit(BCH_FS_WAS_RW, &c->flags));
 787         BUG_ON(atomic_long_read(&bc->nr_keys));
 788
 789         mutex_unlock(&bc->lock);
 790
 791         if (bc->table_init_done)
 792                 rhashtable_destroy(&bc->table);
 793
 794         free_percpu(bc->pcpu_freed);
 795 }
 796
 797 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 798 {
 799         mutex_init(&c->lock);
 800         INIT_LIST_HEAD(&c->freed);
 801 }
 802
 803 static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink)
 804 {
 805         struct btree_key_cache *bc =
 806                 container_of(shrink, struct btree_key_cache, shrink);
 807
 808         bch2_btree_key_cache_to_text(out, bc);
 809 }
 810
 811 int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
 812 {
 813         int ret;
 814
 815         c->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
 816         if (!c->pcpu_freed)
 817                 return -ENOMEM;
 818
 819         ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
 820         if (ret)
 821                 return ret;
 822
 823         c->table_init_done = true;
 824
 825         c->shrink.seeks                 = 1;
 826         c->shrink.count_objects         = bch2_btree_key_cache_count;
 827         c->shrink.scan_objects          = bch2_btree_key_cache_scan;
 828         c->shrink.to_text               = bch2_btree_key_cache_shrinker_to_text;
 829         return register_shrinker(&c->shrink);
 830 }
 831
 832 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 833 {
 834         prt_printf(out, "nr_freed:\t%zu\n",     atomic_long_read(&c->nr_freed));
 835         prt_printf(out, "nr_keys:\t%lu\n",      atomic_long_read(&c->nr_keys));
 836         prt_printf(out, "nr_dirty:\t%lu\n",     atomic_long_read(&c->nr_dirty));
 837 }
 838
 839 void bch2_btree_key_cache_exit(void)
 840 {
 841         if (bch2_key_cache)
 842                 kmem_cache_destroy(bch2_key_cache);
 843 }
 844
 845 int __init bch2_btree_key_cache_init(void)
 846 {
 847         bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
 848         if (!bch2_key_cache)
 849                 return -ENOMEM;
 850
 851         return 0;
 852 }