git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/btree_key_cache.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "bcachefs.h"
   4 #include "btree_cache.h"
   5 #include "btree_iter.h"
   6 #include "btree_key_cache.h"
   7 #include "btree_locking.h"
   8 #include "btree_update.h"
   9 #include "errcode.h"
  10 #include "error.h"
  11 #include "journal.h"
  12 #include "journal_reclaim.h"
  13 #include "trace.h"
  14
  15 #include <linux/sched/mm.h>
  16 #include <linux/seq_buf.h>
  17
  18 static inline bool btree_uses_pcpu_readers(enum btree_id id)
  19 {
  20         return id == BTREE_ID_subvolumes;
  21 }
  22
  23 static struct kmem_cache *bch2_key_cache;
  24
  25 static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
  26                                        const void *obj)
  27 {
  28         const struct bkey_cached *ck = obj;
  29         const struct bkey_cached_key *key = arg->key;
  30
  31         return ck->key.btree_id != key->btree_id ||
  32                 !bpos_eq(ck->key.pos, key->pos);
  33 }
  34
  35 static const struct rhashtable_params bch2_btree_key_cache_params = {
  36         .head_offset    = offsetof(struct bkey_cached, hash),
  37         .key_offset     = offsetof(struct bkey_cached, key),
  38         .key_len        = sizeof(struct bkey_cached_key),
  39         .obj_cmpfn      = bch2_btree_key_cache_cmp_fn,
  40 };
  41
  42 __flatten
  43 inline struct bkey_cached *
  44 bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
  45 {
  46         struct bkey_cached_key key = {
  47                 .btree_id       = btree_id,
  48                 .pos            = pos,
  49         };
  50
  51         return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
  52                                       bch2_btree_key_cache_params);
  53 }
  54
  55 static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
  56 {
  57         if (!six_trylock_intent(&ck->c.lock))
  58                 return false;
  59
  60         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
  61                 six_unlock_intent(&ck->c.lock);
  62                 return false;
  63         }
  64
  65         if (!six_trylock_write(&ck->c.lock)) {
  66                 six_unlock_intent(&ck->c.lock);
  67                 return false;
  68         }
  69
  70         return true;
  71 }
  72
  73 static void bkey_cached_evict(struct btree_key_cache *c,
  74                               struct bkey_cached *ck)
  75 {
  76         BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
  77                                       bch2_btree_key_cache_params));
  78         memset(&ck->key, ~0, sizeof(ck->key));
  79
  80         atomic_long_dec(&c->nr_keys);
  81 }
  82
  83 static void bkey_cached_free(struct btree_key_cache *bc,
  84                              struct bkey_cached *ck)
  85 {
  86         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
  87
  88         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
  89
  90         ck->btree_trans_barrier_seq =
  91                 start_poll_synchronize_srcu(&c->btree_trans_barrier);
  92
  93         if (ck->c.lock.readers) {
  94                 list_move_tail(&ck->list, &bc->freed_pcpu);
  95                 bc->nr_freed_pcpu++;
  96         } else {
  97                 list_move_tail(&ck->list, &bc->freed_nonpcpu);
  98                 bc->nr_freed_nonpcpu++;
  99         }
 100         atomic_long_inc(&bc->nr_freed);
 101
 102         kfree(ck->k);
 103         ck->k           = NULL;
 104         ck->u64s        = 0;
 105
 106         six_unlock_write(&ck->c.lock);
 107         six_unlock_intent(&ck->c.lock);
 108 }
 109
 110 #ifdef __KERNEL__
 111 static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
 112                                                    struct bkey_cached *ck)
 113 {
 114         struct bkey_cached *pos;
 115
 116         bc->nr_freed_nonpcpu++;
 117
 118         list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
 119                 if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
 120                                  pos->btree_trans_barrier_seq)) {
 121                         list_move(&ck->list, &pos->list);
 122                         return;
 123                 }
 124         }
 125
 126         list_move(&ck->list, &bc->freed_nonpcpu);
 127 }
 128 #endif
 129
 130 static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 131                                          struct bkey_cached *ck)
 132 {
 133         BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 134
 135         if (!ck->c.lock.readers) {
 136 #ifdef __KERNEL__
 137                 struct btree_key_cache_freelist *f;
 138                 bool freed = false;
 139
 140                 preempt_disable();
 141                 f = this_cpu_ptr(bc->pcpu_freed);
 142
 143                 if (f->nr < ARRAY_SIZE(f->objs)) {
 144                         f->objs[f->nr++] = ck;
 145                         freed = true;
 146                 }
 147                 preempt_enable();
 148
 149                 if (!freed) {
 150                         mutex_lock(&bc->lock);
 151                         preempt_disable();
 152                         f = this_cpu_ptr(bc->pcpu_freed);
 153
 154                         while (f->nr > ARRAY_SIZE(f->objs) / 2) {
 155                                 struct bkey_cached *ck2 = f->objs[--f->nr];
 156
 157                                 __bkey_cached_move_to_freelist_ordered(bc, ck2);
 158                         }
 159                         preempt_enable();
 160
 161                         __bkey_cached_move_to_freelist_ordered(bc, ck);
 162                         mutex_unlock(&bc->lock);
 163                 }
 164 #else
 165                 mutex_lock(&bc->lock);
 166                 list_move_tail(&ck->list, &bc->freed_nonpcpu);
 167                 bc->nr_freed_nonpcpu++;
 168                 mutex_unlock(&bc->lock);
 169 #endif
 170         } else {
 171                 mutex_lock(&bc->lock);
 172                 list_move_tail(&ck->list, &bc->freed_pcpu);
 173                 mutex_unlock(&bc->lock);
 174         }
 175 }
 176
 177 static void bkey_cached_free_fast(struct btree_key_cache *bc,
 178                                   struct bkey_cached *ck)
 179 {
 180         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 181
 182         ck->btree_trans_barrier_seq =
 183                 start_poll_synchronize_srcu(&c->btree_trans_barrier);
 184
 185         list_del_init(&ck->list);
 186         atomic_long_inc(&bc->nr_freed);
 187
 188         kfree(ck->k);
 189         ck->k           = NULL;
 190         ck->u64s        = 0;
 191
 192         bkey_cached_move_to_freelist(bc, ck);
 193
 194         six_unlock_write(&ck->c.lock);
 195         six_unlock_intent(&ck->c.lock);
 196 }
 197
 198 static struct bkey_cached *
 199 bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 200                   bool *was_new)
 201 {
 202         struct bch_fs *c = trans->c;
 203         struct btree_key_cache *bc = &c->btree_key_cache;
 204         struct bkey_cached *ck = NULL;
 205         bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
 206         int ret;
 207
 208         if (!pcpu_readers) {
 209 #ifdef __KERNEL__
 210                 struct btree_key_cache_freelist *f;
 211
 212                 preempt_disable();
 213                 f = this_cpu_ptr(bc->pcpu_freed);
 214                 if (f->nr)
 215                         ck = f->objs[--f->nr];
 216                 preempt_enable();
 217
 218                 if (!ck) {
 219                         mutex_lock(&bc->lock);
 220                         preempt_disable();
 221                         f = this_cpu_ptr(bc->pcpu_freed);
 222
 223                         while (!list_empty(&bc->freed_nonpcpu) &&
 224                                f->nr < ARRAY_SIZE(f->objs) / 2) {
 225                                 ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
 226                                 list_del_init(&ck->list);
 227                                 bc->nr_freed_nonpcpu--;
 228                                 f->objs[f->nr++] = ck;
 229                         }
 230
 231                         ck = f->nr ? f->objs[--f->nr] : NULL;
 232                         preempt_enable();
 233                         mutex_unlock(&bc->lock);
 234                 }
 235 #else
 236                 mutex_lock(&bc->lock);
 237                 if (!list_empty(&bc->freed_nonpcpu)) {
 238                         ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
 239                         list_del_init(&ck->list);
 240                         bc->nr_freed_nonpcpu--;
 241                 }
 242                 mutex_unlock(&bc->lock);
 243 #endif
 244         } else {
 245                 mutex_lock(&bc->lock);
 246                 if (!list_empty(&bc->freed_pcpu)) {
 247                         ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
 248                         list_del_init(&ck->list);
 249                 }
 250                 mutex_unlock(&bc->lock);
 251         }
 252
 253         if (ck) {
 254                 ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
 255                 if (unlikely(ret)) {
 256                         bkey_cached_move_to_freelist(bc, ck);
 257                         return ERR_PTR(ret);
 258                 }
 259
 260                 path->l[0].b = (void *) ck;
 261                 path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
 262                 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 263
 264                 ret = bch2_btree_node_lock_write(trans, path, &ck->c);
 265                 if (unlikely(ret)) {
 266                         btree_node_unlock(trans, path, 0);
 267                         bkey_cached_move_to_freelist(bc, ck);
 268                         return ERR_PTR(ret);
 269                 }
 270
 271                 return ck;
 272         }
 273
 274         ck = allocate_dropping_locks(trans, ret,
 275                         kmem_cache_zalloc(bch2_key_cache, _gfp));
 276         if (ret) {
 277                 kmem_cache_free(bch2_key_cache, ck);
 278                 return ERR_PTR(ret);
 279         }
 280
 281         if (!ck)
 282                 return NULL;
 283
 284         INIT_LIST_HEAD(&ck->list);
 285         bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
 286
 287         ck->c.cached = true;
 288         BUG_ON(!six_trylock_intent(&ck->c.lock));
 289         BUG_ON(!six_trylock_write(&ck->c.lock));
 290         *was_new = true;
 291         return ck;
 292 }
 293
 294 static struct bkey_cached *
 295 bkey_cached_reuse(struct btree_key_cache *c)
 296 {
 297         struct bucket_table *tbl;
 298         struct rhash_head *pos;
 299         struct bkey_cached *ck;
 300         unsigned i;
 301
 302         mutex_lock(&c->lock);
 303         rcu_read_lock();
 304         tbl = rht_dereference_rcu(c->table.tbl, &c->table);
 305         for (i = 0; i < tbl->size; i++)
 306                 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 307                         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 308                             bkey_cached_lock_for_evict(ck)) {
 309                                 bkey_cached_evict(c, ck);
 310                                 goto out;
 311                         }
 312                 }
 313         ck = NULL;
 314 out:
 315         rcu_read_unlock();
 316         mutex_unlock(&c->lock);
 317         return ck;
 318 }
 319
 320 static struct bkey_cached *
 321 btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 322 {
 323         struct bch_fs *c = trans->c;
 324         struct btree_key_cache *bc = &c->btree_key_cache;
 325         struct bkey_cached *ck;
 326         bool was_new = false;
 327
 328         ck = bkey_cached_alloc(trans, path, &was_new);
 329         if (IS_ERR(ck))
 330                 return ck;
 331
 332         if (unlikely(!ck)) {
 333                 ck = bkey_cached_reuse(bc);
 334                 if (unlikely(!ck)) {
 335                         bch_err(c, "error allocating memory for key cache item, btree %s",
 336                                 bch2_btree_id_str(path->btree_id));
 337                         return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
 338                 }
 339
 340                 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 341         }
 342
 343         ck->c.level             = 0;
 344         ck->c.btree_id          = path->btree_id;
 345         ck->key.btree_id        = path->btree_id;
 346         ck->key.pos             = path->pos;
 347         ck->valid               = false;
 348         ck->flags               = 1U << BKEY_CACHED_ACCESSED;
 349
 350         if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
 351                                           &ck->hash,
 352                                           bch2_btree_key_cache_params))) {
 353                 /* We raced with another fill: */
 354
 355                 if (likely(was_new)) {
 356                         six_unlock_write(&ck->c.lock);
 357                         six_unlock_intent(&ck->c.lock);
 358                         kfree(ck);
 359                 } else {
 360                         bkey_cached_free_fast(bc, ck);
 361                 }
 362
 363                 mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
 364                 return NULL;
 365         }
 366
 367         atomic_long_inc(&bc->nr_keys);
 368
 369         six_unlock_write(&ck->c.lock);
 370
 371         return ck;
 372 }
 373
 374 static int btree_key_cache_fill(struct btree_trans *trans,
 375                                 struct btree_path *ck_path,
 376                                 struct bkey_cached *ck)
 377 {
 378         struct btree_iter iter;
 379         struct bkey_s_c k;
 380         unsigned new_u64s = 0;
 381         struct bkey_i *new_k = NULL;
 382         int ret;
 383
 384         k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
 385                                BTREE_ITER_KEY_CACHE_FILL|
 386                                BTREE_ITER_CACHED_NOFILL);
 387         ret = bkey_err(k);
 388         if (ret)
 389                 goto err;
 390
 391         if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 392                 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
 393                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
 394                 goto err;
 395         }
 396
 397         /*
 398          * bch2_varint_decode can read past the end of the buffer by at
 399          * most 7 bytes (it won't be used):
 400          */
 401         new_u64s = k.k->u64s + 1;
 402
 403         /*
 404          * Allocate some extra space so that the transaction commit path is less
 405          * likely to have to reallocate, since that requires a transaction
 406          * restart:
 407          */
 408         new_u64s = min(256U, (new_u64s * 3) / 2);
 409
 410         if (new_u64s > ck->u64s) {
 411                 new_u64s = roundup_pow_of_two(new_u64s);
 412                 new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
 413                 if (!new_k) {
 414                         bch2_trans_unlock(trans);
 415
 416                         new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
 417                         if (!new_k) {
 418                                 bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
 419                                         bch2_btree_id_str(ck->key.btree_id), new_u64s);
 420                                 ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
 421                                 goto err;
 422                         }
 423
 424                         if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 425                                 kfree(new_k);
 426                                 trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
 427                                 ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
 428                                 goto err;
 429                         }
 430
 431                         ret = bch2_trans_relock(trans);
 432                         if (ret) {
 433                                 kfree(new_k);
 434                                 goto err;
 435                         }
 436                 }
 437         }
 438
 439         ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
 440         if (ret) {
 441                 kfree(new_k);
 442                 goto err;
 443         }
 444
 445         if (new_k) {
 446                 kfree(ck->k);
 447                 ck->u64s = new_u64s;
 448                 ck->k = new_k;
 449         }
 450
 451         bkey_reassemble(ck->k, k);
 452         ck->valid = true;
 453         bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 454
 455         /* We're not likely to need this iterator again: */
 456         set_btree_iter_dontneed(&iter);
 457 err:
 458         bch2_trans_iter_exit(trans, &iter);
 459         return ret;
 460 }
 461
 462 static noinline int
 463 bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
 464                                          unsigned flags)
 465 {
 466         struct bch_fs *c = trans->c;
 467         struct bkey_cached *ck;
 468         int ret = 0;
 469
 470         BUG_ON(path->level);
 471
 472         path->l[1].b = NULL;
 473
 474         if (bch2_btree_node_relock_notrace(trans, path, 0)) {
 475                 ck = (void *) path->l[0].b;
 476                 goto fill;
 477         }
 478 retry:
 479         ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 480         if (!ck) {
 481                 ck = btree_key_cache_create(trans, path);
 482                 ret = PTR_ERR_OR_ZERO(ck);
 483                 if (ret)
 484                         goto err;
 485                 if (!ck)
 486                         goto retry;
 487
 488                 mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 489                 path->locks_want = 1;
 490         } else {
 491                 enum six_lock_type lock_want = __btree_lock_want(path, 0);
 492
 493                 ret = btree_node_lock(trans, path, (void *) ck, 0,
 494                                       lock_want, _THIS_IP_);
 495                 if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 496                         goto err;
 497
 498                 BUG_ON(ret);
 499
 500                 if (ck->key.btree_id != path->btree_id ||
 501                     !bpos_eq(ck->key.pos, path->pos)) {
 502                         six_unlock_type(&ck->c.lock, lock_want);
 503                         goto retry;
 504                 }
 505
 506                 mark_btree_node_locked(trans, path, 0,
 507                                        (enum btree_node_locked_type) lock_want);
 508         }
 509
 510         path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
 511         path->l[0].b            = (void *) ck;
 512 fill:
 513         path->uptodate = BTREE_ITER_UPTODATE;
 514
 515         if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
 516                 /*
 517                  * Using the underscore version because we haven't set
 518                  * path->uptodate yet:
 519                  */
 520                 if (!path->locks_want &&
 521                     !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
 522                         trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
 523                         ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
 524                         goto err;
 525                 }
 526
 527                 ret = btree_key_cache_fill(trans, path, ck);
 528                 if (ret)
 529                         goto err;
 530
 531                 ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
 532                 if (ret)
 533                         goto err;
 534
 535                 path->uptodate = BTREE_ITER_UPTODATE;
 536         }
 537
 538         if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 539                 set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 540
 541         BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 542         BUG_ON(path->uptodate);
 543
 544         return ret;
 545 err:
 546         path->uptodate = BTREE_ITER_NEED_TRAVERSE;
 547         if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 548                 btree_node_unlock(trans, path, 0);
 549                 path->l[0].b = ERR_PTR(ret);
 550         }
 551         return ret;
 552 }
 553
 554 int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
 555                                     unsigned flags)
 556 {
 557         struct bch_fs *c = trans->c;
 558         struct bkey_cached *ck;
 559         int ret = 0;
 560
 561         EBUG_ON(path->level);
 562
 563         path->l[1].b = NULL;
 564
 565         if (bch2_btree_node_relock_notrace(trans, path, 0)) {
 566                 ck = (void *) path->l[0].b;
 567                 goto fill;
 568         }
 569 retry:
 570         ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 571         if (!ck) {
 572                 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
 573         } else {
 574                 enum six_lock_type lock_want = __btree_lock_want(path, 0);
 575
 576                 ret = btree_node_lock(trans, path, (void *) ck, 0,
 577                                       lock_want, _THIS_IP_);
 578                 EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
 579
 580                 if (ret)
 581                         return ret;
 582
 583                 if (ck->key.btree_id != path->btree_id ||
 584                     !bpos_eq(ck->key.pos, path->pos)) {
 585                         six_unlock_type(&ck->c.lock, lock_want);
 586                         goto retry;
 587                 }
 588
 589                 mark_btree_node_locked(trans, path, 0,
 590                                        (enum btree_node_locked_type) lock_want);
 591         }
 592
 593         path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
 594         path->l[0].b            = (void *) ck;
 595 fill:
 596         if (!ck->valid)
 597                 return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
 598
 599         if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 600                 set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 601
 602         path->uptodate = BTREE_ITER_UPTODATE;
 603         EBUG_ON(!ck->valid);
 604         EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 605
 606         return ret;
 607 }
 608
 609 static int btree_key_cache_flush_pos(struct btree_trans *trans,
 610                                      struct bkey_cached_key key,
 611                                      u64 journal_seq,
 612                                      unsigned commit_flags,
 613                                      bool evict)
 614 {
 615         struct bch_fs *c = trans->c;
 616         struct journal *j = &c->journal;
 617         struct btree_iter c_iter, b_iter;
 618         struct bkey_cached *ck = NULL;
 619         int ret;
 620
 621         bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
 622                              BTREE_ITER_SLOTS|
 623                              BTREE_ITER_INTENT|
 624                              BTREE_ITER_ALL_SNAPSHOTS);
 625         bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
 626                              BTREE_ITER_CACHED|
 627                              BTREE_ITER_INTENT);
 628         b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
 629
 630         ret = bch2_btree_iter_traverse(&c_iter);
 631         if (ret)
 632                 goto out;
 633
 634         ck = (void *) c_iter.path->l[0].b;
 635         if (!ck)
 636                 goto out;
 637
 638         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 639                 if (evict)
 640                         goto evict;
 641                 goto out;
 642         }
 643
 644         BUG_ON(!ck->valid);
 645
 646         if (journal_seq && ck->journal.seq != journal_seq)
 647                 goto out;
 648
 649         trans->journal_res.seq = ck->journal.seq;
 650
 651         /*
 652          * If we're at the end of the journal, we really want to free up space
 653          * in the journal right away - we don't want to pin that old journal
 654          * sequence number with a new btree node write, we want to re-journal
 655          * the update
 656          */
 657         if (ck->journal.seq == journal_last_seq(j))
 658                 commit_flags |= BCH_WATERMARK_reclaim;
 659         else
 660                 commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
 661
 662         ret   = bch2_btree_iter_traverse(&b_iter) ?:
 663                 bch2_trans_update(trans, &b_iter, ck->k,
 664                                   BTREE_UPDATE_KEY_CACHE_RECLAIM|
 665                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 666                                   BTREE_TRIGGER_NORUN) ?:
 667                 bch2_trans_commit(trans, NULL, NULL,
 668                                   BCH_TRANS_COMMIT_no_check_rw|
 669                                   BCH_TRANS_COMMIT_no_enospc|
 670                                   commit_flags);
 671
 672         bch2_fs_fatal_err_on(ret &&
 673                              !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 674                              !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
 675                              !bch2_journal_error(j), c,
 676                              "error flushing key cache: %s", bch2_err_str(ret));
 677         if (ret)
 678                 goto out;
 679
 680         bch2_journal_pin_drop(j, &ck->journal);
 681
 682         BUG_ON(!btree_node_locked(c_iter.path, 0));
 683
 684         if (!evict) {
 685                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 686                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 687                         atomic_long_dec(&c->btree_key_cache.nr_dirty);
 688                 }
 689         } else {
 690                 struct btree_path *path2;
 691 evict:
 692                 trans_for_each_path(trans, path2)
 693                         if (path2 != c_iter.path)
 694                                 __bch2_btree_path_unlock(trans, path2);
 695
 696                 bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
 697
 698                 if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 699                         clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 700                         atomic_long_dec(&c->btree_key_cache.nr_dirty);
 701                 }
 702
 703                 mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
 704                 bkey_cached_evict(&c->btree_key_cache, ck);
 705                 bkey_cached_free_fast(&c->btree_key_cache, ck);
 706         }
 707 out:
 708         bch2_trans_iter_exit(trans, &b_iter);
 709         bch2_trans_iter_exit(trans, &c_iter);
 710         return ret;
 711 }
 712
 713 int bch2_btree_key_cache_journal_flush(struct journal *j,
 714                                 struct journal_entry_pin *pin, u64 seq)
 715 {
 716         struct bch_fs *c = container_of(j, struct bch_fs, journal);
 717         struct bkey_cached *ck =
 718                 container_of(pin, struct bkey_cached, journal);
 719         struct bkey_cached_key key;
 720         struct btree_trans *trans = bch2_trans_get(c);
 721         int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 722         int ret = 0;
 723
 724         btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
 725         key = ck->key;
 726
 727         if (ck->journal.seq != seq ||
 728             !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 729                 six_unlock_read(&ck->c.lock);
 730                 goto unlock;
 731         }
 732
 733         if (ck->seq != seq) {
 734                 bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
 735                                         bch2_btree_key_cache_journal_flush);
 736                 six_unlock_read(&ck->c.lock);
 737                 goto unlock;
 738         }
 739         six_unlock_read(&ck->c.lock);
 740
 741         ret = commit_do(trans, NULL, NULL, 0,
 742                 btree_key_cache_flush_pos(trans, key, seq,
 743                                 BCH_TRANS_COMMIT_journal_reclaim, false));
 744 unlock:
 745         srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 746
 747         bch2_trans_put(trans);
 748         return ret;
 749 }
 750
 751 /*
 752  * Flush and evict a key from the key cache:
 753  */
 754 int bch2_btree_key_cache_flush(struct btree_trans *trans,
 755                                enum btree_id id, struct bpos pos)
 756 {
 757         struct bch_fs *c = trans->c;
 758         struct bkey_cached_key key = { id, pos };
 759
 760         /* Fastpath - assume it won't be found: */
 761         if (!bch2_btree_key_cache_find(c, id, pos))
 762                 return 0;
 763
 764         return btree_key_cache_flush_pos(trans, key, 0, 0, true);
 765 }
 766
 767 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 768                                   unsigned flags,
 769                                   struct btree_insert_entry *insert_entry)
 770 {
 771         struct bch_fs *c = trans->c;
 772         struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
 773         struct bkey_i *insert = insert_entry->k;
 774         bool kick_reclaim = false;
 775
 776         BUG_ON(insert->k.u64s > ck->u64s);
 777
 778         bkey_copy(ck->k, insert);
 779         ck->valid = true;
 780
 781         if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 782                 EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 783                 set_bit(BKEY_CACHED_DIRTY, &ck->flags);
 784                 atomic_long_inc(&c->btree_key_cache.nr_dirty);
 785
 786                 if (bch2_nr_btree_keys_need_flush(c))
 787                         kick_reclaim = true;
 788         }
 789
 790         /*
 791          * To minimize lock contention, we only add the journal pin here and
 792          * defer pin updates to the flush callback via ->seq. Be careful not to
 793          * update ->seq on nojournal commits because we don't want to update the
 794          * pin to a seq that doesn't include journal updates on disk. Otherwise
 795          * we risk losing the update after a crash.
 796          *
 797          * The only exception is if the pin is not active in the first place. We
 798          * have to add the pin because journal reclaim drives key cache
 799          * flushing. The flush callback will not proceed unless ->seq matches
 800          * the latest pin, so make sure it starts with a consistent value.
 801          */
 802         if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
 803             !journal_pin_active(&ck->journal)) {
 804                 ck->seq = trans->journal_res.seq;
 805         }
 806         bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
 807                              &ck->journal, bch2_btree_key_cache_journal_flush);
 808
 809         if (kick_reclaim)
 810                 journal_reclaim_kick(&c->journal);
 811         return true;
 812 }
 813
 814 void bch2_btree_key_cache_drop(struct btree_trans *trans,
 815                                struct btree_path *path)
 816 {
 817         struct bch_fs *c = trans->c;
 818         struct bkey_cached *ck = (void *) path->l[0].b;
 819
 820         BUG_ON(!ck->valid);
 821
 822         /*
 823          * We just did an update to the btree, bypassing the key cache: the key
 824          * cache key is now stale and must be dropped, even if dirty:
 825          */
 826         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 827                 clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 828                 atomic_long_dec(&c->btree_key_cache.nr_dirty);
 829                 bch2_journal_pin_drop(&c->journal, &ck->journal);
 830         }
 831
 832         ck->valid = false;
 833 }
 834
 835 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 836                                            struct shrink_control *sc)
 837 {
 838         struct bch_fs *c = shrink->private_data;
 839         struct btree_key_cache *bc = &c->btree_key_cache;
 840         struct bucket_table *tbl;
 841         struct bkey_cached *ck, *t;
 842         size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
 843         unsigned start, flags;
 844         int srcu_idx;
 845
 846         mutex_lock(&bc->lock);
 847         srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 848         flags = memalloc_nofs_save();
 849
 850         /*
 851          * Newest freed entries are at the end of the list - once we hit one
 852          * that's too new to be freed, we can bail out:
 853          */
 854         scanned += bc->nr_freed_nonpcpu;
 855
 856         list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
 857                 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 858                                                  ck->btree_trans_barrier_seq))
 859                         break;
 860
 861                 list_del(&ck->list);
 862                 six_lock_exit(&ck->c.lock);
 863                 kmem_cache_free(bch2_key_cache, ck);
 864                 atomic_long_dec(&bc->nr_freed);
 865                 freed++;
 866                 bc->nr_freed_nonpcpu--;
 867         }
 868
 869         if (scanned >= nr)
 870                 goto out;
 871
 872         scanned += bc->nr_freed_pcpu;
 873
 874         list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
 875                 if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 876                                                  ck->btree_trans_barrier_seq))
 877                         break;
 878
 879                 list_del(&ck->list);
 880                 six_lock_exit(&ck->c.lock);
 881                 kmem_cache_free(bch2_key_cache, ck);
 882                 atomic_long_dec(&bc->nr_freed);
 883                 freed++;
 884                 bc->nr_freed_pcpu--;
 885         }
 886
 887         if (scanned >= nr)
 888                 goto out;
 889
 890         rcu_read_lock();
 891         tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
 892         if (bc->shrink_iter >= tbl->size)
 893                 bc->shrink_iter = 0;
 894         start = bc->shrink_iter;
 895
 896         do {
 897                 struct rhash_head *pos, *next;
 898
 899                 pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
 900
 901                 while (!rht_is_a_nulls(pos)) {
 902                         next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
 903                         ck = container_of(pos, struct bkey_cached, hash);
 904
 905                         if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
 906                                 goto next;
 907
 908                         if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 909                                 clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 910                         else if (bkey_cached_lock_for_evict(ck)) {
 911                                 bkey_cached_evict(bc, ck);
 912                                 bkey_cached_free(bc, ck);
 913                         }
 914
 915                         scanned++;
 916                         if (scanned >= nr)
 917                                 break;
 918 next:
 919                         pos = next;
 920                 }
 921
 922                 bc->shrink_iter++;
 923                 if (bc->shrink_iter >= tbl->size)
 924                         bc->shrink_iter = 0;
 925         } while (scanned < nr && bc->shrink_iter != start);
 926
 927         rcu_read_unlock();
 928 out:
 929         memalloc_nofs_restore(flags);
 930         srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 931         mutex_unlock(&bc->lock);
 932
 933         return freed;
 934 }
 935
 936 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
 937                                             struct shrink_control *sc)
 938 {
 939         struct bch_fs *c = shrink->private_data;
 940         struct btree_key_cache *bc = &c->btree_key_cache;
 941         long nr = atomic_long_read(&bc->nr_keys) -
 942                 atomic_long_read(&bc->nr_dirty);
 943
 944         return max(0L, nr);
 945 }
 946
 947 void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 948 {
 949         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 950         struct bucket_table *tbl;
 951         struct bkey_cached *ck, *n;
 952         struct rhash_head *pos;
 953         LIST_HEAD(items);
 954         unsigned i;
 955 #ifdef __KERNEL__
 956         int cpu;
 957 #endif
 958
 959         shrinker_free(bc->shrink);
 960
 961         mutex_lock(&bc->lock);
 962
 963         /*
 964          * The loop is needed to guard against racing with rehash:
 965          */
 966         while (atomic_long_read(&bc->nr_keys)) {
 967                 rcu_read_lock();
 968                 tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
 969                 if (tbl)
 970                         for (i = 0; i < tbl->size; i++)
 971                                 rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 972                                         bkey_cached_evict(bc, ck);
 973                                         list_add(&ck->list, &items);
 974                                 }
 975                 rcu_read_unlock();
 976         }
 977
 978 #ifdef __KERNEL__
 979         for_each_possible_cpu(cpu) {
 980                 struct btree_key_cache_freelist *f =
 981                         per_cpu_ptr(bc->pcpu_freed, cpu);
 982
 983                 for (i = 0; i < f->nr; i++) {
 984                         ck = f->objs[i];
 985                         list_add(&ck->list, &items);
 986                 }
 987         }
 988 #endif
 989
 990         BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
 991         BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
 992
 993         list_splice(&bc->freed_pcpu,    &items);
 994         list_splice(&bc->freed_nonpcpu, &items);
 995
 996         mutex_unlock(&bc->lock);
 997
 998         list_for_each_entry_safe(ck, n, &items, list) {
 999                 cond_resched();
1000
1001                 bch2_journal_pin_drop(&c->journal, &ck->journal);
1002
1003                 list_del(&ck->list);
1004                 kfree(ck->k);
1005                 six_lock_exit(&ck->c.lock);
1006                 kmem_cache_free(bch2_key_cache, ck);
1007         }
1008
1009         if (atomic_long_read(&bc->nr_dirty) &&
1010             !bch2_journal_error(&c->journal) &&
1011             test_bit(BCH_FS_WAS_RW, &c->flags))
1012                 panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
1013                       atomic_long_read(&bc->nr_dirty));
1014
1015         if (atomic_long_read(&bc->nr_keys))
1016                 panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
1017                       atomic_long_read(&bc->nr_keys));
1018
1019         if (bc->table_init_done)
1020                 rhashtable_destroy(&bc->table);
1021
1022         free_percpu(bc->pcpu_freed);
1023 }
1024
1025 void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
1026 {
1027         mutex_init(&c->lock);
1028         INIT_LIST_HEAD(&c->freed_pcpu);
1029         INIT_LIST_HEAD(&c->freed_nonpcpu);
1030 }
1031
1032 static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
1033 {
1034         struct bch_fs *c = shrink->private_data;
1035         struct btree_key_cache *bc = &c->btree_key_cache;
1036         char *cbuf;
1037         size_t buflen = seq_buf_get_buf(s, &cbuf);
1038         struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
1039
1040         bch2_btree_key_cache_to_text(&out, bc);
1041         seq_buf_commit(s, out.pos);
1042 }
1043
1044 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
1045 {
1046         struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
1047         struct shrinker *shrink;
1048
1049 #ifdef __KERNEL__
1050         bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
1051         if (!bc->pcpu_freed)
1052                 return -BCH_ERR_ENOMEM_fs_btree_cache_init;
1053 #endif
1054
1055         if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
1056                 return -BCH_ERR_ENOMEM_fs_btree_cache_init;
1057
1058         bc->table_init_done = true;
1059
1060         shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
1061         if (!shrink)
1062                 return -BCH_ERR_ENOMEM_fs_btree_cache_init;
1063         bc->shrink = shrink;
1064         shrink->seeks           = 0;
1065         shrink->count_objects   = bch2_btree_key_cache_count;
1066         shrink->scan_objects    = bch2_btree_key_cache_scan;
1067         shrink->to_text         = bch2_btree_key_cache_shrinker_to_text;
1068         shrink->private_data    = c;
1069         shrinker_register(shrink);
1070         return 0;
1071 }
1072
1073 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
1074 {
1075         prt_printf(out, "nr_freed:\t%lu",       atomic_long_read(&c->nr_freed));
1076         prt_newline(out);
1077         prt_printf(out, "nr_keys:\t%lu",        atomic_long_read(&c->nr_keys));
1078         prt_newline(out);
1079         prt_printf(out, "nr_dirty:\t%lu",       atomic_long_read(&c->nr_dirty));
1080         prt_newline(out);
1081 }
1082
1083 void bch2_btree_key_cache_exit(void)
1084 {
1085         kmem_cache_destroy(bch2_key_cache);
1086 }
1087
1088 int __init bch2_btree_key_cache_init(void)
1089 {
1090         bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
1091         if (!bch2_key_cache)
1092                 return -ENOMEM;
1093
1094         return 0;
1095 }