X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_key_cache.c;h=19f98c26cfaaac8a353440216e22ec746e4d2c18;hb=2252eecec7e072dfdc66cfea6da0ee6ed648a858;hp=a5b0a956e70568125d2a41f47bd475562066060f;hpb=bad0c8c50758b4447d529f61017c1a8c85976a3e;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index a5b0a95..19f98c2 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_cache.h" @@ -5,12 +6,19 @@ #include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update.h" +#include "errcode.h" #include "error.h" #include "journal.h" #include "journal_reclaim.h" +#include "trace.h" #include -#include +#include + +static inline bool btree_uses_pcpu_readers(enum btree_id id) +{ + return id == BTREE_ID_subvolumes; +} static struct kmem_cache *bch2_key_cache; @@ -20,8 +28,8 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, const struct bkey_cached *ck = obj; const struct bkey_cached_key *key = arg->key; - return cmp_int(ck->key.btree_id, key->btree_id) ?: - bpos_cmp(ck->key.pos, key->pos); + return ck->key.btree_id != key->btree_id || + !bpos_eq(ck->key.pos, key->pos); } static const struct rhashtable_params bch2_btree_key_cache_params = { @@ -49,13 +57,12 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) if (!six_trylock_intent(&ck->c.lock)) return false; - if (!six_trylock_write(&ck->c.lock)) { + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { six_unlock_intent(&ck->c.lock); return false; } - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - six_unlock_write(&ck->c.lock); + if (!six_trylock_write(&ck->c.lock)) { six_unlock_intent(&ck->c.lock); return false; } @@ -83,7 +90,10 @@ static void bkey_cached_free(struct btree_key_cache *bc, ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); - list_move_tail(&ck->list, &bc->freed); + if (ck->c.lock.readers) + list_move_tail(&ck->list, &bc->freed_pcpu); + else + list_move_tail(&ck->list, &bc->freed_nonpcpu); atomic_long_inc(&bc->nr_freed); kfree(ck->k); @@ -94,14 +104,74 @@ static void bkey_cached_free(struct btree_key_cache *bc, six_unlock_intent(&ck->c.lock); } +#ifdef __KERNEL__ +static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bkey_cached *pos; + + list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { + if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, + pos->btree_trans_barrier_seq)) { + list_move(&ck->list, &pos->list); + return; + } + } + + list_move(&ck->list, &bc->freed_nonpcpu); +} +#endif + +static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + if (!ck->c.lock.readers) { +#ifdef __KERNEL__ + struct btree_key_cache_freelist *f; + bool freed = false; + + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + if (f->nr < ARRAY_SIZE(f->objs)) { + f->objs[f->nr++] = ck; + freed = true; + } + preempt_enable(); + + if (!freed) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (f->nr > ARRAY_SIZE(f->objs) / 2) { + struct bkey_cached *ck2 = f->objs[--f->nr]; + + __bkey_cached_move_to_freelist_ordered(bc, ck2); + } + preempt_enable(); + + __bkey_cached_move_to_freelist_ordered(bc, ck); + mutex_unlock(&bc->lock); + } +#else + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_nonpcpu); + mutex_unlock(&bc->lock); +#endif + } else { + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_pcpu); + mutex_unlock(&bc->lock); + } +} + static void bkey_cached_free_fast(struct btree_key_cache *bc, struct bkey_cached *ck) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - struct btree_key_cache_freelist *f; - bool freed = false; - - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); ck->btree_trans_barrier_seq = start_poll_synchronize_srcu(&c->btree_trans_barrier); @@ -113,80 +183,104 @@ static void bkey_cached_free_fast(struct btree_key_cache *bc, ck->k = NULL; ck->u64s = 0; - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - if (f->nr < ARRAY_SIZE(f->objs)) { - f->objs[f->nr++] = ck; - freed = true; - } - preempt_enable(); - - if (!freed) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (f->nr > ARRAY_SIZE(f->objs) / 2) { - struct bkey_cached *ck2 = f->objs[--f->nr]; - - list_move_tail(&ck2->list, &bc->freed); - } - preempt_enable(); - - list_move_tail(&ck->list, &bc->freed); - mutex_unlock(&bc->lock); - } + bkey_cached_move_to_freelist(bc, ck); six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); } static struct bkey_cached * -bkey_cached_alloc(struct btree_key_cache *c) +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, + bool *was_new) { + struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck = NULL; - struct btree_key_cache_freelist *f; + bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); + int ret; - preempt_disable(); - f = this_cpu_ptr(c->pcpu_freed); - if (f->nr) - ck = f->objs[--f->nr]; - preempt_enable(); + if (!pcpu_readers) { +#ifdef __KERNEL__ + struct btree_key_cache_freelist *f; - if (!ck) { - mutex_lock(&c->lock); preempt_disable(); - f = this_cpu_ptr(c->pcpu_freed); + f = this_cpu_ptr(bc->pcpu_freed); + if (f->nr) + ck = f->objs[--f->nr]; + preempt_enable(); + + if (!ck) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (!list_empty(&bc->freed_nonpcpu) && + f->nr < ARRAY_SIZE(f->objs) / 2) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); + list_del_init(&ck->list); + f->objs[f->nr++] = ck; + } - while (!list_empty(&c->freed) && - f->nr < ARRAY_SIZE(f->objs) / 2) { - ck = list_last_entry(&c->freed, struct bkey_cached, list); + ck = f->nr ? f->objs[--f->nr] : NULL; + preempt_enable(); + mutex_unlock(&bc->lock); + } +#else + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_nonpcpu)) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); list_del_init(&ck->list); - f->objs[f->nr++] = ck; } - - ck = f->nr ? f->objs[--f->nr] : NULL; - preempt_enable(); - mutex_unlock(&c->lock); + mutex_unlock(&bc->lock); +#endif + } else { + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_pcpu)) { + ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); + list_del_init(&ck->list); + } + mutex_unlock(&bc->lock); } if (ck) { - six_lock_intent(&ck->c.lock, NULL, NULL); - six_lock_write(&ck->c.lock, NULL, NULL); + ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); + if (unlikely(ret)) { + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + + path->l[0].b = (void *) ck; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); + + ret = bch2_btree_node_lock_write(trans, path, &ck->c); + if (unlikely(ret)) { + btree_node_unlock(trans, path, 0); + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + return ck; } - ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); - if (likely(ck)) { - INIT_LIST_HEAD(&ck->list); - six_lock_init(&ck->c.lock); - BUG_ON(!six_trylock_intent(&ck->c.lock)); - BUG_ON(!six_trylock_write(&ck->c.lock)); - return ck; + ck = allocate_dropping_locks(trans, ret, + kmem_cache_zalloc(bch2_key_cache, _gfp)); + if (ret) { + kmem_cache_free(bch2_key_cache, ck); + return ERR_PTR(ret); } - return NULL; + if (!ck) + return NULL; + + INIT_LIST_HEAD(&ck->list); + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); + + ck->c.cached = true; + BUG_ON(!six_trylock_intent(&ck->c.lock)); + BUG_ON(!six_trylock_write(&ck->c.lock)); + *was_new = true; + return ck; } static struct bkey_cached * @@ -197,6 +291,7 @@ bkey_cached_reuse(struct btree_key_cache *c) struct bkey_cached *ck; unsigned i; + mutex_lock(&c->lock); rcu_read_lock(); tbl = rht_dereference_rcu(c->table.tbl, &c->table); for (i = 0; i < tbl->size; i++) @@ -204,46 +299,43 @@ bkey_cached_reuse(struct btree_key_cache *c) if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { bkey_cached_evict(c, ck); - rcu_read_unlock(); - return ck; + goto out; } } + ck = NULL; +out: rcu_read_unlock(); - - return NULL; + mutex_unlock(&c->lock); + return ck; } static struct bkey_cached * -btree_key_cache_create(struct bch_fs *c, - enum btree_id btree_id, - struct bpos pos) +btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) { + struct bch_fs *c = trans->c; struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck; - bool was_new = true; + bool was_new = false; - ck = bkey_cached_alloc(bc); + ck = bkey_cached_alloc(trans, path, &was_new); + if (IS_ERR(ck)) + return ck; if (unlikely(!ck)) { ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_ids[btree_id]); - return ERR_PTR(-ENOMEM); + bch2_btree_id_str(path->btree_id)); + return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); } - was_new = false; - } else { - if (btree_id == BTREE_ID_subvolumes) - six_lock_pcpu_alloc(&ck->c.lock); - else - six_lock_pcpu_free(&ck->c.lock); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); } ck->c.level = 0; - ck->c.btree_id = btree_id; - ck->key.btree_id = btree_id; - ck->key.pos = pos; + ck->c.btree_id = path->btree_id; + ck->key.btree_id = path->btree_id; + ck->key.pos = path->pos; ck->valid = false; ck->flags = 1U << BKEY_CACHED_ACCESSED; @@ -260,6 +352,7 @@ btree_key_cache_create(struct bch_fs *c, bkey_cached_free_fast(bc, ck); } + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); return NULL; } @@ -274,25 +367,22 @@ static int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, struct bkey_cached *ck) { - struct btree_path *path; + struct btree_iter iter; struct bkey_s_c k; unsigned new_u64s = 0; struct bkey_i *new_k = NULL; - struct bkey u; int ret; - path = bch2_path_get(trans, ck->key.btree_id, - ck->key.pos, 0, 0, 0, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, path, 0); + k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, + BTREE_ITER_KEY_CACHE_FILL| + BTREE_ITER_CACHED_NOFILL); + ret = bkey_err(k); if (ret) goto err; - k = bch2_btree_path_peek_slot(path, &u); - if (!bch2_btree_node_relock(trans, ck_path, 0)) { - trace_trans_restart_relock_key_cache_fill(trans->fn, - _THIS_IP_, ck_path->btree_id, &ck_path->pos); - ret = btree_trans_restart(trans); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } @@ -311,20 +401,39 @@ static int btree_key_cache_fill(struct btree_trans *trans, if (new_u64s > ck->u64s) { new_u64s = roundup_pow_of_two(new_u64s); - new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_ids[ck->key.btree_id], new_u64s); - ret = -ENOMEM; - goto err; + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(ck->key.btree_id), new_u64s); + ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + goto err; + } + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + kfree(new_k); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); + goto err; + } + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_k); + goto err; + } } } - /* - * XXX: not allowed to be holding read locks when we take a write lock, - * currently - */ - bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b); + ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); + if (ret) { + kfree(new_k); + goto err; + } + if (new_k) { kfree(ck->k); ck->u64s = new_u64s; @@ -336,24 +445,15 @@ static int btree_key_cache_fill(struct btree_trans *trans, bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - path->preserve = false; + set_btree_iter_dontneed(&iter); err: - bch2_path_put(trans, path, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } -static int bkey_cached_check_fn(struct six_lock *lock, void *p) -{ - struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); - const struct btree_path *path = p; - - return ck->key.btree_id == path->btree_id && - !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1; -} - -__flatten -int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, - unsigned flags) +static noinline int +bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_cached *ck; @@ -363,80 +463,141 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path path->l[1].b = NULL; - if (bch2_btree_node_relock(trans, path, 0)) { + if (bch2_btree_node_relock_notrace(trans, path, 0)) { ck = (void *) path->l[0].b; goto fill; } retry: ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); if (!ck) { - if (flags & BTREE_ITER_CACHED_NOCREATE) { - path->l[0].b = NULL; - return 0; - } - - ck = btree_key_cache_create(c, path->btree_id, path->pos); + ck = btree_key_cache_create(trans, path); ret = PTR_ERR_OR_ZERO(ck); if (ret) goto err; if (!ck) goto retry; - mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED); path->locks_want = 1; } else { enum six_lock_type lock_want = __btree_lock_want(path, 0); - if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0, - lock_want, - bkey_cached_check_fn, path, _THIS_IP_)) { - if (!trans->restarted) - goto retry; - - ret = -EINTR; + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto err; - } + + BUG_ON(ret); if (ck->key.btree_id != path->btree_id || - bpos_cmp(ck->key.pos, path->pos)) { + !bpos_eq(ck->key.pos, path->pos)) { six_unlock_type(&ck->c.lock, lock_want); goto retry; } - mark_btree_node_locked(trans, path, 0, lock_want); + mark_btree_node_locked(trans, path, 0, + (enum btree_node_locked_type) lock_want); } - path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); path->l[0].b = (void *) ck; fill: + path->uptodate = BTREE_ITER_UPTODATE; + if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { + /* + * Using the underscore version because we haven't set + * path->uptodate yet: + */ if (!path->locks_want && !__bch2_btree_path_upgrade(trans, path, 1)) { - trace_transaction_restart_ip(trans->fn, _THIS_IP_); - ret = btree_trans_restart(trans); + trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); goto err; } ret = btree_key_cache_fill(trans, path, ck); if (ret) goto err; + + ret = bch2_btree_path_relock(trans, path, _THIS_IP_); + if (ret) + goto err; + + path->uptodate = BTREE_ITER_UPTODATE; } if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - path->uptodate = BTREE_ITER_UPTODATE; BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + BUG_ON(path->uptodate); return ret; err: - if (ret != -EINTR) { - btree_node_unlock(path, 0); - path->l[0].b = BTREE_ITER_NO_NODE_ERROR; + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(ret); } return ret; } +int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck; + int ret = 0; + + EBUG_ON(path->level); + + path->l[1].b = NULL; + + if (bch2_btree_node_relock_notrace(trans, path, 0)) { + ck = (void *) path->l[0].b; + goto fill; + } +retry: + ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); + if (!ck) { + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); + + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + if (ret) + return ret; + + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; + } + + mark_btree_node_locked(trans, path, 0, + (enum btree_node_locked_type) lock_want); + } + + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + path->l[0].b = (void *) ck; +fill: + if (!ck->valid) + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + + path->uptodate = BTREE_ITER_UPTODATE; + EBUG_ON(!ck->valid); + EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + + return ret; +} + static int btree_key_cache_flush_pos(struct btree_trans *trans, struct bkey_cached_key key, u64 journal_seq, @@ -455,8 +616,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_ITER_ALL_SNAPSHOTS); bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_CACHED_NOCREATE| BTREE_ITER_INTENT); b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; @@ -483,7 +642,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, * Since journal reclaim depends on us making progress here, and the * allocator/copygc depend on journal reclaim making progress, we need * to be using alloc reserves: - * */ + */ ret = bch2_btree_iter_traverse(&b_iter) ?: bch2_trans_update(trans, &b_iter, ck->k, BTREE_UPDATE_KEY_CACHE_RECLAIM| @@ -492,18 +651,18 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| (ck->journal.seq == journal_last_seq(j) - ? JOURNAL_WATERMARK_reserved + ? BCH_WATERMARK_reclaim : 0)| commit_flags); - if (ret) { - bch2_fs_fatal_err_on(ret != -EINTR && - ret != -EAGAIN && - !bch2_journal_error(j), c, - "error flushing key cache: %i", ret); + + bch2_fs_fatal_err_on(ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && + !bch2_journal_error(j), c, + "error flushing key cache: %s", bch2_err_str(ret)); + if (ret) goto out; - } bch2_journal_pin_drop(j, &ck->journal); bch2_journal_preres_put(j, &ck->res); @@ -516,21 +675,21 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, atomic_long_dec(&c->btree_key_cache.nr_dirty); } } else { + struct btree_path *path2; evict: - BUG_ON(!btree_node_intent_locked(c_iter.path, 0)); - - mark_btree_node_unlocked(c_iter.path, 0); - c_iter.path->l[0].b = NULL; + trans_for_each_path(trans, path2) + if (path2 != c_iter.path) + __bch2_btree_path_unlock(trans, path2); - six_lock_write(&ck->c.lock, NULL, NULL); + bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_dec(&c->btree_key_cache.nr_dirty); } + mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); bkey_cached_evict(&c->btree_key_cache, ck); - bkey_cached_free_fast(&c->btree_key_cache, ck); } out: @@ -546,11 +705,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, struct bkey_cached *ck = container_of(pin, struct bkey_cached, journal); struct bkey_cached_key key; - int ret = 0; - + struct btree_trans *trans = bch2_trans_get(c); int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + int ret = 0; - six_lock_read(&ck->c.lock, NULL, NULL); + btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); key = ck->key; if (ck->journal.seq != seq || @@ -558,14 +717,22 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, six_unlock_read(&ck->c.lock); goto unlock; } + + if (ck->seq != seq) { + bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, + bch2_btree_key_cache_journal_flush); + six_unlock_read(&ck->c.lock); + goto unlock; + } six_unlock_read(&ck->c.lock); - ret = bch2_trans_do(c, NULL, NULL, 0, - btree_key_cache_flush_pos(&trans, key, seq, + ret = commit_do(trans, NULL, NULL, 0, + btree_key_cache_flush_pos(trans, key, seq, BTREE_INSERT_JOURNAL_RECLAIM, false)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + bch2_trans_put(trans); return ret; } @@ -586,21 +753,22 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, } bool bch2_btree_insert_key_cached(struct btree_trans *trans, - struct btree_path *path, - struct bkey_i *insert) + unsigned flags, + struct btree_insert_entry *insert_entry) { struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) path->l[0].b; + struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; + struct bkey_i *insert = insert_entry->k; bool kick_reclaim = false; - BUG_ON(insert->u64s > ck->u64s); + BUG_ON(insert->k.u64s > ck->u64s); - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { int difference; - BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); + BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); - difference = jset_u64s(insert->u64s) - ck->res.u64s; + difference = jset_u64s(insert->k.u64s) - ck->res.u64s; if (difference > 0) { trans->journal_preres.u64s -= difference; ck->res.u64s += difference; @@ -611,6 +779,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); set_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_inc(&c->btree_key_cache.nr_dirty); @@ -618,8 +787,24 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, kick_reclaim = true; } - bch2_journal_pin_update(&c->journal, trans->journal_res.seq, - &ck->journal, bch2_btree_key_cache_journal_flush); + /* + * To minimize lock contention, we only add the journal pin here and + * defer pin updates to the flush callback via ->seq. Be careful not to + * update ->seq on nojournal commits because we don't want to update the + * pin to a seq that doesn't include journal updates on disk. Otherwise + * we risk losing the update after a crash. + * + * The only exception is if the pin is not active in the first place. We + * have to add the pin because journal reclaim drives key cache + * flushing. The flush callback will not proceed unless ->seq matches + * the latest pin, so make sure it starts with a consistent value. + */ + if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + !journal_pin_active(&ck->journal)) { + ck->seq = trans->journal_res.seq; + } + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + &ck->journal, bch2_btree_key_cache_journal_flush); if (kick_reclaim) journal_reclaim_kick(&c->journal); @@ -629,11 +814,22 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, void bch2_btree_key_cache_drop(struct btree_trans *trans, struct btree_path *path) { + struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) path->l[0].b; - ck->valid = false; + BUG_ON(!ck->valid); - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + /* + * We just did an update to the btree, bypassing the key cache: the key + * cache key is now stale and must be dropped, even if dirty: + */ + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + bch2_journal_pin_drop(&c->journal, &ck->journal); + } + + ck->valid = false; } static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, @@ -648,12 +844,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, unsigned start, flags; int srcu_idx; - /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_FS) - mutex_lock(&bc->lock); - else if (!mutex_trylock(&bc->lock)) - return -1; - + mutex_lock(&bc->lock); srcu_idx = srcu_read_lock(&c->btree_trans_barrier); flags = memalloc_nofs_save(); @@ -661,12 +852,29 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, * Newest freed entries are at the end of the list - once we hit one * that's too new to be freed, we can bail out: */ - list_for_each_entry_safe(ck, t, &bc->freed, list) { + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; + + list_del(&ck->list); + six_lock_exit(&ck->c.lock); + kmem_cache_free(bch2_key_cache, ck); + atomic_long_dec(&bc->nr_freed); + scanned++; + freed++; + } + + if (scanned >= nr) + goto out; + + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ck->btree_trans_barrier_seq)) break; list_del(&ck->list); + six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); scanned++; @@ -740,35 +948,49 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) struct bucket_table *tbl; struct bkey_cached *ck, *n; struct rhash_head *pos; + LIST_HEAD(items); unsigned i; +#ifdef __KERNEL__ int cpu; +#endif - if (bc->shrink.list.next) - unregister_shrinker(&bc->shrink); + unregister_shrinker(&bc->shrink); mutex_lock(&bc->lock); - rcu_read_lock(); - tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - if (tbl) - for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - bkey_cached_evict(bc, ck); - list_add(&ck->list, &bc->freed); - } - rcu_read_unlock(); + /* + * The loop is needed to guard against racing with rehash: + */ + while (atomic_long_read(&bc->nr_keys)) { + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + if (tbl) + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + bkey_cached_evict(bc, ck); + list_add(&ck->list, &items); + } + rcu_read_unlock(); + } +#ifdef __KERNEL__ for_each_possible_cpu(cpu) { struct btree_key_cache_freelist *f = per_cpu_ptr(bc->pcpu_freed, cpu); for (i = 0; i < f->nr; i++) { ck = f->objs[i]; - list_add(&ck->list, &bc->freed); + list_add(&ck->list, &items); } } +#endif + + list_splice(&bc->freed_pcpu, &items); + list_splice(&bc->freed_nonpcpu, &items); + + mutex_unlock(&bc->lock); - list_for_each_entry_safe(ck, n, &bc->freed, list) { + list_for_each_entry_safe(ck, n, &items, list) { cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); @@ -776,15 +998,19 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) list_del(&ck->list); kfree(ck->k); + six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); } - BUG_ON(atomic_long_read(&bc->nr_dirty) && - !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_WAS_RW, &c->flags)); - BUG_ON(atomic_long_read(&bc->nr_keys)); + if (atomic_long_read(&bc->nr_dirty) && + !bch2_journal_error(&c->journal) && + test_bit(BCH_FS_WAS_RW, &c->flags)) + panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", + atomic_long_read(&bc->nr_dirty)); - mutex_unlock(&bc->lock); + if (atomic_long_read(&bc->nr_keys)) + panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", + atomic_long_read(&bc->nr_keys)); if (bc->table_init_done) rhashtable_destroy(&bc->table); @@ -795,54 +1021,64 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { mutex_init(&c->lock); - INIT_LIST_HEAD(&c->freed); + INIT_LIST_HEAD(&c->freed_pcpu); + INIT_LIST_HEAD(&c->freed_nonpcpu); } -static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) +static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) { struct btree_key_cache *bc = container_of(shrink, struct btree_key_cache, shrink); + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); - bch2_btree_key_cache_to_text(out, bc); + bch2_btree_key_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); } -int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) +int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) { - int ret; + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - c->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); - if (!c->pcpu_freed) - return -ENOMEM; +#ifdef __KERNEL__ + bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); + if (!bc->pcpu_freed) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; +#endif - ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params); - if (ret) - return ret; + if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; - c->table_init_done = true; + bc->table_init_done = true; - c->shrink.seeks = 1; - c->shrink.count_objects = bch2_btree_key_cache_count; - c->shrink.scan_objects = bch2_btree_key_cache_scan; - c->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; - return register_shrinker(&c->shrink); + bc->shrink.seeks = 0; + bc->shrink.count_objects = bch2_btree_key_cache_count; + bc->shrink.scan_objects = bch2_btree_key_cache_scan; + bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; + if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name)) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return 0; } void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) { - prt_printf(out, "nr_freed:\t%zu\n", atomic_long_read(&c->nr_freed)); - prt_printf(out, "nr_keys:\t%lu\n", atomic_long_read(&c->nr_keys)); - prt_printf(out, "nr_dirty:\t%lu\n", atomic_long_read(&c->nr_dirty)); + prt_printf(out, "nr_freed:\t%lu", atomic_long_read(&c->nr_freed)); + prt_newline(out); + prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); + prt_newline(out); + prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); + prt_newline(out); } void bch2_btree_key_cache_exit(void) { - if (bch2_key_cache) - kmem_cache_destroy(bch2_key_cache); + kmem_cache_destroy(bch2_key_cache); } int __init bch2_btree_key_cache_init(void) { - bch2_key_cache = KMEM_CACHE(bkey_cached, 0); + bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); if (!bch2_key_cache) return -ENOMEM;