X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_cache.c;h=1aacd271021b475433ddc5b546dc93744395b1bf;hb=86bd5c622c93509e204b6fb5de6910879fc87a70;hp=0c737f35f430aa21af52a5832936279da97be716;hpb=b74c93d43cb5e02c169569b4d4ca312548b964ba;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 0c737f3..1aacd27 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -1,22 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" #include "debug.h" +#include "error.h" #include #include #include -const char * const bch2_btree_ids[] = { -#define x(kwd, val, name) name, - BCH_BTREE_IDS() -#undef x - NULL -}; +struct lock_class_key bch2_btree_node_lock_key; void bch2_recalc_btree_reserve(struct bch_fs *c) { @@ -28,7 +25,7 @@ void bch2_recalc_btree_reserve(struct bch_fs *c) for (i = 0; i < BTREE_ID_NR; i++) if (c->btree_roots[i].b) reserve += min_t(unsigned, 1, - c->btree_roots[i].b->level) * 8; + c->btree_roots[i].b->c.level) * 8; c->btree_cache.reserve = reserve; } @@ -38,20 +35,21 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc) return max_t(int, 0, bc->used - bc->reserve); } -static void __btree_node_data_free(struct bch_fs *c, struct btree *b) +static void btree_node_data_free(struct bch_fs *c, struct btree *b) { + struct btree_cache *bc = &c->btree_cache; + EBUG_ON(btree_node_write_in_flight(b)); kvpfree(b->data, btree_bytes(c)); b->data = NULL; - bch2_btree_keys_free(b); -} - -static void btree_node_data_free(struct bch_fs *c, struct btree *b) -{ - struct btree_cache *bc = &c->btree_cache; +#ifdef __KERNEL__ + vfree(b->aux_data); +#else + munmap(b->aux_data, btree_aux_data_bytes(b)); +#endif + b->aux_data = NULL; - __btree_node_data_free(c, b); bc->used--; list_move(&b->list, &bc->freed); } @@ -62,49 +60,68 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, const struct btree *b = obj; const u64 *v = arg->key; - return PTR_HASH(&b->key) == *v ? 0 : 1; + return b->hash_val == *v ? 0 : 1; } static const struct rhashtable_params bch_btree_cache_params = { .head_offset = offsetof(struct btree, hash), - .key_offset = offsetof(struct btree, key.v), - .key_len = sizeof(struct bch_extent_ptr), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), .obj_cmpfn = bch2_btree_cache_cmp_fn, }; -static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { - struct btree_cache *bc = &c->btree_cache; + BUG_ON(b->data || b->aux_data); b->data = kvpmalloc(btree_bytes(c), gfp); if (!b->data) - goto err; - - if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) - goto err; + return -ENOMEM; +#ifdef __KERNEL__ + b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); +#else + b->aux_data = mmap(NULL, btree_aux_data_bytes(b), + PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); +#endif + if (!b->aux_data) { + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; + return -ENOMEM; + } - bc->used++; - list_move(&b->list, &bc->freeable); - return; -err: - kvpfree(b->data, btree_bytes(c)); - b->data = NULL; - list_move(&b->list, &bc->freed); + return 0; } -static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) +static struct btree *__btree_node_mem_alloc(struct bch_fs *c) { - struct btree *b = kzalloc(sizeof(struct btree), gfp); + struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); if (!b) return NULL; bkey_btree_ptr_init(&b->key); - six_lock_init(&b->lock); + __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); + b->byte_order = ilog2(btree_bytes(c)); + return b; +} - btree_node_data_alloc(c, b, gfp); - return b->data ? b : NULL; +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b = __btree_node_mem_alloc(c); + if (!b) + return NULL; + + if (btree_node_data_alloc(c, b, GFP_KERNEL)) { + kfree(b); + return NULL; + } + + bc->used++; + list_add(&b->list, &bc->freeable); + return b; } /* Btree in memory cache - hash table */ @@ -114,11 +131,16 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); /* Cause future lookups for this node to fail: */ - PTR_HASH(&b->key) = 0; + b->hash_val = 0; + + six_lock_wakeup_all(&b->c.lock); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) { + BUG_ON(b->hash_val); + b->hash_val = btree_ptr_hash_val(&b->key); + return rhashtable_lookup_insert_fast(&bc->table, &b->hash, bch_btree_cache_params); } @@ -128,8 +150,13 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, { int ret; - b->level = level; - b->btree_id = id; + b->c.level = level; + b->c.btree_id = id; + + if (level) + six_lock_pcpu_alloc(&b->c.lock); + else + six_lock_pcpu_free_rcu(&b->c.lock); mutex_lock(&bc->lock); ret = __bch2_btree_node_hash_insert(bc, b); @@ -144,8 +171,9 @@ __flatten static inline struct btree *btree_cache_find(struct btree_cache *bc, const struct bkey_i *k) { - return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k), - bch_btree_cache_params); + u64 v = btree_ptr_hash_val(k); + + return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); } /* @@ -158,54 +186,67 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) int ret = 0; lockdep_assert_held(&bc->lock); +wait_on_io: + if (b->flags & ((1U << BTREE_NODE_dirty)| + (1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) + return -ENOMEM; + + /* XXX: waiting on IO with btree cache lock held */ + bch2_btree_node_wait_on_read(b); + bch2_btree_node_wait_on_write(b); + } - if (!six_trylock_intent(&b->lock)) + if (!six_trylock_intent(&b->c.lock)) return -ENOMEM; - if (!six_trylock_write(&b->lock)) + if (!six_trylock_write(&b->c.lock)) goto out_unlock_intent; + /* recheck under lock */ + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) + goto out_unlock; + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } + if (btree_node_noevict(b)) goto out_unlock; if (!btree_node_may_write(b)) goto out_unlock; - if (btree_node_dirty(b) && - test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) - goto out_unlock; - - if (btree_node_dirty(b) || - btree_node_write_in_flight(b) || - btree_node_read_in_flight(b)) { - if (!flush) + if (btree_node_dirty(b)) { + if (!flush || + test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) goto out_unlock; - - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); - /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted * - unless btree verify mode is enabled, since it runs out of * the post write cleanup: */ - if (verify_btree_ondisk(c)) + if (bch2_verify_btree_ondisk) bch2_btree_node_write(c, b, SIX_LOCK_intent); else - __bch2_btree_node_write(c, b, SIX_LOCK_read); + __bch2_btree_node_write(c, b, false); - /* wait for any in flight btree write */ - btree_node_wait_on_io(b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; } out: - if (PTR_HASH(&b->key) && !ret) + if (b->hash_val && !ret) trace_btree_node_reap(c, b); return ret; out_unlock: - six_unlock_write(&b->lock); + six_unlock_write(&b->c.lock); out_unlock_intent: - six_unlock_intent(&b->lock); + six_unlock_intent(&b->c.lock); ret = -ENOMEM; goto out; } @@ -231,17 +272,19 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long can_free; unsigned long touched = 0; unsigned long freed = 0; - unsigned i; + unsigned i, flags; - if (btree_shrinker_disabled(c)) + if (bch2_btree_shrinker_disabled) return SHRINK_STOP; /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) mutex_lock(&bc->lock); else if (!mutex_trylock(&bc->lock)) return -1; + flags = memalloc_nofs_save(); + /* * It's _really_ critical that we don't free too many btree nodes - we * have to always leave ourselves a reserve. The reserve is how we @@ -263,8 +306,8 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (++i > 3 && !btree_node_reclaim(c, b)) { btree_node_data_free(c, b); - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); freed++; } } @@ -290,13 +333,13 @@ restart: mutex_unlock(&bc->lock); bch2_btree_node_hash_remove(bc, b); - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); if (freed >= nr) goto out; - if (sc->gfp_mask & __GFP_IO) + if (sc->gfp_mask & __GFP_FS) mutex_lock(&bc->lock); else if (!mutex_trylock(&bc->lock)) goto out; @@ -307,6 +350,7 @@ restart: mutex_unlock(&bc->lock); out: + memalloc_nofs_restore(flags); return (unsigned long) freed * btree_pages(c); } @@ -317,7 +361,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, btree_cache.shrink); struct btree_cache *bc = &c->btree_cache; - if (btree_shrinker_disabled(c)) + if (bch2_btree_shrinker_disabled) return 0; return btree_cache_can_free(bc) * btree_pages(c); @@ -327,19 +371,19 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; struct btree *b; - unsigned i; + unsigned i, flags; if (bc->shrink.list.next) unregister_shrinker(&bc->shrink); + /* vfree() can allocate memory: */ + flags = memalloc_nofs_save(); mutex_lock(&bc->lock); -#ifdef CONFIG_BCACHEFS_DEBUG if (c->verify_data) list_move(&c->verify_data->list, &bc->live); kvpfree(c->verify_ondisk, btree_bytes(c)); -#endif for (i = 0; i < BTREE_ID_NR; i++) if (c->btree_roots[i].b) @@ -355,18 +399,22 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (btree_node_dirty(b)) bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty(b); + clear_btree_node_dirty(c, b); btree_node_data_free(c, b); } + BUG_ON(atomic_read(&c->btree_cache.dirty)); + while (!list_empty(&bc->freed)) { b = list_first_entry(&bc->freed, struct btree, list); list_del(&b->list); + six_lock_pcpu_free(&b->c.lock); kfree(b); } mutex_unlock(&bc->lock); + memalloc_nofs_restore(flags); if (bc->table_init_done) rhashtable_destroy(&bc->table); @@ -389,36 +437,20 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); for (i = 0; i < bc->reserve; i++) - if (!btree_node_mem_alloc(c, GFP_KERNEL)) { + if (!__bch2_btree_node_mem_alloc(c)) { ret = -ENOMEM; goto out; } list_splice_init(&bc->live, &bc->freeable); -#ifdef CONFIG_BCACHEFS_DEBUG mutex_init(&c->verify_lock); - c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); - if (!c->verify_ondisk) { - ret = -ENOMEM; - goto out; - } - - c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL); - if (!c->verify_data) { - ret = -ENOMEM; - goto out; - } - - list_del_init(&c->verify_data->list); -#endif - bc->shrink.count_objects = bch2_btree_cache_count; bc->shrink.scan_objects = bch2_btree_cache_scan; bc->shrink.seeks = 4; bc->shrink.batch = btree_pages(c) * 2; - register_shrinker(&bc->shrink); + ret = register_shrinker(&bc->shrink); out: pr_verbose_init(c->opts, "ret %i", ret); return ret; @@ -520,36 +552,43 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) */ list_for_each_entry(b, &bc->freeable, list) if (!btree_node_reclaim(c, b)) - goto out_unlock; + goto got_node; /* * We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, &bc->freed, list) - if (!btree_node_reclaim(c, b)) { - btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO); - if (b->data) - goto out_unlock; + if (!btree_node_reclaim(c, b)) + goto got_node; - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + b = NULL; +got_node: + if (b) + list_del_init(&b->list); + mutex_unlock(&bc->lock); + + if (!b) { + b = __btree_node_mem_alloc(c); + if (!b) goto err; - } - b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO); - if (!b) - goto err; + BUG_ON(!six_trylock_intent(&b->c.lock)); + BUG_ON(!six_trylock_write(&b->c.lock)); + } + + if (!b->data) { + if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) + goto err; + + mutex_lock(&bc->lock); + bc->used++; + mutex_unlock(&bc->lock); + } - BUG_ON(!six_trylock_intent(&b->lock)); - BUG_ON(!six_trylock_write(&b->lock)); -out_unlock: BUG_ON(btree_node_hashed(b)); + BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_write_in_flight(b)); - - list_del_init(&b->list); - mutex_unlock(&bc->lock); - memalloc_nofs_restore(flags); out: b->flags = 0; b->written = 0; @@ -557,13 +596,23 @@ out: b->sib_u64s[0] = 0; b->sib_u64s[1] = 0; b->whiteout_u64s = 0; - bch2_btree_keys_init(b, &c->expensive_debug_checks); + bch2_btree_keys_init(b); + set_btree_node_accessed(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); + memalloc_nofs_restore(flags); return b; err: + mutex_lock(&bc->lock); + + if (b) { + list_add(&b->list, &bc->freed); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + } + /* Try to cannibalize another cached btree node: */ if (bc->alloc_lock == current) { b = btree_node_cannibalize(c); @@ -577,6 +626,7 @@ err: } mutex_unlock(&bc->lock); + memalloc_nofs_restore(flags); return ERR_PTR(-ENOMEM); } @@ -584,67 +634,117 @@ err: static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, + enum btree_id btree_id, unsigned level, enum six_lock_type lock_type, bool sync) { struct btree_cache *bc = &c->btree_cache; struct btree *b; + u32 seq; + BUG_ON(level + 1 >= BTREE_MAX_DEPTH); /* * Parent node must be locked, else we could read in a btree node that's * been freed: */ - BUG_ON(!btree_node_locked(iter, level + 1)); - BUG_ON(level >= BTREE_MAX_DEPTH); + if (iter && !bch2_btree_node_relock(iter, level + 1)) + return ERR_PTR(-EINTR); b = bch2_btree_node_mem_alloc(c); if (IS_ERR(b)) return b; bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) { + if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ /* mark as unhashed... */ - PTR_HASH(&b->key) = 0; + b->hash_val = 0; mutex_lock(&bc->lock); list_add(&b->list, &bc->freeable); mutex_unlock(&bc->lock); - six_unlock_write(&b->lock); - six_unlock_intent(&b->lock); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); return NULL; } - /* - * If the btree node wasn't cached, we can't drop our lock on - * the parent until after it's added to the cache - because - * otherwise we could race with a btree_split() freeing the node - * we're trying to lock. - * - * But the deadlock described below doesn't exist in this case, - * so it's safe to not drop the parent lock until here: - */ - if (btree_node_read_locked(iter, level + 1)) - btree_node_unlock(iter, level + 1); + set_btree_node_read_in_flight(b); - bch2_btree_node_read(c, b, sync); + six_unlock_write(&b->c.lock); + seq = b->c.lock.state.seq; + six_unlock_intent(&b->c.lock); + + /* Unlock before doing IO: */ + if (iter && sync) + bch2_trans_unlock(iter->trans); - six_unlock_write(&b->lock); + bch2_btree_node_read(c, b, sync); - if (!sync) { - six_unlock_intent(&b->lock); + if (!sync) return NULL; - } - if (lock_type == SIX_LOCK_read) - six_lock_downgrade(&b->lock); + /* + * XXX: this will probably always fail because btree_iter_relock() + * currently fails for iterators that aren't pointed at a valid btree + * node + */ + if (iter && + (!bch2_trans_relock(iter->trans) || + !bch2_btree_iter_relock(iter, _THIS_IP_))) + return ERR_PTR(-EINTR); + + if (!six_relock_type(&b->c.lock, lock_type, seq)) + return ERR_PTR(-EINTR); return b; } +static int lock_node_check_fn(struct six_lock *lock, void *p) +{ + struct btree *b = container_of(lock, struct btree, c.lock); + const struct bkey_i *k = p; + + return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; +} + +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) +{ + char buf1[100], buf2[100], buf3[100], buf4[100]; + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; + + bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key + : POS_MIN); + bch2_bpos_to_text(&PBUF(buf2), b->data->min_key); + + bch2_bpos_to_text(&PBUF(buf3), b->key.k.p); + bch2_bpos_to_text(&PBUF(buf4), b->data->max_key); + bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n" + "btree: ptr %u header %llu\n" + "level: ptr %u header %llu\n" + "min ptr %s node header %s\n" + "max ptr %s node header %s", + b->c.btree_id, BTREE_NODE_ID(b->data), + b->c.level, BTREE_NODE_LEVEL(b->data), + buf1, buf2, buf3, buf4); +} + +static inline void btree_check_header(struct bch_fs *c, struct btree *b) +{ + if (b->c.btree_id != BTREE_NODE_ID(b->data) || + b->c.level != BTREE_NODE_LEVEL(b->data) || + bpos_cmp(b->data->max_key, b->key.k.p) || + (b->key.k.type == KEY_TYPE_btree_ptr_v2 && + bpos_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) + btree_bad_header(c, b); +} + /** * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. @@ -656,22 +756,18 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, */ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type) + enum six_lock_type lock_type, + unsigned long trace_ip) { struct btree_cache *bc = &c->btree_cache; struct btree *b; struct bset_tree *t; - /* - * XXX: locking optimization - * - * we can make the locking looser here - caller can drop lock on parent - * node before locking child node (and potentially blocking): we just - * have to have bch2_btree_node_fill() call relock on the parent and - * return -EINTR if that fails - */ - EBUG_ON(!btree_node_locked(iter, level + 1)); EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { @@ -680,7 +776,8 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, iter, k, level, lock_type, true); + b = bch2_btree_node_fill(c, iter, k, iter->btree_id, + level, lock_type, true); /* We raced and found the btree node in the cache */ if (!b) @@ -689,6 +786,7 @@ retry: if (IS_ERR(b)) return b; } else { +lock_node: /* * There's a potential deadlock with splits and insertions into * interior nodes we have to avoid: @@ -710,7 +808,7 @@ retry: * free it: * * To guard against this, btree nodes are evicted from the cache - * when they're freed - and PTR_HASH() is zeroed out, which we + * when they're freed - and b->hash_val is zeroed out, which we * check for after we lock the node. * * Then, bch2_btree_node_relock() on the parent will fail - because @@ -720,23 +818,49 @@ retry: if (btree_node_read_locked(iter, level + 1)) btree_node_unlock(iter, level + 1); - if (!btree_node_lock(b, k->k.p, level, iter, lock_type)) + if (!btree_node_lock(b, k->k.p, level, iter, lock_type, + lock_node_check_fn, (void *) k, trace_ip)) { + if (b->hash_val != btree_ptr_hash_val(k)) + goto retry; return ERR_PTR(-EINTR); + } - if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) || - b->level != level || + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.level != level || race_fault())) { - six_unlock_type(&b->lock, lock_type); + six_unlock_type(&b->c.lock, lock_type); if (bch2_btree_node_relock(iter, level + 1)) goto retry; - trace_trans_restart_btree_node_reused(iter->trans->ip); + trace_trans_restart_btree_node_reused(iter->trans->ip, + trace_ip, + iter->btree_id, + &iter->real_pos); return ERR_PTR(-EINTR); } } - wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, - TASK_UNINTERRUPTIBLE); + if (unlikely(btree_node_read_in_flight(b))) { + u32 seq = b->c.lock.state.seq; + + six_unlock_type(&b->c.lock, lock_type); + bch2_trans_unlock(iter->trans); + + bch2_btree_node_wait_on_read(b); + + /* + * XXX: check if this always fails - btree_iter_relock() + * currently fails for iterators that aren't pointed at a valid + * btree node + */ + if (iter && + (!bch2_trans_relock(iter->trans) || + !bch2_btree_iter_relock(iter, _THIS_IP_))) + return ERR_PTR(-EINTR); + + if (!six_relock_type(&b->c.lock, lock_type, seq)) + goto retry; + } prefetch(b->aux_data); @@ -749,136 +873,154 @@ retry: } /* avoid atomic set bit if it's not needed: */ - if (btree_node_accessed(b)) + if (!btree_node_accessed(b)) set_btree_node_accessed(b); if (unlikely(btree_node_read_error(b))) { - six_unlock_type(&b->lock, lock_type); + six_unlock_type(&b->c.lock, lock_type); return ERR_PTR(-EIO); } - EBUG_ON(b->btree_id != iter->btree_id || - BTREE_NODE_LEVEL(b->data) != level || - bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->c.btree_id != iter->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); return b; } -struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, - struct btree_iter *iter, - struct btree *b, - enum btree_node_sibling sib) +struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, + const struct bkey_i *k, + enum btree_id btree_id, + unsigned level, + bool nofill) { - struct btree_trans *trans = iter->trans; - struct btree *parent; - struct btree_node_iter node_iter; - struct bkey_packed *k; - BKEY_PADDED(k) tmp; - struct btree *ret = NULL; - unsigned level = b->level; - - parent = btree_iter_node(iter, level + 1); - if (!parent) - return NULL; - - if (!bch2_btree_node_relock(iter, level + 1)) { - ret = ERR_PTR(-EINTR); - goto out; - } - - node_iter = iter->l[parent->level].iter; - - k = bch2_btree_node_iter_peek_all(&node_iter, parent); - BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); - - k = sib == btree_prev_sib - ? bch2_btree_node_iter_prev(&node_iter, parent) - : (bch2_btree_node_iter_advance(&node_iter, parent), - bch2_btree_node_iter_peek(&node_iter, parent)); - if (!k) - goto out; - - bch2_bkey_unpack(parent, &tmp.k, k); - - ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent); + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; + int ret; - if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { - struct btree_iter *linked; + EBUG_ON(level >= BTREE_MAX_DEPTH); - if (!bch2_btree_node_relock(iter, level + 1)) + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { + if (nofill) goto out; - /* - * We might have got -EINTR because trylock failed, and we're - * holding other locks that would cause us to deadlock: - */ - trans_for_each_iter(trans, linked) - if (btree_iter_cmp(iter, linked) < 0) - __bch2_btree_iter_unlock(linked); - - if (sib == btree_prev_sib) - btree_node_unlock(iter, level); + b = bch2_btree_node_fill(c, NULL, k, btree_id, + level, SIX_LOCK_read, true); - ret = bch2_btree_node_get(c, iter, &tmp.k, level, - SIX_LOCK_intent); + /* We raced and found the btree node in the cache */ + if (!b) + goto retry; - /* - * before btree_iter_relock() calls btree_iter_verify_locks(): - */ - if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level + 1); + if (IS_ERR(b) && + !bch2_btree_cache_cannibalize_lock(c, NULL)) + goto retry; - if (!bch2_btree_node_relock(iter, level)) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + if (IS_ERR(b)) + goto out; + } else { +lock_node: + ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); + if (ret) + goto retry; - if (!IS_ERR(ret)) { - six_unlock_intent(&ret->lock); - ret = ERR_PTR(-EINTR); - } + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.btree_id != btree_id || + b->c.level != level)) { + six_unlock_read(&b->c.lock); + goto retry; } - - bch2_trans_relock(trans); } -out: - if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level + 1); - - if (PTR_ERR_OR_ZERO(ret) == -EINTR) - bch2_btree_iter_upgrade(iter, level + 2); - BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); + /* XXX: waiting on IO with btree locks held: */ + __bch2_btree_node_wait_on_read(b); - if (!IS_ERR_OR_NULL(ret)) { - struct btree *n1 = ret, *n2 = b; + prefetch(b->aux_data); - if (sib != btree_prev_sib) - swap(n1, n2); + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; - BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id, - n1->key.k.p), - n2->data->min_key)); + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); } - bch2_btree_trans_verify_locks(trans); + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); - return ret; + if (unlikely(btree_node_read_error(b))) { + six_unlock_read(&b->c.lock); + b = ERR_PTR(-EIO); + goto out; + } + + EBUG_ON(b->c.btree_id != btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); +out: + bch2_btree_cache_cannibalize_unlock(c); + return b; } void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, - const struct bkey_i *k, unsigned level) + const struct bkey_i *k, + enum btree_id btree_id, unsigned level) { struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(!btree_node_locked(iter, level + 1)); + BUG_ON(iter && !btree_node_locked(iter, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); if (b) return; - bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false); + bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); +} + +void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + + b = btree_cache_find(bc, k); + if (!b) + return; +wait_on_io: + /* not allowed to wait on io with btree locks held: */ + + /* XXX we're called from btree_gc which will be holding other btree + * nodes locked + * */ + __bch2_btree_node_wait_on_read(b); + __bch2_btree_node_wait_on_write(b); + + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); + + if (btree_node_dirty(b)) { + __bch2_btree_node_write(c, b, false); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } + + BUG_ON(btree_node_dirty(b)); + + mutex_lock(&bc->lock); + btree_node_data_free(c, b); + bch2_btree_node_hash_remove(bc, b); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, @@ -891,20 +1033,19 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, bch2_btree_keys_stats(b, &stats); - pr_buf(out, - "l %u %llu:%llu - %llu:%llu:\n" - " ptrs: ", - b->level, - b->data->min_key.inode, - b->data->min_key.offset, - b->data->max_key.inode, - b->data->max_key.offset); + pr_buf(out, "l %u ", b->c.level); + bch2_bpos_to_text(out, b->data->min_key); + pr_buf(out, " - "); + bch2_bpos_to_text(out, b->data->max_key); + pr_buf(out, ":\n" + " ptrs: "); bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + pr_buf(out, "\n" " format: u64s %u fields %u %u %u %u %u\n" " unpack fn len: %u\n" " bytes used %zu/%zu (%zu%% full)\n" - " sib u64s: %u, %u (merge threshold %zu)\n" + " sib u64s: %u, %u (merge threshold %u)\n" " nr packed keys %u\n" " nr unpacked keys %u\n" " floats %zu\n" @@ -921,9 +1062,16 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], - BTREE_FOREGROUND_MERGE_THRESHOLD(c), + c->btree_foreground_merge_threshold, b->nr.packed_keys, b->nr.unpacked_keys, stats.floats, stats.failed); } + +void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) +{ + pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); + pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); + pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); +}