X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_cache.c;h=9b7ea1227069e6d73d53ef15fa0d1ee3afaadd5e;hb=f3f005c76eb5636542a8f5b137bd1904d57e8f86;hp=2ca0a9d8226b1610f3b842f21a1d035f61685728;hpb=c1f55a60c41ca5ab8ed7a0893c3d29f8006da82a;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 2ca0a9d..9b7ea12 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -9,16 +9,11 @@ #include "debug.h" #include "errcode.h" #include "error.h" +#include "journal.h" +#include "trace.h" #include #include -#include - -#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ -do { \ - if (shrinker_counter) \ - bc->not_freed_##counter++; \ -} while (0) const char * const bch2_btree_node_flags[] = { #define x(f) #f, @@ -31,13 +26,15 @@ void bch2_recalc_btree_reserve(struct bch_fs *c) { unsigned i, reserve = 16; - if (!c->btree_roots[0].b) + if (!c->btree_roots_known[0].b) reserve += 8; - for (i = 0; i < BTREE_ID_NR; i++) - if (c->btree_roots[i].b) - reserve += min_t(unsigned, 1, - c->btree_roots[i].b->c.level) * 8; + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->b) + reserve += min_t(unsigned, 1, r->b->c.level) * 8; + } c->btree_cache.reserve = reserve; } @@ -61,10 +58,12 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) EBUG_ON(btree_node_write_in_flight(b)); - kvpfree(b->data, btree_bytes(c)); + clear_btree_node_just_written(b); + + kvfree(b->data); b->data = NULL; #ifdef __KERNEL__ - vfree(b->aux_data); + kvfree(b->aux_data); #else munmap(b->aux_data, btree_aux_data_bytes(b)); #endif @@ -95,11 +94,11 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_bytes(c), gfp); + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ - b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); + b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); #else b->aux_data = mmap(NULL, btree_aux_data_bytes(b), PROT_READ|PROT_WRITE|PROT_EXEC, @@ -108,9 +107,9 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_bytes(c)); + kvfree(b->data); b->data = NULL; - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } return 0; @@ -125,13 +124,9 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) return NULL; bkey_btree_ptr_init(&b->key); - __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - lockdep_set_no_check_recursion(&b->c.lock.dep_map); -#endif INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); - b->byte_order = ilog2(btree_bytes(c)); + b->byte_order = ilog2(c->opts.btree_node_size); return b; } @@ -149,6 +144,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return NULL; } + bch2_btree_lock_init(&b->c, 0); + bc->used++; list_add(&b->list, &bc->freeable); return b; @@ -205,7 +202,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) { struct btree_cache *bc = &c->btree_cache; int ret = 0; @@ -215,64 +212,38 @@ wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_dirty(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); - else if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); - else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); - return -ENOMEM; - } + if (!flush) + return -BCH_ERR_ENOMEM_btree_node_reclaim; /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } - if (!six_trylock_intent(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); - return -ENOMEM; - } + if (!six_trylock_intent(&b->c.lock)) + return -BCH_ERR_ENOMEM_btree_node_reclaim; - if (!six_trylock_write(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); + if (!six_trylock_write(&b->c.lock)) goto out_unlock_intent; - } /* recheck under lock */ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); - else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + if (!flush) goto out_unlock; - } six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; } - if (btree_node_noevict(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(noevict); + if (btree_node_noevict(b) || + btree_node_write_blocked(b) || + btree_node_will_make_reachable(b)) goto out_unlock; - } - if (btree_node_write_blocked(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); - goto out_unlock; - } - if (btree_node_will_make_reachable(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); - goto out_unlock; - } if (btree_node_dirty(b)) { - if (!flush) { - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + if (!flush) goto out_unlock; - } /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -298,25 +269,24 @@ out_unlock: six_unlock_write(&b->c.lock); out_unlock_intent: six_unlock_intent(&b->c.lock); - ret = -ENOMEM; + ret = -BCH_ERR_ENOMEM_btree_node_reclaim; goto out; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, false, shrinker_counter); + return __btree_node_reclaim(c, b, false); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true, false); + return __btree_node_reclaim(c, b, true); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache.shrink); + struct bch_fs *c = shrink->private_data; struct btree_cache *bc = &c->btree_cache; struct btree *b, *t; unsigned long nr = sc->nr_to_scan; @@ -358,12 +328,11 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b, true)) { + if (!btree_node_reclaim(c, b)) { btree_node_data_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; - bc->freed++; } } restart: @@ -372,11 +341,9 @@ restart: if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - bc->not_freed_access_bit++; - } else if (!btree_node_reclaim(c, b, true)) { + } else if (!btree_node_reclaim(c, b)) { freed++; btree_node_data_free(c, b); - bc->freed++; bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); @@ -417,8 +384,7 @@ out_nounlock: static unsigned long bch2_btree_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache.shrink); + struct bch_fs *c = shrink->private_data; struct btree_cache *bc = &c->btree_cache; if (bch2_btree_shrinker_disabled) @@ -427,22 +393,13 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, return btree_cache_can_free(bc); } -static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) -{ - struct bch_fs *c = container_of(shrink, struct bch_fs, - btree_cache.shrink); - - bch2_btree_cache_to_text(out, &c->btree_cache); -} - void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; struct btree *b; unsigned i, flags; - if (bc->shrink.list.next) - unregister_shrinker(&bc->shrink); + shrinker_free(bc->shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); @@ -451,11 +408,14 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, btree_bytes(c)); + kvfree(c->verify_ondisk); + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); - for (i = 0; i < BTREE_ID_NR; i++) - if (c->btree_roots[i].b) - list_add(&c->btree_roots[i].b->list, &bc->live); + if (r->b) + list_add(&r->b->list, &bc->live); + } list_splice(&bc->freeable, &bc->live); @@ -465,21 +425,18 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) BUG_ON(btree_node_read_in_flight(b) || btree_node_write_in_flight(b)); - if (btree_node_dirty(b)) - bch2_btree_complete_write(c, b, btree_current_write(b)); - clear_btree_node_dirty_acct(c, b); - btree_node_data_free(c, b); } - BUG_ON(atomic_read(&c->btree_cache.dirty)); + BUG_ON(!bch2_journal_error(&c->journal) && + atomic_read(&c->btree_cache.dirty)); list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); while (!list_empty(&bc->freed_nonpcpu)) { b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); list_del(&b->list); - six_lock_pcpu_free(&b->c.lock); + six_lock_exit(&b->c.lock); kfree(b); } @@ -493,37 +450,39 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) int bch2_fs_btree_cache_init(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; + struct shrinker *shrink; unsigned i; int ret = 0; - pr_verbose_init(c->opts, ""); - ret = rhashtable_init(&bc->table, &bch_btree_cache_params); if (ret) - goto out; + goto err; bc->table_init_done = true; bch2_recalc_btree_reserve(c); for (i = 0; i < bc->reserve; i++) - if (!__bch2_btree_node_mem_alloc(c)) { - ret = -ENOMEM; - goto out; - } + if (!__bch2_btree_node_mem_alloc(c)) + goto err; list_splice_init(&bc->live, &bc->freeable); mutex_init(&c->verify_lock); - bc->shrink.count_objects = bch2_btree_cache_count; - bc->shrink.scan_objects = bch2_btree_cache_scan; - bc->shrink.to_text = bch2_btree_cache_shrinker_to_text; - bc->shrink.seeks = 4; - ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); -out: - pr_verbose_init(c->opts, "ret %i", ret); - return ret; + shrink = shrinker_alloc(0, "%s-btree_cache", c->name); + if (!shrink) + goto err; + bc->shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; + shrink->seeks = 4; + shrink->private_data = c; + shrinker_register(shrink); + + return 0; +err: + return -BCH_ERR_ENOMEM_fs_btree_cache_init; } void bch2_fs_btree_cache_init_early(struct btree_cache *bc) @@ -541,19 +500,21 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc) * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) +void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; if (bc->alloc_lock == current) { - trace_and_count(c, btree_cache_cannibalize_unlock, c); + trace_and_count(c, btree_cache_cannibalize_unlock, trans); bc->alloc_lock = NULL; closure_wake_up(&bc->alloc_wait); } } -int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) +int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct task_struct *old; @@ -562,8 +523,8 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; if (!cl) { - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); - return -ENOMEM; + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); + return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; } closure_wait(&bc->alloc_wait, cl); @@ -576,11 +537,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) goto success; } - trace_and_count(c, btree_cache_cannibalize_lock_fail, c); - return -EAGAIN; + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); + return -BCH_ERR_btree_cache_cannibalize_lock_blocked; success: - trace_and_count(c, btree_cache_cannibalize_lock, c); + trace_and_count(c, btree_cache_cannibalize_lock, trans); return 0; } @@ -590,7 +551,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree *b; list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b, false)) + if (!btree_node_reclaim(c, b)) return b; while (1) { @@ -607,8 +568,9 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) } } -struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) +struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct list_head *freed = pcpu_read_locks ? &bc->freed_pcpu @@ -625,22 +587,22 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b, false)) { + if (!btree_node_reclaim(c, b)) { list_del_init(&b->list); goto got_node; } - b = __btree_node_mem_alloc(c, __GFP_NOWARN); + b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); if (!b) { mutex_unlock(&bc->lock); + bch2_trans_unlock(trans); b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; mutex_lock(&bc->lock); } - if (pcpu_read_locks) - six_lock_pcpu_alloc(&b->c.lock); + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); @@ -651,7 +613,7 @@ got_node: * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2, false)) { + if (!btree_node_reclaim(c, b2)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); btree_node_to_freedlist(bc, b2); @@ -662,8 +624,11 @@ got_node: mutex_unlock(&bc->lock); - if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) - goto err; + if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { + bch2_trans_unlock(trans); + if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) + goto err; + } mutex_lock(&bc->lock); bc->used++; @@ -683,7 +648,7 @@ out: bch2_btree_keys_init(b); set_btree_node_accessed(b); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); memalloc_nofs_restore(flags); @@ -694,6 +659,7 @@ err: /* Try to cannibalize another cached btree node: */ if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); + clear_btree_node_just_written(b2); bch2_btree_node_hash_remove(bc, b2); if (b) { @@ -709,18 +675,17 @@ err: mutex_unlock(&bc->lock); - trace_and_count(c, btree_cache_cannibalize, c); + trace_and_count(c, btree_cache_cannibalize, trans); goto out; } mutex_unlock(&bc->lock); memalloc_nofs_restore(flags); - return ERR_PTR(-ENOMEM); + return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); } /* Slowpath, don't want it inlined into btree_iter_traverse() */ -static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, - struct btree_trans *trans, +static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, enum btree_id btree_id, @@ -728,6 +693,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, enum six_lock_type lock_type, bool sync) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; u32 seq; @@ -737,14 +703,17 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * Parent node must be locked, else we could read in a btree node that's * been freed: */ - if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { + if (path && !bch2_btree_node_relock(trans, path, level + 1)) { trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); } - b = bch2_btree_node_mem_alloc(c, level != 0); + b = bch2_btree_node_mem_alloc(trans, level != 0); + + if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { + if (!path) + return b; - if (trans && b == ERR_PTR(-ENOMEM)) { trans->memory_allocation_failure = true; trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); @@ -772,19 +741,19 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, set_btree_node_read_in_flight(b); six_unlock_write(&b->c.lock); - seq = b->c.lock.state.seq; + seq = six_lock_seq(&b->c.lock); six_unlock_intent(&b->c.lock); /* Unlock before doing IO: */ - if (trans && sync) - bch2_trans_unlock(trans); + if (path && sync) + bch2_trans_unlock_noassert(trans); - bch2_btree_node_read(c, b, sync); + bch2_btree_node_read(trans, b, sync); if (!sync) return NULL; - if (trans) { + if (path) { int ret = bch2_trans_relock(trans) ?: bch2_btree_path_relock_intent(trans, path); if (ret) { @@ -794,8 +763,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, } if (!six_relock_type(&b->c.lock, lock_type, seq)) { - if (trans) - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); + BUG_ON(!path); + + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); } @@ -806,19 +776,19 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; - if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, "btree node header doesn't match ptr\n" "btree %s level %u\n" "ptr: ", - bch2_btree_ids[b->c.btree_id], b->c.level); + bch2_btree_id_str(b->c.btree_id), b->c.level); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_printf(&buf, "\nheader: btree %s level %llu\n" "min ", - bch2_btree_ids[BTREE_NODE_ID(b->data)], + bch2_btree_id_str(BTREE_NODE_ID(b->data)), BTREE_NODE_LEVEL(b->data)); bch2_bpos_to_text(&buf, b->data->min_key); @@ -833,46 +803,26 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b) { if (b->c.btree_id != BTREE_NODE_ID(b->data) || b->c.level != BTREE_NODE_LEVEL(b->data) || - bpos_cmp(b->data->max_key, b->key.k.p) || + !bpos_eq(b->data->max_key, b->key.k.p) || (b->key.k.type == KEY_TYPE_btree_ptr_v2 && - bpos_cmp(b->data->min_key, + !bpos_eq(b->data->min_key, bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) btree_bad_header(c, b); } -/** - * bch_btree_node_get - find a btree node in the cache and lock it, reading it - * in from disk if necessary. - * - * If IO is necessary and running under generic_make_request, returns -EAGAIN. - * - * The btree node will have either a read or a write lock held, depending on - * the @write parameter. - */ -struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, - const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type, - unsigned long trace_ip) +static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; struct bset_tree *t; + bool need_relock = false; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); - - b = btree_node_mem_ptr(k); - - /* - * Check b->hash_val _before_ calling btree_node_lock() - this might not - * be the node we want anymore, and trying to lock the wrong node could - * cause an unneccessary transaction restart: - */ - if (likely(c->opts.btree_node_mem_ptr_optimization && - b && - b->hash_val == btree_ptr_hash_val(k))) - goto lock_node; retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { @@ -881,8 +831,9 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, trans, path, k, path->btree_id, + b = bch2_btree_node_fill(trans, path, k, path->btree_id, level, lock_type, true); + need_relock = true; /* We raced and found the btree node in the cache */ if (!b) @@ -891,35 +842,6 @@ retry: if (IS_ERR(b)) return b; } else { -lock_node: - /* - * There's a potential deadlock with splits and insertions into - * interior nodes we have to avoid: - * - * The other thread might be holding an intent lock on the node - * we want, and they want to update its parent node so they're - * going to upgrade their intent lock on the parent node to a - * write lock. - * - * But if we're holding a read lock on the parent, and we're - * trying to get the intent lock they're holding, we deadlock. - * - * So to avoid this we drop the read locks on parent nodes when - * we're starting to take intent locks - and handle the race. - * - * The race is that they might be about to free the node we - * want, and dropping our read lock on the parent node lets them - * update the parent marking the node we want as freed, and then - * free it: - * - * To guard against this, btree nodes are evicted from the cache - * when they're freed - and b->hash_val is zeroed out, which we - * check for after we lock the node. - * - * Then, bch2_btree_node_relock() on the parent will fail - because - * the parent was modified, when the pointer to the node we want - * was removed - and we'll bail out: - */ if (btree_node_read_locked(path, level + 1)) btree_node_unlock(trans, path, level + 1); @@ -939,13 +861,18 @@ lock_node: trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); } if (unlikely(btree_node_read_in_flight(b))) { - u32 seq = b->c.lock.state.seq; + u32 seq = six_lock_seq(&b->c.lock); six_unlock_type(&b->c.lock, lock_type); bch2_trans_unlock(trans); + need_relock = true; bch2_btree_node_wait_on_read(b); @@ -953,19 +880,106 @@ lock_node: * should_be_locked is not set on this path yet, so we need to * relock it specifically: */ - if (trans) { - int ret = bch2_trans_relock(trans) ?: - bch2_btree_path_relock_intent(trans, path); - if (ret) { - BUG_ON(!trans->restarted); - return ERR_PTR(ret); - } - } - if (!six_relock_type(&b->c.lock, lock_type, seq)) goto retry; } + if (unlikely(need_relock)) { + ret = bch2_trans_relock(trans) ?: + bch2_btree_path_relock_intent(trans, path); + if (ret) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(ret); + } + } + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(-EIO); + } + + EBUG_ON(b->c.btree_id != path->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); + + return b; +} + +/** + * bch2_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * @trans: btree transaction object + * @path: btree_path being traversed + * @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2) + * @level: level of btree node being looked up (0 == leaf node) + * @lock_type: SIX_LOCK_read or SIX_LOCK_intent + * @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek()) + * + * The btree node will have either a read or a write lock held, depending on + * the @write parameter. + * + * Returns: btree node or ERR_PTR() + */ +struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bset_tree *t; + int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + + /* + * Check b->hash_val _before_ calling btree_node_lock() - this might not + * be the node we want anymore, and trying to lock the wrong node could + * cause an unneccessary transaction restart: + */ + if (unlikely(!c->opts.btree_node_mem_ptr_optimization || + !b || + b->hash_val != btree_ptr_hash_val(k))) + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(trans, path, level + 1); + + ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.level != level || + race_fault())) { + six_unlock_type(&b->c.lock, lock_type); + if (bch2_btree_node_relock(trans, path, level + 1)) + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + + trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); + } + + if (unlikely(btree_node_read_in_flight(b))) { + six_unlock_type(&b->c.lock, lock_type); + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + } + prefetch(b->aux_data); for_each_bset(b, t) { @@ -1017,7 +1031,7 @@ retry: if (nofill) goto out; - b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id, + b = bch2_btree_node_fill(trans, NULL, k, btree_id, level, SIX_LOCK_read, true); /* We raced and found the btree node in the cache */ @@ -1025,14 +1039,14 @@ retry: goto retry; if (IS_ERR(b) && - !bch2_btree_cache_cannibalize_lock(c, NULL)) + !bch2_btree_cache_cannibalize_lock(trans, NULL)) goto retry; if (IS_ERR(b)) goto out; } else { lock_node: - ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read); + ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ERR_PTR(ret); @@ -1073,27 +1087,27 @@ lock_node: EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); out: - bch2_btree_cache_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(trans); return b; } -int bch2_btree_node_prefetch(struct bch_fs *c, - struct btree_trans *trans, +int bch2_btree_node_prefetch(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, enum btree_id btree_id, unsigned level) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(trans && !btree_node_locked(path, level + 1)); + BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); if (b) return 0; - b = bch2_btree_node_fill(c, trans, path, k, btree_id, + b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); return PTR_ERR_OR_ZERO(b); } @@ -1137,10 +1151,22 @@ wait_on_io: six_unlock_intent(&b->c.lock); } -void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, - struct btree *b) +const char *bch2_btree_id_str(enum btree_id btree) +{ + return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)"; +} + +void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) +{ + prt_printf(out, "%s level %u/%u\n ", + bch2_btree_id_str(b->c.btree_id), + b->c.level, + bch2_btree_id_root(c, b->c.btree_id)->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); +} + +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { - const struct bkey_format *f = &b->format; struct bset_stats stats; memset(&stats, 0, sizeof(stats)); @@ -1154,9 +1180,13 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, ":\n" " ptrs: "); bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + prt_newline(out); - prt_printf(out, "\n" - " format: u64s %u fields %u %u %u %u %u\n" + prt_printf(out, + " format: "); + bch2_bkey_format_to_text(out, &b->format); + + prt_printf(out, " unpack fn len: %u\n" " bytes used %zu/%zu (%zu%% full)\n" " sib u64s: %u, %u (merge threshold %u)\n" @@ -1164,15 +1194,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, " nr unpacked keys %u\n" " floats %zu\n" " failed unpacked %zu\n", - f->key_u64s, - f->bits_per_field[0], - f->bits_per_field[1], - f->bits_per_field[2], - f->bits_per_field[3], - f->bits_per_field[4], b->unpack_fn_len, b->nr.live_u64s * sizeof(u64), - btree_bytes(c) - sizeof(struct btree_node), + btree_buf_bytes(b) - sizeof(struct btree_node), b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], @@ -1183,21 +1207,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, stats.failed); } -void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc) +void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) { - prt_printf(out, "nr nodes:\t\t%u\n", bc->used); - prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); - - prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed); - prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty); - prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight); - prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight); - prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent); - prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write); - prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit); - prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict); - prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked); - prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable); - + prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); + prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); + prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); }