X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_iter.c;h=cd903fdc426c8baac3a8e94bb7d8a6c63135fd5c;hb=e66011cd2c8aa1c1e78085704a9a2514c6b7ccd1;hp=25d254ee9eaca15545a9456546d722e131ec68ea;hpb=e240b4ae86adb022e3266220ce9807dad8b51beb;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 25d254e..cd903fd 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -16,6 +16,7 @@ #include "replicas.h" #include "subvolume.h" +#include #include #include @@ -46,7 +47,7 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans) if (need_resched() || race_fault()) { bch2_trans_unlock(trans); schedule(); - return bch2_trans_relock(trans) ? 0 : -EINTR; + return bch2_trans_relock(trans); } else { return 0; } @@ -99,12 +100,6 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos return p; } -static inline bool is_btree_node(struct btree_path *path, unsigned l) -{ - return l < BTREE_MAX_DEPTH && - (unsigned long) path->l[l].b >= 128; -} - static inline struct bpos btree_iter_search_key(struct btree_iter *iter) { struct bpos pos = iter->pos; @@ -135,438 +130,6 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, !btree_path_pos_after_node(path, b); } -/* Btree node locking: */ - -void bch2_btree_node_unlock_write(struct btree_trans *trans, - struct btree_path *path, struct btree *b) -{ - bch2_btree_node_unlock_write_inlined(trans, path, b); -} - -void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) -{ - struct btree_path *linked; - unsigned readers = 0; - - trans_for_each_path(trans, linked) - if (linked->l[b->c.level].b == b && - btree_node_read_locked(linked, b->c.level)) - readers++; - - /* - * Must drop our read locks before calling six_lock_write() - - * six_unlock() won't do wakeups until the reader count - * goes to 0, and it's safe because we have the node intent - * locked: - */ - if (!b->c.lock.readers) - atomic64_sub(__SIX_VAL(read_lock, readers), - &b->c.lock.state.counter); - else - this_cpu_sub(*b->c.lock.readers, readers); - - six_lock_write(&b->c.lock, NULL, NULL); - - if (!b->c.lock.readers) - atomic64_add(__SIX_VAL(read_lock, readers), - &b->c.lock.state.counter); - else - this_cpu_add(*b->c.lock.readers, readers); -} - -bool __bch2_btree_node_relock(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree *b = btree_path_node(path, level); - int want = __btree_lock_want(path, level); - - if (!is_btree_node(path, level)) - goto fail; - - if (race_fault()) - goto fail; - - if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || - (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, b, level, want))) { - mark_btree_node_locked(trans, path, level, want); - return true; - } -fail: - trace_btree_node_relock_fail(trans->fn, _RET_IP_, - path->btree_id, - &path->pos, - (unsigned long) b, - path->l[level].lock_seq, - is_btree_node(path, level) ? b->c.lock.state.seq : 0); - return false; -} - -bool bch2_btree_node_upgrade(struct btree_trans *trans, - struct btree_path *path, unsigned level) -{ - struct btree *b = path->l[level].b; - - if (!is_btree_node(path, level)) - return false; - - switch (btree_lock_want(path, level)) { - case BTREE_NODE_UNLOCKED: - BUG_ON(btree_node_locked(path, level)); - return true; - case BTREE_NODE_READ_LOCKED: - BUG_ON(btree_node_intent_locked(path, level)); - return bch2_btree_node_relock(trans, path, level); - case BTREE_NODE_INTENT_LOCKED: - break; - } - - if (btree_node_intent_locked(path, level)) - return true; - - if (race_fault()) - return false; - - if (btree_node_locked(path, level) - ? six_lock_tryupgrade(&b->c.lock) - : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) - goto success; - - if (btree_node_lock_seq_matches(path, b, level) && - btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { - btree_node_unlock(path, level); - goto success; - } - - return false; -success: - mark_btree_node_intent_locked(trans, path, level); - return true; -} - -static inline bool btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade) -{ - unsigned l = path->level; - int fail_idx = -1; - - do { - if (!btree_path_node(path, l)) - break; - - if (!(upgrade - ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) - fail_idx = l; - - l++; - } while (l < path->locks_want); - - /* - * When we fail to get a lock, we have to ensure that any child nodes - * can't be relocked so bch2_btree_path_traverse has to walk back up to - * the node that we failed to relock: - */ - if (fail_idx >= 0) { - __bch2_btree_path_unlock(path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - - do { - path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; - --fail_idx; - } while (fail_idx >= 0); - } - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; - - bch2_trans_verify_locks(trans); - - return path->uptodate < BTREE_ITER_NEED_RELOCK; -} - -static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, - bool cached) -{ - return !cached - ? container_of(_b, struct btree, c)->key.k.p - : container_of(_b, struct bkey_cached, c)->key.pos; -} - -/* Slowpath: */ -bool __bch2_btree_node_lock(struct btree_trans *trans, - struct btree_path *path, - struct btree *b, - struct bpos pos, unsigned level, - enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - struct btree_path *linked; - unsigned reason; - - /* Check if it's safe to block: */ - trans_for_each_path(trans, linked) { - if (!linked->nodes_locked) - continue; - - /* - * Can't block taking an intent lock if we have _any_ nodes read - * locked: - * - * - Our read lock blocks another thread with an intent lock on - * the same node from getting a write lock, and thus from - * dropping its intent lock - * - * - And the other thread may have multiple nodes intent locked: - * both the node we want to intent lock, and the node we - * already have read locked - deadlock: - */ - if (type == SIX_LOCK_intent && - linked->nodes_locked != linked->nodes_intent_locked) { - reason = 1; - goto deadlock; - } - - if (linked->btree_id != path->btree_id) { - if (linked->btree_id < path->btree_id) - continue; - - reason = 3; - goto deadlock; - } - - /* - * Within the same btree, non-cached paths come before cached - * paths: - */ - if (linked->cached != path->cached) { - if (!linked->cached) - continue; - - reason = 4; - goto deadlock; - } - - /* - * Interior nodes must be locked before their descendants: if - * another path has possible descendants locked of the node - * we're about to lock, it must have the ancestors locked too: - */ - if (level > __fls(linked->nodes_locked)) { - reason = 5; - goto deadlock; - } - - /* Must lock btree nodes in key order: */ - if (btree_node_locked(linked, level) && - bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, - linked->cached)) <= 0) { - BUG_ON(trans->in_traverse_all); - reason = 7; - goto deadlock; - } - } - - return btree_node_lock_type(trans, path, b, pos, level, - type, should_sleep_fn, p); -deadlock: - trace_trans_restart_would_deadlock(trans->fn, ip, - trans->in_traverse_all, reason, - linked->btree_id, - linked->cached, - &linked->pos, - path->btree_id, - path->cached, - &pos); - btree_trans_restart(trans); - return false; -} - -/* Btree iterator locking: */ - -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_btree_path_verify_locks(struct btree_path *path) -{ - unsigned l; - - if (!path->nodes_locked) { - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level)); - return; - } - - for (l = 0; btree_path_node(path, l); l++) - BUG_ON(btree_lock_want(path, l) != - btree_node_locked_type(path, l)); -} - -void bch2_trans_verify_locks(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - bch2_btree_path_verify_locks(path); -} -#else -static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} -#endif - -/* Btree path locking: */ - -/* - * Only for btree_cache.c - only relocks intent locks - */ -bool bch2_btree_path_relock_intent(struct btree_trans *trans, - struct btree_path *path) -{ - unsigned l; - - for (l = path->level; - l < path->locks_want && btree_path_node(path, l); - l++) { - if (!bch2_btree_node_relock(trans, path, l)) { - __bch2_btree_path_unlock(path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, - path->btree_id, &path->pos); - btree_trans_restart(trans); - return false; - } - } - - return true; -} - -__flatten -static bool bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - bool ret = btree_path_get_locks(trans, path, false); - - if (!ret) { - trace_trans_restart_relock_path(trans->fn, trace_ip, - path->btree_id, &path->pos); - btree_trans_restart(trans); - } - return ret; -} - -bool __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want) -{ - struct btree_path *linked; - - EBUG_ON(path->locks_want >= new_locks_want); - - path->locks_want = new_locks_want; - - if (btree_path_get_locks(trans, path, true)) - return true; - - /* - * XXX: this is ugly - we'd prefer to not be mucking with other - * iterators in the btree_trans here. - * - * On failure to upgrade the iterator, setting iter->locks_want and - * calling get_locks() is sufficient to make bch2_btree_path_traverse() - * get the locks we want on transaction restart. - * - * But if this iterator was a clone, on transaction restart what we did - * to this iterator isn't going to be preserved. - * - * Possibly we could add an iterator field for the parent iterator when - * an iterator is a copy - for now, we'll just upgrade any other - * iterators with the same btree id. - * - * The code below used to be needed to ensure ancestor nodes get locked - * before interior nodes - now that's handled by - * bch2_btree_path_traverse_all(). - */ - if (!path->cached && !trans->in_traverse_all) - trans_for_each_path(trans, linked) - if (linked != path && - linked->cached == path->cached && - linked->btree_id == path->btree_id && - linked->locks_want < new_locks_want) { - linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true); - } - - return false; -} - -void __bch2_btree_path_downgrade(struct btree_path *path, - unsigned new_locks_want) -{ - unsigned l; - - EBUG_ON(path->locks_want < new_locks_want); - - path->locks_want = new_locks_want; - - while (path->nodes_locked && - (l = __fls(path->nodes_locked)) >= path->locks_want) { - if (l > path->level) { - btree_node_unlock(path, l); - } else { - if (btree_node_intent_locked(path, l)) { - six_lock_downgrade(&path->l[l].b->c.lock); - path->nodes_intent_locked ^= 1 << l; - } - break; - } - } - - bch2_btree_path_verify_locks(path); -} - -void bch2_trans_downgrade(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - bch2_btree_path_downgrade(path); -} - -/* Btree transaction locking: */ - -bool bch2_trans_relock(struct btree_trans *trans) -{ - struct btree_path *path; - - if (unlikely(trans->restarted)) - return false; - - trans_for_each_path(trans, path) - if (path->should_be_locked && - !bch2_btree_path_relock(trans, path, _RET_IP_)) { - trace_trans_restart_relock(trans->fn, _RET_IP_, - path->btree_id, &path->pos); - BUG_ON(!trans->restarted); - return false; - } - return true; -} - -void bch2_trans_unlock(struct btree_trans *trans) -{ - struct btree_path *path; - - trans_for_each_path(trans, path) - __bch2_btree_path_unlock(path); - - /* - * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking - * btree nodes, it implements its own walking: - */ - BUG_ON(!trans->is_initial_gc && - lock_class_is_held(&bch2_btree_node_lock_key)); -} - /* Btree iterator: */ #ifdef CONFIG_BCACHEFS_DEBUG @@ -585,7 +148,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, bkey_cmp(ck->key.pos, path->pos)); if (!locked) - btree_node_unlock(path, 0); + btree_node_unlock(trans, path, 0); } static void bch2_btree_path_verify_level(struct btree_trans *trans, @@ -616,7 +179,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, if (!btree_path_node(path, level)) return; - if (!bch2_btree_node_relock(trans, path, level)) + if (!bch2_btree_node_relock_notrace(trans, path, level)) return; BUG_ON(!btree_path_pos_in_node(path, l->b)); @@ -642,7 +205,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, } if (!locked) - btree_node_unlock(path, level); + btree_node_unlock(trans, path, level); return; err: bch2_bpos_to_text(&buf1, path->pos); @@ -651,14 +214,14 @@ err: struct bkey uk = bkey_unpack_key(l->b, p); bch2_bkey_to_text(&buf2, &uk); } else { - pr_buf(&buf2, "(none)"); + prt_printf(&buf2, "(none)"); } if (k) { struct bkey uk = bkey_unpack_key(l->b, k); bch2_bkey_to_text(&buf3, &uk); } else { - pr_buf(&buf3, "(none)"); + prt_printf(&buf3, "(none)"); } panic("path should be %s key at level %u:\n" @@ -795,7 +358,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, if (cmp < 0) continue; - if (!(path->nodes_locked & 1) || + if (!btree_node_locked(path, 0) || !path->should_be_locked) continue; @@ -1019,27 +582,29 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c, +static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, struct bkey *u) { - struct bkey_s_c k = __btree_iter_unpack(c, l, u, + struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, bch2_btree_node_iter_peek(&l->iter, l->b)); path->pos = k.k ? k.k->p : l->b->key.k.p; + bch2_btree_path_verify_level(trans, path, l - path->l); return k; } -static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c, +static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, struct bkey *u) { - struct bkey_s_c k = __btree_iter_unpack(c, l, u, + struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, bch2_btree_node_iter_prev(&l->iter, l->b)); path->pos = k.k ? k.k->p : l->b->data->min_key; + bch2_btree_path_verify_level(trans, path, l - path->l); return k; } @@ -1062,61 +627,6 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path, return true; } -/* - * Verify that iterator for parent node points to child node: - */ -static void btree_path_verify_new_node(struct btree_trans *trans, - struct btree_path *path, struct btree *b) -{ - struct bch_fs *c = trans->c; - struct btree_path_level *l; - unsigned plevel; - bool parent_locked; - struct bkey_packed *k; - - if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - return; - - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - return; - - plevel = b->c.level + 1; - if (!btree_path_node(path, plevel)) - return; - - parent_locked = btree_node_locked(path, plevel); - - if (!bch2_btree_node_relock(trans, path, plevel)) - return; - - l = &path->l[plevel]; - k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - if (!k || - bkey_deleted(k) || - bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - struct printbuf buf3 = PRINTBUF; - struct printbuf buf4 = PRINTBUF; - struct bkey uk = bkey_unpack_key(b, k); - - bch2_dump_btree_node(c, l->b); - bch2_bpos_to_text(&buf1, path->pos); - bch2_bkey_to_text(&buf2, &uk); - bch2_bpos_to_text(&buf3, b->data->min_key); - bch2_bpos_to_text(&buf3, b->data->max_key); - panic("parent iter doesn't point to new node:\n" - "iter pos %s %s\n" - "iter key %s\n" - "new node %s-%s\n", - bch2_btree_ids[path->btree_id], - buf1.buf, buf2.buf, buf3.buf, buf4.buf); - } - - if (!parent_locked) - btree_node_unlock(path, plevel); -} - static inline void __btree_path_level_init(struct btree_path *path, unsigned level) { @@ -1132,14 +642,12 @@ static inline void __btree_path_level_init(struct btree_path *path, bch2_btree_node_iter_peek(&l->iter, l->b); } -static inline void btree_path_level_init(struct btree_trans *trans, - struct btree_path *path, - struct btree *b) +inline void bch2_btree_path_level_init(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { BUG_ON(path->cached); - btree_path_verify_new_node(trans, path, b); - EBUG_ON(!btree_path_pos_in_node(path, b)); EBUG_ON(b->c.lock.state.seq & 1); @@ -1159,19 +667,19 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) struct btree_path *path; trans_for_each_path(trans, path) - if (!path->cached && + if (path->uptodate == BTREE_ITER_UPTODATE && + !path->cached && btree_path_pos_in_node(path, b)) { enum btree_node_locked_type t = btree_lock_want(path, b->c.level); - if (path->nodes_locked && - t != BTREE_NODE_UNLOCKED) { - btree_node_unlock(path, b->c.level); + if (t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(trans, path, b->c.level); six_lock_increment(&b->c.lock, t); mark_btree_node_locked(trans, path, b->c.level, t); } - btree_path_level_init(trans, path, b); + bch2_btree_path_level_init(trans, path, b); } } @@ -1189,14 +697,6 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) /* Btree path: traverse, set_pos: */ -static int lock_root_check_fn(struct six_lock *lock, void *p) -{ - struct btree *b = container_of(lock, struct btree, c.lock); - struct btree **rootp = p; - - return b == *rootp ? 0 : -1; -} - static inline int btree_path_lock_root(struct btree_trans *trans, struct btree_path *path, unsigned depth_want, @@ -1206,6 +706,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; enum six_lock_type lock_type; unsigned i; + int ret; EBUG_ON(path->nodes_locked); @@ -1227,26 +728,27 @@ static inline int btree_path_lock_root(struct btree_trans *trans, } lock_type = __btree_lock_want(path, path->level); - if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX, - path->level, lock_type, - lock_root_check_fn, rootp, - trace_ip))) { - if (trans->restarted) - return -EINTR; - continue; + ret = btree_node_lock(trans, path, &b->c, + path->level, lock_type, trace_ip); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) + continue; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + BUG(); } if (likely(b == READ_ONCE(*rootp) && b->c.level == path->level && !race_fault())) { for (i = 0; i < path->level; i++) - path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; + path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); path->l[path->level].b = b; for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) path->l[i].b = NULL; mark_btree_node_locked(trans, path, path->level, lock_type); - btree_path_level_init(trans, path, b); + bch2_btree_path_level_init(trans, path, b); return 0; } @@ -1285,7 +787,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat } if (!was_locked) - btree_node_unlock(path, path->level); + btree_node_unlock(trans, path, path->level); bch2_bkey_buf_exit(&tmp, c); return ret; @@ -1320,7 +822,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p } if (!was_locked) - btree_node_unlock(path, path->level); + btree_node_unlock(trans, path, path->level); bch2_bkey_buf_exit(&tmp, c); return ret; @@ -1345,7 +847,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, bp->mem_ptr = (unsigned long)b; if (!locked) - btree_node_unlock(path, plevel); + btree_node_unlock(trans, path, plevel); } static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, @@ -1410,16 +912,16 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (unlikely(ret)) goto err; - mark_btree_node_locked(trans, path, level, lock_type); - btree_path_level_init(trans, path, b); - if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && unlikely(b != btree_node_mem_ptr(tmp.k))) btree_node_mem_ptr_set(trans, path, level + 1, b); if (btree_node_read_locked(path, level + 1)) - btree_node_unlock(path, level + 1); + btree_node_unlock(trans, path, level + 1); + + mark_btree_node_locked(trans, path, level, lock_type); path->level = level; + bch2_btree_path_level_init(trans, path, b); bch2_btree_path_verify_locks(path); err: @@ -1435,14 +937,14 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) struct bch_fs *c = trans->c; struct btree_path *path; unsigned long trace_ip = _RET_IP_; - int i, ret = 0; + int ret = 0; if (trans->in_traverse_all) - return -EINTR; + return -BCH_ERR_transaction_restart_in_traverse_all; trans->in_traverse_all = true; retry_all: - trans->restarted = false; + trans->restarted = 0; trans->traverse_all_idx = U8_MAX; trans_for_each_path(trans, path) @@ -1450,17 +952,6 @@ retry_all: btree_trans_verify_sorted(trans); - for (i = trans->nr_sorted - 2; i >= 0; --i) { - struct btree_path *path1 = trans->paths + trans->sorted[i]; - struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; - - if (path1->btree_id == path2->btree_id && - path1->locks_want < path2->locks_want) - __bch2_btree_path_upgrade(trans, path1, path2->locks_want); - else if (!path1->locks_want && path2->locks_want) - __bch2_btree_path_upgrade(trans, path1, 1); - } - bch2_trans_unlock(trans); cond_resched(); @@ -1486,7 +977,8 @@ retry_all: */ if (path->uptodate) { ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); - if (ret == -EINTR || ret == -ENOMEM) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + ret == -ENOMEM) goto retry_all; if (ret) goto err; @@ -1508,7 +1000,7 @@ err: trans->in_traverse_all = false; - trace_trans_traverse_all(trans->fn, trace_ip); + trace_and_count(c, trans_traverse_all, trans, trace_ip); return ret; } @@ -1527,29 +1019,41 @@ static inline bool btree_path_good_node(struct btree_trans *trans, return true; } +static void btree_path_set_level_down(struct btree_trans *trans, + struct btree_path *path, + unsigned new_level) +{ + unsigned l; + + path->level = new_level; + + for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) + if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(trans, path, l); + + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_path_verify(trans, path); +} + static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, struct btree_path *path, int check_pos) { unsigned i, l = path->level; - +again: while (btree_path_node(path, l) && - !btree_path_good_node(trans, path, l, check_pos)) { - btree_node_unlock(path, l); - path->l[l].b = BTREE_ITER_NO_NODE_UP; - l++; - } + !btree_path_good_node(trans, path, l, check_pos)) + __btree_path_set_level_up(trans, path, l++); /* If we need intent locks, take them too: */ for (i = l + 1; i < path->locks_want && btree_path_node(path, i); i++) - if (!bch2_btree_node_relock(trans, path, i)) - while (l <= i) { - btree_node_unlock(path, l); - path->l[l].b = BTREE_ITER_NO_NODE_UP; - l++; - } + if (!bch2_btree_node_relock(trans, path, i)) { + while (l <= i) + __btree_path_set_level_up(trans, path, l++); + goto again; + } return l; } @@ -1569,19 +1073,17 @@ static int btree_path_traverse_one(struct btree_trans *trans, unsigned long trace_ip) { unsigned depth_want = path->level; - int ret = 0; + int ret = trans->restarted; - if (unlikely(trans->restarted)) { - ret = -EINTR; + if (unlikely(ret)) goto out; - } /* * Ensure we obey path->should_be_locked: if it's set, we can't unlock * and re-traverse the path without a transaction restart: */ if (path->should_be_locked) { - ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR; + ret = bch2_btree_path_relock(trans, path, trace_ip); goto out; } @@ -1595,6 +1097,9 @@ static int btree_path_traverse_one(struct btree_trans *trans, path->level = btree_path_up_until_good_node(trans, path, 0); + EBUG_ON(btree_path_node(path, path->level) && + !btree_node_locked(path, path->level)); + /* * Note: path->nodes[path->level] may be temporarily NULL here - that * would indicate to other code that we got to the end of the btree, @@ -1615,22 +1120,16 @@ static int btree_path_traverse_one(struct btree_trans *trans, goto out; } - __bch2_btree_path_unlock(path); + __bch2_btree_path_unlock(trans, path); path->level = depth_want; - - if (ret == -EIO) - path->l[path->level].b = - BTREE_ITER_NO_NODE_ERROR; - else - path->l[path->level].b = - BTREE_ITER_NO_NODE_DOWN; + path->l[path->level].b = ERR_PTR(ret); goto out; } } path->uptodate = BTREE_ITER_UPTODATE; out: - BUG_ON((ret == -EINTR) != !!trans->restarted); + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_btree_path_verify(trans, path); return ret; } @@ -1638,6 +1137,16 @@ out: int __must_check bch2_btree_path_traverse(struct btree_trans *trans, struct btree_path *path, unsigned flags) { + if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U); + u64 mask = ~(~0ULL << restart_probability_bits); + + if ((prandom_u32() & mask) == mask) { + trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); + } + } + if (path->uptodate < BTREE_ITER_NEED_RELOCK) return 0; @@ -1648,10 +1157,11 @@ int __must_check bch2_btree_path_traverse(struct btree_trans *trans, static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, struct btree_path *src) { - unsigned i; + unsigned i, offset = offsetof(struct btree_path, pos); - memcpy(&dst->pos, &src->pos, - sizeof(struct btree_path) - offsetof(struct btree_path, pos)); + memcpy((void *) dst + offset, + (void *) src + offset, + sizeof(struct btree_path) - offset); for (i = 0; i < BTREE_MAX_DEPTH; i++) if (btree_node_locked(dst, i)) @@ -1711,8 +1221,8 @@ bch2_btree_path_set_pos(struct btree_trans *trans, bch2_btree_path_check_sort(trans, path, cmp); if (unlikely(path->cached)) { - btree_node_unlock(path, 0); - path->l[0].b = BTREE_ITER_NO_NODE_CACHED; + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); goto out; } @@ -1734,7 +1244,7 @@ bch2_btree_path_set_pos(struct btree_trans *trans, if (l != path->level) { btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - __bch2_btree_path_unlock(path); + __bch2_btree_path_unlock(trans, path); } out: bch2_btree_path_verify(trans, path); @@ -1745,37 +1255,37 @@ out: static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) { - struct btree_path *next; + struct btree_path *sib; - next = prev_btree_path(trans, path); - if (next && !btree_path_cmp(next, path)) - return next; + sib = prev_btree_path(trans, path); + if (sib && !btree_path_cmp(sib, path)) + return sib; - next = next_btree_path(trans, path); - if (next && !btree_path_cmp(next, path)) - return next; + sib = next_btree_path(trans, path); + if (sib && !btree_path_cmp(sib, path)) + return sib; return NULL; } static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) { - struct btree_path *next; + struct btree_path *sib; - next = prev_btree_path(trans, path); - if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) - return next; + sib = prev_btree_path(trans, path); + if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) + return sib; - next = next_btree_path(trans, path); - if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) - return next; + sib = next_btree_path(trans, path); + if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) + return sib; return NULL; } static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) { - __bch2_btree_path_unlock(path); + __bch2_btree_path_unlock(trans, path); btree_path_list_remove(trans, path); trans->paths_allocated &= ~(1ULL << path->idx); } @@ -1790,26 +1300,37 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte if (!__btree_path_put(path, intent)) return; - /* - * Perhaps instead we should check for duplicate paths in traverse_all: - */ - if (path->preserve && - (dup = have_path_at_pos(trans, path))) { - dup->preserve = true; - path->preserve = false; - goto free; - } + dup = path->preserve + ? have_path_at_pos(trans, path) + : have_node_at_pos(trans, path); + + if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) + return; - if (!path->preserve && - (dup = have_node_at_pos(trans, path))) - goto free; - return; -free: if (path->should_be_locked && - !btree_node_locked(dup, path->level)) + !trans->restarted && + (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) + return; + + if (dup) { + dup->preserve |= path->preserve; + dup->should_be_locked |= path->should_be_locked; + } + + __bch2_path_free(trans, path); +} + +static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, + bool intent) +{ + struct btree_path *dup; + + EBUG_ON(trans->paths + path->idx != path); + EBUG_ON(!path->ref); + + if (!__btree_path_put(path, intent)) return; - dup->should_be_locked |= path->should_be_locked; __bch2_path_free(trans, path); } @@ -1817,29 +1338,30 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { struct btree_insert_entry *i; - pr_buf(buf, "transaction updates for %s journal seq %llu", + prt_printf(buf, "transaction updates for %s journal seq %llu", trans->fn, trans->journal_res.seq); - pr_newline(buf); - pr_indent_push(buf, 2); + prt_newline(buf); + printbuf_indent_add(buf, 2); trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; - pr_buf(buf, "update: btree %s %pS", + prt_printf(buf, "update: btree=%s cached=%u %pS", bch2_btree_ids[i->btree_id], + i->cached, (void *) i->ip_allocated); - pr_newline(buf); + prt_newline(buf); - pr_buf(buf, " old "); + prt_printf(buf, " old "); bch2_bkey_val_to_text(buf, trans->c, old); - pr_newline(buf); + prt_newline(buf); - pr_buf(buf, " new "); + prt_printf(buf, " new "); bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); - pr_newline(buf); + prt_newline(buf); } - pr_indent_pop(buf, 2); + printbuf_indent_sub(buf, 2); } noinline __cold @@ -1848,65 +1370,97 @@ void bch2_dump_trans_updates(struct btree_trans *trans) struct printbuf buf = PRINTBUF; bch2_trans_updates_to_text(&buf, trans); - bch_err(trans->c, "%s", buf.buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } +void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) +{ + prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", + path->idx, path->ref, path->intent_ref, + path->preserve ? 'P' : ' ', + path->should_be_locked ? 'S' : ' ', + bch2_btree_ids[path->btree_id], + path->level); + bch2_bpos_to_text(out, path->pos); + + prt_printf(out, " locks %u", path->nodes_locked); +#ifdef CONFIG_BCACHEFS_DEBUG + prt_printf(out, " %pS", (void *) path->ip_allocated); +#endif + prt_newline(out); +} + +void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) +{ + struct btree_path *path; + unsigned idx; + + trans_for_each_path_inorder(trans, path, idx) + bch2_btree_path_to_text(out, path); +} + noinline __cold void bch2_dump_trans_paths_updates(struct btree_trans *trans) { - struct btree_path *path; struct printbuf buf = PRINTBUF; - unsigned idx; - trans_for_each_path_inorder(trans, path, idx) { - printbuf_reset(&buf); + bch2_trans_paths_to_text(&buf, trans); + bch2_trans_updates_to_text(&buf, trans); - bch2_bpos_to_text(&buf, path->pos); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} - printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree=%s l=%u pos %s locks %u %pS\n", - path->idx, path->ref, path->intent_ref, - path->should_be_locked ? " S" : "", - path->preserve ? " P" : "", - bch2_btree_ids[path->btree_id], - path->level, - buf.buf, - path->nodes_locked, -#ifdef CONFIG_BCACHEFS_DEBUG - (void *) path->ip_allocated -#else - NULL -#endif - ); +noinline +static void bch2_trans_update_max_paths(struct btree_trans *trans) +{ + struct btree_transaction_stats *s = btree_trans_stats(trans); + struct printbuf buf = PRINTBUF; + + bch2_trans_paths_to_text(&buf, trans); + + if (!buf.allocation_failure) { + mutex_lock(&s->lock); + if (s->nr_max_paths < hweight64(trans->paths_allocated)) { + s->nr_max_paths = trans->nr_max_paths = + hweight64(trans->paths_allocated); + swap(s->max_paths_text, buf.buf); + } + mutex_unlock(&s->lock); } printbuf_exit(&buf); +} - bch2_dump_trans_updates(trans); +static noinline void btree_path_overflow(struct btree_trans *trans) +{ + bch2_dump_trans_paths_updates(trans); + panic("trans path oveflow\n"); } -static struct btree_path *btree_path_alloc(struct btree_trans *trans, - struct btree_path *pos) +static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, + struct btree_path *pos) { struct btree_path *path; unsigned idx; if (unlikely(trans->paths_allocated == - ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { - bch2_dump_trans_paths_updates(trans); - panic("trans path oveflow\n"); - } + ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) + btree_path_overflow(trans); idx = __ffs64(~trans->paths_allocated); trans->paths_allocated |= 1ULL << idx; + if (unlikely(idx > trans->nr_max_paths)) + bch2_trans_update_max_paths(trans); + path = &trans->paths[idx]; path->idx = idx; path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; - path->nodes_intent_locked = 0; btree_path_list_add(trans, pos, path); return path; @@ -1956,9 +1510,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path->level = level; path->locks_want = locks_want; path->nodes_locked = 0; - path->nodes_intent_locked = 0; for (i = 0; i < ARRAY_SIZE(path->l); i++) - path->l[i].b = BTREE_ITER_NO_NODE_INIT; + path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); #ifdef CONFIG_BCACHEFS_DEBUG path->ip_allocated = ip; #endif @@ -1980,10 +1533,8 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, */ locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > path->locks_want) { - path->locks_want = locks_want; - btree_path_get_locks(trans, path, true); - } + if (locks_want > path->locks_want) + bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); return path; } @@ -1993,12 +1544,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct struct bkey_s_c k; + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + EBUG_ON(!btree_node_locked(path, path->level)); + if (!path->cached) { struct btree_path_level *l = path_l(path); struct bkey_packed *_k; - EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); - _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; @@ -2012,12 +1564,7 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct EBUG_ON(ck && (path->btree_id != ck->key.btree_id || bkey_cmp(path->pos, ck->key.pos))); - - /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */ - if (unlikely(!ck || !ck->valid)) - return bkey_s_c_null; - - EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + EBUG_ON(!ck || !ck->valid); *u = ck->k->k; k = bkey_i_to_s_c(ck->k); @@ -2052,7 +1599,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - iter->path->should_be_locked = true; + btree_path_set_should_be_locked(iter->path); return 0; } @@ -2083,8 +1630,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - iter->path->should_be_locked = true; - BUG_ON(iter->path->uptodate); + btree_path_set_should_be_locked(iter->path); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2100,7 +1646,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) struct btree_trans *trans = iter->trans; struct btree_path *path = iter->path; struct btree *b = NULL; - unsigned l; int ret; BUG_ON(trans->restarted); @@ -2113,31 +1658,24 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) /* got to end? */ if (!btree_path_node(path, path->level + 1)) { - btree_node_unlock(path, path->level); - path->l[path->level].b = BTREE_ITER_NO_NODE_UP; - path->level++; - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_level_up(trans, path); return NULL; } if (!bch2_btree_node_relock(trans, path, path->level + 1)) { - __bch2_btree_path_unlock(path); - path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; - path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; + __bch2_btree_path_unlock(trans, path); + path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); + path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, - path->btree_id, &path->pos); - btree_trans_restart(trans); - ret = -EINTR; + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); goto err; } b = btree_path_node(path, path->level + 1); if (!bpos_cmp(iter->pos, b->key.k.p)) { - btree_node_unlock(path, path->level); - path->l[path->level].b = BTREE_ITER_NO_NODE_UP; - path->level++; + __btree_path_set_level_up(trans, path, path->level++); } else { /* * Haven't gotten to the end of the parent node: go back down to @@ -2148,14 +1686,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - path->level = iter->min_depth; - - for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) - if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) - btree_node_unlock(path, l); - - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - bch2_btree_iter_verify(iter); + btree_path_set_level_down(trans, path, iter->min_depth); ret = bch2_btree_path_traverse(trans, path, iter->flags); if (ret) @@ -2170,7 +1701,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - iter->path->should_be_locked = true; + btree_path_set_should_be_locked(iter->path); BUG_ON(iter->path->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); @@ -2186,15 +1717,23 @@ err: inline bool bch2_btree_iter_advance(struct btree_iter *iter) { - struct bpos pos = iter->k.p; - bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS - ? bpos_cmp(pos, SPOS_MAX) - : bkey_cmp(pos, SPOS_MAX)) != 0; + if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { + struct bpos pos = iter->k.p; + bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_cmp(pos, SPOS_MAX) + : bkey_cmp(pos, SPOS_MAX)) != 0; - if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) - pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); - return ret; + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; + } else { + if (!btree_path_node(iter->path, iter->path->level)) + return true; + + iter->advanced = true; + return false; + } } inline bool bch2_btree_iter_rewind(struct btree_iter *iter) @@ -2215,16 +1754,47 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, struct bpos pos) { struct btree_insert_entry *i; + struct bkey_i *ret = NULL; - trans_for_each_update(trans, i) - if ((cmp_int(btree_id, i->btree_id) ?: - bpos_cmp(pos, i->k->k.p)) <= 0) { - if (btree_id == i->btree_id) - return i->k; + trans_for_each_update(trans, i) { + if (i->btree_id < btree_id) + continue; + if (i->btree_id > btree_id) break; - } + if (bpos_cmp(i->k->k.p, pos) < 0) + continue; + if (i->key_cache_already_flushed) + continue; + if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0) + ret = i->k; + } - return NULL; + return ret; +} + +struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos start_pos, + struct bpos end_pos) +{ + struct bkey_i *k; + + if (bpos_cmp(start_pos, iter->journal_pos) < 0) + iter->journal_idx = 0; + + k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, 0, + start_pos, end_pos, + &iter->journal_idx); + + iter->journal_pos = k ? k->k.p : end_pos; + return k; +} + +struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_btree_journal_peek(trans, iter, pos, pos); } static noinline @@ -2233,12 +1803,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_i *next_journal = - bch2_journal_keys_peek(trans->c, iter->btree_id, 0, - iter->path->pos); + bch2_btree_journal_peek(trans, iter, iter->path->pos, + k.k ? k.k->p : iter->path->l[0].b->key.k.p); - if (next_journal && - bpos_cmp(next_journal->k.p, - k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + if (next_journal) { iter->k = next_journal->k; k = bkey_i_to_s_c(next_journal); } @@ -2251,7 +1819,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, * bkey_s_c_null: */ static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) { struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; @@ -2275,11 +1843,20 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos if (unlikely(ret)) return bkey_s_c_err(ret); - iter->key_cache_path->should_be_locked = true; + btree_path_set_should_be_locked(iter->key_cache_path); return bch2_btree_path_peek_slot(iter->key_cache_path, &u); } +static noinline +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +{ + struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos); + int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_); + + return err ? bkey_s_c_err(err) : ret; +} + static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; @@ -2287,10 +1864,12 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp struct bkey_s_c k, k2; int ret; - EBUG_ON(iter->path->cached || iter->path->level); + EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); while (1) { + struct btree_path_level *l; + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -2303,22 +1882,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - iter->path->should_be_locked = true; + l = path_l(iter->path); + + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + goto out; + } + + btree_path_set_should_be_locked(iter->path); - k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); + k = btree_path_level_peek_all(trans->c, l, &iter->k); if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { - ret = bkey_err(k2); + k = k2; + ret = bkey_err(k); if (ret) { - k = k2; bch2_btree_iter_set_pos(iter, iter->pos); goto out; } - - k = k2; - iter->k = *k.k; } if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) @@ -2329,7 +1914,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp : NULL; if (next_update && bpos_cmp(next_update->k.p, - k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + k.k ? k.k->p : l->b->key.k.p) <= 0) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); } @@ -2350,9 +1935,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (likely(k.k)) { break; - } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { + } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) { /* Advance to next leaf node: */ - search_key = bpos_successor(iter->path->l[0].b->key.k.p); + search_key = bpos_successor(l->b->key.k.p); } else { /* End of btree: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); @@ -2378,9 +1963,11 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + if (iter->update_path) { - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); iter->update_path = NULL; } @@ -2389,7 +1976,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e while (1) { k = __bch2_btree_iter_peek(iter, search_key); if (!k.k || bkey_err(k)) - goto out; + goto out_no_locked; /* * iter->pos should be mononotically increasing, and always be @@ -2406,13 +1993,13 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e if (bkey_cmp(iter_pos, end) > 0) { bch2_btree_iter_set_pos(iter, end); k = bkey_s_c_null; - goto out; + goto out_no_locked; } if (iter->update_path && bkey_cmp(iter->update_path->pos, k.k->p)) { - bch2_path_put(trans, iter->update_path, - iter->flags & BTREE_ITER_INTENT); + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); iter->update_path = NULL; } @@ -2468,18 +2055,16 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - BUG_ON(!iter->path->nodes_locked); -out: + + btree_path_set_should_be_locked(iter->path); +out_no_locked: if (iter->update_path) { if (iter->update_path->uptodate && - !bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_)) { - k = bkey_s_c_err(-EINTR); - } else { - BUG_ON(!(iter->update_path->nodes_locked & 1)); - iter->update_path->should_be_locked = true; - } + (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) + k = bkey_s_c_err(ret); + else + btree_path_set_should_be_locked(iter->update_path); } - iter->path->should_be_locked = true; if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) iter->pos.snapshot = iter->snapshot; @@ -2495,6 +2080,100 @@ out: return k; } +/** + * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal + * to iterator's current position, returning keys from every level of the btree. + * For keys at different levels of the btree that compare equal, the key from + * the lower level (leaf) is returned first. + */ +struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bkey_s_c k; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + BUG_ON(iter->path->level < iter->min_depth); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out_no_locked; + } + + /* Already at end? */ + if (!btree_path_node(iter->path, iter->path->level)) { + k = bkey_s_c_null; + goto out_no_locked; + } + + k = btree_path_level_peek_all(trans->c, + &iter->path->l[iter->path->level], &iter->k); + + /* Check if we should go up to the parent node: */ + if (!k.k || + (iter->advanced && + !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) { + iter->pos = path_l(iter->path)->b->key.k.p; + btree_path_set_level_up(trans, iter->path); + iter->advanced = false; + continue; + } + + /* + * Check if we should go back down to a leaf: + * If we're not in a leaf node, we only return the current key + * if it exactly matches iter->pos - otherwise we first have to + * go back to the leaf: + */ + if (iter->path->level != iter->min_depth && + (iter->advanced || + !k.k || + bpos_cmp(iter->pos, k.k->p))) { + btree_path_set_level_down(trans, iter->path, iter->min_depth); + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + /* Check if we should go to the next key: */ + if (iter->path->level == iter->min_depth && + iter->advanced && + k.k && + !bpos_cmp(iter->pos, k.k->p)) { + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + if (iter->advanced && + iter->path->level == iter->min_depth && + bpos_cmp(k.k->p, iter->pos)) + iter->advanced = false; + + BUG_ON(iter->advanced); + BUG_ON(!k.k); + break; + } + + iter->pos = k.k->p; + btree_path_set_should_be_locked(iter->path); +out_no_locked: + bch2_btree_iter_verify(iter); + + return k; +} + /** * bch2_btree_iter_next: returns first key greater than iterator's current * position @@ -2543,16 +2222,16 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto out; + goto out_no_locked; } - k = btree_path_level_peek(trans->c, iter->path, + k = btree_path_level_peek(trans, iter->path, &iter->path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 : bpos_cmp(k.k->p, search_key) > 0)) - k = btree_path_level_prev(trans->c, iter->path, + k = btree_path_level_prev(trans, iter->path, &iter->path->l[0], &iter->k); bch2_btree_path_check_sort(trans, iter->path, 0); @@ -2568,7 +2247,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) * that candidate */ if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { - bch2_path_put(trans, iter->path, + bch2_path_put_nokeep(trans, iter->path, iter->flags & BTREE_ITER_INTENT); iter->path = saved_path; saved_path = NULL; @@ -2581,7 +2260,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) iter->snapshot, k.k->p.snapshot)) { if (saved_path) - bch2_path_put(trans, saved_path, + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_INTENT); @@ -2609,7 +2288,7 @@ got_key: /* Start of btree: */ bch2_btree_iter_set_pos(iter, POS_MIN); k = bkey_s_c_null; - goto out; + goto out_no_locked; } } @@ -2621,10 +2300,11 @@ got_key: if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) iter->pos.snapshot = iter->snapshot; -out: + + btree_path_set_should_be_locked(iter->path); +out_no_locked: if (saved_path) - bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT); - iter->path->should_be_locked = true; + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2651,9 +2331,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; - EBUG_ON(iter->path->level); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && @@ -2670,8 +2351,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) btree_iter_ip_allocated(iter)); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); - if (unlikely(ret)) - return bkey_s_c_err(ret); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } if ((iter->flags & BTREE_ITER_CACHED) || !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { @@ -2687,25 +2370,27 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && - (next_update = bch2_journal_keys_peek(trans->c, iter->btree_id, - 0, iter->pos)) && - !bpos_cmp(next_update->k.p, iter->pos)) { + (next_update = bch2_btree_journal_peek_slot(trans, + iter, iter->pos))) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); goto out; } if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && - (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; - goto out; + /* We're not returning a key from iter->path: */ + goto out_no_locked; } k = bch2_btree_path_peek_slot(iter->path, &iter->k); } else { struct bpos next; + EBUG_ON(iter->path->level); + if (iter->flags & BTREE_ITER_INTENT) { struct btree_iter iter2; struct bpos end = iter->pos; @@ -2725,11 +2410,14 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bpos pos = iter->pos; k = bch2_btree_iter_peek(iter); - iter->pos = pos; + if (unlikely(bkey_err(k))) + bch2_btree_iter_set_pos(iter, pos); + else + iter->pos = pos; } if (unlikely(bkey_err(k))) - return k; + goto out_no_locked; next = k.k ? bkey_start_pos(k.k) : POS_MAX; @@ -2751,8 +2439,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - iter->path->should_be_locked = true; - + btree_path_set_should_be_locked(iter->path); +out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); ret = bch2_btree_iter_verify_ret(iter, k); @@ -2804,6 +2492,9 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) struct btree_path *path, *prev = NULL; unsigned i; + if (!bch2_debug_check_iterators) + return; + trans_for_each_path_inorder(trans, path, i) { if (prev && btree_path_cmp(prev, path) > 0) { bch2_dump_trans_paths_updates(trans); @@ -2901,7 +2592,7 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) bch2_path_put(trans, iter->path, iter->flags & BTREE_ITER_INTENT); if (iter->update_path) - bch2_path_put(trans, iter->update_path, + bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, @@ -2911,15 +2602,21 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) iter->key_cache_path = NULL; } -static void __bch2_trans_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags, - unsigned long ip) +static inline void __bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags, + unsigned long ip) { - EBUG_ON(trans->restarted); + if (trans->restarted) + panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n", + bch2_err_str(trans->restarted), + (void *) trans->last_restarted_ip); + + if (flags & BTREE_ITER_ALL_LEVELS) + flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && btree_node_type_is_extents(btree_id)) @@ -2936,12 +2633,6 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) flags |= BTREE_ITER_WITH_JOURNAL; - if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_CACHED; - flags &= ~BTREE_ITER_WITH_KEY_CACHE; - } else if (!(flags & BTREE_ITER_CACHED)) - flags |= BTREE_ITER_WITH_KEY_CACHE; - iter->trans = trans; iter->path = NULL; iter->update_path = NULL; @@ -2954,6 +2645,8 @@ static void __bch2_trans_iter_init(struct btree_trans *trans, iter->k.type = KEY_TYPE_deleted; iter->k.p = pos; iter->k.size = 0; + iter->journal_idx = 0; + iter->journal_pos = POS_MIN; #ifdef CONFIG_BCACHEFS_DEBUG iter->ip_allocated = ip; #endif @@ -2967,6 +2660,12 @@ void bch2_trans_iter_init(struct btree_trans *trans, unsigned btree_id, struct bpos pos, unsigned flags) { + if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; + flags &= ~BTREE_ITER_WITH_KEY_CACHE; + } else if (!(flags & BTREE_ITER_CACHED)) + flags |= BTREE_ITER_WITH_KEY_CACHE; + __bch2_trans_iter_init(trans, iter, btree_id, pos, 0, 0, flags, _RET_IP_); } @@ -2999,36 +2698,34 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->key_cache_path = NULL; } -void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { - size_t new_top = trans->mem_top + size; + unsigned new_top = trans->mem_top + size; + size_t old_bytes = trans->mem_bytes; + size_t new_bytes = roundup_pow_of_two(new_top); + void *new_mem; void *p; - if (new_top > trans->mem_bytes) { - size_t old_bytes = trans->mem_bytes; - size_t new_bytes = roundup_pow_of_two(new_top); - void *new_mem; + trans->mem_max = max(trans->mem_max, new_top); - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); - new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); - if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - kfree(trans->mem); - } + new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); + if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + kfree(trans->mem); + } - if (!new_mem) - return ERR_PTR(-ENOMEM); + if (!new_mem) + return ERR_PTR(-ENOMEM); - trans->mem = new_mem; - trans->mem_bytes = new_bytes; + trans->mem = new_mem; + trans->mem_bytes = new_bytes; - if (old_bytes) { - trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes); - btree_trans_restart(trans); - return ERR_PTR(-EINTR); - } + if (old_bytes) { + trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } p = trans->mem + trans->mem_top; @@ -3041,29 +2738,23 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) * bch2_trans_begin() - reset a transaction after a interrupted attempt * @trans: transaction to reset * - * While iterating over nodes or updating nodes a attempt to lock a btree - * node may return EINTR when the trylock fails. When this occurs - * bch2_trans_begin() should be called and the transaction retried. + * While iterating over nodes or updating nodes a attempt to lock a btree node + * may return BCH_ERR_transaction_restart when the trylock fails. When this + * occurs bch2_trans_begin() should be called and the transaction retried. */ -void bch2_trans_begin(struct btree_trans *trans) +u32 bch2_trans_begin(struct btree_trans *trans) { - struct btree_insert_entry *i; struct btree_path *path; - trans_for_each_update(trans, i) - __btree_path_put(i->path, true); + bch2_trans_reset_updates(trans); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - trans->extra_journal_res = 0; - trans->nr_updates = 0; + trans->restart_count++; trans->mem_top = 0; - trans->hooks = NULL; - trans->extra_journal_entries.nr = 0; - if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; - memset(&trans->fs_usage_deltas->memset_start, 0, + memset((void *) trans->fs_usage_deltas + + offsetof(struct replicas_delta_list, memset_start), 0, (void *) &trans->fs_usage_deltas->memset_end - (void *) &trans->fs_usage_deltas->memset_start); } @@ -3090,12 +2781,28 @@ void bch2_trans_begin(struct btree_trans *trans) path->preserve = false; } - bch2_trans_cond_resched(trans); + if (!trans->restarted && + (need_resched() || + ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + bch2_trans_unlock(trans); + cond_resched(); + bch2_trans_relock(trans); + } + trans->last_restarted_ip = _RET_IP_; if (trans->restarted) bch2_btree_path_traverse_all(trans); - trans->restarted = false; + trans->last_begin_time = ktime_get_ns(); + return trans->restart_count; +} + +void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count) +{ + if (trans_was_restarted(trans, restart_count)) + panic("trans->restart_count %u, should be %u, last restarted by %pS\n", + trans->restart_count, restart_count, + (void *) trans->last_restarted_ip); } static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) @@ -3116,35 +2823,67 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) trans->updates = p; p += updates_bytes; } -void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, - unsigned expected_nr_iters, - size_t expected_mem_bytes, - const char *fn) +static inline unsigned bch2_trans_get_fn_idx(struct btree_trans *trans, struct bch_fs *c, + const char *fn) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->btree_transaction_fns); i++) + if (!c->btree_transaction_fns[i] || + c->btree_transaction_fns[i] == fn) { + c->btree_transaction_fns[i] = fn; + return i; + } + + pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); + return i; +} + +void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *fn) __acquires(&c->btree_trans_barrier) { + struct btree_transaction_stats *s; + struct btree_trans *pos; + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); memset(trans, 0, sizeof(*trans)); trans->c = c; trans->fn = fn; + trans->last_begin_time = ktime_get_ns(); + trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn); + trans->locking_wait.task = current; + closure_init_stack(&trans->ref); bch2_trans_alloc_paths(trans, c); - if (expected_mem_bytes) { - trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); - trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); + s = btree_trans_stats(trans); + if (s) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); if (!unlikely(trans->mem)) { trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); trans->mem_bytes = BTREE_TRANS_MEM_MAX; + } else { + trans->mem_bytes = expected_mem_bytes; } + + trans->nr_max_paths = s->nr_max_paths; } trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->pid = current->pid; mutex_lock(&c->btree_trans_lock); - list_add(&trans->list, &c->btree_trans_list); + list_for_each_entry(pos, &c->btree_trans_list, list) { + if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + list_add_tail(&trans->list, &pos->list); + goto list_add_done; + } + } + list_add_tail(&trans->list, &c->btree_trans_list); +list_add_done: mutex_unlock(&c->btree_trans_lock); } @@ -3175,9 +2914,15 @@ void bch2_trans_exit(struct btree_trans *trans) { struct btree_insert_entry *i; struct bch_fs *c = trans->c; + struct btree_transaction_stats *s = btree_trans_stats(trans); bch2_trans_unlock(trans); + closure_sync(&trans->ref); + + if (s) + s->max_mem = max(s->max_mem, trans->mem_max); + trans_for_each_update(trans, i) __btree_path_put(i->path, true); trans->nr_updates = 0; @@ -3223,82 +2968,73 @@ void bch2_trans_exit(struct btree_trans *trans) } static void __maybe_unused -bch2_btree_path_node_to_text(struct printbuf *out, - struct btree_bkey_cached_common *_b, - bool cached) +bch2_btree_bkey_cached_common_to_text(struct printbuf *out, + struct btree_bkey_cached_common *b) { - pr_buf(out, " l=%u %s:", - _b->level, bch2_btree_ids[_b->btree_id]); - bch2_bpos_to_text(out, btree_node_pos(_b, cached)); -} + struct six_lock_count c = six_lock_counts(&b->lock); + struct task_struct *owner; + pid_t pid; -static bool trans_has_locks(struct btree_trans *trans) -{ - struct btree_path *path; + rcu_read_lock(); + owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0;; + rcu_read_unlock(); - trans_for_each_path(trans, path) - if (path->nodes_locked) - return true; - return false; + prt_tab(out); + prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', + b->level, bch2_btree_ids[b->btree_id]); + bch2_bpos_to_text(out, btree_node_pos(b)); + + prt_tab(out); + prt_printf(out, " locks %u:%u:%u held by pid %u", + c.n[0], c.n[1], c.n[2], pid); } -void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) +void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) { - struct btree_trans *trans; struct btree_path *path; - struct btree *b; + struct btree_bkey_cached_common *b; static char lock_types[] = { 'r', 'i', 'w' }; unsigned l; - mutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - if (!trans_has_locks(trans)) - continue; + if (!out->nr_tabstops) { + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 32); + } - pr_buf(out, "%i %s\n", trans->pid, trans->fn); + prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); - trans_for_each_path(trans, path) { - if (!path->nodes_locked) - continue; + trans_for_each_path(trans, path) { + if (!path->nodes_locked) + continue; - pr_buf(out, " path %u %c l=%u %s:", - path->idx, - path->cached ? 'c' : 'b', - path->level, - bch2_btree_ids[path->btree_id]); - bch2_bpos_to_text(out, path->pos); - pr_buf(out, "\n"); - - for (l = 0; l < BTREE_MAX_DEPTH; l++) { - if (btree_node_locked(path, l)) { - pr_buf(out, " %s l=%u ", - btree_node_intent_locked(path, l) ? "i" : "r", l); - bch2_btree_path_node_to_text(out, - (void *) path->l[l].b, - path->cached); - pr_buf(out, "\n"); - } + prt_printf(out, " path %u %c l=%u %s:", + path->idx, + path->cached ? 'c' : 'b', + path->level, + bch2_btree_ids[path->btree_id]); + bch2_bpos_to_text(out, path->pos); + prt_newline(out); + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + if (btree_node_locked(path, l) && + !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { + prt_printf(out, " %c l=%u ", + lock_types[btree_node_locked_type(path, l)], l); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); } } + } - b = READ_ONCE(trans->locking); - if (b) { - path = &trans->paths[trans->locking_path_idx]; - pr_buf(out, " locking path %u %c l=%u %c %s:", - trans->locking_path_idx, - path->cached ? 'c' : 'b', - trans->locking_level, - lock_types[trans->locking_lock_type], - bch2_btree_ids[trans->locking_btree_id]); - bch2_bpos_to_text(out, trans->locking_pos); - - pr_buf(out, " node "); - bch2_btree_path_node_to_text(out, - (void *) b, path->cached); - pr_buf(out, "\n"); - } + b = READ_ONCE(trans->locking); + if (b) { + prt_str(out, " want"); + prt_newline(out); + prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); } - mutex_unlock(&c->btree_trans_lock); } void bch2_fs_btree_iter_exit(struct bch_fs *c) @@ -3311,9 +3047,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) int bch2_fs_btree_iter_init(struct bch_fs *c) { - unsigned nr = BTREE_ITER_MAX; + unsigned i, nr = BTREE_ITER_MAX; int ret; + for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++) + mutex_init(&c->btree_transaction_stats[i].lock); + INIT_LIST_HEAD(&c->btree_trans_list); mutex_init(&c->btree_trans_lock);