]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_iter.c
Update bcachefs sources to eb83f1f842bb mean and variance: Promote to lib/math
[bcachefs-tools-debian] / libbcachefs / btree_iter.c
index 365794dc4dcd66c4b18a1519987e01ed6628d25d..f430ca83720673043b135128aa75b272a73dfaff 100644 (file)
@@ -5,6 +5,7 @@
 #include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update.h"
@@ -12,9 +13,8 @@
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
-#include "recovery.h"
 #include "replicas.h"
-#include "subvolume.h"
+#include "snapshot.h"
 #include "trace.h"
 
 #include <linux/random.h>
@@ -35,21 +35,6 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
 
 static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
 
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline int bch2_trans_cond_resched(struct btree_trans *trans)
-{
-       if (need_resched() || race_fault()) {
-               bch2_trans_unlock(trans);
-               schedule();
-               return bch2_trans_relock(trans);
-       } else {
-               return 0;
-       }
-}
-
 static inline int __btree_path_cmp(const struct btree_path *l,
                                   enum btree_id        r_btree_id,
                                   bool                 r_cached,
@@ -241,7 +226,7 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
        for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
                if (!path->l[i].b) {
                        BUG_ON(!path->cached &&
-                              c->btree_roots[path->btree_id].b->c.level > i);
+                              bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
                        break;
                }
 
@@ -272,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 
        BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
               (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-              !btree_type_has_snapshots(iter->btree_id));
+              !btree_type_has_snapshot_field(iter->btree_id));
 
        if (iter->update_path)
                bch2_btree_path_verify(trans, iter->update_path);
@@ -377,7 +362,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
        bch2_bpos_to_text(&buf, pos);
 
        panic("not locked: %s %s%s\n",
-             bch2_btree_ids[id], buf.buf,
+             bch2_btree_id_str(id), buf.buf,
              key_cache ? " cached" : "");
 }
 
@@ -503,7 +488,6 @@ fixup_done:
        if (!bch2_btree_node_iter_end(node_iter) &&
            iter_current_key_modified &&
            b->c.level) {
-               struct bset_tree *t;
                struct bkey_packed *k, *k2, *p;
 
                k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -652,9 +636,8 @@ void bch2_btree_path_level_init(struct btree_trans *trans,
        BUG_ON(path->cached);
 
        EBUG_ON(!btree_path_pos_in_node(path, b));
-       EBUG_ON(b->c.lock.state.seq & 1);
 
-       path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+       path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
        path->l[b->c.level].b = b;
        __btree_path_level_init(path, b->c.level);
 }
@@ -704,7 +687,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 
                        if (t != BTREE_NODE_UNLOCKED) {
                                btree_node_unlock(trans, path, b->c.level);
-                               six_lock_increment(&b->c.lock, t);
+                               six_lock_increment(&b->c.lock, (enum six_lock_type) t);
                                mark_btree_node_locked(trans, path, b->c.level, t);
                        }
 
@@ -736,7 +719,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
                                       unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
+       struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
        enum six_lock_type lock_type;
        unsigned i;
        int ret;
@@ -780,7 +763,8 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
                        for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
                                path->l[i].b = NULL;
 
-                       mark_btree_node_locked(trans, path, path->level, lock_type);
+                       mark_btree_node_locked(trans, path, path->level,
+                                              (enum btree_node_locked_type) lock_type);
                        bch2_btree_path_level_init(trans, path, b);
                        return 0;
                }
@@ -797,7 +781,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
        struct btree_node_iter node_iter = l->iter;
        struct bkey_packed *k;
        struct bkey_buf tmp;
-       unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+       unsigned nr = test_bit(BCH_FS_started, &c->flags)
                ? (path->level > 1 ? 0 :  2)
                : (path->level > 1 ? 1 : 16);
        bool was_locked = btree_node_locked(path, path->level);
@@ -832,7 +816,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
        struct bch_fs *c = trans->c;
        struct bkey_s_c k;
        struct bkey_buf tmp;
-       unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+       unsigned nr = test_bit(BCH_FS_started, &c->flags)
                ? (path->level > 1 ? 0 :  2)
                : (path->level > 1 ? 1 : 16);
        bool was_locked = btree_node_locked(path, path->level);
@@ -952,7 +936,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
        if (btree_node_read_locked(path, level + 1))
                btree_node_unlock(trans, path, level + 1);
 
-       mark_btree_node_locked(trans, path, level, lock_type);
+       mark_btree_node_locked(trans, path, level,
+                              (enum btree_node_locked_type) lock_type);
        path->level = level;
        bch2_btree_path_level_init(trans, path, b);
 
@@ -1024,7 +1009,7 @@ retry_all:
        /*
         * We used to assert that all paths had been traversed here
         * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
-        * path->Should_be_locked is not set yet, we we might have unlocked and
+        * path->should_be_locked is not set yet, we might have unlocked and
         * then failed to relock a path - that's fine.
         */
 err:
@@ -1124,6 +1109,9 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
        if (unlikely(ret))
                goto out;
 
+       if (unlikely(!trans->srcu_held))
+               bch2_trans_srcu_lock(trans);
+
        /*
         * Ensure we obey path->should_be_locked: if it's set, we can't unlock
         * and re-traverse the path without a transaction restart:
@@ -1226,8 +1214,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
                   struct btree_path *path, struct bpos new_pos,
                   bool intent, unsigned long ip, int cmp)
 {
-       unsigned level = path->level;
-
        bch2_trans_verify_not_in_restart(trans);
        EBUG_ON(!path->ref);
 
@@ -1243,7 +1229,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
                goto out;
        }
 
-       level = btree_path_up_until_good_node(trans, path, cmp);
+       unsigned level = btree_path_up_until_good_node(trans, path, cmp);
 
        if (btree_path_node(path, level)) {
                struct btree_path_level *l = &path->l[level];
@@ -1357,14 +1343,14 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p
        __bch2_path_free(trans, path);
 }
 
-void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
 {
        panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
              trans->restart_count, restart_count,
              (void *) trans->last_begin_ip);
 }
 
-void bch2_trans_in_restart_error(struct btree_trans *trans)
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
 {
        panic("in transaction restart: %s, last restarted by %pS\n",
              bch2_err_str(trans->restarted),
@@ -1386,7 +1372,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
                struct bkey_s_c old = { &i->old_k, i->old_v };
 
                prt_printf(buf, "update: btree=%s cached=%u %pS",
-                      bch2_btree_ids[i->btree_id],
+                      bch2_btree_id_str(i->btree_id),
                       i->cached,
                       (void *) i->ip_allocated);
                prt_newline(buf);
@@ -1402,7 +1388,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 
        trans_for_each_wb_update(trans, wb) {
                prt_printf(buf, "update: btree=%s wb=1 %pS",
-                      bch2_btree_ids[wb->btree],
+                      bch2_btree_id_str(wb->btree),
                       (void *) i->ip_allocated);
                prt_newline(buf);
 
@@ -1431,7 +1417,7 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
                   path->idx, path->ref, path->intent_ref,
                   path->preserve ? 'P' : ' ',
                   path->should_be_locked ? 'S' : ' ',
-                  bch2_btree_ids[path->btree_id],
+                  bch2_btree_id_str(path->btree_id),
                   path->level);
        bch2_bpos_to_text(out, path->pos);
 
@@ -1442,7 +1428,7 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
        prt_newline(out);
 }
 
-noinline __cold
+static noinline __cold
 void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
                                bool nosort)
 {
@@ -1462,7 +1448,7 @@ void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans)
        __bch2_trans_paths_to_text(out, trans, false);
 }
 
-noinline __cold
+static noinline __cold
 void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
 {
        struct printbuf buf = PRINTBUF;
@@ -1506,10 +1492,26 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
        trans->nr_max_paths = hweight64(trans->paths_allocated);
 }
 
+noinline __cold
+int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
+{
+       if (trace_trans_restart_too_many_iters_enabled()) {
+               struct printbuf buf = PRINTBUF;
+
+               bch2_trans_paths_to_text(&buf, trans);
+               trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
+               printbuf_exit(&buf);
+       }
+
+       count_event(trans->c, trans_restart_too_many_iters);
+
+       return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
+}
+
 static noinline void btree_path_overflow(struct btree_trans *trans)
 {
        bch2_dump_trans_paths_updates(trans);
-       panic("trans path oveflow\n");
+       panic("trans path overflow\n");
 }
 
 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
@@ -1538,6 +1540,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
        path->ref               = 0;
        path->intent_ref        = 0;
        path->nodes_locked      = 0;
+       path->alloc_seq++;
 
        btree_path_list_add(trans, pos, path);
        trans->paths_sorted = false;
@@ -1613,7 +1616,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 
        locks_want = min(locks_want, BTREE_MAX_DEPTH);
        if (locks_want > path->locks_want)
-               bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
+               bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
 
        return path;
 }
@@ -1810,23 +1813,15 @@ err:
 
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
-       if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
-               struct bpos pos = iter->k.p;
-               bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
-                            ? bpos_eq(pos, SPOS_MAX)
-                            : bkey_eq(pos, SPOS_MAX));
-
-               if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-                       pos = bkey_successor(iter, pos);
-               bch2_btree_iter_set_pos(iter, pos);
-               return ret;
-       } else {
-               if (!btree_path_node(iter->path, iter->path->level))
-                       return true;
+       struct bpos pos = iter->k.p;
+       bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+                    ? bpos_eq(pos, SPOS_MAX)
+                    : bkey_eq(pos, SPOS_MAX));
 
-               iter->advanced = true;
-               return false;
-       }
+       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+               pos = bkey_successor(iter, pos);
+       bch2_btree_iter_set_pos(iter, pos);
+       return ret;
 }
 
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
@@ -1871,23 +1866,15 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
                : NULL;
 }
 
-struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
-                                      struct btree_iter *iter,
-                                      struct bpos end_pos)
+static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
+                                             struct btree_iter *iter,
+                                             struct bpos end_pos)
 {
-       struct bkey_i *k;
-
-       if (bpos_lt(iter->path->pos, iter->journal_pos))
-               iter->journal_idx = 0;
-
-       k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
-                                       iter->path->level,
-                                       iter->path->pos,
-                                       end_pos,
-                                       &iter->journal_idx);
-
-       iter->journal_pos = k ? k->k.p : end_pos;
-       return k;
+       return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+                                          iter->path->level,
+                                          iter->path->pos,
+                                          end_pos,
+                                          &iter->journal_idx);
 }
 
 static noinline
@@ -2062,8 +2049,12 @@ out:
 }
 
 /**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter:      iterator to peek from
+ * @end:       search limit: returns keys less than or equal to @end
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
 {
@@ -2073,7 +2064,6 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
        struct bpos iter_pos;
        int ret;
 
-       EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
        EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
 
        if (iter->update_path) {
@@ -2200,102 +2190,11 @@ end:
 }
 
 /**
- * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
- * to iterator's current position, returning keys from every level of the btree.
- * For keys at different levels of the btree that compare equal, the key from
- * the lower level (leaf) is returned first.
- */
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
-{
-       struct btree_trans *trans = iter->trans;
-       struct bkey_s_c k;
-       int ret;
-
-       EBUG_ON(iter->path->cached);
-       bch2_btree_iter_verify(iter);
-       BUG_ON(iter->path->level < iter->min_depth);
-       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
-       EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
-
-       while (1) {
-               iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
-                                       iter->flags & BTREE_ITER_INTENT,
-                                       btree_iter_ip_allocated(iter));
-
-               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-               if (unlikely(ret)) {
-                       /* ensure that iter->k is consistent with iter->pos: */
-                       bch2_btree_iter_set_pos(iter, iter->pos);
-                       k = bkey_s_c_err(ret);
-                       goto out_no_locked;
-               }
-
-               /* Already at end? */
-               if (!btree_path_node(iter->path, iter->path->level)) {
-                       k = bkey_s_c_null;
-                       goto out_no_locked;
-               }
-
-               k = btree_path_level_peek_all(trans->c,
-                               &iter->path->l[iter->path->level], &iter->k);
-
-               /* Check if we should go up to the parent node: */
-               if (!k.k ||
-                   (iter->advanced &&
-                    bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
-                       iter->pos = path_l(iter->path)->b->key.k.p;
-                       btree_path_set_level_up(trans, iter->path);
-                       iter->advanced = false;
-                       continue;
-               }
-
-               /*
-                * Check if we should go back down to a leaf:
-                * If we're not in a leaf node, we only return the current key
-                * if it exactly matches iter->pos - otherwise we first have to
-                * go back to the leaf:
-                */
-               if (iter->path->level != iter->min_depth &&
-                   (iter->advanced ||
-                    !k.k ||
-                    !bpos_eq(iter->pos, k.k->p))) {
-                       btree_path_set_level_down(trans, iter->path, iter->min_depth);
-                       iter->pos = bpos_successor(iter->pos);
-                       iter->advanced = false;
-                       continue;
-               }
-
-               /* Check if we should go to the next key: */
-               if (iter->path->level == iter->min_depth &&
-                   iter->advanced &&
-                   k.k &&
-                   bpos_eq(iter->pos, k.k->p)) {
-                       iter->pos = bpos_successor(iter->pos);
-                       iter->advanced = false;
-                       continue;
-               }
-
-               if (iter->advanced &&
-                   iter->path->level == iter->min_depth &&
-                   !bpos_eq(k.k->p, iter->pos))
-                       iter->advanced = false;
-
-               BUG_ON(iter->advanced);
-               BUG_ON(!k.k);
-               break;
-       }
-
-       iter->pos = k.k->p;
-       btree_path_set_should_be_locked(iter->path);
-out_no_locked:
-       bch2_btree_iter_verify(iter);
-
-       return k;
-}
-
-/**
- * bch2_btree_iter_next: returns first key greater than iterator's current
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
  * position
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
@@ -2306,8 +2205,11 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 }
 
 /**
- * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
  * iterator's current position
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
@@ -2430,8 +2332,11 @@ out_no_locked:
 }
 
 /**
- * bch2_btree_iter_prev: returns first key less than iterator's current
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
  * position
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
@@ -2450,7 +2355,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
-       EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
        EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
 
        /* extents can't span inode numbers: */
@@ -2736,19 +2640,9 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
        iter->key_cache_path = NULL;
 }
 
-static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
-                         struct btree_iter *iter,
-                         unsigned btree_id, struct bpos pos,
-                         unsigned flags)
-{
-       bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-                              bch2_btree_iter_flags(trans, btree_id, flags),
-                              _RET_IP_);
-}
-
 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
                          struct btree_iter *iter,
-                         unsigned btree_id, struct bpos pos,
+                         enum btree_id btree_id, struct bpos pos,
                          unsigned flags)
 {
        bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
@@ -2764,9 +2658,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
                               unsigned depth,
                               unsigned flags)
 {
-       flags |= BTREE_ITER_NOT_EXTENTS;
-       flags |= __BTREE_ITER_ALL_SNAPSHOTS;
-       flags |= BTREE_ITER_ALL_SNAPSHOTS;
+       flags |= BTREE_ITER_NOT_EXTENTS;
+       flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+       flags |= BTREE_ITER_ALL_SNAPSHOTS;
 
        bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
                               __bch2_btree_iter_flags(trans, btree_id, flags),
@@ -2794,6 +2688,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
        unsigned new_top = trans->mem_top + size;
        size_t old_bytes = trans->mem_bytes;
        size_t new_bytes = roundup_pow_of_two(new_top);
+       int ret;
        void *new_mem;
        void *p;
 
@@ -2801,15 +2696,27 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 
        WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
 
-       new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
-       if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
-               new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
-               new_bytes = BTREE_TRANS_MEM_MAX;
-               kfree(trans->mem);
-       }
+       new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+       if (unlikely(!new_mem)) {
+               bch2_trans_unlock(trans);
 
-       if (!new_mem)
-               return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+               new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+               if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+                       new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+                       new_bytes = BTREE_TRANS_MEM_MAX;
+                       kfree(trans->mem);
+               }
+
+               if (!new_mem)
+                       return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+               trans->mem = new_mem;
+               trans->mem_bytes = new_bytes;
+
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       return ERR_PTR(ret);
+       }
 
        trans->mem = new_mem;
        trans->mem_bytes = new_bytes;
@@ -2825,24 +2732,44 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
        return p;
 }
 
-static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
+static inline void check_srcu_held_too_long(struct btree_trans *trans)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_path *path;
+       WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+            "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+            (jiffies - trans->srcu_lock_time) / HZ);
+}
 
-       trans_for_each_path(trans, path)
-               if (path->cached && !btree_node_locked(path, 0))
-                       path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+void bch2_trans_srcu_unlock(struct btree_trans *trans)
+{
+       if (trans->srcu_held) {
+               struct bch_fs *c = trans->c;
+               struct btree_path *path;
 
-       srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-       trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-       trans->srcu_lock_time   = jiffies;
+               trans_for_each_path(trans, path)
+                       if (path->cached && !btree_node_locked(path, 0))
+                               path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+               check_srcu_held_too_long(trans);
+               srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+               trans->srcu_held = false;
+       }
+}
+
+void bch2_trans_srcu_lock(struct btree_trans *trans)
+{
+       if (!trans->srcu_held) {
+               trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
+               trans->srcu_lock_time   = jiffies;
+               trans->srcu_held = true;
+       }
 }
 
 /**
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
  *
+ * Returns:    current restart counter, to be used with trans_was_restarted()
+ *
  * While iterating over nodes or updating nodes a attempt to lock a btree node
  * may return BCH_ERR_transaction_restart when the trylock fails. When this
  * occurs bch2_trans_begin() should be called and the transaction retried.
@@ -2850,6 +2777,7 @@ static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
 u32 bch2_trans_begin(struct btree_trans *trans)
 {
        struct btree_path *path;
+       u64 now;
 
        bch2_trans_reset_updates(trans);
 
@@ -2878,16 +2806,18 @@ u32 bch2_trans_begin(struct btree_trans *trans)
                        path->preserve = false;
        }
 
+       now = local_clock();
        if (!trans->restarted &&
            (need_resched() ||
-            local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
-               bch2_trans_unlock(trans);
-               cond_resched();
-               bch2_trans_relock(trans);
+            now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+               drop_locks_do(trans, (cond_resched(), 0));
+               now = local_clock();
        }
+       trans->last_begin_time = now;
 
-       if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
-               bch2_trans_reset_srcu_lock(trans);
+       if (unlikely(trans->srcu_held &&
+                    time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+               bch2_trans_srcu_unlock(trans);
 
        trans->last_begin_ip = _RET_IP_;
        if (trans->restarted) {
@@ -2895,26 +2825,26 @@ u32 bch2_trans_begin(struct btree_trans *trans)
                trans->notrace_relock_fail = false;
        }
 
-       trans->last_begin_time = local_clock();
        return trans->restart_count;
 }
 
-static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
 {
-       size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
-       size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
-       void *p = NULL;
+       struct btree_trans *trans;
 
-       BUG_ON(trans->used_mempool);
-
-#ifdef __KERNEL__
-       p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
-#endif
-       if (!p)
-               p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
+       if (IS_ENABLED(__KERNEL__)) {
+               trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+               if (trans)
+                       return trans;
+       }
 
-       trans->paths            = p; p += paths_bytes;
-       trans->updates          = p; p += updates_bytes;
+       trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+       /*
+        * paths need to be zeroed, bch2_check_for_deadlock looks at
+        * paths in other threads
+        */
+       memset(&trans->paths, 0, sizeof(trans->paths));
+       return trans;
 }
 
 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
@@ -2934,13 +2864,16 @@ unsigned bch2_trans_get_fn_idx(const char *fn)
        return i;
 }
 
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
        __acquires(&c->btree_trans_barrier)
 {
+       struct btree_trans *trans;
        struct btree_transaction_stats *s;
 
        bch2_assert_btree_nodes_not_locked();
 
+       trans = bch2_trans_alloc(c);
+
        memset(trans, 0, sizeof(*trans));
        trans->c                = c;
        trans->fn               = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
@@ -2949,11 +2882,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
        trans->fn_idx           = fn_idx;
        trans->locking_wait.task = current;
        trans->journal_replay_not_finished =
-               !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+               unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+               atomic_inc_not_zero(&c->journal_keys.ref);
        closure_init_stack(&trans->ref);
 
-       bch2_trans_alloc_paths(trans, c);
-
        s = btree_trans_stats(trans);
        if (s && s->max_mem) {
                unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
@@ -2973,13 +2905,14 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
                trans->wb_updates_size = s->wb_updates_size;
        }
 
-       trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+       trans->srcu_idx         = srcu_read_lock(&c->btree_trans_barrier);
        trans->srcu_lock_time   = jiffies;
+       trans->srcu_held        = true;
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
                struct btree_trans *pos;
 
-               mutex_lock(&c->btree_trans_lock);
+               seqmutex_lock(&c->btree_trans_lock);
                list_for_each_entry(pos, &c->btree_trans_list, list) {
                        /*
                         * We'd much prefer to be stricter here and completely
@@ -2997,8 +2930,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
                }
                list_add_tail(&trans->list, &c->btree_trans_list);
 list_add_done:
-               mutex_unlock(&c->btree_trans_lock);
+               seqmutex_unlock(&c->btree_trans_lock);
        }
+
+       return trans;
 }
 
 static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -3016,14 +2951,14 @@ leaked:
        trans_for_each_path(trans, path)
                if (path->ref)
                        printk(KERN_ERR "  btree %s %pS\n",
-                              bch2_btree_ids[path->btree_id],
+                              bch2_btree_id_str(path->btree_id),
                               (void *) path->ip_allocated);
        /* Be noisy about this: */
        bch2_fatal_error(c);
 #endif
 }
 
-void bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_put(struct btree_trans *trans)
        __releases(&c->btree_trans_barrier)
 {
        struct btree_insert_entry *i;
@@ -3032,6 +2967,12 @@ void bch2_trans_exit(struct btree_trans *trans)
 
        bch2_trans_unlock(trans);
 
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+               seqmutex_lock(&c->btree_trans_lock);
+               list_del(&trans->list);
+               seqmutex_unlock(&c->btree_trans_lock);
+       }
+
        closure_sync(&trans->ref);
 
        if (s)
@@ -3043,16 +2984,11 @@ void bch2_trans_exit(struct btree_trans *trans)
 
        check_btree_paths_leaked(trans);
 
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
-               mutex_lock(&c->btree_trans_lock);
-               list_del(&trans->list);
-               mutex_unlock(&c->btree_trans_lock);
+       if (trans->srcu_held) {
+               check_srcu_held_too_long(trans);
+               srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
        }
 
-       srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-
-       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
        kfree(trans->extra_journal_entries.data);
 
        if (trans->fs_usage_deltas) {
@@ -3064,23 +3000,19 @@ void bch2_trans_exit(struct btree_trans *trans)
                        kfree(trans->fs_usage_deltas);
        }
 
+       if (unlikely(trans->journal_replay_not_finished))
+               bch2_journal_keys_put(c);
+
        if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
                mempool_free(trans->mem, &c->btree_trans_mem_pool);
        else
                kfree(trans->mem);
 
-#ifdef __KERNEL__
-       /*
-        * Userspace doesn't have a real percpu implementation:
-        */
-       trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
-#endif
-
-       if (trans->paths)
-               mempool_free(trans->paths, &c->btree_paths_pool);
-
-       trans->mem      = (void *) 0x1;
-       trans->paths    = (void *) 0x1;
+       /* Userspace doesn't have a real percpu implementation: */
+       if (IS_ENABLED(__KERNEL__))
+               trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+       if (trans)
+               mempool_free(trans, &c->btree_trans_pool);
 }
 
 static void __maybe_unused
@@ -3098,7 +3030,7 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
 
        prt_tab(out);
        prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
-                  b->level, bch2_btree_ids[b->btree_id]);
+                  b->level, bch2_btree_id_str(b->btree_id));
        bch2_bpos_to_text(out, btree_node_pos(b));
 
        prt_tab(out);
@@ -3111,16 +3043,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
        struct btree_path *path;
        struct btree_bkey_cached_common *b;
        static char lock_types[] = { 'r', 'i', 'w' };
-       unsigned l;
+       struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+       unsigned l, idx;
 
        if (!out->nr_tabstops) {
                printbuf_tabstop_push(out, 16);
                printbuf_tabstop_push(out, 32);
        }
 
-       prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
+       prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
 
-       trans_for_each_path(trans, path) {
+       trans_for_each_path_safe(trans, path, idx) {
                if (!path->nodes_locked)
                        continue;
 
@@ -3128,7 +3061,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
                       path->idx,
                       path->cached ? 'c' : 'b',
                       path->level,
-                      bch2_btree_ids[path->btree_id]);
+                      bch2_btree_id_str(path->btree_id));
                bch2_bpos_to_text(out, path->pos);
                prt_newline(out);
 
@@ -3158,6 +3091,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
        struct btree_transaction_stats *s;
+       struct btree_trans *trans;
+       int cpu;
+
+       trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+       if (trans)
+               panic("%s leaked btree_trans\n", trans->fn);
+
+       if (c->btree_trans_bufs)
+               for_each_possible_cpu(cpu)
+                       kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+       free_percpu(c->btree_trans_bufs);
 
        for (s = c->btree_transaction_stats;
             s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
@@ -3169,13 +3113,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
        if (c->btree_trans_barrier_initialized)
                cleanup_srcu_struct(&c->btree_trans_barrier);
        mempool_exit(&c->btree_trans_mem_pool);
-       mempool_exit(&c->btree_paths_pool);
+       mempool_exit(&c->btree_trans_pool);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
        struct btree_transaction_stats *s;
-       unsigned nr = BTREE_ITER_MAX;
        int ret;
 
        for (s = c->btree_transaction_stats;
@@ -3186,11 +3129,14 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
        }
 
        INIT_LIST_HEAD(&c->btree_trans_list);
-       mutex_init(&c->btree_trans_lock);
+       seqmutex_init(&c->btree_trans_lock);
+
+       c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+       if (!c->btree_trans_bufs)
+               return -ENOMEM;
 
-       ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
-                       sizeof(struct btree_path) * nr +
-                       sizeof(struct btree_insert_entry) * nr) ?:
+       ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+                                         sizeof(struct btree_trans)) ?:
                mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
                                          BTREE_TRANS_MEM_MAX) ?:
                init_srcu_struct(&c->btree_trans_barrier);