]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 3f3f969859 bcachefs: Fix some compiler warnings
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 9 Sep 2021 23:06:29 +0000 (19:06 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Thu, 9 Sep 2021 23:10:07 +0000 (19:10 -0400)
59 files changed:
.bcachefs_revision
cmd_debug.c
cmd_fusemount.c
include/trace/events/bcachefs.h
libbcachefs/acl.c
libbcachefs/alloc_background.c
libbcachefs/bcachefs.h
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bset.c
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache.h
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/ec.c
libbcachefs/extent_update.c
libbcachefs/extent_update.h
libbcachefs/extents.c
libbcachefs/fs-common.c
libbcachefs/fs-io.c
libbcachefs/fs.c
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/journal_seq_blacklist.c
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/move_types.h
libbcachefs/movinggc.c
libbcachefs/opts.h
libbcachefs/quota.c
libbcachefs/rebalance.c
libbcachefs/rebalance_types.h
libbcachefs/recovery.c
libbcachefs/reflink.c
libbcachefs/str_hash.h
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/tests.c
libbcachefs/varint.c
libbcachefs/xattr.c

index ab237af5c1f6263e5c5277bfbe84fb43a43be42a..e80bf480fd9b0e264f2d5109f4dda4b81c711e4b 100644 (file)
@@ -1 +1 @@
-60fbf06f49679fdb2b37e1e863c321dfddfc3a4a
+3f3f9698592290e98a727f5023115c1775be7d5f
index 2f56e41e82b70834ecc29b78a69c8ddc842086cb..b3a6ea0c6d804de8afef5ec2c642780b9be9d798 100644 (file)
@@ -64,7 +64,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
                const struct bch_extent_ptr *ptr;
                struct bkey_ptrs_c ptrs;
                struct btree_trans trans;
-               struct btree_iter *iter;
+               struct btree_iter iter;
                struct btree *b;
 
                bch2_trans_init(&trans, c, 0, 0);
@@ -95,6 +95,8 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
                                                  ptr->offset << 9,
                                                  btree_bytes(c));
                }
+
+               bch2_trans_iter_exit(&trans, &iter);
                bch2_trans_exit(&trans);
        }
 
@@ -181,7 +183,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
                      struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        char buf[512];
        int ret;
@@ -196,7 +198,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
                bch2_bkey_val_to_text(&PBUF(buf), c, k);
                puts(buf);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 }
@@ -205,7 +207,7 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne
                               struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        char buf[4096];
 
@@ -218,7 +220,7 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne
                bch2_btree_node_to_text(&PBUF(buf), c, b);
                puts(buf);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 }
@@ -227,7 +229,7 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level,
                       struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        char buf[4096];
 
@@ -241,7 +243,7 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level,
                fputs(buf, stdout);
                putchar('\n');
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 }
@@ -346,7 +348,7 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned
                              struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        char buf[4096];
 
@@ -362,7 +364,7 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned
 
                print_node_ondisk(c, b);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 }
@@ -371,7 +373,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l
                            struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree_node_iter node_iter;
        struct bkey unpacked;
        struct bkey_s_c k;
@@ -393,7 +395,7 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l
                        puts(buf);
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 }
index 2b6b2d7ecc92e8d3b322d9138173b608a55ca534..216094f06ad33e3b34d8ed72e4c96d3b0e4a927e 100644 (file)
@@ -171,7 +171,7 @@ static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t inum,
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked inode_u;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        u64 now;
        int ret;
 
@@ -185,8 +185,7 @@ retry:
        bch2_trans_begin(&trans);
        now = bch2_current_time(c);
 
-       iter = bch2_inode_peek(&trans, &inode_u, inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(iter);
+       ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
@@ -208,11 +207,11 @@ retry:
                inode_u.bi_mtime = now;
        /* TODO: CTIME? */
 
-       ret   = bch2_inode_write(&trans, iter, &inode_u) ?:
+       ret   = bch2_inode_write(&trans, &iter, &inode_u) ?:
                bch2_trans_commit(&trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL);
 err:
-        bch2_trans_iter_put(&trans, iter);
+        bch2_trans_iter_exit(&trans, &iter);
        if (ret == -EINTR)
                goto retry;
 
@@ -523,7 +522,7 @@ static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t inum,
 static int inode_update_times(struct bch_fs *c, fuse_ino_t inum)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bch_inode_unpacked inode_u;
        int ret = 0;
        u64 now;
@@ -533,15 +532,14 @@ retry:
        bch2_trans_begin(&trans);
        now = bch2_current_time(c);
 
-       iter = bch2_inode_peek(&trans, &inode_u, inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(iter);
+       ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        inode_u.bi_mtime = now;
        inode_u.bi_ctime = now;
 
-       ret = bch2_inode_write(&trans, iter, &inode_u);
+       ret = bch2_inode_write(&trans, &iter, &inode_u);
        if (ret)
                goto err;
 
@@ -549,7 +547,7 @@ retry:
                                BTREE_INSERT_NOFAIL);
 
 err:
-        bch2_trans_iter_put(&trans, iter);
+        bch2_trans_iter_exit(&trans, &iter);
        if (ret == -EINTR)
                goto retry;
 
index a11bb5f7180eec21aafe49f7beb9c64d2cb96560..fce3146378f9fbd51b37f115fe9e8e1fd0ffb8f5 100644 (file)
@@ -298,28 +298,6 @@ TRACE_EVENT(btree_reserve_get_fail,
                  __entry->required, __entry->cl)
 );
 
-TRACE_EVENT(btree_insert_key,
-       TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
-       TP_ARGS(c, b, k),
-
-       TP_STRUCT__entry(
-               __field(u8,             id                      )
-               __field(u64,            inode                   )
-               __field(u64,            offset                  )
-               __field(u32,            size                    )
-       ),
-
-       TP_fast_assign(
-               __entry->id             = b->c.btree_id;
-               __entry->inode          = k->k.p.inode;
-               __entry->offset         = k->k.p.offset;
-               __entry->size           = k->k.size;
-       ),
-
-       TP_printk("btree %u: %llu:%llu len %u", __entry->id,
-                 __entry->inode, __entry->offset, __entry->size)
-);
-
 DEFINE_EVENT(btree_node, btree_split,
        TP_PROTO(struct bch_fs *c, struct btree *b),
        TP_ARGS(c, b)
@@ -540,69 +518,6 @@ TRACE_EVENT(copygc_wait,
                  __entry->wait_amount, __entry->until)
 );
 
-TRACE_EVENT(trans_get_iter,
-       TP_PROTO(unsigned long trans_ip,
-                unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *got_pos,
-                unsigned got_locks,
-                unsigned got_uptodate,
-                struct bpos *src_pos,
-                unsigned src_locks,
-                unsigned src_uptodate),
-       TP_ARGS(trans_ip, caller_ip, btree_id,
-               got_pos, got_locks, got_uptodate,
-               src_pos, src_locks, src_uptodate),
-
-       TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip                )
-               __field(unsigned long,          caller_ip               )
-               __field(u8,                     btree_id                )
-               __field(u64,                    got_pos_inode           )
-               __field(u64,                    got_pos_offset          )
-               __field(u32,                    got_pos_snapshot        )
-               __field(u8,                     got_locks               )
-               __field(u8,                     got_uptodate            )
-               __field(u64,                    src_pos_inode           )
-               __field(u64,                    src_pos_offset          )
-               __field(u32,                    src_pos_snapshot        )
-               __field(u8,                     src_locks               )
-               __field(u8,                     src_uptodate            )
-       ),
-
-       TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
-               __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = btree_id;
-               __entry->got_pos_inode          = got_pos->inode;
-               __entry->got_pos_offset         = got_pos->offset;
-               __entry->got_pos_snapshot       = got_pos->snapshot;
-               __entry->got_locks              = got_locks;
-               __entry->got_uptodate           = got_uptodate;
-               __entry->src_pos_inode          = src_pos->inode;
-               __entry->src_pos_offset         = src_pos->offset;
-               __entry->src_pos_snapshot       = src_pos->snapshot;
-               __entry->src_locks              = src_locks;
-               __entry->src_uptodate           = src_uptodate;
-       ),
-
-       TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u "
-                 "src %llu:%llu:%u l %u u %u",
-                 (void *) __entry->trans_ip,
-                 (void *) __entry->caller_ip,
-                 __entry->btree_id,
-                 __entry->got_pos_inode,
-                 __entry->got_pos_offset,
-                 __entry->got_pos_snapshot,
-                 __entry->got_locks,
-                 __entry->got_uptodate,
-                 __entry->src_pos_inode,
-                 __entry->src_pos_offset,
-                 __entry->src_pos_snapshot,
-                 __entry->src_locks,
-                 __entry->src_uptodate)
-);
-
 TRACE_EVENT(transaction_restart_ip,
        TP_PROTO(unsigned long caller, unsigned long ip),
        TP_ARGS(caller, ip),
@@ -772,96 +687,6 @@ DEFINE_EVENT(transaction_restart_iter,     trans_restart_traverse,
        TP_ARGS(trans_ip, caller_ip, btree_id, pos)
 );
 
-TRACE_EVENT(iter_traverse,
-       TP_PROTO(unsigned long  trans_ip,
-                unsigned long  caller_ip,
-                bool key_cache,
-                enum btree_id  btree_id,
-                struct bpos    *pos,
-                int ret),
-       TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, ret),
-
-       TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip        )
-               __field(unsigned long,          caller_ip       )
-               __field(u8,                     key_cache       )
-               __field(u8,                     btree_id        )
-               __field(u64,                    pos_inode       )
-               __field(u64,                    pos_offset      )
-               __field(u32,                    pos_snapshot    )
-               __field(s32,                    ret             )
-       ),
-
-       TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
-               __entry->caller_ip              = caller_ip;
-               __entry->key_cache              = key_cache;
-               __entry->btree_id               = btree_id;
-               __entry->pos_inode              = pos->inode;
-               __entry->pos_offset             = pos->offset;
-               __entry->pos_snapshot           = pos->snapshot;
-               __entry->ret                    = ret;
-       ),
-
-       TP_printk("%ps %pS key cache %u btree %u %llu:%llu:%u ret %i",
-                 (void *) __entry->trans_ip,
-                 (void *) __entry->caller_ip,
-                 __entry->key_cache,
-                 __entry->btree_id,
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot,
-                 __entry->ret)
-);
-
-TRACE_EVENT(iter_set_search_pos,
-       TP_PROTO(unsigned long  trans_ip,
-                unsigned long  caller_ip,
-                enum btree_id  btree_id,
-                struct bpos    *old_pos,
-                struct bpos    *new_pos,
-                unsigned       good_level),
-       TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level),
-
-       TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip                )
-               __field(unsigned long,          caller_ip               )
-               __field(u8,                     btree_id                )
-               __field(u64,                    old_pos_inode           )
-               __field(u64,                    old_pos_offset          )
-               __field(u32,                    old_pos_snapshot        )
-               __field(u64,                    new_pos_inode           )
-               __field(u64,                    new_pos_offset          )
-               __field(u32,                    new_pos_snapshot        )
-               __field(u8,                     good_level              )
-       ),
-
-       TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
-               __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = btree_id;
-               __entry->old_pos_inode          = old_pos->inode;
-               __entry->old_pos_offset         = old_pos->offset;
-               __entry->old_pos_snapshot       = old_pos->snapshot;
-               __entry->new_pos_inode          = new_pos->inode;
-               __entry->new_pos_offset         = new_pos->offset;
-               __entry->new_pos_snapshot       = new_pos->snapshot;
-               __entry->good_level             = good_level;
-       ),
-
-       TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u",
-                 (void *) __entry->trans_ip,
-                 (void *) __entry->caller_ip,
-                 __entry->btree_id,
-                 __entry->old_pos_inode,
-                 __entry->old_pos_offset,
-                 __entry->old_pos_snapshot,
-                 __entry->new_pos_inode,
-                 __entry->new_pos_offset,
-                 __entry->new_pos_snapshot,
-                 __entry->good_level)
-);
-
 TRACE_EVENT(trans_restart_would_deadlock,
        TP_PROTO(unsigned long  trans_ip,
                 unsigned long  caller_ip,
@@ -931,99 +756,42 @@ TRACE_EVENT(trans_restart_would_deadlock,
                  __entry->want_pos_snapshot)
 );
 
-TRACE_EVENT(trans_restart_mem_realloced,
-       TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
-                unsigned long bytes),
-       TP_ARGS(trans_ip, caller_ip, bytes),
+TRACE_EVENT(trans_restart_would_deadlock_write,
+       TP_PROTO(unsigned long trans_ip),
+       TP_ARGS(trans_ip),
 
        TP_STRUCT__entry(
                __field(unsigned long,          trans_ip        )
-               __field(unsigned long,          caller_ip       )
-               __field(unsigned long,          bytes           )
        ),
 
        TP_fast_assign(
                __entry->trans_ip       = trans_ip;
-               __entry->caller_ip      = caller_ip;
-               __entry->bytes          = bytes;
        ),
 
-       TP_printk("%ps %pS bytes %lu",
-                 (void *) __entry->trans_ip,
-                 (void *) __entry->caller_ip,
-                 __entry->bytes)
+       TP_printk("%ps", (void *) __entry->trans_ip)
 );
 
-DECLARE_EVENT_CLASS(node_lock_fail,
-       TP_PROTO(unsigned long trans_ip,
-                unsigned long caller_ip,
-                bool key_cache,
-                enum btree_id btree_id,
-                struct bpos *pos,
-                unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-       TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
-               level, iter_seq, node, node_seq),
+TRACE_EVENT(trans_restart_mem_realloced,
+       TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
+                unsigned long bytes),
+       TP_ARGS(trans_ip, caller_ip, bytes),
 
        TP_STRUCT__entry(
                __field(unsigned long,          trans_ip        )
                __field(unsigned long,          caller_ip       )
-               __field(u8,                     key_cache       )
-               __field(u8,                     btree_id        )
-               __field(u64,                    pos_inode       )
-               __field(u64,                    pos_offset      )
-               __field(u32,                    pos_snapshot    )
-               __field(u32,                    level           )
-               __field(u32,                    iter_seq        )
-               __field(u32,                    node            )
-               __field(u32,                    node_seq        )
+               __field(unsigned long,          bytes           )
        ),
 
        TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
-               __entry->caller_ip              = caller_ip;
-               __entry->key_cache              = key_cache;
-               __entry->btree_id               = btree_id;
-               __entry->pos_inode              = pos->inode;
-               __entry->pos_offset             = pos->offset;
-               __entry->pos_snapshot           = pos->snapshot;
-               __entry->level                  = level;
-               __entry->iter_seq               = iter_seq;
-               __entry->node                   = node;
-               __entry->node_seq               = node_seq;
+               __entry->trans_ip       = trans_ip;
+               __entry->caller_ip      = caller_ip;
+               __entry->bytes          = bytes;
        ),
 
-       TP_printk("%ps %pS key cache %u btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u",
+       TP_printk("%ps %pS bytes %lu",
                  (void *) __entry->trans_ip,
                  (void *) __entry->caller_ip,
-                 __entry->key_cache,
-                 __entry->btree_id,
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot,
-                 __entry->level, __entry->iter_seq,
-                 __entry->node, __entry->node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
-       TP_PROTO(unsigned long trans_ip,
-                unsigned long caller_ip,
-                bool key_cache,
-                enum btree_id btree_id,
-                struct bpos *pos,
-                unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-       TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
-               level, iter_seq, node, node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_relock_fail,
-       TP_PROTO(unsigned long trans_ip,
-                unsigned long caller_ip,
-                bool key_cache,
-                enum btree_id btree_id,
-                struct bpos *pos,
-                unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-       TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
-               level, iter_seq, node, node_seq)
+                 __entry->bytes)
 );
 
 #endif /* _TRACE_BCACHE_H */
index eb907e5d33d3fcf364f0f977d390d916336d391d..2146a63d1846353fbac2badd7e028a702bf24e84 100644 (file)
@@ -218,7 +218,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c_xattr xattr;
        struct posix_acl *acl = NULL;
        struct bkey_s_c k;
@@ -228,20 +228,19 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
 retry:
        bch2_trans_begin(&trans);
 
-       iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+       ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
                        &hash, inode->v.i_ino,
                        &X_SEARCH(acl_to_xattr_type(type), "", 0),
                        0);
-       if (IS_ERR(iter)) {
-               if (PTR_ERR(iter) == -EINTR)
+       if (ret) {
+               if (ret == -EINTR)
                        goto retry;
-
-               if (PTR_ERR(iter) != -ENOENT)
-                       acl = ERR_CAST(iter);
+               if (ret != -ENOENT)
+                       acl = ERR_PTR(ret);
                goto out;
        }
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret) {
                acl = ERR_PTR(ret);
@@ -254,8 +253,8 @@ retry:
 
        if (!IS_ERR(acl))
                set_cached_acl(&inode->v, type, acl);
-       bch2_trans_iter_put(&trans, iter);
 out:
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return acl;
 }
@@ -296,7 +295,7 @@ int bch2_set_acl(struct user_namespace *mnt_userns,
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct btree_trans trans;
-       struct btree_iter *inode_iter;
+       struct btree_iter inode_iter = { NULL };
        struct bch_inode_unpacked inode_u;
        struct bch_hash_info hash_info;
        struct posix_acl *acl;
@@ -309,9 +308,8 @@ retry:
        bch2_trans_begin(&trans);
        acl = _acl;
 
-       inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-                                    BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
+       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+                             BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
 
@@ -332,11 +330,11 @@ retry:
        inode_u.bi_ctime        = bch2_current_time(c);
        inode_u.bi_mode         = mode;
 
-       ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+       ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
                bch2_trans_commit(&trans, NULL,
                                  &inode->ei_journal_seq, 0);
 btree_err:
-       bch2_trans_iter_put(&trans, inode_iter);
+       bch2_trans_iter_exit(&trans, &inode_iter);
 
        if (ret == -EINTR)
                goto retry;
@@ -360,22 +358,21 @@ int bch2_acl_chmod(struct btree_trans *trans,
                   struct posix_acl **new_acl)
 {
        struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c_xattr xattr;
        struct bkey_i_xattr *new;
        struct posix_acl *acl;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
+       ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
                        &hash_info, inode->bi_inum,
                        &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
                        BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(iter);
        if (ret)
                return ret == -ENOENT ? 0 : ret;
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
        xattr = bkey_s_c_to_xattr(k);
        if (ret)
                goto err;
@@ -396,12 +393,12 @@ int bch2_acl_chmod(struct btree_trans *trans,
                goto err;
        }
 
-       new->k.p = iter->pos;
-       ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+       new->k.p = iter.pos;
+       ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
        *new_acl = acl;
        acl = NULL;
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        if (!IS_ERR_OR_NULL(acl))
                kfree(acl);
        return ret;
index 886861a00df30ef6393c33c8775cc923f2943dfe..87fa92408f93ec6979e7fccdb4b1467703edccc5 100644 (file)
@@ -353,32 +353,32 @@ err:
 int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bch_dev *ca;
        unsigned i;
        int ret = 0;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        for_each_member_device(ca, c, i) {
-               bch2_btree_iter_set_pos(iter,
+               bch2_btree_iter_set_pos(&iter,
                        POS(ca->dev_idx, ca->mi.first_bucket));
 
-               while (iter->pos.offset < ca->mi.nbuckets) {
+               while (iter.pos.offset < ca->mi.nbuckets) {
                        bch2_trans_cond_resched(&trans);
 
-                       ret = bch2_alloc_write_key(&trans, iter, flags);
+                       ret = bch2_alloc_write_key(&trans, &iter, flags);
                        if (ret) {
                                percpu_ref_put(&ca->ref);
                                goto err;
                        }
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                }
        }
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -390,18 +390,18 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 {
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bucket *g;
        struct bkey_alloc_buf *a;
        struct bkey_alloc_unpacked u;
        u64 *time, now;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
-                                  BTREE_ITER_CACHED|
-                                  BTREE_ITER_CACHED_NOFILL|
-                                  BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_CACHED_NOFILL|
+                            BTREE_ITER_INTENT);
+       ret = bch2_btree_iter_traverse(&iter);
        if (ret)
                goto out;
 
@@ -412,7 +412,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 
        percpu_down_read(&c->mark_lock);
        g = bucket(ca, bucket_nr);
-       u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
+       u = alloc_mem_to_key(&iter, g, READ_ONCE(g->mark));
        percpu_up_read(&c->mark_lock);
 
        time = rw == READ ? &u.read_time : &u.write_time;
@@ -423,10 +423,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
        *time = now;
 
        bch2_alloc_pack(c, a, u);
-       ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
+       ret   = bch2_trans_update(trans, &iter, &a->k, 0) ?:
                bch2_trans_commit(trans, NULL, NULL, 0);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -695,27 +695,28 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
        struct bkey_alloc_unpacked u;
        struct bucket *g;
        struct bucket_mark m;
-       struct btree_iter *iter =
-               bch2_trans_get_iter(trans, BTREE_ID_alloc,
-                                   POS(ca->dev_idx, b),
-                                   BTREE_ITER_CACHED|
-                                   BTREE_ITER_CACHED_NOFILL|
-                                   BTREE_ITER_INTENT);
+       struct btree_iter iter;
        int ret;
 
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                            POS(ca->dev_idx, b),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_CACHED_NOFILL|
+                            BTREE_ITER_INTENT);
+
        a = bch2_trans_kmalloc(trans, sizeof(*a));
        ret = PTR_ERR_OR_ZERO(a);
        if (ret)
                goto err;
 
-       ret = bch2_btree_iter_traverse(iter);
+       ret = bch2_btree_iter_traverse(&iter);
        if (ret)
                goto err;
 
        percpu_down_read(&c->mark_lock);
        g = bucket(ca, b);
        m = READ_ONCE(g->mark);
-       u = alloc_mem_to_key(iter, g, m);
+       u = alloc_mem_to_key(&iter, g, m);
        percpu_up_read(&c->mark_lock);
 
        u.gen++;
@@ -726,10 +727,10 @@ static int bucket_invalidate_btree(struct btree_trans *trans,
        u.write_time    = atomic64_read(&c->io_clock[WRITE].now);
 
        bch2_alloc_pack(c, a, u);
-       ret = bch2_trans_update(trans, iter, &a->k,
+       ret = bch2_trans_update(trans, &iter, &a->k,
                                BTREE_TRIGGER_BUCKET_INVALIDATE);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index 051aba63eaa54ff7b130d1b6a8f618e0e56111f4..9975fc173ccb7a3a87030353386f719bdc3bcb0f 100644 (file)
@@ -557,8 +557,8 @@ struct journal_keys {
        u64                     journal_seq_base;
 };
 
-struct btree_iter_buf {
-       struct btree_iter       *iter;
+struct btree_path_buf {
+       struct btree_path       *path;
 };
 
 #define REPLICAS_DELTA_LIST_MAX        (1U << 16)
@@ -666,9 +666,9 @@ struct bch_fs {
        /* btree_iter.c: */
        struct mutex            btree_trans_lock;
        struct list_head        btree_trans_list;
-       mempool_t               btree_iters_pool;
+       mempool_t               btree_paths_pool;
        mempool_t               btree_trans_mem_pool;
-       struct btree_iter_buf  __percpu *btree_iters_bufs;
+       struct btree_path_buf  __percpu *btree_paths_bufs;
 
        struct srcu_struct      btree_trans_barrier;
 
@@ -791,6 +791,10 @@ struct bch_fs {
        struct write_point      copygc_write_point;
        s64                     copygc_wait;
 
+       /* DATA PROGRESS STATS */
+       struct list_head        data_progress_list;
+       struct mutex            data_progress_lock;
+
        /* STRIPES: */
        GENRADIX(struct stripe) stripes[2];
 
index 2e45d88fab0382cdc9e99e9d5449702adc8f30d0..c4a66f28ef4be08682064acac15cd1cb8f641656 100644 (file)
@@ -163,37 +163,6 @@ static inline struct bpos bpos_max(struct bpos l, struct bpos r)
        return bpos_cmp(l, r) > 0 ? l : r;
 }
 
-#define sbb(a, b, borrow)                              \
-do {                                                   \
-       typeof(a) d1, d2;                               \
-                                                       \
-       d1 = a - borrow;                                \
-       borrow  = d1 > a;                               \
-                                                       \
-       d2 = d1 - b;                                    \
-       borrow += d2 > d1;                              \
-       a = d2;                                         \
-} while (0)
-
-/* returns a - b: */
-static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
-{
-       int borrow = 0;
-
-       sbb(a.snapshot, b.snapshot,     borrow);
-       sbb(a.offset,   b.offset,       borrow);
-       sbb(a.inode,    b.inode,        borrow);
-       return a;
-}
-
-static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
-{
-       if (bpos_cmp(l, r) > 0)
-               swap(l, r);
-
-       return bpos_sub(r, l);
-}
-
 void bch2_bpos_swab(struct bpos *);
 void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
 
index f8adbf4372764852a6838f9ed8d1aa540d2752c2..a03b5514a802288fefe91c18e6e99f8951aefabe 100644 (file)
@@ -215,6 +215,8 @@ void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
                pr_buf(out, "POS_MIN");
        else if (!bpos_cmp(pos, POS_MAX))
                pr_buf(out, "POS_MAX");
+       else if (!bpos_cmp(pos, SPOS_MAX))
+               pr_buf(out, "SPOS_MAX");
        else {
                if (pos.inode == U64_MAX)
                        pr_buf(out, "U64_MAX");
index 0eb85acdbf8bcaec42dfa6f5e33ee69df83fa9d4..59e4c1d1a2a5d0b9575f53a947b6b46b02a1de90 100644 (file)
@@ -197,9 +197,11 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
                return;
 
        /* Verify no duplicates: */
-       btree_node_iter_for_each(iter, set)
+       btree_node_iter_for_each(iter, set) {
+               BUG_ON(set->k > set->end);
                btree_node_iter_for_each(iter, s2)
                        BUG_ON(set != s2 && set->end == s2->end);
+       }
 
        /* Verify that set->end is correct: */
        btree_node_iter_for_each(iter, set) {
index cd0c5009e167b9f529fc8f9d0edfeaac460ff902..5f9ab818e2a7f4c25e27d9c17974e679a21af777 100644 (file)
@@ -128,7 +128,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 
 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 {
-       rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+       int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+       BUG_ON(ret);
 
        /* Cause future lookups for this node to fail: */
        b->hash_val = 0;
@@ -632,7 +633,8 @@ err:
 
 /* Slowpath, don't want it inlined into btree_iter_traverse() */
 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
-                               struct btree_iter *iter,
+                               struct btree_trans *trans,
+                               struct btree_path *path,
                                const struct bkey_i *k,
                                enum btree_id btree_id,
                                unsigned level,
@@ -648,8 +650,8 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
         * Parent node must be locked, else we could read in a btree node that's
         * been freed:
         */
-       if (iter && !bch2_btree_node_relock(iter, level + 1)) {
-               btree_trans_restart(iter->trans);
+       if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
+               btree_trans_restart(trans);
                return ERR_PTR(-EINTR);
        }
 
@@ -680,23 +682,23 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
        six_unlock_intent(&b->c.lock);
 
        /* Unlock before doing IO: */
-       if (iter && sync)
-               bch2_trans_unlock(iter->trans);
+       if (trans && sync)
+               bch2_trans_unlock(trans);
 
        bch2_btree_node_read(c, b, sync);
 
        if (!sync)
                return NULL;
 
-       if (iter &&
-           (!bch2_trans_relock(iter->trans) ||
-            !bch2_btree_iter_relock_intent(iter))) {
-               BUG_ON(!iter->trans->restarted);
+       if (trans &&
+           (!bch2_trans_relock(trans) ||
+            !bch2_btree_path_relock_intent(trans, path))) {
+               BUG_ON(!trans->restarted);
                return ERR_PTR(-EINTR);
        }
 
        if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-               btree_trans_restart(iter->trans);
+               btree_trans_restart(trans);
                return ERR_PTR(-EINTR);
        }
 
@@ -754,7 +756,7 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b)
  * The btree node will have either a read or a write lock held, depending on
  * the @write parameter.
  */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *iter,
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
                                  const struct bkey_i *k, unsigned level,
                                  enum six_lock_type lock_type,
                                  unsigned long trace_ip)
@@ -779,7 +781,7 @@ retry:
                 * else we could read in a btree node from disk that's been
                 * freed:
                 */
-               b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+               b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
                                         level, lock_type, true);
 
                /* We raced and found the btree node in the cache */
@@ -818,10 +820,10 @@ lock_node:
                 * the parent was modified, when the pointer to the node we want
                 * was removed - and we'll bail out:
                 */
-               if (btree_node_read_locked(iter, level + 1))
-                       btree_node_unlock(iter, level + 1);
+               if (btree_node_read_locked(path, level + 1))
+                       btree_node_unlock(path, level + 1);
 
-               if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
+               if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
                                     lock_node_check_fn, (void *) k, trace_ip)) {
                        if (!trans->restarted)
                                goto retry;
@@ -832,13 +834,13 @@ lock_node:
                             b->c.level != level ||
                             race_fault())) {
                        six_unlock_type(&b->c.lock, lock_type);
-                       if (bch2_btree_node_relock(iter, level + 1))
+                       if (bch2_btree_node_relock(trans, path, level + 1))
                                goto retry;
 
                        trace_trans_restart_btree_node_reused(trans->ip,
                                                              trace_ip,
-                                                             iter->btree_id,
-                                                             &iter->real_pos);
+                                                             path->btree_id,
+                                                             &path->pos);
                        btree_trans_restart(trans);
                        return ERR_PTR(-EINTR);
                }
@@ -853,12 +855,12 @@ lock_node:
                bch2_btree_node_wait_on_read(b);
 
                /*
-                * should_be_locked is not set on this iterator yet, so we need
-                * to relock it specifically:
+                * should_be_locked is not set on this path yet, so we need to
+                * relock it specifically:
                 */
-               if (iter &&
+               if (trans &&
                    (!bch2_trans_relock(trans) ||
-                    !bch2_btree_iter_relock_intent(iter))) {
+                    !bch2_btree_path_relock_intent(trans, path))) {
                        BUG_ON(!trans->restarted);
                        return ERR_PTR(-EINTR);
                }
@@ -886,7 +888,7 @@ lock_node:
                return ERR_PTR(-EIO);
        }
 
-       EBUG_ON(b->c.btree_id != iter->btree_id);
+       EBUG_ON(b->c.btree_id != path->btree_id);
        EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
        btree_check_header(c, b);
 
@@ -917,7 +919,7 @@ retry:
                if (nofill)
                        goto out;
 
-               b = bch2_btree_node_fill(c, NULL, k, btree_id,
+               b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
                                         level, SIX_LOCK_read, true);
 
                /* We raced and found the btree node in the cache */
@@ -975,21 +977,24 @@ out:
        return b;
 }
 
-int bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
+int bch2_btree_node_prefetch(struct bch_fs *c,
+                            struct btree_trans *trans,
+                            struct btree_path *path,
                             const struct bkey_i *k,
                             enum btree_id btree_id, unsigned level)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
 
-       BUG_ON(iter && !btree_node_locked(iter, level + 1));
+       BUG_ON(trans && !btree_node_locked(path, level + 1));
        BUG_ON(level >= BTREE_MAX_DEPTH);
 
        b = btree_cache_find(bc, k);
        if (b)
                return 0;
 
-       b = bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
+       b = bch2_btree_node_fill(c, trans, path, k, btree_id,
+                                level, SIX_LOCK_read, false);
        return PTR_ERR_OR_ZERO(b);
 }
 
index 5032293e8628a5fc0c9c23616e30923b1feb9b7c..402cec1802bc4375e2af268e59fa535695be35ff 100644 (file)
@@ -22,14 +22,14 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *,
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
                                  const struct bkey_i *, unsigned,
                                  enum six_lock_type, unsigned long);
 
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
                                         enum btree_id, unsigned, bool);
 
-int bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
+int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
                             const struct bkey_i *, enum btree_id, unsigned);
 
 void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
index 3dd1094d10c9cdf61ef233ac9c94616141ce4286..307f287d95e674d46039d35b7054cf90a681e63d 100644 (file)
@@ -775,7 +775,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
                         bool initial, bool metadata_only)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        unsigned depth = metadata_only                  ? 1
                : bch2_expensive_debug_checks           ? 0
@@ -800,13 +800,13 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
                if (!initial) {
                        if (max_stale > 64)
-                               bch2_btree_node_rewrite(&trans, iter,
+                               bch2_btree_node_rewrite(&trans, &iter,
                                                b->data->keys.seq,
                                                BTREE_INSERT_NOWAIT|
                                                BTREE_INSERT_GC_LOCK_HELD);
                        else if (!bch2_btree_gc_rewrite_disabled &&
                                 (bch2_btree_gc_always_rewrite || max_stale > 16))
-                               bch2_btree_node_rewrite(&trans, iter,
+                               bch2_btree_node_rewrite(&trans, &iter,
                                                b->data->keys.seq,
                                                BTREE_INSERT_NOWAIT|
                                                BTREE_INSERT_GC_LOCK_HELD);
@@ -814,7 +814,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
                bch2_trans_cond_resched(&trans);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
@@ -1414,7 +1414,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
                                bool metadata_only)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct reflink_gc *r;
        size_t idx = 0;
@@ -1480,7 +1480,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
                }
        }
 fsck_err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 out:
        genradix_free(&c->reflink_gc_table);
@@ -1512,7 +1512,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
                                 bool metadata_only)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct reflink_gc *r;
        int ret;
@@ -1547,7 +1547,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
                r->size         = k.k->size;
                r->refcount     = 0;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        return 0;
@@ -1722,7 +1722,7 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf sk;
        int ret = 0, commit_err = 0;
@@ -1730,14 +1730,21 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-                                  BTREE_ITER_PREFETCH|
-                                  BTREE_ITER_NOT_EXTENTS|
-                                  BTREE_ITER_ALL_SNAPSHOTS);
+       bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
-       while ((k = bch2_btree_iter_peek(iter)).k &&
-              !(ret = bkey_err(k))) {
-               c->gc_gens_pos = iter->pos;
+       while ((bch2_trans_begin(&trans),
+               k = bch2_btree_iter_peek(&iter)).k) {
+               ret = bkey_err(k);
+
+               if (ret == -EINTR)
+                       continue;
+               if (ret)
+                       break;
+
+               c->gc_gens_pos = iter.pos;
 
                if (gc_btree_gens_key(c, k) && !commit_err) {
                        bch2_bkey_buf_reassemble(&sk, c, k);
@@ -1745,7 +1752,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 
 
                        commit_err =
-                               bch2_trans_update(&trans, iter, sk.k, 0) ?:
+                               bch2_trans_update(&trans, &iter, sk.k, 0) ?:
                                bch2_trans_commit(&trans, NULL, NULL,
                                                       BTREE_INSERT_NOWAIT|
                                                       BTREE_INSERT_NOFAIL);
@@ -1755,9 +1762,9 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
                        }
                }
 
-               bch2_btree_iter_advance(iter);
+               bch2_btree_iter_advance(&iter);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
index 40fa0111a3f635cbee8b95582d2aacabe1a93b8a..f11fcab619021f418958301eee4119a2ebc39c1e 100644 (file)
@@ -465,16 +465,13 @@ void bch2_btree_build_aux_trees(struct btree *b)
  *
  * Returns true if we sorted (i.e. invalidated iterators
  */
-void bch2_btree_init_next(struct btree_trans *trans,
-                         struct btree_iter *iter,
-                         struct btree *b)
+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 {
        struct bch_fs *c = trans->c;
        struct btree_node_entry *bne;
        bool reinit_iter = false;
 
        EBUG_ON(!(b->c.lock.state.seq & 1));
-       EBUG_ON(iter && iter->l[b->c.level].b != b);
        BUG_ON(bset_written(b, bset(b, &b->set[1])));
 
        if (b->nsets == MAX_BSETS &&
@@ -503,8 +500,8 @@ void bch2_btree_init_next(struct btree_trans *trans,
 
        bch2_btree_build_aux_trees(b);
 
-       if (iter && reinit_iter)
-               bch2_btree_iter_reinit_node(iter, b);
+       if (reinit_iter)
+               bch2_trans_node_reinit_iter(trans, b);
 }
 
 static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
@@ -1260,7 +1257,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
        bool dump_bset_maps = false;
        bool have_retry = false;
        int ret = 0, best = -1, write = READ;
-       unsigned i, written, written2;
+       unsigned i, written = 0, written2 = 0;
        __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
                ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
 
index 7fdcf879c7d468ae796c4079a791a9c7570b648a..0f20224e2a77cec3070850226ea52cd45ebb3695 100644 (file)
@@ -134,8 +134,7 @@ void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 void bch2_btree_node_drop_keys_outside_node(struct btree *);
 
 void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct btree_trans *, struct btree_iter *,
-                         struct btree *);
+void bch2_btree_init_next(struct btree_trans *, struct btree *);
 
 int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
                              struct btree *, bool);
index fe710d19ca1994d5149f2496dc500704b4c10a42..ce4d7c7e6f9b655dff015aa3093653579264965e 100644 (file)
 #include <linux/prefetch.h>
 #include <trace/events/bcachefs.h>
 
-static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
-static void btree_trans_sort_iters(struct btree_trans *);
-static void btree_iter_check_sort(struct btree_trans *, struct btree_iter *);
-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long);
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *,
-                                                struct btree_iter *);
-static void btree_iter_copy(struct btree_iter *, struct btree_iter *);
+static void btree_trans_verify_sorted(struct btree_trans *);
+static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
 
-static inline int btree_iter_cmp(const struct btree_iter *l,
-                                const struct btree_iter *r)
+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
+                                      struct btree_path *);
+
+static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+
+static inline int __btree_path_cmp(const struct btree_path *l,
+                                  enum btree_id        r_btree_id,
+                                  bool                 r_cached,
+                                  struct bpos          r_pos,
+                                  unsigned             r_level)
 {
-       return   cmp_int(l->btree_id, r->btree_id) ?:
-               -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
-                bkey_cmp(l->real_pos, r->real_pos);
+       return   cmp_int(l->btree_id,   r_btree_id) ?:
+                cmp_int(l->cached,     r_cached) ?:
+                bpos_cmp(l->pos,       r_pos) ?:
+               -cmp_int(l->level,      r_level);
 }
 
-static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+static inline int btree_path_cmp(const struct btree_path *l,
+                                const struct btree_path *r)
 {
-       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+       return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
+}
 
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
        /* Are we iterating over keys in all snapshots? */
        if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
                p = bpos_successor(p);
@@ -50,8 +59,6 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
 
 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
 {
-       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
-
        /* Are we iterating over keys in all snapshots? */
        if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
                p = bpos_predecessor(p);
@@ -63,10 +70,10 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos
        return p;
 }
 
-static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
 {
        return l < BTREE_MAX_DEPTH &&
-               (unsigned long) iter->l[l].b >= 128;
+               (unsigned long) path->l[l].b >= 128;
 }
 
 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
@@ -79,41 +86,40 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
        return pos;
 }
 
-static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+static inline bool btree_path_pos_before_node(struct btree_path *path,
                                              struct btree *b)
 {
-       return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
+       return bpos_cmp(path->pos, b->data->min_key) < 0;
 }
 
-static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+static inline bool btree_path_pos_after_node(struct btree_path *path,
                                             struct btree *b)
 {
-       return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
+       return bpos_cmp(b->key.k.p, path->pos) < 0;
 }
 
-static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+static inline bool btree_path_pos_in_node(struct btree_path *path,
                                          struct btree *b)
 {
-       return iter->btree_id == b->c.btree_id &&
-               !btree_iter_pos_before_node(iter, b) &&
-               !btree_iter_pos_after_node(iter, b);
+       return path->btree_id == b->c.btree_id &&
+               !btree_path_pos_before_node(path, b) &&
+               !btree_path_pos_after_node(path, b);
 }
 
 /* Btree node locking: */
 
-void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+                       struct btree_path *path, struct btree *b)
 {
-       bch2_btree_node_unlock_write_inlined(b, iter);
+       bch2_btree_node_unlock_write_inlined(trans, path, b);
 }
 
-void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 {
-       struct btree_iter *linked;
+       struct btree_path *linked;
        unsigned readers = 0;
 
-       EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
-
-       trans_for_each_iter(iter->trans, linked)
+       trans_for_each_path(trans, linked)
                if (linked->l[b->c.level].b == b &&
                    btree_node_read_locked(linked, b->c.level))
                        readers++;
@@ -126,138 +132,132 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
         */
        atomic64_sub(__SIX_VAL(read_lock, readers),
                     &b->c.lock.state.counter);
-       btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
+       btree_node_lock_type(trans->c, b, SIX_LOCK_write);
        atomic64_add(__SIX_VAL(read_lock, readers),
                     &b->c.lock.state.counter);
 }
 
-bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+                             struct btree_path *path, unsigned level)
 {
-       struct btree *b = btree_iter_node(iter, level);
-       int want = __btree_lock_want(iter, level);
+       struct btree *b = btree_path_node(path, level);
+       int want = __btree_lock_want(path, level);
 
-       if (!is_btree_node(iter, level))
+       if (!is_btree_node(path, level))
                return false;
 
        if (race_fault())
                return false;
 
-       if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
-           (btree_node_lock_seq_matches(iter, b, level) &&
-            btree_node_lock_increment(iter->trans, b, level, want))) {
-               mark_btree_node_locked(iter, level, want);
+       if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+           (btree_node_lock_seq_matches(path, b, level) &&
+            btree_node_lock_increment(trans, b, level, want))) {
+               mark_btree_node_locked(trans, path, level, want);
                return true;
        } else {
                return false;
        }
 }
 
-static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+static bool bch2_btree_node_upgrade(struct btree_trans *trans,
+                                   struct btree_path *path, unsigned level)
 {
-       struct btree *b = iter->l[level].b;
+       struct btree *b = path->l[level].b;
 
-       EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
+       EBUG_ON(btree_lock_want(path, level) != BTREE_NODE_INTENT_LOCKED);
 
-       if (!is_btree_node(iter, level))
+       if (!is_btree_node(path, level))
                return false;
 
-       if (btree_node_intent_locked(iter, level))
+       if (btree_node_intent_locked(path, level))
                return true;
 
        if (race_fault())
                return false;
 
-       if (btree_node_locked(iter, level)
+       if (btree_node_locked(path, level)
            ? six_lock_tryupgrade(&b->c.lock)
-           : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
+           : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
                goto success;
 
-       if (btree_node_lock_seq_matches(iter, b, level) &&
-           btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
-               btree_node_unlock(iter, level);
+       if (btree_node_lock_seq_matches(path, b, level) &&
+           btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
+               btree_node_unlock(path, level);
                goto success;
        }
 
        return false;
 success:
-       mark_btree_node_intent_locked(iter, level);
+       mark_btree_node_intent_locked(trans, path, level);
        return true;
 }
 
-static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
-                                       unsigned long trace_ip)
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+                                       struct btree_path *path,
+                                       bool upgrade, unsigned long trace_ip)
 {
-       unsigned l = iter->level;
+       unsigned l = path->level;
        int fail_idx = -1;
 
        do {
-               if (!btree_iter_node(iter, l))
+               if (!btree_path_node(path, l))
                        break;
 
                if (!(upgrade
-                     ? bch2_btree_node_upgrade(iter, l)
-                     : bch2_btree_node_relock(iter, l))) {
-                       (upgrade
-                        ? trace_node_upgrade_fail
-                        : trace_node_relock_fail)(iter->trans->ip, trace_ip,
-                                       btree_iter_type(iter) == BTREE_ITER_CACHED,
-                                       iter->btree_id, &iter->real_pos,
-                                       l, iter->l[l].lock_seq,
-                                       is_btree_node(iter, l)
-                                       ? 0
-                                       : (unsigned long) iter->l[l].b,
-                                       is_btree_node(iter, l)
-                                       ? iter->l[l].b->c.lock.state.seq
-                                       : 0);
+                     ? bch2_btree_node_upgrade(trans, path, l)
+                     : bch2_btree_node_relock(trans, path, l)))
                        fail_idx = l;
-                       btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-               }
 
                l++;
-       } while (l < iter->locks_want);
+       } while (l < path->locks_want);
 
        /*
         * When we fail to get a lock, we have to ensure that any child nodes
-        * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+        * can't be relocked so bch2_btree_path_traverse has to walk back up to
         * the node that we failed to relock:
         */
-       while (fail_idx >= 0) {
-               btree_node_unlock(iter, fail_idx);
-               iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
-               --fail_idx;
+       if (fail_idx >= 0) {
+               __bch2_btree_path_unlock(path);
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+               do {
+                       path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+                       --fail_idx;
+               } while (fail_idx >= 0);
        }
 
-       if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
-               iter->uptodate = BTREE_ITER_NEED_PEEK;
+       if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+               path->uptodate = BTREE_ITER_UPTODATE;
 
-       bch2_btree_trans_verify_locks(iter->trans);
+       bch2_trans_verify_locks(trans);
 
-       return iter->uptodate < BTREE_ITER_NEED_RELOCK;
+       return path->uptodate < BTREE_ITER_NEED_RELOCK;
 }
 
 static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
-                                 enum btree_iter_type type)
+                                 bool cached)
 {
-       return  type != BTREE_ITER_CACHED
+       return !cached
                ? container_of(_b, struct btree, c)->key.k.p
                : container_of(_b, struct bkey_cached, c)->key.pos;
 }
 
 /* Slowpath: */
-bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
-                           unsigned level, struct btree_iter *iter,
+bool __bch2_btree_node_lock(struct btree_trans *trans,
+                           struct btree_path *path,
+                           struct btree *b,
+                           struct bpos pos, unsigned level,
                            enum six_lock_type type,
                            six_lock_should_sleep_fn should_sleep_fn, void *p,
                            unsigned long ip)
 {
-       struct btree_trans *trans = iter->trans;
-       struct btree_iter *linked, *deadlock_iter = NULL;
+       struct btree_path *linked, *deadlock_path = NULL;
        u64 start_time = local_clock();
        unsigned reason = 9;
        bool ret;
 
        /* Check if it's safe to block: */
-       trans_for_each_iter(trans, linked) {
+       trans_for_each_path(trans, linked) {
                if (!linked->nodes_locked)
                        continue;
 
@@ -275,25 +275,25 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                 */
                if (type == SIX_LOCK_intent &&
                    linked->nodes_locked != linked->nodes_intent_locked) {
-                       deadlock_iter = linked;
+                       deadlock_path = linked;
                        reason = 1;
                }
 
-               if (linked->btree_id != iter->btree_id) {
-                       if (linked->btree_id > iter->btree_id) {
-                               deadlock_iter = linked;
+               if (linked->btree_id != path->btree_id) {
+                       if (linked->btree_id > path->btree_id) {
+                               deadlock_path = linked;
                                reason = 3;
                        }
                        continue;
                }
 
                /*
-                * Within the same btree, cached iterators come before non
-                * cached iterators:
+                * Within the same btree, cached paths come before non
+                * cached paths:
                 */
-               if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
-                       if (btree_iter_is_cached(iter)) {
-                               deadlock_iter = linked;
+               if (linked->cached != path->cached) {
+                       if (path->cached) {
+                               deadlock_path = linked;
                                reason = 4;
                        }
                        continue;
@@ -301,32 +301,32 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 
                /*
                 * Interior nodes must be locked before their descendants: if
-                * another iterator has possible descendants locked of the node
+                * another path has possible descendants locked of the node
                 * we're about to lock, it must have the ancestors locked too:
                 */
                if (level > __fls(linked->nodes_locked)) {
-                       deadlock_iter = linked;
+                       deadlock_path = linked;
                        reason = 5;
                }
 
                /* Must lock btree nodes in key order: */
                if (btree_node_locked(linked, level) &&
                    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
-                                                btree_iter_type(linked))) <= 0) {
-                       deadlock_iter = linked;
+                                                linked->cached)) <= 0) {
+                       deadlock_path = linked;
                        reason = 7;
                        BUG_ON(trans->in_traverse_all);
                }
        }
 
-       if (unlikely(deadlock_iter)) {
+       if (unlikely(deadlock_path)) {
                trace_trans_restart_would_deadlock(trans->ip, ip,
                                trans->in_traverse_all, reason,
-                               deadlock_iter->btree_id,
-                               btree_iter_type(deadlock_iter),
-                               &deadlock_iter->real_pos,
-                               iter->btree_id,
-                               btree_iter_type(iter),
+                               deadlock_path->btree_id,
+                               deadlock_path->cached,
+                               &deadlock_path->pos,
+                               path->btree_id,
+                               path->cached,
                                &pos);
                btree_trans_restart(trans);
                return false;
@@ -336,9 +336,9 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                return true;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-       trans->locking_iter_idx = iter->idx;
+       trans->locking_path_idx = path->idx;
        trans->locking_pos      = pos;
-       trans->locking_btree_id = iter->btree_id;
+       trans->locking_btree_id = path->btree_id;
        trans->locking_level    = level;
        trans->locking          = b;
 #endif
@@ -357,59 +357,49 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 /* Btree iterator locking: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+
+static void bch2_btree_path_verify_locks(struct btree_path *path)
 {
        unsigned l;
 
-       if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
-               BUG_ON(iter->nodes_locked);
+       if (!path->nodes_locked) {
+               BUG_ON(path->uptodate == BTREE_ITER_UPTODATE);
                return;
        }
 
-       for (l = 0; btree_iter_node(iter, l); l++) {
-               if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
-                   !btree_node_locked(iter, l))
-                       continue;
-
-               BUG_ON(btree_lock_want(iter, l) !=
-                      btree_node_locked_type(iter, l));
-       }
+       for (l = 0; btree_path_node(path, l); l++)
+               BUG_ON(btree_lock_want(path, l) !=
+                      btree_node_locked_type(path, l));
 }
 
-void bch2_btree_trans_verify_locks(struct btree_trans *trans)
+void bch2_trans_verify_locks(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               bch2_btree_iter_verify_locks(iter);
+       trans_for_each_path(trans, path)
+               bch2_btree_path_verify_locks(path);
 }
 #else
-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
 #endif
 
+/* Btree path locking: */
+
 /*
  * Only for btree_cache.c - only relocks intent locks
  */
-bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
+bool bch2_btree_path_relock_intent(struct btree_trans *trans,
+                                  struct btree_path *path)
 {
        unsigned l;
 
-       for (l = iter->level;
-            l < iter->locks_want && btree_iter_node(iter, l);
+       for (l = path->level;
+            l < path->locks_want && btree_path_node(path, l);
             l++) {
-               if (!bch2_btree_node_relock(iter, l)) {
-                       trace_node_relock_fail(iter->trans->ip, _RET_IP_,
-                                       btree_iter_type(iter) == BTREE_ITER_CACHED,
-                                       iter->btree_id, &iter->real_pos,
-                                       l, iter->l[l].lock_seq,
-                                       is_btree_node(iter, l)
-                                       ? 0
-                                       : (unsigned long) iter->l[l].b,
-                                       is_btree_node(iter, l)
-                                       ? iter->l[l].b->c.lock.state.seq
-                                       : 0);
-                       btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-                       btree_trans_restart(iter->trans);
+               if (!bch2_btree_node_relock(trans, path, l)) {
+                       __bch2_btree_path_unlock(path);
+                       btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+                       btree_trans_restart(trans);
                        return false;
                }
        }
@@ -418,25 +408,27 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
 }
 
 __flatten
-bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
+static bool bch2_btree_path_relock(struct btree_trans *trans,
+                       struct btree_path *path, unsigned long trace_ip)
 {
-       bool ret = btree_iter_get_locks(iter, false, trace_ip);
+       bool ret = btree_path_get_locks(trans, path, false, trace_ip);
 
        if (!ret)
-               btree_trans_restart(iter->trans);
+               btree_trans_restart(trans);
        return ret;
 }
 
-bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+                              struct btree_path *path,
                               unsigned new_locks_want)
 {
-       struct btree_iter *linked;
+       struct btree_path *linked;
 
-       EBUG_ON(iter->locks_want >= new_locks_want);
+       EBUG_ON(path->locks_want >= new_locks_want);
 
-       iter->locks_want = new_locks_want;
+       path->locks_want = new_locks_want;
 
-       if (btree_iter_get_locks(iter, true, _THIS_IP_))
+       if (btree_path_get_locks(trans, path, true, _THIS_IP_))
                return true;
 
        /*
@@ -444,7 +436,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
         * iterators in the btree_trans here.
         *
         * On failure to upgrade the iterator, setting iter->locks_want and
-        * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
+        * calling get_locks() is sufficient to make bch2_btree_path_traverse()
         * get the locks we want on transaction restart.
         *
         * But if this iterator was a clone, on transaction restart what we did
@@ -456,75 +448,67 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
         *
         * The code below used to be needed to ensure ancestor nodes get locked
         * before interior nodes - now that's handled by
-        * bch2_btree_iter_traverse_all().
+        * bch2_btree_path_traverse_all().
         */
-       trans_for_each_iter(iter->trans, linked)
-               if (linked != iter &&
-                   btree_iter_type(linked) == btree_iter_type(iter) &&
-                   linked->btree_id == iter->btree_id &&
+       trans_for_each_path(trans, linked)
+               if (linked != path &&
+                   linked->cached == path->cached &&
+                   linked->btree_id == path->btree_id &&
                    linked->locks_want < new_locks_want) {
                        linked->locks_want = new_locks_want;
-                       btree_iter_get_locks(linked, true, _THIS_IP_);
+                       btree_path_get_locks(trans, linked, true, _THIS_IP_);
                }
 
-       if (iter->should_be_locked)
-               btree_trans_restart(iter->trans);
        return false;
 }
 
-void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+void __bch2_btree_path_downgrade(struct btree_path *path,
                                 unsigned new_locks_want)
 {
        unsigned l;
 
-       EBUG_ON(iter->locks_want < new_locks_want);
+       EBUG_ON(path->locks_want < new_locks_want);
 
-       iter->locks_want = new_locks_want;
+       path->locks_want = new_locks_want;
 
-       while (iter->nodes_locked &&
-              (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
-               if (l > iter->level) {
-                       btree_node_unlock(iter, l);
+       while (path->nodes_locked &&
+              (l = __fls(path->nodes_locked)) >= path->locks_want) {
+               if (l > path->level) {
+                       btree_node_unlock(path, l);
                } else {
-                       if (btree_node_intent_locked(iter, l)) {
-                               six_lock_downgrade(&iter->l[l].b->c.lock);
-                               iter->nodes_intent_locked ^= 1 << l;
+                       if (btree_node_intent_locked(path, l)) {
+                               six_lock_downgrade(&path->l[l].b->c.lock);
+                               path->nodes_intent_locked ^= 1 << l;
                        }
                        break;
                }
        }
 
-       bch2_btree_trans_verify_locks(iter->trans);
+       bch2_btree_path_verify_locks(path);
 }
 
 void bch2_trans_downgrade(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               bch2_btree_iter_downgrade(iter);
+       trans_for_each_path(trans, path)
+               bch2_btree_path_downgrade(path);
 }
 
 /* Btree transaction locking: */
 
-static inline bool btree_iter_should_be_locked(struct btree_iter *iter)
-{
-       return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
-               iter->should_be_locked;
-}
-
 bool bch2_trans_relock(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
        if (unlikely(trans->restarted))
                return false;
 
-       trans_for_each_iter(trans, iter)
-               if (btree_iter_should_be_locked(iter) &&
-                   !bch2_btree_iter_relock(iter, _RET_IP_)) {
+       trans_for_each_path(trans, path)
+               if (path->should_be_locked &&
+                   !bch2_btree_path_relock(trans, path, _RET_IP_)) {
                        trace_trans_restart_relock(trans->ip, _RET_IP_,
-                                       iter->btree_id, &iter->real_pos);
+                                       path->btree_id, &path->pos);
                        BUG_ON(!trans->restarted);
                        return false;
                }
@@ -533,10 +517,10 @@ bool bch2_trans_relock(struct btree_trans *trans)
 
 void bch2_trans_unlock(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               __bch2_btree_iter_unlock(iter);
+       trans_for_each_path(trans, path)
+               __bch2_btree_path_unlock(path);
 
        BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
 }
@@ -545,26 +529,27 @@ void bch2_trans_unlock(struct btree_trans *trans)
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
-static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
+static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+                                         struct btree_path *path)
 {
        struct bkey_cached *ck;
-       bool locked = btree_node_locked(iter, 0);
+       bool locked = btree_node_locked(path, 0);
 
-       if (!bch2_btree_node_relock(iter, 0))
+       if (!bch2_btree_node_relock(trans, path, 0))
                return;
 
-       ck = (void *) iter->l[0].b;
-       BUG_ON(ck->key.btree_id != iter->btree_id ||
-              bkey_cmp(ck->key.pos, iter->pos));
+       ck = (void *) path->l[0].b;
+       BUG_ON(ck->key.btree_id != path->btree_id ||
+              bkey_cmp(ck->key.pos, path->pos));
 
        if (!locked)
-               btree_node_unlock(iter, 0);
+               btree_node_unlock(path, 0);
 }
 
-static void bch2_btree_iter_verify_level(struct btree_iter *iter,
-                                        unsigned level)
+static void bch2_btree_path_verify_level(struct btree_trans *trans,
+                               struct btree_path *path, unsigned level)
 {
-       struct btree_iter_level *l;
+       struct btree_path_level *l;
        struct btree_node_iter tmp;
        bool locked;
        struct bkey_packed *p, *k;
@@ -574,65 +559,52 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
        if (!bch2_debug_check_iterators)
                return;
 
-       l       = &iter->l[level];
+       l       = &path->l[level];
        tmp     = l->iter;
-       locked  = btree_node_locked(iter, level);
+       locked  = btree_node_locked(path, level);
 
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+       if (path->cached) {
                if (!level)
-                       bch2_btree_iter_verify_cached(iter);
+                       bch2_btree_path_verify_cached(trans, path);
                return;
        }
 
-       BUG_ON(iter->level < iter->min_depth);
-
-       if (!btree_iter_node(iter, level))
+       if (!btree_path_node(path, level))
                return;
 
-       if (!bch2_btree_node_relock(iter, level))
+       if (!bch2_btree_node_relock(trans, path, level))
                return;
 
-       BUG_ON(!btree_iter_pos_in_node(iter, l->b));
-
-       /*
-        * node iterators don't use leaf node iterator:
-        */
-       if (btree_iter_type(iter) == BTREE_ITER_NODES &&
-           level <= iter->min_depth)
-               goto unlock;
+       BUG_ON(!btree_path_pos_in_node(path, l->b));
 
        bch2_btree_node_iter_verify(&l->iter, l->b);
 
        /*
-        * For interior nodes, the iterator will have skipped past
-        * deleted keys:
-        *
-        * For extents, the iterator may have skipped past deleted keys (but not
-        * whiteouts)
+        * For interior nodes, the iterator will have skipped past deleted keys:
         */
-       p = level || btree_node_type_is_extents(iter->btree_id)
+       p = level
                ? bch2_btree_node_iter_prev(&tmp, l->b)
                : bch2_btree_node_iter_prev_all(&tmp, l->b);
        k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 
-       if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
+       if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
                msg = "before";
                goto err;
        }
 
-       if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+       if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
                msg = "after";
                goto err;
        }
-unlock:
+
        if (!locked)
-               btree_node_unlock(iter, level);
+               btree_node_unlock(path, level);
        return;
 err:
        strcpy(buf2, "(none)");
        strcpy(buf3, "(none)");
 
-       bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+       bch2_bpos_to_text(&PBUF(buf1), path->pos);
 
        if (p) {
                struct bkey uk = bkey_unpack_key(l->b, p);
@@ -644,79 +616,84 @@ err:
                bch2_bkey_to_text(&PBUF(buf3), &uk);
        }
 
-       panic("iterator should be %s key at level %u:\n"
-             "iter pos %s\n"
+       panic("path should be %s key at level %u:\n"
+             "path pos %s\n"
              "prev key %s\n"
              "cur  key %s\n",
              msg, level, buf1, buf2, buf3);
 }
 
-static void bch2_btree_iter_verify(struct btree_iter *iter)
+static void bch2_btree_path_verify(struct btree_trans *trans,
+                                  struct btree_path *path)
 {
-       struct btree_trans *trans = iter->trans;
        struct bch_fs *c = trans->c;
-       enum btree_iter_type type = btree_iter_type(iter);
        unsigned i;
 
-       EBUG_ON(iter->btree_id >= BTREE_ID_NR);
-
-       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-              iter->pos.snapshot != iter->snapshot);
-
-       BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
-              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
-
-       BUG_ON(type == BTREE_ITER_NODES &&
-              !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
-
-       BUG_ON(type != BTREE_ITER_NODES &&
-              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-              !btree_type_has_snapshots(iter->btree_id));
+       EBUG_ON(path->btree_id >= BTREE_ID_NR);
 
-       for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) {
-               if (!iter->l[i].b) {
-                       BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
+       for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+               if (!path->l[i].b) {
+                       BUG_ON(c->btree_roots[path->btree_id].b->c.level > i);
                        break;
                }
 
-               bch2_btree_iter_verify_level(iter, i);
+               bch2_btree_path_verify_level(trans, path, i);
        }
 
-       bch2_btree_iter_verify_locks(iter);
+       bch2_btree_path_verify_locks(path);
 }
 
-static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+       struct btree_path *path;
+
+       trans_for_each_path(trans, path)
+               bch2_btree_path_verify(trans, path);
+}
+
+static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
-       enum btree_iter_type type = btree_iter_type(iter);
+       struct btree_trans *trans = iter->trans;
+
+       BUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+       BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
 
        BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
               iter->pos.snapshot != iter->snapshot);
 
-       BUG_ON((type == BTREE_ITER_KEYS ||
-               type == BTREE_ITER_CACHED) &&
-              (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
-               bkey_cmp(iter->pos, iter->k.p) > 0));
+       BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
+
+       BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
+              (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              !btree_type_has_snapshots(iter->btree_id));
+
+       bch2_btree_path_verify(trans, iter->path);
 }
 
-void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
-       struct btree_iter *iter;
-
-       if (!bch2_debug_check_iterators)
-               return;
+       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+              iter->pos.snapshot != iter->snapshot);
 
-       trans_for_each_iter_with_node(trans, b, iter)
-               bch2_btree_iter_verify_level(iter, b->c.level);
+       BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+              bkey_cmp(iter->pos, iter->k.p) > 0);
 }
 
 #else
 
-static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
+static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
+                                               struct btree_path *path, unsigned l) {}
+static inline void bch2_btree_path_verify(struct btree_trans *trans,
+                                         struct btree_path *path) {}
 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
 
 #endif
 
+/* Btree path: fixups after btree updates */
+
 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
                                        struct btree *b,
                                        struct bset_tree *t,
@@ -734,40 +711,38 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
        bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
 }
 
-static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
                                               struct btree *b,
                                               struct bkey_packed *where)
 {
-       struct btree_iter_level *l = &iter->l[b->c.level];
+       struct btree_path_level *l = &path->l[b->c.level];
 
        if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
                return;
 
-       if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
+       if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
                bch2_btree_node_iter_advance(&l->iter, l->b);
-
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
                                      struct btree *b,
                                      struct bkey_packed *where)
 {
-       struct btree_iter *linked;
+       struct btree_path *path;
 
-       trans_for_each_iter_with_node(iter->trans, b, linked) {
-               __bch2_btree_iter_fix_key_modified(linked, b, where);
-               bch2_btree_iter_verify_level(linked, b->c.level);
+       trans_for_each_path_with_node(trans, b, path) {
+               __bch2_btree_path_fix_key_modified(path, b, where);
+               bch2_btree_path_verify_level(trans, path, b->c.level);
        }
 }
 
-static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
-                                     struct btree *b,
-                                     struct btree_node_iter *node_iter,
-                                     struct bset_tree *t,
-                                     struct bkey_packed *where,
-                                     unsigned clobber_u64s,
-                                     unsigned new_u64s)
+static void __bch2_btree_node_iter_fix(struct btree_path *path,
+                                      struct btree *b,
+                                      struct btree_node_iter *node_iter,
+                                      struct bset_tree *t,
+                                      struct bkey_packed *where,
+                                      unsigned clobber_u64s,
+                                      unsigned new_u64s)
 {
        const struct bkey_packed *end = btree_bkey_last(b, t);
        struct btree_node_iter_set *set;
@@ -785,7 +760,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 
        /* didn't find the bset in the iterator - might have to readd it: */
        if (new_u64s &&
-           bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+           bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
                bch2_btree_node_iter_push(node_iter, b, where, end);
                goto fixup_done;
        } else {
@@ -800,7 +775,7 @@ found:
                return;
 
        if (new_u64s &&
-           bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+           bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
                set->k = offset;
        } else if (set->k < offset + clobber_u64s) {
                set->k = offset + new_u64s;
@@ -826,8 +801,7 @@ fixup_done:
         */
        if (!bch2_btree_node_iter_end(node_iter) &&
            iter_current_key_modified &&
-           (b->c.level ||
-            btree_node_type_is_extents(iter->btree_id))) {
+           b->c.level) {
                struct bset_tree *t;
                struct bkey_packed *k, *k2, *p;
 
@@ -852,14 +826,10 @@ fixup_done:
                                                            b, t, k2);
                }
        }
-
-       if (!b->c.level &&
-           node_iter == &iter->l[0].iter &&
-           iter_current_key_modified)
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-void bch2_btree_node_iter_fix(struct btree_iter *iter,
+void bch2_btree_node_iter_fix(struct btree_trans *trans,
+                             struct btree_path *path,
                              struct btree *b,
                              struct btree_node_iter *node_iter,
                              struct bkey_packed *where,
@@ -867,26 +837,28 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
                              unsigned new_u64s)
 {
        struct bset_tree *t = bch2_bkey_to_bset(b, where);
-       struct btree_iter *linked;
+       struct btree_path *linked;
 
-       if (node_iter != &iter->l[b->c.level].iter) {
-               __bch2_btree_node_iter_fix(iter, b, node_iter, t,
+       if (node_iter != &path->l[b->c.level].iter) {
+               __bch2_btree_node_iter_fix(path, b, node_iter, t,
                                           where, clobber_u64s, new_u64s);
 
                if (bch2_debug_check_iterators)
                        bch2_btree_node_iter_verify(node_iter, b);
        }
 
-       trans_for_each_iter_with_node(iter->trans, b, linked) {
+       trans_for_each_path_with_node(trans, b, linked) {
                __bch2_btree_node_iter_fix(linked, b,
                                           &linked->l[b->c.level].iter, t,
                                           where, clobber_u64s, new_u64s);
-               bch2_btree_iter_verify_level(linked, b->c.level);
+               bch2_btree_path_verify_level(trans, linked, b->c.level);
        }
 }
 
-static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
-                                                 struct btree_iter_level *l,
+/* Btree path level: pointer to a particular btree node and node iter */
+
+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
+                                                 struct btree_path_level *l,
                                                  struct bkey *u,
                                                  struct bkey_packed *k)
 {
@@ -911,48 +883,52 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
         * assertion here:
         */
        if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
-               bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
+               bch2_bkey_debugcheck(c, l->b, ret);
 
        return ret;
 }
 
-/* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
-                                                       struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
+                                                       struct btree_path_level *l,
+                                                       struct bkey *u)
 {
-       return __btree_iter_unpack(iter, l, &iter->k,
+       return __btree_iter_unpack(c, l, u,
                        bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
-                                                   struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
+                                                   struct btree_path *path,
+                                                   struct btree_path_level *l,
+                                                   struct bkey *u)
 {
-       struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+       struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
                        bch2_btree_node_iter_peek(&l->iter, l->b));
 
-       iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
+       path->pos = k.k ? k.k->p : l->b->key.k.p;
        return k;
 }
 
-static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
-                                                   struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c,
+                                                   struct btree_path *path,
+                                                   struct btree_path_level *l,
+                                                   struct bkey *u)
 {
-       struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+       struct bkey_s_c k = __btree_iter_unpack(c, l, u,
                        bch2_btree_node_iter_prev(&l->iter, l->b));
 
-       iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
+       path->pos = k.k ? k.k->p : l->b->data->min_key;
        return k;
 }
 
-static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
-                                            struct btree_iter_level *l,
+static inline bool btree_path_advance_to_pos(struct btree_path *path,
+                                            struct btree_path_level *l,
                                             int max_advance)
 {
        struct bkey_packed *k;
        int nr_advanced = 0;
 
        while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-              bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+              bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
                if (max_advance > 0 && nr_advanced >= max_advance)
                        return false;
 
@@ -966,9 +942,10 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
 /*
  * Verify that iterator for parent node points to child node:
  */
-static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+static void btree_path_verify_new_node(struct btree_trans *trans,
+                                      struct btree_path *path, struct btree *b)
 {
-       struct btree_iter_level *l;
+       struct btree_path_level *l;
        unsigned plevel;
        bool parent_locked;
        struct bkey_packed *k;
@@ -977,15 +954,15 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
                return;
 
        plevel = b->c.level + 1;
-       if (!btree_iter_node(iter, plevel))
+       if (!btree_path_node(path, plevel))
                return;
 
-       parent_locked = btree_node_locked(iter, plevel);
+       parent_locked = btree_node_locked(path, plevel);
 
-       if (!bch2_btree_node_relock(iter, plevel))
+       if (!bch2_btree_node_relock(trans, path, plevel))
                return;
 
-       l = &iter->l[plevel];
+       l = &path->l[plevel];
        k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
        if (!k ||
            bkey_deleted(k) ||
@@ -996,8 +973,8 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
                char buf4[100];
                struct bkey uk = bkey_unpack_key(b, k);
 
-               bch2_dump_btree_node(iter->trans->c, l->b);
-               bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+               bch2_dump_btree_node(trans->c, l->b);
+               bch2_bpos_to_text(&PBUF(buf1), path->pos);
                bch2_bkey_to_text(&PBUF(buf2), &uk);
                bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
                bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
@@ -1005,20 +982,20 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
                      "iter pos %s %s\n"
                      "iter key %s\n"
                      "new node %s-%s\n",
-                     bch2_btree_ids[iter->btree_id], buf1,
+                     bch2_btree_ids[path->btree_id], buf1,
                      buf2, buf3, buf4);
        }
 
        if (!parent_locked)
-               btree_node_unlock(iter, b->c.level + 1);
+               btree_node_unlock(path, plevel);
 }
 
-static inline void __btree_iter_init(struct btree_iter *iter,
-                                    unsigned level)
+static inline void __btree_path_level_init(struct btree_path *path,
+                                          unsigned level)
 {
-       struct btree_iter_level *l = &iter->l[level];
+       struct btree_path_level *l = &path->l[level];
 
-       bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
+       bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
 
        /*
         * Iterators to interior nodes should always be pointed at the first non
@@ -1026,63 +1003,48 @@ static inline void __btree_iter_init(struct btree_iter *iter,
         */
        if (level)
                bch2_btree_node_iter_peek(&l->iter, l->b);
-
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-static inline void btree_iter_node_set(struct btree_iter *iter,
-                                      struct btree *b)
+static inline void btree_path_level_init(struct btree_trans *trans,
+                                        struct btree_path *path,
+                                        struct btree *b)
 {
-       BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
+       BUG_ON(path->cached);
 
-       btree_iter_verify_new_node(iter, b);
+       btree_path_verify_new_node(trans, path, b);
 
-       EBUG_ON(!btree_iter_pos_in_node(iter, b));
+       EBUG_ON(!btree_path_pos_in_node(path, b));
        EBUG_ON(b->c.lock.state.seq & 1);
 
-       iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
-       iter->l[b->c.level].b = b;
-       __btree_iter_init(iter, b->c.level);
+       path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+       path->l[b->c.level].b = b;
+       __btree_path_level_init(path, b->c.level);
 }
 
+/* Btree path: fixups after btree node updates: */
+
 /*
  * A btree node is being replaced - update the iterator to point to the new
  * node:
  */
-void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 {
-       enum btree_node_locked_type t;
-       struct btree_iter *linked;
+       struct btree_path *path;
 
-       trans_for_each_iter(iter->trans, linked)
-               if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
-                   btree_iter_pos_in_node(linked, b)) {
-                       /*
-                        * bch2_btree_iter_node_drop() has already been called -
-                        * the old node we're replacing has already been
-                        * unlocked and the pointer invalidated
-                        */
-                       BUG_ON(btree_node_locked(linked, b->c.level));
+       trans_for_each_path(trans, path)
+               if (!path->cached &&
+                   btree_path_pos_in_node(path, b)) {
+                       enum btree_node_locked_type t =
+                               btree_lock_want(path, b->c.level);
 
-                       t = btree_lock_want(linked, b->c.level);
-                       if (t != BTREE_NODE_UNLOCKED) {
+                       if (path->nodes_locked &&
+                           t != BTREE_NODE_UNLOCKED) {
+                               btree_node_unlock(path, b->c.level);
                                six_lock_increment(&b->c.lock, t);
-                               mark_btree_node_locked(linked, b->c.level, t);
+                               mark_btree_node_locked(trans, path, b->c.level, t);
                        }
 
-                       btree_iter_node_set(linked, b);
-               }
-}
-
-void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
-{
-       struct btree_iter *linked;
-       unsigned level = b->c.level;
-
-       trans_for_each_iter(iter->trans, linked)
-               if (linked->l[level].b == b) {
-                       btree_node_unlock(linked, level);
-                       linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
+                       btree_path_level_init(trans, path, b);
                }
 }
 
@@ -1090,14 +1052,16 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
  * A btree node has been modified in such a way as to invalidate iterators - fix
  * them:
  */
-void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
 {
-       struct btree_iter *linked;
+       struct btree_path *path;
 
-       trans_for_each_iter_with_node(iter->trans, b, linked)
-               __btree_iter_init(linked, b->c.level);
+       trans_for_each_path_with_node(trans, b, path)
+               __btree_path_level_init(path, b->c.level);
 }
 
+/* Btree path: traverse, set_pos: */
+
 static int lock_root_check_fn(struct six_lock *lock, void *p)
 {
        struct btree *b = container_of(lock, struct btree, c.lock);
@@ -1106,38 +1070,38 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
        return b == *rootp ? 0 : -1;
 }
 
-static inline int btree_iter_lock_root(struct btree_trans *trans,
-                                      struct btree_iter *iter,
+static inline int btree_path_lock_root(struct btree_trans *trans,
+                                      struct btree_path *path,
                                       unsigned depth_want,
                                       unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
+       struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
        enum six_lock_type lock_type;
        unsigned i;
 
-       EBUG_ON(iter->nodes_locked);
+       EBUG_ON(path->nodes_locked);
 
        while (1) {
                b = READ_ONCE(*rootp);
-               iter->level = READ_ONCE(b->c.level);
+               path->level = READ_ONCE(b->c.level);
 
-               if (unlikely(iter->level < depth_want)) {
+               if (unlikely(path->level < depth_want)) {
                        /*
                         * the root is at a lower depth than the depth we want:
                         * got to the end of the btree, or we're walking nodes
                         * greater than some depth and there are no nodes >=
                         * that depth
                         */
-                       iter->level = depth_want;
-                       for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
-                               iter->l[i].b = NULL;
+                       path->level = depth_want;
+                       for (i = path->level; i < BTREE_MAX_DEPTH; i++)
+                               path->l[i].b = NULL;
                        return 1;
                }
 
-               lock_type = __btree_lock_want(iter, iter->level);
-               if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level,
-                                             iter, lock_type,
+               lock_type = __btree_lock_want(path, path->level);
+               if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
+                                             path->level, lock_type,
                                              lock_root_check_fn, rootp,
                                              trace_ip))) {
                        if (trans->restarted)
@@ -1146,16 +1110,16 @@ static inline int btree_iter_lock_root(struct btree_trans *trans,
                }
 
                if (likely(b == READ_ONCE(*rootp) &&
-                          b->c.level == iter->level &&
+                          b->c.level == path->level &&
                           !race_fault())) {
-                       for (i = 0; i < iter->level; i++)
-                               iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
-                       iter->l[iter->level].b = b;
-                       for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
-                               iter->l[i].b = NULL;
-
-                       mark_btree_node_locked(iter, iter->level, lock_type);
-                       btree_iter_node_set(iter, b);
+                       for (i = 0; i < path->level; i++)
+                               path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+                       path->l[path->level].b = b;
+                       for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
+                               path->l[i].b = NULL;
+
+                       mark_btree_node_locked(trans, path, path->level, lock_type);
+                       btree_path_level_init(trans, path, b);
                        return 0;
                }
 
@@ -1164,23 +1128,23 @@ static inline int btree_iter_lock_root(struct btree_trans *trans,
 }
 
 noinline
-static int btree_iter_prefetch(struct btree_iter *iter)
+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
 {
-       struct bch_fs *c = iter->trans->c;
-       struct btree_iter_level *l = &iter->l[iter->level];
+       struct bch_fs *c = trans->c;
+       struct btree_path_level *l = path_l(path);
        struct btree_node_iter node_iter = l->iter;
        struct bkey_packed *k;
        struct bkey_buf tmp;
        unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
-               ? (iter->level > 1 ? 0 :  2)
-               : (iter->level > 1 ? 1 : 16);
-       bool was_locked = btree_node_locked(iter, iter->level);
+               ? (path->level > 1 ? 0 :  2)
+               : (path->level > 1 ? 1 : 16);
+       bool was_locked = btree_node_locked(path, path->level);
        int ret = 0;
 
        bch2_bkey_buf_init(&tmp);
 
        while (nr && !ret) {
-               if (!bch2_btree_node_relock(iter, iter->level))
+               if (!bch2_btree_node_relock(trans, path, path->level))
                        break;
 
                bch2_btree_node_iter_advance(&node_iter, l->b);
@@ -1189,26 +1153,27 @@ static int btree_iter_prefetch(struct btree_iter *iter)
                        break;
 
                bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-               ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
-                                              iter->level - 1);
+               ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+                                              path->level - 1);
        }
 
        if (!was_locked)
-               btree_node_unlock(iter, iter->level);
+               btree_node_unlock(path, path->level);
 
        bch2_bkey_buf_exit(&tmp, c);
        return ret;
 }
 
-static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
+                                           struct btree_path *path,
                                            unsigned plevel, struct btree *b)
 {
-       struct btree_iter_level *l = &iter->l[plevel];
-       bool locked = btree_node_locked(iter, plevel);
+       struct btree_path_level *l = &path->l[plevel];
+       bool locked = btree_node_locked(path, plevel);
        struct bkey_packed *k;
        struct bch_btree_ptr_v2 *bp;
 
-       if (!bch2_btree_node_relock(iter, plevel))
+       if (!bch2_btree_node_relock(trans, path, plevel))
                return;
 
        k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
@@ -1218,59 +1183,61 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
        bp->mem_ptr = (unsigned long)b;
 
        if (!locked)
-               btree_node_unlock(iter, plevel);
+               btree_node_unlock(path, plevel);
 }
 
-static __always_inline int btree_iter_down(struct btree_trans *trans,
-                                          struct btree_iter *iter,
+static __always_inline int btree_path_down(struct btree_trans *trans,
+                                          struct btree_path *path,
+                                          unsigned flags,
                                           unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter_level *l = &iter->l[iter->level];
+       struct btree_path_level *l = path_l(path);
        struct btree *b;
-       unsigned level = iter->level - 1;
-       enum six_lock_type lock_type = __btree_lock_want(iter, level);
+       unsigned level = path->level - 1;
+       enum six_lock_type lock_type = __btree_lock_want(path, level);
        struct bkey_buf tmp;
        int ret;
 
-       EBUG_ON(!btree_node_locked(iter, iter->level));
+       EBUG_ON(!btree_node_locked(path, path->level));
 
        bch2_bkey_buf_init(&tmp);
        bch2_bkey_buf_unpack(&tmp, c, l->b,
                         bch2_btree_node_iter_peek(&l->iter, l->b));
 
-       b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip);
+       b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
        ret = PTR_ERR_OR_ZERO(b);
        if (unlikely(ret))
                goto err;
 
-       mark_btree_node_locked(iter, level, lock_type);
-       btree_iter_node_set(iter, b);
+       mark_btree_node_locked(trans, path, level, lock_type);
+       btree_path_level_init(trans, path, b);
 
        if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
            unlikely(b != btree_node_mem_ptr(tmp.k)))
-               btree_node_mem_ptr_set(iter, level + 1, b);
+               btree_node_mem_ptr_set(trans, path, level + 1, b);
 
-       if (iter->flags & BTREE_ITER_PREFETCH)
-               ret = btree_iter_prefetch(iter);
+       if (flags & BTREE_ITER_PREFETCH)
+               ret = btree_path_prefetch(trans, path);
 
-       if (btree_node_read_locked(iter, level + 1))
-               btree_node_unlock(iter, level + 1);
-       iter->level = level;
+       if (btree_node_read_locked(path, level + 1))
+               btree_node_unlock(path, level + 1);
+       path->level = level;
 
-       bch2_btree_iter_verify_locks(iter);
+       bch2_btree_path_verify_locks(path);
 err:
        bch2_bkey_buf_exit(&tmp, c);
        return ret;
 }
 
-static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
+static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+                                  unsigned, unsigned long);
 
-static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
+static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
                                     unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_path *path;
        int i;
 
        if (trans->in_traverse_all)
@@ -1280,20 +1247,24 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 retry_all:
        trans->restarted = false;
 
-       trans_for_each_iter(trans, iter)
-               iter->should_be_locked = false;
+       trans_for_each_path(trans, path)
+               path->should_be_locked = false;
+
+       btree_trans_verify_sorted(trans);
 
-       btree_trans_sort_iters(trans);
+#ifdef CONFIG_BCACHEFS_DEBUG
+       trans->traverse_all_idx = U8_MAX;
+#endif
 
        for (i = trans->nr_sorted - 2; i >= 0; --i) {
-               struct btree_iter *iter1 = trans->iters + trans->sorted[i];
-               struct btree_iter *iter2 = trans->iters + trans->sorted[i + 1];
-
-               if (iter1->btree_id == iter2->btree_id &&
-                   iter1->locks_want < iter2->locks_want)
-                       __bch2_btree_iter_upgrade(iter1, iter2->locks_want);
-               else if (!iter1->locks_want && iter2->locks_want)
-                       __bch2_btree_iter_upgrade(iter1, 1);
+               struct btree_path *path1 = trans->paths + trans->sorted[i];
+               struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
+
+               if (path1->btree_id == path2->btree_id &&
+                   path1->locks_want < path2->locks_want)
+                       __bch2_btree_path_upgrade(trans, path1, path2->locks_want);
+               else if (!path1->locks_want && path2->locks_want)
+                       __bch2_btree_path_upgrade(trans, path1, 1);
        }
 
        bch2_trans_unlock(trans);
@@ -1318,18 +1289,32 @@ retry_all:
        BUG_ON(ret && ret != -EINTR);
 
        /* Now, redo traversals in correct order: */
-       trans_for_each_iter_inorder(trans, iter) {
-               EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+       i = 0;
+       while (i < trans->nr_sorted) {
+               path = trans->paths + trans->sorted[i];
+
+               EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
+#ifdef CONFIG_BCACHEFS_DEBUG
+               trans->traverse_all_idx = path->idx;
+#endif
 
-               ret = btree_iter_traverse_one(iter, _THIS_IP_);
+               ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
                if (ret)
                        goto retry_all;
 
-               EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+               EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
+
+               if (path->nodes_locked)
+                       i++;
        }
 
-       trans_for_each_iter(trans, iter)
-               BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+       /*
+        * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
+        * and relock(), relock() won't relock since path->should_be_locked
+        * isn't set yet, which is all fine
+        */
+       trans_for_each_path(trans, path)
+               BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
 out:
        bch2_btree_cache_cannibalize_unlock(c);
 
@@ -1339,37 +1324,50 @@ out:
        return ret;
 }
 
-static int bch2_btree_iter_traverse_all(struct btree_trans *trans)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
-       return __btree_iter_traverse_all(trans, 0, _RET_IP_);
+       return __btree_path_traverse_all(trans, 0, _RET_IP_);
 }
 
-static inline bool btree_iter_good_node(struct btree_iter *iter,
+static inline bool btree_path_good_node(struct btree_trans *trans,
+                                       struct btree_path *path,
                                        unsigned l, int check_pos)
 {
-       if (!is_btree_node(iter, l) ||
-           !bch2_btree_node_relock(iter, l))
+       if (!is_btree_node(path, l) ||
+           !bch2_btree_node_relock(trans, path, l))
                return false;
 
-       if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+       if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
                return false;
-       if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+       if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
                return false;
        return true;
 }
 
-static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+                                                    struct btree_path *path,
                                                     int check_pos)
 {
-       unsigned l = iter->level;
+       unsigned i, l = path->level;
 
-       while (btree_iter_node(iter, l) &&
-              !btree_iter_good_node(iter, l, check_pos)) {
-               btree_node_unlock(iter, l);
-               iter->l[l].b = BTREE_ITER_NO_NODE_UP;
+       while (btree_path_node(path, l) &&
+              !btree_path_good_node(trans, path, l, check_pos)) {
+               btree_node_unlock(path, l);
+               path->l[l].b = BTREE_ITER_NO_NODE_UP;
                l++;
        }
 
+       /* If we need intent locks, take them too: */
+       for (i = l + 1;
+            i < path->locks_want && btree_path_node(path, i);
+            i++)
+               if (!bch2_btree_node_relock(trans, path, i))
+                       while (l <= i) {
+                               btree_node_unlock(path, l);
+                               path->l[l].b = BTREE_ITER_NO_NODE_UP;
+                               l++;
+                       }
+
        return l;
 }
 
@@ -1382,249 +1380,167 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-static int btree_iter_traverse_one(struct btree_iter *iter,
+static int btree_path_traverse_one(struct btree_trans *trans,
+                                  struct btree_path *path,
+                                  unsigned flags,
                                   unsigned long trace_ip)
 {
-       struct btree_trans *trans = iter->trans;
-       unsigned l, depth_want = iter->level;
+       unsigned depth_want = path->level;
        int ret = 0;
 
        /*
-        * Ensure we obey iter->should_be_locked: if it's set, we can't unlock
-        * and re-traverse the iterator without a transaction restart:
+        * Ensure we obey path->should_be_locked: if it's set, we can't unlock
+        * and re-traverse the path without a transaction restart:
         */
-       if (iter->should_be_locked) {
-               ret = bch2_btree_iter_relock(iter, trace_ip) ? 0 : -EINTR;
+       if (path->should_be_locked) {
+               ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
                goto out;
        }
 
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
-               ret = bch2_btree_iter_traverse_cached(iter);
+       if (path->cached) {
+               ret = bch2_btree_path_traverse_cached(trans, path, flags);
                goto out;
        }
 
-       if (unlikely(iter->level >= BTREE_MAX_DEPTH))
+       if (unlikely(path->level >= BTREE_MAX_DEPTH))
                goto out;
 
-       iter->level = btree_iter_up_until_good_node(iter, 0);
-
-       /* If we need intent locks, take them too: */
-       for (l = iter->level + 1;
-            l < iter->locks_want && btree_iter_node(iter, l);
-            l++)
-               if (!bch2_btree_node_relock(iter, l))
-                       while (iter->level <= l) {
-                               btree_node_unlock(iter, iter->level);
-                               iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
-                               iter->level++;
-                       }
+       path->level = btree_path_up_until_good_node(trans, path, 0);
 
        /*
-        * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+        * Note: path->nodes[path->level] may be temporarily NULL here - that
         * would indicate to other code that we got to the end of the btree,
         * here it indicates that relocking the root failed - it's critical that
-        * btree_iter_lock_root() comes next and that it can't fail
+        * btree_path_lock_root() comes next and that it can't fail
         */
-       while (iter->level > depth_want) {
-               ret = btree_iter_node(iter, iter->level)
-                       ? btree_iter_down(trans, iter, trace_ip)
-                       : btree_iter_lock_root(trans, iter, depth_want, trace_ip);
+       while (path->level > depth_want) {
+               ret = btree_path_node(path, path->level)
+                       ? btree_path_down(trans, path, flags, trace_ip)
+                       : btree_path_lock_root(trans, path, depth_want, trace_ip);
                if (unlikely(ret)) {
                        if (ret == 1) {
                                /*
-                                * Got to the end of the btree (in
-                                * BTREE_ITER_NODES mode)
+                                * No nodes at this level - got to the end of
+                                * the btree:
                                 */
                                ret = 0;
                                goto out;
                        }
 
-                       __bch2_btree_iter_unlock(iter);
-                       iter->level = depth_want;
+                       __bch2_btree_path_unlock(path);
+                       path->level = depth_want;
 
-                       if (ret == -EIO) {
-                               iter->flags |= BTREE_ITER_ERROR;
-                               iter->l[iter->level].b =
+                       if (ret == -EIO)
+                               path->l[path->level].b =
                                        BTREE_ITER_NO_NODE_ERROR;
-                       } else {
-                               iter->l[iter->level].b =
+                       else
+                               path->l[path->level].b =
                                        BTREE_ITER_NO_NODE_DOWN;
-                       }
                        goto out;
                }
        }
 
-       iter->uptodate = BTREE_ITER_NEED_PEEK;
+       path->uptodate = BTREE_ITER_UPTODATE;
 out:
        BUG_ON((ret == -EINTR) != !!trans->restarted);
-       trace_iter_traverse(trans->ip, trace_ip,
-                           btree_iter_type(iter) == BTREE_ITER_CACHED,
-                           iter->btree_id, &iter->real_pos, ret);
-       bch2_btree_iter_verify(iter);
+       bch2_btree_path_verify(trans, path);
        return ret;
 }
 
-static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
+static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
+
+int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+                                         struct btree_path *path, unsigned flags)
 {
-       struct btree_trans *trans = iter->trans;
        int ret;
 
+       if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+               return 0;
+
        ret =   bch2_trans_cond_resched(trans) ?:
-               btree_iter_traverse_one(iter, _RET_IP_);
-       if (unlikely(ret) && hweight64(trans->iters_linked) == 1) {
-               ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
+               btree_path_traverse_one(trans, path, flags, _RET_IP_);
+       if (unlikely(ret) && hweight64(trans->paths_allocated) == 1) {
+               ret = __btree_path_traverse_all(trans, ret, _RET_IP_);
                BUG_ON(ret == -EINTR);
        }
 
        return ret;
 }
 
-/*
- * Note:
- * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
- * for internal btree iterator users
- *
- * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
- * btree_iter_traverse() does not:
- */
-static inline int __must_check
-btree_iter_traverse(struct btree_iter *iter)
-{
-       return iter->uptodate >= BTREE_ITER_NEED_RELOCK
-               ? __bch2_btree_iter_traverse(iter)
-               : 0;
-}
-
-int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
+static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+                           struct btree_path *src)
 {
-       int ret;
+       unsigned i;
 
-       btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+       memcpy(&dst->pos, &src->pos,
+              sizeof(struct btree_path) - offsetof(struct btree_path, pos));
 
-       ret = btree_iter_traverse(iter);
-       if (ret)
-               return ret;
+       for (i = 0; i < BTREE_MAX_DEPTH; i++)
+               if (btree_node_locked(dst, i))
+                       six_lock_increment(&dst->l[i].b->c.lock,
+                                          __btree_lock_want(dst, i));
 
-       iter->should_be_locked = true;
-       return 0;
+       btree_path_check_sort(trans, dst, 0);
 }
 
-/* Iterate across nodes (leaf and interior nodes) */
-
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
+                                          bool intent)
 {
-       struct btree *b;
-       int ret;
-
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
-       bch2_btree_iter_verify(iter);
-
-       ret = btree_iter_traverse(iter);
-       if (ret)
-               return NULL;
-
-       b = btree_iter_node(iter, iter->level);
-       if (!b)
-               return NULL;
+       struct btree_path *new = btree_path_alloc(trans, src);
 
-       BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
-
-       iter->pos = iter->real_pos = b->key.k.p;
-
-       bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
-
-       return b;
+       btree_path_copy(trans, new, src);
+       __btree_path_get(new, intent);
+       return new;
 }
 
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+                        struct btree_path *path, bool intent)
 {
-       struct btree *b;
-       int ret;
-
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
-       bch2_btree_iter_verify(iter);
-
-       /* already got to end? */
-       if (!btree_iter_node(iter, iter->level))
-               return NULL;
-
-       bch2_trans_cond_resched(iter->trans);
-
-       btree_node_unlock(iter, iter->level);
-       iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
-       iter->level++;
-
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-       ret = btree_iter_traverse(iter);
-       if (ret)
-               return NULL;
-
-       /* got to end? */
-       b = btree_iter_node(iter, iter->level);
-       if (!b)
-               return NULL;
-
-       if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
-               /*
-                * Haven't gotten to the end of the parent node: go back down to
-                * the next child node
-                */
-               btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
-
-               /* Unlock to avoid screwing up our lock invariants: */
-               btree_node_unlock(iter, iter->level);
-
-               iter->level = iter->min_depth;
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-               bch2_btree_iter_verify(iter);
-
-               ret = btree_iter_traverse(iter);
-               if (ret)
-                       return NULL;
-
-               b = iter->l[iter->level].b;
+       if (path->ref > 1 || path->preserve) {
+               __btree_path_put(path, intent);
+               path = btree_path_clone(trans, path, intent);
+               path->preserve = false;
+#ifdef CONFIG_BCACHEFS_DEBUG
+               path->ip_allocated = _RET_IP_;
+#endif
+               btree_trans_verify_sorted(trans);
        }
 
-       iter->pos = iter->real_pos = b->key.k.p;
-
-       bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
-
-       return b;
+       return path;
 }
 
-/* Iterate across keys (in leaf nodes only) */
-
-static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
+static struct btree_path * __must_check
+btree_path_set_pos(struct btree_trans *trans,
+                  struct btree_path *path, struct bpos new_pos,
+                  bool intent)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
-       struct bpos old_pos = iter->real_pos;
-#endif
-       int cmp = bpos_cmp(new_pos, iter->real_pos);
-       unsigned l = iter->level;
+       int cmp = bpos_cmp(new_pos, path->pos);
+       unsigned l = path->level;
 
-       EBUG_ON(iter->trans->restarted);
+       EBUG_ON(trans->restarted);
+       EBUG_ON(!path->ref);
 
        if (!cmp)
-               goto out;
+               return path;
 
-       iter->real_pos = new_pos;
-       iter->should_be_locked = false;
+       path = bch2_btree_path_make_mut(trans, path, intent);
 
-       btree_iter_check_sort(iter->trans, iter);
+       path->pos               = new_pos;
+       path->should_be_locked  = false;
 
-       if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
-               btree_node_unlock(iter, 0);
-               iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-               return;
+       btree_path_check_sort(trans, path, cmp);
+
+       if (unlikely(path->cached)) {
+               btree_node_unlock(path, 0);
+               path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+               goto out;
        }
 
-       l = btree_iter_up_until_good_node(iter, cmp);
+       l = btree_path_up_until_good_node(trans, path, cmp);
 
-       if (btree_iter_node(iter, l)) {
+       if (btree_path_node(path, l)) {
                /*
                 * We might have to skip over many keys, or just a few: try
                 * advancing the node iterator, and if we have to skip over too
@@ -1632,143 +1548,457 @@ static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_p
                 * is expensive).
                 */
                if (cmp < 0 ||
-                   !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
-                       __btree_iter_init(iter, l);
+                   !btree_path_advance_to_pos(path, &path->l[l], 8))
+                       __btree_path_level_init(path, l);
+       }
 
-               /* Don't leave it locked if we're not supposed to: */
-               if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
-                       btree_node_unlock(iter, l);
+       if (l != path->level) {
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+               __bch2_btree_path_unlock(path);
        }
 out:
-       if (l != iter->level)
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-       else
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
-       bch2_btree_iter_verify(iter);
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trace_iter_set_search_pos(iter->trans->ip, _RET_IP_,
-                                 iter->btree_id,
-                                 &old_pos, &new_pos, l);
-#endif
+       bch2_btree_path_verify(trans, path);
+       return path;
 }
 
-inline bool bch2_btree_iter_advance(struct btree_iter *iter)
+/* Btree path: main interface: */
+
+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
 {
-       struct bpos pos = iter->k.p;
-       bool ret = bpos_cmp(pos, SPOS_MAX) != 0;
+       struct btree_path *next;
 
-       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-               pos = bkey_successor(iter, pos);
-       bch2_btree_iter_set_pos(iter, pos);
-       return ret;
+       next = prev_btree_path(trans, path);
+       if (next && !btree_path_cmp(next, path))
+               return next;
+
+       next = next_btree_path(trans, path);
+       if (next && !btree_path_cmp(next, path))
+               return next;
+
+       return NULL;
 }
 
-inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+static bool have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
 {
-       struct bpos pos = bkey_start_pos(&iter->k);
-       bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
-                   ? bpos_cmp(pos, POS_MIN)
-                   : bkey_cmp(pos, POS_MIN)) != 0;
+       struct btree_path *next;
 
-       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-               pos = bkey_predecessor(iter, pos);
-       bch2_btree_iter_set_pos(iter, pos);
-       return ret;
+       next = prev_btree_path(trans, path);
+       if (next && path_l(next)->b == path_l(path)->b)
+               return true;
+
+       next = next_btree_path(trans, path);
+       if (next && path_l(next)->b == path_l(path)->b)
+               return true;
+
+       return false;
+}
+
+static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+{
+       __bch2_btree_path_unlock(path);
+       btree_path_list_remove(trans, path);
+       trans->paths_allocated &= ~(1ULL << path->idx);
 }
 
-static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
+void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
 {
-       struct bpos next_pos = iter->l[0].b->key.k.p;
-       bool ret = bpos_cmp(next_pos, SPOS_MAX) != 0;
+       struct btree_path *dup;
+
+       EBUG_ON(trans->paths + path->idx != path);
+       EBUG_ON(!path->ref);
+
+       if (!__btree_path_put(path, intent))
+               return;
 
        /*
-        * Typically, we don't want to modify iter->pos here, since that
-        * indicates where we searched from - unless we got to the end of the
-        * btree, in that case we want iter->pos to reflect that:
+        * Perhaps instead we should check for duplicate paths in traverse_all:
         */
-       if (ret)
-               btree_iter_set_search_pos(iter, bpos_successor(next_pos));
-       else
-               bch2_btree_iter_set_pos(iter, SPOS_MAX);
+       if (path->preserve &&
+           (dup = have_path_at_pos(trans, path))) {
+               dup->preserve = true;
+               path->preserve = false;
+       }
 
-       return ret;
+       if (!path->preserve &&
+           have_node_at_pos(trans, path))
+               __bch2_path_free(trans, path);
 }
 
-static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
 {
-       struct bpos next_pos = iter->l[0].b->data->min_key;
-       bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
+       struct btree_path *path;
+       struct btree_insert_entry *i;
+       unsigned idx;
+       char buf[300];
 
-       if (ret)
-               btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
-       else
-               bch2_btree_iter_set_pos(iter, POS_MIN);
+       btree_trans_verify_sorted(trans);
 
-       return ret;
+       trans_for_each_path_inorder(trans, path, idx)
+               printk(KERN_ERR "path: idx %u ref %u:%u%s btree %s pos %s %pS\n",
+                      path->idx, path->ref, path->intent_ref,
+                      path->preserve ? " preserve" : "",
+                      bch2_btree_ids[path->btree_id],
+                      (bch2_bpos_to_text(&PBUF(buf), path->pos), buf),
+#ifdef CONFIG_BCACHEFS_DEBUG
+                      (void *) path->ip_allocated
+#else
+                      NULL
+#endif
+                      );
+
+       trans_for_each_update(trans, i)
+               printk(KERN_ERR "update: btree %s %s %pS\n",
+                      bch2_btree_ids[i->btree_id],
+                      (bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)), buf),
+                      (void *) i->ip_allocated);
 }
 
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter,
-                                                     struct bpos pos)
+static struct btree_path *btree_path_alloc(struct btree_trans *trans,
+                                          struct btree_path *pos)
 {
-       struct btree_insert_entry *i;
+       struct btree_path *path;
+       unsigned idx;
 
-       if (!(iter->flags & BTREE_ITER_WITH_UPDATES))
-               return NULL;
+       if (unlikely(trans->paths_allocated ==
+                    ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
+               bch2_dump_trans_paths_updates(trans);
+               panic("trans path oveflow\n");
+       }
 
-       trans_for_each_update(iter->trans, i)
-               if ((cmp_int(iter->btree_id,    i->iter->btree_id) ?:
-                    bkey_cmp(pos,              i->k->k.p)) <= 0) {
-                       if (iter->btree_id ==   i->iter->btree_id)
-                               return i->k;
-                       break;
-               }
+       idx = __ffs64(~trans->paths_allocated);
+       trans->paths_allocated |= 1ULL << idx;
 
-       return NULL;
+       path = &trans->paths[idx];
+
+       path->idx               = idx;
+       path->ref               = 0;
+       path->intent_ref        = 0;
+       path->nodes_locked      = 0;
+       path->nodes_intent_locked = 0;
+
+       btree_path_list_add(trans, pos, path);
+       return path;
 }
 
-/**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
- */
-struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+struct btree_path *bch2_path_get(struct btree_trans *trans, bool cached,
+                                enum btree_id btree_id, struct bpos pos,
+                                unsigned locks_want, unsigned level,
+                                bool intent)
 {
-       struct bpos search_key = btree_iter_search_key(iter);
-       struct bkey_i *next_update;
-       struct bkey_s_c k;
-       int ret;
-
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-       bch2_btree_iter_verify(iter);
-       bch2_btree_iter_verify_entry_exit(iter);
-start:
-       next_update = btree_trans_peek_updates(iter, search_key);
-       btree_iter_set_search_pos(iter, search_key);
+       struct btree_path *path, *path_pos = NULL;
+       int i;
 
-       while (1) {
-               ret = btree_iter_traverse(iter);
-               if (unlikely(ret))
-                       return bkey_s_c_err(ret);
+       BUG_ON(trans->restarted);
 
-               k = btree_iter_level_peek(iter, &iter->l[0]);
+       trans_for_each_path_inorder(trans, path, i) {
+               if (__btree_path_cmp(path,
+                                    btree_id,
+                                    cached,
+                                    pos,
+                                    level) > 0)
+                       break;
 
-               if (next_update &&
-                   bpos_cmp(next_update->k.p, iter->real_pos) <= 0) {
-                       iter->k = next_update->k;
-                       k = bkey_i_to_s_c(next_update);
-               }
+               path_pos = path;
+       }
 
-               if (likely(k.k)) {
-                       if (bkey_deleted(k.k)) {
-                               search_key = bkey_successor(iter, k.k->p);
-                               goto start;
-                       }
+       if (path_pos &&
+           path_pos->cached    == cached &&
+           path_pos->btree_id  == btree_id &&
+           path_pos->level     == level) {
+               __btree_path_get(path_pos, intent);
+               path = btree_path_set_pos(trans, path_pos, pos, intent);
+               path->preserve = true;
+       } else {
+               path = btree_path_alloc(trans, path_pos);
+               path_pos = NULL;
+
+               __btree_path_get(path, intent);
+               path->pos                       = pos;
+               path->btree_id                  = btree_id;
+               path->cached                    = cached;
+               path->preserve                  = true;
+               path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
+               path->should_be_locked          = false;
+               path->level                     = level;
+               path->locks_want                = locks_want;
+               path->nodes_locked              = 0;
+               path->nodes_intent_locked       = 0;
+               for (i = 0; i < ARRAY_SIZE(path->l); i++)
+                       path->l[i].b            = BTREE_ITER_NO_NODE_INIT;
+#ifdef CONFIG_BCACHEFS_DEBUG
+               path->ip_allocated              = _RET_IP_;
+#endif
+               btree_trans_verify_sorted(trans);
+       }
 
-                       break;
+       if (path->intent_ref)
+               locks_want = max(locks_want, level + 1);
+
+       /*
+        * If the path has locks_want greater than requested, we don't downgrade
+        * it here - on transaction restart because btree node split needs to
+        * upgrade locks, we might be putting/getting the iterator again.
+        * Downgrading iterators only happens via bch2_trans_downgrade(), after
+        * a successful transaction commit.
+        */
+
+       locks_want = min(locks_want, BTREE_MAX_DEPTH);
+       if (locks_want > path->locks_want) {
+               path->locks_want = locks_want;
+               btree_path_get_locks(trans, path, true, _THIS_IP_);
+       }
+
+       return path;
+}
+
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+{
+
+       struct bkey_s_c k;
+
+       BUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+
+       if (!path->cached) {
+               struct btree_path_level *l = path_l(path);
+               struct bkey_packed *_k =
+                       bch2_btree_node_iter_peek_all(&l->iter, l->b);
+
+               k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
+
+               EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
+
+               if (!k.k || bpos_cmp(path->pos, k.k->p))
+                       goto hole;
+       } else {
+               struct bkey_cached *ck = (void *) path->l[0].b;
+
+               EBUG_ON(path->btree_id != ck->key.btree_id ||
+                       bkey_cmp(path->pos, ck->key.pos));
+
+               /* BTREE_ITER_CACHED_NOFILL? */
+               if (unlikely(!ck->valid))
+                       goto hole;
+
+               k = bkey_i_to_s_c(ck->k);
+       }
+
+       return k;
+hole:
+       bkey_init(u);
+       u->p = path->pos;
+       return (struct bkey_s_c) { u, NULL };
+}
+
+/* Btree iterators: */
+
+int __must_check
+__bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+       return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+}
+
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+       int ret;
+
+       iter->path = btree_path_set_pos(iter->trans, iter->path,
+                                       btree_iter_search_key(iter),
+                                       iter->flags & BTREE_ITER_INTENT);
+
+       ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+       if (ret)
+               return ret;
+
+       iter->path->should_be_locked = true;
+       return 0;
+}
+
+/* Iterate across nodes (leaf and interior nodes) */
+
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+{
+       struct btree *b = NULL;
+       int ret;
+
+       EBUG_ON(iter->path->cached);
+       bch2_btree_iter_verify(iter);
+
+       ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+       if (ret)
+               goto out;
+
+       b = btree_path_node(iter->path, iter->path->level);
+       if (!b)
+               goto out;
+
+       BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
+
+       bkey_init(&iter->k);
+       iter->k.p = iter->pos = b->key.k.p;
+       iter->path->should_be_locked = true;
+out:
+       bch2_btree_iter_verify_entry_exit(iter);
+       bch2_btree_iter_verify(iter);
+
+       return b;
+}
+
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+{
+       struct btree_trans *trans = iter->trans;
+       struct btree_path *path = iter->path;
+       struct btree *b = NULL;
+       int ret;
+
+       EBUG_ON(iter->path->cached);
+       bch2_btree_iter_verify(iter);
+
+       /* already got to end? */
+       if (!btree_path_node(path, path->level))
+               goto out;
+
+       bch2_trans_cond_resched(trans);
+
+       btree_node_unlock(path, path->level);
+       path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+       path->level++;
+
+       btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+       ret = bch2_btree_path_traverse(trans, path, iter->flags);
+       if (ret)
+               goto out;
+
+       /* got to end? */
+       b = btree_path_node(path, path->level);
+       if (!b)
+               goto out;
+
+       if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
+               /*
+                * Haven't gotten to the end of the parent node: go back down to
+                * the next child node
+                */
+               path = iter->path =
+                       btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+                                          iter->flags & BTREE_ITER_INTENT);
+
+               /* Unlock to avoid screwing up our lock invariants: */
+               btree_node_unlock(path, path->level);
+
+               path->level = iter->min_depth;
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+               bch2_btree_iter_verify(iter);
+
+               ret = bch2_btree_path_traverse(trans, path, iter->flags);
+               if (ret) {
+                       b = NULL;
+                       goto out;
                }
 
-               if (!btree_iter_set_pos_to_next_leaf(iter))
-                       return bkey_s_c_null;
+               b = path->l[path->level].b;
+       }
+
+       bkey_init(&iter->k);
+       iter->k.p = iter->pos = b->key.k.p;
+       iter->path->should_be_locked = true;
+out:
+       bch2_btree_iter_verify_entry_exit(iter);
+       bch2_btree_iter_verify(iter);
+
+       return b;
+}
+
+/* Iterate across keys (in leaf nodes only) */
+
+inline bool bch2_btree_iter_advance(struct btree_iter *iter)
+{
+       struct bpos pos = iter->k.p;
+       bool ret = bpos_cmp(pos, SPOS_MAX) != 0;
+
+       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+               pos = bkey_successor(iter, pos);
+       bch2_btree_iter_set_pos(iter, pos);
+       return ret;
+}
+
+inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+{
+       struct bpos pos = bkey_start_pos(&iter->k);
+       bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+                   ? bpos_cmp(pos, POS_MIN)
+                   : bkey_cmp(pos, POS_MIN)) != 0;
+
+       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+               pos = bkey_predecessor(iter, pos);
+       bch2_btree_iter_set_pos(iter, pos);
+       return ret;
+}
+
+/**
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
+ * current position
+ */
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+{
+       struct btree_trans *trans = iter->trans;
+       struct bpos search_key = btree_iter_search_key(iter);
+       struct bkey_i *next_update;
+       struct bkey_s_c k;
+       int ret, cmp;
+
+       EBUG_ON(iter->path->cached || iter->path->level);
+       bch2_btree_iter_verify(iter);
+       bch2_btree_iter_verify_entry_exit(iter);
+
+       while (1) {
+               iter->path = btree_path_set_pos(trans, iter->path, search_key,
+                                  iter->flags & BTREE_ITER_INTENT);
+
+               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+               if (unlikely(ret)) {
+                       /* ensure that iter->k is consistent with iter->pos: */
+                       bch2_btree_iter_set_pos(iter, iter->pos);
+                       k = bkey_s_c_err(ret);
+                       goto out;
+               }
+
+               next_update = iter->flags & BTREE_ITER_WITH_UPDATES
+                       ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
+                       : NULL;
+               k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
+
+               /* * In the btree, deleted keys sort before non deleted: */
+               if (k.k && bkey_deleted(k.k) &&
+                   (!next_update ||
+                    bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
+                       search_key = k.k->p;
+                       continue;
+               }
+
+               if (next_update &&
+                   bpos_cmp(next_update->k.p,
+                            k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+                       iter->k = next_update->k;
+                       k = bkey_i_to_s_c(next_update);
+               }
+
+               if (likely(k.k)) {
+                       if (likely(!bkey_deleted(k.k)))
+                               break;
+
+                       /* Advance to next key: */
+                       search_key = bkey_successor(iter, k.k->p);
+               } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
+                       /* Advance to next leaf node: */
+                       search_key = bpos_successor(iter->path->l[0].b->key.k.p);
+               } else {
+                       /* End of btree: */
+                       bch2_btree_iter_set_pos(iter, SPOS_MAX);
+                       k = bkey_s_c_null;
+                       goto out;
+               }
        }
 
        /*
@@ -1780,9 +2010,18 @@ start:
        else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
                iter->pos = bkey_start_pos(k.k);
 
+       cmp = bpos_cmp(k.k->p, iter->path->pos);
+       if (cmp) {
+               iter->path = bch2_btree_path_make_mut(trans, iter->path,
+                                       iter->flags & BTREE_ITER_INTENT);
+               iter->path->pos = k.k->p;
+               btree_path_check_sort(trans, iter->path, cmp);
+       }
+out:
+       iter->path->should_be_locked = true;
+
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
        return k;
 }
 
@@ -1804,37 +2043,49 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
-       struct btree_iter_level *l = &iter->l[0];
+       struct btree_trans *trans = iter->trans;
+       struct bpos search_key = iter->pos;
        struct bkey_s_c k;
        int ret;
 
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+       EBUG_ON(iter->path->cached || iter->path->level);
        EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
 
-       btree_iter_set_search_pos(iter, iter->pos);
-
        while (1) {
-               ret = btree_iter_traverse(iter);
+               iter->path = btree_path_set_pos(trans, iter->path, search_key,
+                                               iter->flags & BTREE_ITER_INTENT);
+
+               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
                if (unlikely(ret)) {
+                       /* ensure that iter->k is consistent with iter->pos: */
+                       bch2_btree_iter_set_pos(iter, iter->pos);
                        k = bkey_s_c_err(ret);
-                       goto no_key;
+                       goto out;
                }
 
-               k = btree_iter_level_peek(iter, l);
+               k = btree_path_level_peek(trans, iter->path,
+                                         &iter->path->l[0], &iter->k);
                if (!k.k ||
                    ((iter->flags & BTREE_ITER_IS_EXTENTS)
                     ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
                     : bkey_cmp(k.k->p, iter->pos) > 0))
-                       k = btree_iter_level_prev(iter, l);
+                       k = btree_path_level_prev(trans->c, iter->path,
+                                                 &iter->path->l[0], &iter->k);
 
-               if (likely(k.k))
-                       break;
+               btree_path_check_sort(trans, iter->path, 0);
 
-               if (!btree_iter_set_pos_to_prev_leaf(iter)) {
+               if (likely(k.k)) {
+                       break;
+               } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
+                       /* Advance to previous leaf node: */
+                       search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+               } else {
+                       /* Start of btree: */
+                       bch2_btree_iter_set_pos(iter, POS_MIN);
                        k = bkey_s_c_null;
-                       goto no_key;
+                       goto out;
                }
        }
 
@@ -1844,19 +2095,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
        if (bkey_cmp(k.k->p, iter->pos) < 0)
                iter->pos = k.k->p;
 out:
+       iter->path->should_be_locked = true;
+
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
+
        return k;
-no_key:
-       /*
-        * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
-        * then we errored going to the previous leaf - make sure it's
-        * consistent with iter->pos:
-        */
-       bkey_init(&iter->k);
-       iter->k.p = iter->pos;
-       goto out;
 }
 
 /**
@@ -1873,12 +2117,12 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
+       struct btree_trans *trans = iter->trans;
        struct bpos search_key;
        struct bkey_s_c k;
        int ret;
 
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS &&
-               btree_iter_type(iter) != BTREE_ITER_CACHED);
+       EBUG_ON(iter->path->level);
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
 
@@ -1892,50 +2136,41 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        }
 
        search_key = btree_iter_search_key(iter);
-       btree_iter_set_search_pos(iter, search_key);
+       iter->path = btree_path_set_pos(trans, iter->path, search_key,
+                                       iter->flags & BTREE_ITER_INTENT);
 
-       ret = btree_iter_traverse(iter);
+       ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
        if (unlikely(ret))
                return bkey_s_c_err(ret);
 
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED ||
-           !(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+       if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
                struct bkey_i *next_update;
-               struct bkey_cached *ck;
 
-               switch (btree_iter_type(iter)) {
-               case BTREE_ITER_KEYS:
-                       k = btree_iter_level_peek_all(iter, &iter->l[0]);
-                       EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0);
-                       break;
-               case BTREE_ITER_CACHED:
-                       ck = (void *) iter->l[0].b;
-                       EBUG_ON(iter->btree_id != ck->key.btree_id ||
-                               bkey_cmp(iter->pos, ck->key.pos));
-                       BUG_ON(!ck->valid);
-
-                       k = bkey_i_to_s_c(ck->k);
-                       break;
-               case BTREE_ITER_NODES:
-                       BUG();
-               }
+               next_update = iter->flags & BTREE_ITER_WITH_UPDATES
+                       ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
+                       : NULL;
 
-               next_update = btree_trans_peek_updates(iter, search_key);
                if (next_update &&
-                   (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
+                   !bpos_cmp(next_update->k.p, iter->pos)) {
                        iter->k = next_update->k;
                        k = bkey_i_to_s_c(next_update);
+               } else {
+                       k = bch2_btree_path_peek_slot(iter->path, &iter->k);
                }
        } else {
-               if ((iter->flags & BTREE_ITER_INTENT)) {
-                       struct btree_iter *child =
-                               btree_iter_child_alloc(iter, _THIS_IP_);
+               struct bpos next;
+
+               if (iter->flags & BTREE_ITER_INTENT) {
+                       struct btree_iter iter2;
 
-                       btree_iter_copy(child, iter);
-                       k = bch2_btree_iter_peek(child);
+                       bch2_trans_copy_iter(&iter2, iter);
+                       k = bch2_btree_iter_peek(&iter2);
 
-                       if (k.k && !bkey_err(k))
-                               iter->k = child->k;
+                       if (k.k && !bkey_err(k)) {
+                               iter->k = iter2.k;
+                               k.k = &iter->k;
+                       }
+                       bch2_trans_iter_exit(trans, &iter2);
                } else {
                        struct bpos pos = iter->pos;
 
@@ -1945,19 +2180,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
                if (unlikely(bkey_err(k)))
                        return k;
-       }
 
-       if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
-               if (!k.k ||
-                   ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
-                    ? bpos_cmp(iter->pos, k.k->p)
-                    : bkey_cmp(iter->pos, k.k->p))) {
-                       bkey_init(&iter->k);
-                       iter->k.p = iter->pos;
-                       k = (struct bkey_s_c) { &iter->k, NULL };
-               }
-       } else {
-               struct bpos next = k.k ? bkey_start_pos(k.k) : POS_MAX;
+               next = k.k ? bkey_start_pos(k.k) : POS_MAX;
 
                if (bkey_cmp(iter->pos, next) < 0) {
                        bkey_init(&iter->k);
@@ -1974,9 +2198,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                }
        }
 
+       iter->path->should_be_locked = true;
+
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
 
        return k;
 }
@@ -1997,35 +2222,14 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
        return bch2_btree_iter_peek_slot(iter);
 }
 
-static inline void bch2_btree_iter_init(struct btree_trans *trans,
-                       struct btree_iter *iter, enum btree_id btree_id)
-{
-       struct bch_fs *c = trans->c;
-       unsigned i;
-
-       iter->trans                     = trans;
-       iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
-       iter->btree_id                  = btree_id;
-       iter->real_pos                  = POS_MIN;
-       iter->level                     = 0;
-       iter->min_depth                 = 0;
-       iter->locks_want                = 0;
-       iter->nodes_locked              = 0;
-       iter->nodes_intent_locked       = 0;
-       for (i = 0; i < ARRAY_SIZE(iter->l); i++)
-               iter->l[i].b            = BTREE_ITER_NO_NODE_INIT;
-
-       prefetch(c->btree_roots[btree_id].b);
-}
-
 /* new transactional stuff: */
 
-static inline void btree_iter_verify_sorted_ref(struct btree_trans *trans,
-                                               struct btree_iter *iter)
+static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
+                                               struct btree_path *path)
 {
-       EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
-       EBUG_ON(trans->sorted[iter->sorted_idx] != iter->idx);
-       EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+       EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+       EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+       EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 }
 
 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
@@ -2034,432 +2238,180 @@ static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
        unsigned i;
 
        for (i = 0; i < trans->nr_sorted; i++)
-               btree_iter_verify_sorted_ref(trans, trans->iters + trans->sorted[i]);
+               btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
 #endif
 }
 
-static inline void btree_trans_verify_sorted(struct btree_trans *trans)
+static void btree_trans_verify_sorted(struct btree_trans *trans)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-       struct btree_iter *iter, *prev = NULL;
+       struct btree_path *path, *prev = NULL;
+       unsigned i;
 
-       trans_for_each_iter_inorder(trans, iter)
-               BUG_ON(prev && btree_iter_cmp(prev, iter) > 0);
+       trans_for_each_path_inorder(trans, path, i) {
+               BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+               prev = path;
+       }
 #endif
 }
 
-static inline void btree_iter_swap(struct btree_trans *trans,
-                                  struct btree_iter *l, struct btree_iter *r)
+static inline void btree_path_swap(struct btree_trans *trans,
+                                  struct btree_path *l, struct btree_path *r)
 {
        swap(l->sorted_idx, r->sorted_idx);
        swap(trans->sorted[l->sorted_idx],
             trans->sorted[r->sorted_idx]);
 
-       btree_iter_verify_sorted_ref(trans, l);
-       btree_iter_verify_sorted_ref(trans, r);
+       btree_path_verify_sorted_ref(trans, l);
+       btree_path_verify_sorted_ref(trans, r);
 }
 
-static void btree_trans_sort_iters(struct btree_trans *trans)
+static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
+                                 int cmp)
 {
-       bool swapped = false;
-       int i, l = 0, r = trans->nr_sorted;
-
-       while (1) {
-               for (i = l; i + 1 < r; i++) {
-                       if (btree_iter_cmp(trans->iters + trans->sorted[i],
-                                          trans->iters + trans->sorted[i + 1]) > 0) {
-                               swap(trans->sorted[i], trans->sorted[i + 1]);
-                               trans->iters[trans->sorted[i]].sorted_idx = i;
-                               trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
-                               swapped = true;
-                       }
-               }
+       struct btree_path *n;
 
-               if (!swapped)
-                       break;
+       if (cmp <= 0) {
+               n = prev_btree_path(trans, path);
+               if (n && btree_path_cmp(n, path) > 0) {
+                       do {
+                               btree_path_swap(trans, n, path);
+                               n = prev_btree_path(trans, path);
+                       } while (n && btree_path_cmp(n, path) > 0);
 
-               r--;
-               swapped = false;
-
-               for (i = r - 2; i >= l; --i) {
-                       if (btree_iter_cmp(trans->iters + trans->sorted[i],
-                                          trans->iters + trans->sorted[i + 1]) > 0) {
-                               swap(trans->sorted[i],
-                                    trans->sorted[i + 1]);
-                               trans->iters[trans->sorted[i]].sorted_idx = i;
-                               trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
-                               swapped = true;
-                       }
+                       goto out;
                }
-
-               if (!swapped)
-                       break;
-
-               l++;
-               swapped = false;
-       }
-
-       btree_trans_verify_sorted_refs(trans);
-       btree_trans_verify_sorted(trans);
-}
-
-static void btree_iter_check_sort(struct btree_trans *trans, struct btree_iter *iter)
-{
-       struct btree_iter *n;
-
-       EBUG_ON(iter->sorted_idx == U8_MAX);
-
-       n = next_btree_iter(trans, iter);
-       if (n && btree_iter_cmp(iter, n) > 0) {
-               do {
-                       btree_iter_swap(trans, iter, n);
-                       n = next_btree_iter(trans, iter);
-               } while (n && btree_iter_cmp(iter, n) > 0);
-
-               return;
        }
 
-       n = prev_btree_iter(trans, iter);
-       if (n && btree_iter_cmp(n, iter) > 0) {
-               do {
-                       btree_iter_swap(trans, n, iter);
-                       n = prev_btree_iter(trans, iter);
-               } while (n && btree_iter_cmp(n, iter) > 0);
+       if (cmp >= 0) {
+               n = next_btree_path(trans, path);
+               if (n && btree_path_cmp(path, n) > 0) {
+                       do {
+                               btree_path_swap(trans, path, n);
+                               n = next_btree_path(trans, path);
+                       } while (n && btree_path_cmp(path, n) > 0);
+               }
        }
-
+out:
        btree_trans_verify_sorted(trans);
 }
 
-static inline void btree_iter_list_remove(struct btree_trans *trans,
-                                         struct btree_iter *iter)
+static inline void btree_path_list_remove(struct btree_trans *trans,
+                                         struct btree_path *path)
 {
        unsigned i;
 
-       EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
+       EBUG_ON(path->sorted_idx >= trans->nr_sorted);
 
-       array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx);
+       array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
 
-       for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
-               trans->iters[trans->sorted[i]].sorted_idx = i;
+       for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+               trans->paths[trans->sorted[i]].sorted_idx = i;
 
-       iter->sorted_idx = U8_MAX;
+       path->sorted_idx = U8_MAX;
 
        btree_trans_verify_sorted_refs(trans);
 }
 
-static inline void btree_iter_list_add(struct btree_trans *trans,
-                                      struct btree_iter *pos,
-                                      struct btree_iter *iter)
+static inline void btree_path_list_add(struct btree_trans *trans,
+                                      struct btree_path *pos,
+                                      struct btree_path *path)
 {
        unsigned i;
 
        btree_trans_verify_sorted_refs(trans);
 
-       iter->sorted_idx = pos ? pos->sorted_idx : trans->nr_sorted;
+       path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
 
-       array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx);
+       array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
 
-       for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
-               trans->iters[trans->sorted[i]].sorted_idx = i;
+       for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+               trans->paths[trans->sorted[i]].sorted_idx = i;
 
        btree_trans_verify_sorted_refs(trans);
 }
 
-static void btree_iter_child_free(struct btree_iter *iter)
-{
-       struct btree_iter *child = btree_iter_child(iter);
-
-       if (child) {
-               bch2_trans_iter_free(iter->trans, child);
-               iter->child_idx = U8_MAX;
-       }
-}
-
-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter,
-                                                unsigned long ip)
-{
-       struct btree_trans *trans = iter->trans;
-       struct btree_iter *child = btree_iter_child(iter);
-
-       if (!child) {
-               child = btree_trans_iter_alloc(trans, iter);
-               child->ip_allocated     = ip;
-               iter->child_idx         = child->idx;
-
-               trans->iters_live       |= 1ULL << child->idx;
-               trans->iters_touched    |= 1ULL << child->idx;
-       }
-
-       return child;
-}
-
-static inline void __bch2_trans_iter_free(struct btree_trans *trans,
-                                         unsigned idx)
-{
-       btree_iter_child_free(&trans->iters[idx]);
-
-       btree_iter_list_remove(trans, &trans->iters[idx]);
-
-       __bch2_btree_iter_unlock(&trans->iters[idx]);
-       trans->iters_linked             &= ~(1ULL << idx);
-       trans->iters_live               &= ~(1ULL << idx);
-       trans->iters_touched            &= ~(1ULL << idx);
-}
-
-int bch2_trans_iter_put(struct btree_trans *trans,
-                       struct btree_iter *iter)
-{
-       int ret;
-
-       if (IS_ERR_OR_NULL(iter))
-               return 0;
-
-       BUG_ON(trans->iters + iter->idx != iter);
-       BUG_ON(!btree_iter_live(trans, iter));
-
-       ret = btree_iter_err(iter);
-
-       if (!(trans->iters_touched & (1ULL << iter->idx)) &&
-           !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
-               __bch2_trans_iter_free(trans, iter->idx);
-
-       trans->iters_live       &= ~(1ULL << iter->idx);
-       return ret;
-}
-
-int bch2_trans_iter_free(struct btree_trans *trans,
-                        struct btree_iter *iter)
-{
-       if (IS_ERR_OR_NULL(iter))
-               return 0;
-
-       set_btree_iter_dontneed(trans, iter);
-
-       return bch2_trans_iter_put(trans, iter);
-}
-
-noinline __cold
-static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
-{
-
-       struct btree_iter *iter;
-       struct btree_insert_entry *i;
-       char buf[100];
-
-       btree_trans_sort_iters(trans);
-
-       trans_for_each_iter_inorder(trans, iter)
-               printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
-                      bch2_btree_ids[iter->btree_id],
-                      (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf),
-                      btree_iter_live(trans, iter) ? " live" : "",
-                      (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
-                      iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
-                      (void *) iter->ip_allocated);
-
-       trans_for_each_update(trans, i) {
-               char buf[300];
-
-               bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
-               printk(KERN_ERR "update: btree %s %s\n",
-                      bch2_btree_ids[i->iter->btree_id], buf);
-       }
-       panic("trans iter oveflow\n");
-}
-
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
-                                                struct btree_iter *pos)
-{
-       struct btree_iter *iter;
-       unsigned idx;
-
-       if (unlikely(trans->iters_linked ==
-                    ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
-               btree_trans_iter_alloc_fail(trans);
-
-       idx = __ffs64(~trans->iters_linked);
-       iter = &trans->iters[idx];
-
-       iter->trans             = trans;
-       iter->idx               = idx;
-       iter->child_idx         = U8_MAX;
-       iter->sorted_idx        = U8_MAX;
-       iter->flags             = 0;
-       iter->nodes_locked      = 0;
-       iter->nodes_intent_locked = 0;
-       trans->iters_linked     |= 1ULL << idx;
-
-       btree_iter_list_add(trans, pos, iter);
-       return iter;
-}
-
-static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 {
-       unsigned i;
-
-       __bch2_btree_iter_unlock(dst);
-       btree_iter_child_free(dst);
-
-       memcpy(&dst->flags, &src->flags,
-              sizeof(struct btree_iter) - offsetof(struct btree_iter, flags));
-
-       for (i = 0; i < BTREE_MAX_DEPTH; i++)
-               if (btree_node_locked(dst, i))
-                       six_lock_increment(&dst->l[i].b->c.lock,
-                                          __btree_lock_want(dst, i));
-
-       dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
-       dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
-
-       btree_iter_check_sort(dst->trans, dst);
+       if (iter->path)
+               bch2_path_put(trans, iter->path,
+                             iter->flags & BTREE_ITER_INTENT);
+       iter->path = NULL;
 }
 
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
-                                        unsigned btree_id, struct bpos pos,
-                                        unsigned locks_want,
-                                        unsigned depth,
-                                        unsigned flags)
+static void __bch2_trans_iter_init(struct btree_trans *trans,
+                                  struct btree_iter *iter,
+                                  unsigned btree_id, struct bpos pos,
+                                  unsigned locks_want,
+                                  unsigned depth,
+                                  unsigned flags)
 {
-       struct btree_iter *iter, *best = NULL;
-       struct bpos real_pos, pos_min = POS_MIN;
-
        EBUG_ON(trans->restarted);
 
-       if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
-           btree_node_type_is_extents(btree_id) &&
-           !(flags & BTREE_ITER_NOT_EXTENTS) &&
-           !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+       if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+           btree_node_type_is_extents(btree_id))
                flags |= BTREE_ITER_IS_EXTENTS;
 
-       if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
-           !btree_type_has_snapshots(btree_id))
+       if (!btree_type_has_snapshots(btree_id) &&
+           !(flags & __BTREE_ITER_ALL_SNAPSHOTS))
                flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
 
        if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
                pos.snapshot = btree_type_has_snapshots(btree_id)
                        ? U32_MAX : 0;
 
-       real_pos = pos;
-
-       if ((flags & BTREE_ITER_IS_EXTENTS) &&
-           bkey_cmp(pos, POS_MAX))
-               real_pos = bpos_nosnap_successor(pos);
-
-       trans_for_each_iter(trans, iter) {
-               if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
-                       continue;
-
-               if (iter->btree_id != btree_id)
-                       continue;
-
-               if (best) {
-                       int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
-                                          bpos_diff(iter->real_pos, real_pos));
-
-                       if (cmp < 0 ||
-                           ((cmp == 0 && btree_iter_keep(trans, iter))))
-                               continue;
-               }
-
-               best = iter;
-       }
-
-       if (!best) {
-               iter = btree_trans_iter_alloc(trans, NULL);
-               bch2_btree_iter_init(trans, iter, btree_id);
-       } else if (btree_iter_keep(trans, best)) {
-               iter = btree_trans_iter_alloc(trans, best);
-               btree_iter_copy(iter, best);
-       } else {
-               iter = best;
-       }
-
-       trans->iters_live       |= 1ULL << iter->idx;
-       trans->iters_touched    |= 1ULL << iter->idx;
-
-       iter->flags = flags;
-
-       iter->snapshot = pos.snapshot;
-
-       /*
-        * If the iterator has locks_want greater than requested, we explicitly
-        * do not downgrade it here - on transaction restart because btree node
-        * split needs to upgrade locks, we might be putting/getting the
-        * iterator again. Downgrading iterators only happens via an explicit
-        * bch2_trans_downgrade().
-        */
-
-       locks_want = min(locks_want, BTREE_MAX_DEPTH);
-       if (locks_want > iter->locks_want) {
-               iter->locks_want = locks_want;
-               btree_iter_get_locks(iter, true, _THIS_IP_);
-       }
-
-       while (iter->level != depth) {
-               btree_node_unlock(iter, iter->level);
-               iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
-               iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
-               if (iter->level < depth)
-                       iter->level++;
-               else
-                       iter->level--;
-       }
-
+       iter->trans     = trans;
+       iter->path      = NULL;
+       iter->btree_id  = btree_id;
        iter->min_depth = depth;
+       iter->flags     = flags;
+       iter->snapshot  = pos.snapshot;
+       iter->pos       = pos;
+       iter->k.type    = KEY_TYPE_deleted;
+       iter->k.p       = pos;
+       iter->k.size    = 0;
 
-       bch2_btree_iter_set_pos(iter, pos);
-       btree_iter_set_search_pos(iter, real_pos);
-
-       trace_trans_get_iter(_RET_IP_, trans->ip,
-                            btree_id,
-                            &real_pos, locks_want, iter->uptodate,
-                            best ? &best->real_pos     : &pos_min,
-                            best ? best->locks_want    : U8_MAX,
-                            best ? best->uptodate      : U8_MAX);
-
-       return iter;
+       iter->path = bch2_path_get(trans,
+                                  flags & BTREE_ITER_CACHED,
+                                  btree_id,
+                                  iter->pos,
+                                  locks_want,
+                                  depth,
+                                  flags & BTREE_ITER_INTENT);
 }
 
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
-                                           enum btree_id btree_id,
-                                           struct bpos pos,
-                                           unsigned locks_want,
-                                           unsigned depth,
-                                           unsigned flags)
+void bch2_trans_iter_init(struct btree_trans *trans,
+                         struct btree_iter *iter,
+                         unsigned btree_id, struct bpos pos,
+                         unsigned flags)
 {
-       struct btree_iter *iter =
-               __bch2_trans_get_iter(trans, btree_id, pos,
-                                     locks_want, depth,
-                                     BTREE_ITER_NODES|
-                                     BTREE_ITER_NOT_EXTENTS|
-                                     BTREE_ITER_ALL_SNAPSHOTS|
-                                     flags);
-
-       BUG_ON(bkey_cmp(iter->pos, pos));
-       BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
-       BUG_ON(iter->level      != depth);
-       BUG_ON(iter->min_depth  != depth);
-       iter->ip_allocated = _RET_IP_;
-
-       return iter;
+       __bch2_trans_iter_init(trans, iter, btree_id, pos,
+                              0, 0, flags);
 }
 
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
-                                       struct btree_iter *src)
+void bch2_trans_node_iter_init(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              enum btree_id btree_id,
+                              struct bpos pos,
+                              unsigned locks_want,
+                              unsigned depth,
+                              unsigned flags)
 {
-       struct btree_iter *iter;
-
-       iter = btree_trans_iter_alloc(trans, src);
-       btree_iter_copy(iter, src);
-
-       trans->iters_live |= 1ULL << iter->idx;
-       /*
-        * We don't need to preserve this iter since it's cheap to copy it
-        * again - this will cause trans_iter_put() to free it right away:
-        */
-       set_btree_iter_dontneed(trans, iter);
+       __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
+                              BTREE_ITER_NOT_EXTENTS|
+                              __BTREE_ITER_ALL_SNAPSHOTS|
+                              BTREE_ITER_ALL_SNAPSHOTS|
+                              flags);
+       BUG_ON(iter->path->locks_want    < min(locks_want, BTREE_MAX_DEPTH));
+       BUG_ON(iter->path->level        != depth);
+       BUG_ON(iter->min_depth          != depth);
+}
 
-       return iter;
+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+{
+       *dst = *src;
+       if (src->path)
+               __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
@@ -2500,20 +2452,6 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
        return p;
 }
 
-inline void bch2_trans_unlink_iters(struct btree_trans *trans)
-{
-       u64 iters = trans->iters_linked &
-               ~trans->iters_touched &
-               ~trans->iters_live;
-
-       while (iters) {
-               unsigned idx = __ffs64(iters);
-
-               iters &= ~(1ULL << idx);
-               __bch2_trans_iter_free(trans, idx);
-       }
-}
-
 /**
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
@@ -2524,18 +2462,11 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans)
  */
 void bch2_trans_begin(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
-
-       trans_for_each_iter(trans, iter)
-               iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
-                                BTREE_ITER_SET_POS_AFTER_COMMIT);
+       struct btree_insert_entry *i;
+       struct btree_path *path;
 
-       /*
-        * XXX: we shouldn't be doing this if the transaction was restarted, but
-        * currently we still overflow transaction iterators if we do that
-        * */
-       bch2_trans_unlink_iters(trans);
-       trans->iters_touched &= trans->iters_live;
+       trans_for_each_update(trans, i)
+               __btree_path_put(i->path, true);
 
        trans->extra_journal_res        = 0;
        trans->nr_updates               = 0;
@@ -2552,32 +2483,44 @@ void bch2_trans_begin(struct btree_trans *trans)
                       (void *) &trans->fs_usage_deltas->memset_start);
        }
 
+       trans_for_each_path(trans, path) {
+               path->should_be_locked = false;
+
+               /*
+                * XXX: we probably shouldn't be doing this if the transaction
+                * was restarted, but currently we still overflow transaction
+                * iterators if we do that
+                */
+               if (!path->ref && !path->preserve)
+                       __bch2_path_free(trans, path);
+               else if (!path->ref)
+                       path->preserve = false;
+       }
+
        bch2_trans_cond_resched(trans);
 
        if (trans->restarted)
-               bch2_btree_iter_traverse_all(trans);
+               bch2_btree_path_traverse_all(trans);
 
        trans->restarted = false;
 }
 
-static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 {
-       size_t iters_bytes      = sizeof(struct btree_iter) * BTREE_ITER_MAX;
+       size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
        size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
-       size_t sorted_bytes     = sizeof(u8) * BTREE_ITER_MAX;
        void *p = NULL;
 
        BUG_ON(trans->used_mempool);
 
 #ifdef __KERNEL__
-       p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+       p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
 #endif
        if (!p)
-               p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+               p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
 
-       trans->iters            = p; p += iters_bytes;
+       trans->paths            = p; p += paths_bytes;
        trans->updates          = p; p += updates_bytes;
-       trans->sorted           = p; p += sorted_bytes;
 }
 
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
@@ -2589,11 +2532,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
        trans->c                = c;
        trans->ip               = _RET_IP_;
 
-       /*
-        * reallocating iterators currently completely breaks
-        * bch2_trans_iter_put(), we always allocate the max:
-        */
-       bch2_trans_alloc_iters(trans, c);
+       bch2_trans_alloc_paths(trans, c);
 
        if (expected_mem_bytes) {
                trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
@@ -2615,54 +2554,63 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 #endif
 }
 
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct bch_fs *c = trans->c;
+       struct btree_path *path;
+
+       trans_for_each_path(trans, path)
+               if (path->ref)
+                       goto leaked;
+       return;
+leaked:
+       bch_err(c, "btree paths leaked from %pS!", (void *) trans->ip);
+       trans_for_each_path(trans, path)
+               if (path->ref)
+                       printk(KERN_ERR "  btree %s %pS\n",
+                              bch2_btree_ids[path->btree_id],
+                              (void *) path->ip_allocated);
+       /* Be noisy about this: */
+       bch2_fatal_error(c);
+#endif
+}
+
 int bch2_trans_exit(struct btree_trans *trans)
        __releases(&c->btree_trans_barrier)
 {
+       struct btree_insert_entry *i;
        struct bch_fs *c = trans->c;
 
        bch2_trans_unlock(trans);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       if (trans->iters_live) {
-               struct btree_iter *iter;
-
-               trans_for_each_iter(trans, iter)
-                       btree_iter_child_free(iter);
-       }
+       trans_for_each_update(trans, i)
+               __btree_path_put(i->path, true);
+       trans->nr_updates               = 0;
 
-       if (trans->iters_live) {
-               struct btree_iter *iter;
-
-               bch_err(c, "btree iterators leaked!");
-               trans_for_each_iter(trans, iter)
-                       if (btree_iter_live(trans, iter))
-                               printk(KERN_ERR "  btree %s allocated at %pS\n",
-                                      bch2_btree_ids[iter->btree_id],
-                                      (void *) iter->ip_allocated);
-               /* Be noisy about this: */
-               bch2_fatal_error(c);
-       }
+       check_btree_paths_leaked(trans);
 
-       mutex_lock(&trans->c->btree_trans_lock);
+#ifdef CONFIG_BCACHEFS_DEBUG
+       mutex_lock(&c->btree_trans_lock);
        list_del(&trans->list);
-       mutex_unlock(&trans->c->btree_trans_lock);
+       mutex_unlock(&c->btree_trans_lock);
 #endif
 
        srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 
-       bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
        if (trans->fs_usage_deltas) {
                if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
                    REPLICAS_DELTA_LIST_MAX)
                        mempool_free(trans->fs_usage_deltas,
-                                    &trans->c->replicas_delta_pool);
+                                    &c->replicas_delta_pool);
                else
                        kfree(trans->fs_usage_deltas);
        }
 
        if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
-               mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
+               mempool_free(trans->mem, &c->btree_trans_mem_pool);
        else
                kfree(trans->mem);
 
@@ -2670,36 +2618,35 @@ int bch2_trans_exit(struct btree_trans *trans)
        /*
         * Userspace doesn't have a real percpu implementation:
         */
-       trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+       trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
 #endif
 
-       if (trans->iters)
-               mempool_free(trans->iters, &trans->c->btree_iters_pool);
+       if (trans->paths)
+               mempool_free(trans->paths, &c->btree_paths_pool);
 
        trans->mem      = (void *) 0x1;
-       trans->iters    = (void *) 0x1;
+       trans->paths    = (void *) 0x1;
 
        return trans->error ? -EIO : 0;
 }
 
 static void __maybe_unused
-bch2_btree_iter_node_to_text(struct printbuf *out,
+bch2_btree_path_node_to_text(struct printbuf *out,
                             struct btree_bkey_cached_common *_b,
-                            enum btree_iter_type type)
+                            bool cached)
 {
        pr_buf(out, "    l=%u %s:",
               _b->level, bch2_btree_ids[_b->btree_id]);
-       bch2_bpos_to_text(out, btree_node_pos(_b, type));
+       bch2_bpos_to_text(out, btree_node_pos(_b, cached));
 }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
+static bool trans_has_locks(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
-                   iter->nodes_locked)
+       trans_for_each_path(trans, path)
+               if (path->nodes_locked)
                        return true;
        return false;
 }
@@ -2709,35 +2656,36 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
        struct btree_trans *trans;
-       struct btree_iter *iter;
+       struct btree_path *path;
        struct btree *b;
        unsigned l;
 
        mutex_lock(&c->btree_trans_lock);
        list_for_each_entry(trans, &c->btree_trans_list, list) {
-               if (!trans_has_btree_nodes_locked(trans))
+               if (!trans_has_locks(trans))
                        continue;
 
                pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
 
-               trans_for_each_iter(trans, iter) {
-                       if (!iter->nodes_locked)
+               trans_for_each_path(trans, path) {
+                       if (!path->nodes_locked)
                                continue;
 
-                       pr_buf(out, "  iter %u %c %s:",
-                              iter->idx,
-                              btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
-                              bch2_btree_ids[iter->btree_id]);
-                       bch2_bpos_to_text(out, iter->pos);
+                       pr_buf(out, "  path %u %c l=%u %s:",
+                              path->idx,
+                              path->cached ? 'c' : 'b',
+                              path->level,
+                              bch2_btree_ids[path->btree_id]);
+                       bch2_bpos_to_text(out, path->pos);
                        pr_buf(out, "\n");
 
                        for (l = 0; l < BTREE_MAX_DEPTH; l++) {
-                               if (btree_node_locked(iter, l)) {
+                               if (btree_node_locked(path, l)) {
                                        pr_buf(out, "    %s l=%u ",
-                                              btree_node_intent_locked(iter, l) ? "i" : "r", l);
-                                       bch2_btree_iter_node_to_text(out,
-                                                       (void *) iter->l[l].b,
-                                                       btree_iter_type(iter));
+                                              btree_node_intent_locked(path, l) ? "i" : "r", l);
+                                       bch2_btree_path_node_to_text(out,
+                                                       (void *) path->l[l].b,
+                                                       path->cached);
                                        pr_buf(out, "\n");
                                }
                        }
@@ -2745,18 +2693,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
                b = READ_ONCE(trans->locking);
                if (b) {
-                       iter = &trans->iters[trans->locking_iter_idx];
-                       pr_buf(out, "  locking iter %u %c l=%u %s:",
-                              trans->locking_iter_idx,
-                              btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
+                       path = &trans->paths[trans->locking_path_idx];
+                       pr_buf(out, "  locking path %u %c l=%u %s:",
+                              trans->locking_path_idx,
+                              path->cached ? 'c' : 'b',
                               trans->locking_level,
                               bch2_btree_ids[trans->locking_btree_id]);
                        bch2_bpos_to_text(out, trans->locking_pos);
 
                        pr_buf(out, " node ");
-                       bch2_btree_iter_node_to_text(out,
-                                       (void *) b,
-                                       btree_iter_type(iter));
+                       bch2_btree_path_node_to_text(out,
+                                       (void *) b, path->cached);
                        pr_buf(out, "\n");
                }
        }
@@ -2767,7 +2714,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
        mempool_exit(&c->btree_trans_mem_pool);
-       mempool_exit(&c->btree_iters_pool);
+       mempool_exit(&c->btree_paths_pool);
        cleanup_srcu_struct(&c->btree_trans_barrier);
 }
 
@@ -2779,9 +2726,8 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
        mutex_init(&c->btree_trans_lock);
 
        return  init_srcu_struct(&c->btree_trans_barrier) ?:
-               mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
-                       sizeof(u8) * nr +
-                       sizeof(struct btree_iter) * nr +
+               mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
+                       sizeof(struct btree_path) * nr +
                        sizeof(struct btree_insert_entry) * nr) ?:
                mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
                                          BTREE_TRANS_MEM_MAX);
index 39124e68e48828f3870522b625d8a46cc5b0a244..be1bb489f3d63816b325368935c3a78b54a96a70 100644 (file)
@@ -5,40 +5,49 @@
 #include "bset.h"
 #include "btree_types.h"
 
-static inline void btree_iter_set_dirty(struct btree_iter *iter,
-                                       enum btree_iter_uptodate u)
+static inline void __btree_path_get(struct btree_path *path, bool intent)
 {
-       iter->uptodate = max_t(unsigned, iter->uptodate, u);
+       path->ref++;
+       path->intent_ref += intent;
 }
 
-static inline struct btree *btree_iter_node(struct btree_iter *iter,
+static inline bool __btree_path_put(struct btree_path *path, bool intent)
+{
+       EBUG_ON(!path->ref);
+       EBUG_ON(!path->intent_ref && intent);
+       path->intent_ref -= intent;
+       return --path->ref == 0;
+}
+
+static inline void btree_path_set_dirty(struct btree_path *path,
+                                       enum btree_path_uptodate u)
+{
+       path->uptodate = max_t(unsigned, path->uptodate, u);
+}
+
+static inline struct btree *btree_path_node(struct btree_path *path,
                                            unsigned level)
 {
-       return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+       return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
 }
 
-static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter,
+static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
                                        const struct btree *b, unsigned level)
 {
        /*
         * We don't compare the low bits of the lock sequence numbers because
-        * @iter might have taken a write lock on @b, and we don't want to skip
-        * the linked iterator if the sequence numbers were equal before taking
-        * that write lock. The lock sequence number is incremented by taking
-        * and releasing write locks and is even when unlocked:
+        * @path might have taken a write lock on @b, and we don't want to skip
+        * the linked path if the sequence numbers were equal before taking that
+        * write lock. The lock sequence number is incremented by taking and
+        * releasing write locks and is even when unlocked:
         */
-       return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
+       return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
 }
 
-static inline struct btree *btree_node_parent(struct btree_iter *iter,
+static inline struct btree *btree_node_parent(struct btree_path *path,
                                              struct btree *b)
 {
-       return btree_iter_node(iter, b->c.level + 1);
-}
-
-static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
-{
-       return hweight64(trans->iters_linked) > 1;
+       return btree_path_node(path, b->c.level + 1);
 }
 
 static inline int btree_iter_err(const struct btree_iter *iter)
@@ -46,97 +55,105 @@ static inline int btree_iter_err(const struct btree_iter *iter)
        return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
 }
 
-/* Iterate over iters within a transaction: */
+/* Iterate over paths within a transaction: */
 
-static inline struct btree_iter *
-__trans_next_iter(struct btree_trans *trans, unsigned idx)
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned idx)
 {
        u64 l;
 
        if (idx == BTREE_ITER_MAX)
                return NULL;
 
-       l = trans->iters_linked >> idx;
+       l = trans->paths_allocated >> idx;
        if (!l)
                return NULL;
 
        idx += __ffs64(l);
        EBUG_ON(idx >= BTREE_ITER_MAX);
-       EBUG_ON(trans->iters[idx].idx != idx);
-       return &trans->iters[idx];
+       EBUG_ON(trans->paths[idx].idx != idx);
+       return &trans->paths[idx];
 }
 
-#define trans_for_each_iter(_trans, _iter)                             \
-       for (_iter = __trans_next_iter((_trans), 0);                    \
-            (_iter);                                                   \
-            _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
+#define trans_for_each_path(_trans, _path)                             \
+       for (_path = __trans_next_path((_trans), 0);                    \
+            (_path);                                                   \
+            _path = __trans_next_path((_trans), (_path)->idx + 1))
 
-static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
-       unsigned idx = iter ? iter->sorted_idx + 1 : 0;
+       unsigned idx = path ? path->sorted_idx + 1 : 0;
 
        EBUG_ON(idx > trans->nr_sorted);
 
        return idx < trans->nr_sorted
-               ? trans->iters + trans->sorted[idx]
+               ? trans->paths + trans->sorted[idx]
                : NULL;
 }
 
-static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
-       EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
-       return iter->sorted_idx
-               ? trans->iters + trans->sorted[iter->sorted_idx - 1]
+       EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+       return path->sorted_idx
+               ? trans->paths + trans->sorted[path->sorted_idx - 1]
                : NULL;
 }
 
-#define trans_for_each_iter_inorder(_trans, _iter)                     \
-       for (_iter = next_btree_iter(trans, NULL);                      \
-            (_iter);                                                   \
-            _iter = next_btree_iter((_trans), (_iter)))
+#define trans_for_each_path_inorder(_trans, _path, _i)                 \
+       for (_i = 0;                                                    \
+            ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+            _i++)
 
-static inline bool __iter_has_node(const struct btree_iter *iter,
+static inline bool __path_has_node(const struct btree_path *path,
                                   const struct btree *b)
 {
-       return iter->l[b->c.level].b == b &&
-               btree_node_lock_seq_matches(iter, b, b->c.level);
+       return path->l[b->c.level].b == b &&
+               btree_node_lock_seq_matches(path, b, b->c.level);
 }
 
-static inline struct btree_iter *
-__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
+static inline struct btree_path *
+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
                            unsigned idx)
 {
-       struct btree_iter *iter = __trans_next_iter(trans, idx);
+       struct btree_path *path = __trans_next_path(trans, idx);
 
-       while (iter && !__iter_has_node(iter, b))
-               iter = __trans_next_iter(trans, iter->idx + 1);
+       while (path && !__path_has_node(path, b))
+               path = __trans_next_path(trans, path->idx + 1);
 
-       return iter;
+       return path;
 }
 
-#define trans_for_each_iter_with_node(_trans, _b, _iter)               \
-       for (_iter = __trans_next_iter_with_node((_trans), (_b), 0);    \
-            (_iter);                                                   \
-            _iter = __trans_next_iter_with_node((_trans), (_b),        \
-                                                (_iter)->idx + 1))
+#define trans_for_each_path_with_node(_trans, _b, _path)               \
+       for (_path = __trans_next_path_with_node((_trans), (_b), 0);    \
+            (_path);                                                   \
+            _path = __trans_next_path_with_node((_trans), (_b),        \
+                                                (_path)->idx + 1))
+
+struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, bool);
+int __must_check bch2_btree_path_traverse(struct btree_trans *,
+                                         struct btree_path *, unsigned);
+struct btree_path *bch2_path_get(struct btree_trans *, bool, enum btree_id,
+                                struct bpos, unsigned, unsigned, bool);
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *);
-void bch2_btree_trans_verify_locks(struct btree_trans *);
+void bch2_trans_verify_paths(struct btree_trans *);
+void bch2_trans_verify_locks(struct btree_trans *);
 #else
-static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
-                                                struct btree *b) {}
-static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
 #endif
 
-void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
-                                          struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
-                             struct btree_node_iter *, struct bkey_packed *,
-                             unsigned, unsigned);
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+                                     struct btree *, struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
+                             struct btree *, struct btree_node_iter *,
+                             struct bkey_packed *, unsigned, unsigned);
+
+bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
 
-bool bch2_btree_iter_relock_intent(struct btree_iter *);
-bool bch2_btree_iter_relock(struct btree_iter *, unsigned long);
+void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
@@ -149,35 +166,36 @@ static inline int btree_trans_restart(struct btree_trans *trans)
        return -EINTR;
 }
 
-bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+                              struct btree_path *, unsigned);
 
-static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
+                                          struct btree_path *path,
                                           unsigned new_locks_want)
 {
        new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
-       return iter->locks_want < new_locks_want
-               ? __bch2_btree_iter_upgrade(iter, new_locks_want)
-               : iter->uptodate <= BTREE_ITER_NEED_PEEK;
+       return path->locks_want < new_locks_want
+               ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+               : path->uptodate == BTREE_ITER_UPTODATE;
 }
 
-void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
 
-static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+static inline void bch2_btree_path_downgrade(struct btree_path *path)
 {
-       unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT);
+       unsigned new_locks_want = path->level + !!path->intent_ref;
 
-       if (iter->locks_want > new_locks_want)
-               __bch2_btree_iter_downgrade(iter, new_locks_want);
+       if (path->locks_want > new_locks_want)
+               __bch2_btree_path_downgrade(path, new_locks_want);
 }
 
 void bch2_trans_downgrade(struct btree_trans *);
 
-void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
-void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
-
-void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
 
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
@@ -206,7 +224,8 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos
        iter->k.p.offset        = iter->pos.offset      = new_pos.offset;
        iter->k.p.snapshot      = iter->pos.snapshot    = new_pos.snapshot;
        iter->k.size = 0;
-       iter->should_be_locked = false;
+       if (iter->path->ref == 1)
+               iter->path->should_be_locked = false;
 }
 
 static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
@@ -215,16 +234,6 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
        iter->pos = bkey_start_pos(&iter->k);
 }
 
-static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx)
-{
-       return idx != U8_MAX ? trans->iters + idx : NULL;
-}
-
-static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
-{
-       return idx_to_btree_iter(iter->trans, iter->child_idx);
-}
-
 /*
  * Unlocks before scheduling
  * Note: does not revalidate iterator
@@ -242,11 +251,11 @@ static inline int bch2_trans_cond_resched(struct btree_trans *trans)
 
 #define __for_each_btree_node(_trans, _iter, _btree_id, _start,        \
                              _locks_want, _depth, _flags, _b)          \
-       for (iter = bch2_trans_get_node_iter((_trans), (_btree_id),     \
+       for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \
                                _start, _locks_want, _depth, _flags),   \
-            _b = bch2_btree_iter_peek_node(_iter);                     \
+            _b = bch2_btree_iter_peek_node(&(_iter));                  \
             (_b);                                                      \
-            (_b) = bch2_btree_iter_next_node(_iter))
+            (_b) = bch2_btree_iter_next_node(&(_iter)))
 
 #define for_each_btree_node(_trans, _iter, _btree_id, _start,          \
                            _flags, _b)                                 \
@@ -276,75 +285,36 @@ static inline int bkey_err(struct bkey_s_c k)
 
 #define for_each_btree_key(_trans, _iter, _btree_id,                   \
                           _start, _flags, _k, _ret)                    \
-       for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id),       \
-                                          (_start), (_flags)),         \
-            (_k) = __bch2_btree_iter_peek(_iter, _flags);              \
+       for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),      \
+                                 (_start), (_flags)),                  \
+            (_k) = __bch2_btree_iter_peek(&(_iter), _flags);           \
             !((_ret) = bkey_err(_k)) && (_k).k;                        \
-            (_k) = __bch2_btree_iter_next(_iter, _flags))
+            (_k) = __bch2_btree_iter_next(&(_iter), _flags))
 
 #define for_each_btree_key_continue(_iter, _flags, _k, _ret)           \
-       for ((_k) = __bch2_btree_iter_peek(_iter, _flags);              \
+       for ((_k) = __bch2_btree_iter_peek(&(_iter), _flags);           \
             !((_ret) = bkey_err(_k)) && (_k).k;                        \
-            (_k) = __bch2_btree_iter_next(_iter, _flags))
+            (_k) = __bch2_btree_iter_next(&(_iter), _flags))
 
 /* new multiple iterator interface: */
 
-int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
-int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
 
-void bch2_trans_unlink_iters(struct btree_trans *);
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
+                         unsigned, struct bpos, unsigned);
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+                              enum btree_id, struct bpos,
+                              unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
 
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
-                                        struct bpos, unsigned,
-                                        unsigned, unsigned);
-
-static inline struct btree_iter *
-bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
-                   struct bpos pos, unsigned flags)
-{
-       struct btree_iter *iter =
-               __bch2_trans_get_iter(trans, btree_id, pos,
-                                     (flags & BTREE_ITER_INTENT) != 0, 0,
-                                     flags);
-       iter->ip_allocated = _THIS_IP_;
-       return iter;
-}
-
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
-                                       struct btree_iter *);
-static inline struct btree_iter *
-bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
-{
-       struct btree_iter *iter =
-               __bch2_trans_copy_iter(trans, src);
-
-       iter->ip_allocated = _THIS_IP_;
-       return iter;
-}
-
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
-                               enum btree_id, struct bpos,
-                               unsigned, unsigned, unsigned);
-
-static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter)
+static inline void set_btree_iter_dontneed(struct btree_iter *iter)
 {
-       return (trans->iters_live & (1ULL << iter->idx)) != 0;
+       iter->path->preserve = false;
 }
 
-static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter)
-{
-       return btree_iter_live(trans, iter) ||
-               (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-}
-
-static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
-{
-       trans->iters_touched &= ~(1ULL << iter->idx);
-}
-
-void bch2_trans_begin(struct btree_trans *);
-
 void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+void bch2_trans_begin(struct btree_trans *);
 void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
 int bch2_trans_exit(struct btree_trans *);
 
index e327ef39d4329512f3e576b6f2bed239ea1c0272..938ced36af73c3eb6d7be666d55cc953890f5371 100644 (file)
@@ -196,23 +196,23 @@ btree_key_cache_create(struct btree_key_cache *c,
 }
 
 static int btree_key_cache_fill(struct btree_trans *trans,
-                               struct btree_iter *ck_iter,
+                               struct btree_path *ck_path,
                                struct bkey_cached *ck)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        unsigned new_u64s = 0;
        struct bkey_i *new_k = NULL;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, ck->key.btree_id,
-                                  ck->key.pos, BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, ck->key.btree_id,
+                            ck->key.pos, BTREE_ITER_SLOTS);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
 
-       if (!bch2_btree_node_relock(ck_iter, 0)) {
+       if (!bch2_btree_node_relock(trans, ck_path, 0)) {
                trace_transaction_restart_ip(trans->ip, _THIS_IP_);
                ret = btree_trans_restart(trans);
                goto err;
@@ -237,7 +237,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
         * XXX: not allowed to be holding read locks when we take a write lock,
         * currently
         */
-       bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
+       bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
        if (new_k) {
                kfree(ck->k);
                ck->u64s = new_u64s;
@@ -246,63 +246,64 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
        bkey_reassemble(ck->k, k);
        ck->valid = true;
-       bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
+       bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 
        /* We're not likely to need this iterator again: */
-       set_btree_iter_dontneed(trans, iter);
+       set_btree_iter_dontneed(&iter);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
 static int bkey_cached_check_fn(struct six_lock *lock, void *p)
 {
        struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
-       const struct btree_iter *iter = p;
+       const struct btree_path *path = p;
 
-       return ck->key.btree_id == iter->btree_id &&
-               !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+       return ck->key.btree_id == path->btree_id &&
+               !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
 }
 
 __flatten
-int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+                                   unsigned flags)
 {
-       struct btree_trans *trans = iter->trans;
        struct bch_fs *c = trans->c;
        struct bkey_cached *ck;
        int ret = 0;
 
-       BUG_ON(iter->level);
+       BUG_ON(path->level);
 
-       iter->l[1].b = NULL;
+       path->l[1].b = NULL;
 
-       if (bch2_btree_node_relock(iter, 0)) {
-               ck = (void *) iter->l[0].b;
+       if (bch2_btree_node_relock(trans, path, 0)) {
+               ck = (void *) path->l[0].b;
                goto fill;
        }
 retry:
-       ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
+       ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
        if (!ck) {
-               if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
-                       iter->l[0].b = NULL;
+               if (flags & BTREE_ITER_CACHED_NOCREATE) {
+                       path->l[0].b = NULL;
                        return 0;
                }
 
                ck = btree_key_cache_create(&c->btree_key_cache,
-                                           iter->btree_id, iter->pos);
+                                           path->btree_id, path->pos);
                ret = PTR_ERR_OR_ZERO(ck);
                if (ret)
                        goto err;
                if (!ck)
                        goto retry;
 
-               mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
-               iter->locks_want = 1;
+               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+               path->locks_want = 1;
        } else {
-               enum six_lock_type lock_want = __btree_lock_want(iter, 0);
+               enum six_lock_type lock_want = __btree_lock_want(path, 0);
 
-               if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
-                                    bkey_cached_check_fn, iter, _THIS_IP_)) {
+               if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
+                                    lock_want,
+                                    bkey_cached_check_fn, path, _THIS_IP_)) {
                        if (!trans->restarted)
                                goto retry;
 
@@ -311,28 +312,27 @@ retry:
                        goto err;
                }
 
-               if (ck->key.btree_id != iter->btree_id ||
-                   bpos_cmp(ck->key.pos, iter->pos)) {
+               if (ck->key.btree_id != path->btree_id ||
+                   bpos_cmp(ck->key.pos, path->pos)) {
                        six_unlock_type(&ck->c.lock, lock_want);
                        goto retry;
                }
 
-               mark_btree_node_locked(iter, 0, lock_want);
+               mark_btree_node_locked(trans, path, 0, lock_want);
        }
 
-       iter->l[0].lock_seq     = ck->c.lock.state.seq;
-       iter->l[0].b            = (void *) ck;
+       path->l[0].lock_seq     = ck->c.lock.state.seq;
+       path->l[0].b            = (void *) ck;
 fill:
-       if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
-               if (!iter->locks_want &&
-                   !!__bch2_btree_iter_upgrade(iter, 1)) {
+       if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+               if (!path->locks_want &&
+                   !__bch2_btree_path_upgrade(trans, path, 1)) {
                        trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-                       BUG_ON(!trans->restarted);
-                       ret = -EINTR;
+                       ret = btree_trans_restart(trans);
                        goto err;
                }
 
-               ret = btree_key_cache_fill(trans, iter, ck);
+               ret = btree_key_cache_fill(trans, path, ck);
                if (ret)
                        goto err;
        }
@@ -340,22 +340,14 @@ fill:
        if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
                set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
-       iter->uptodate = BTREE_ITER_NEED_PEEK;
-
-       if ((iter->flags & BTREE_ITER_INTENT) &&
-           !bch2_btree_iter_upgrade(iter, 1)) {
-               BUG_ON(!trans->restarted);
-               ret = -EINTR;
-       }
-
-       BUG_ON(!ret && !btree_node_locked(iter, 0));
+       path->uptodate = BTREE_ITER_UPTODATE;
+       BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 
        return ret;
 err:
        if (ret != -EINTR) {
-               btree_node_unlock(iter, 0);
-               iter->flags |= BTREE_ITER_ERROR;
-               iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+               btree_node_unlock(path, 0);
+               path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
        }
        return ret;
 }
@@ -368,23 +360,23 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct journal *j = &c->journal;
-       struct btree_iter *c_iter = NULL, *b_iter = NULL;
+       struct btree_iter c_iter, b_iter;
        struct bkey_cached *ck = NULL;
        int ret;
 
-       b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
-                                    BTREE_ITER_SLOTS|
-                                    BTREE_ITER_INTENT);
-       c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
-                                    BTREE_ITER_CACHED|
-                                    BTREE_ITER_CACHED_NOFILL|
-                                    BTREE_ITER_CACHED_NOCREATE|
-                                    BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(c_iter);
+       bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+                            BTREE_ITER_SLOTS|
+                            BTREE_ITER_INTENT);
+       bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_CACHED_NOFILL|
+                            BTREE_ITER_CACHED_NOCREATE|
+                            BTREE_ITER_INTENT);
+       ret = bch2_btree_iter_traverse(&c_iter);
        if (ret)
                goto out;
 
-       ck = (void *) c_iter->l[0].b;
+       ck = (void *) c_iter.path->l[0].b;
        if (!ck ||
            (journal_seq && ck->journal.seq != journal_seq))
                goto out;
@@ -400,8 +392,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
         * allocator/copygc depend on journal reclaim making progress, we need
         * to be using alloc reserves:
         * */
-       ret   = bch2_btree_iter_traverse(b_iter) ?:
-               bch2_trans_update(trans, b_iter, ck->k,
+       ret   = bch2_btree_iter_traverse(&b_iter) ?:
+               bch2_trans_update(trans, &b_iter, ck->k,
                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
                                  BTREE_TRIGGER_NORUN) ?:
                bch2_trans_commit(trans, NULL, NULL,
@@ -423,7 +415,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
        bch2_journal_pin_drop(j, &ck->journal);
        bch2_journal_preres_put(j, &ck->res);
 
-       BUG_ON(!btree_node_locked(c_iter, 0));
+       BUG_ON(!btree_node_locked(c_iter.path, 0));
 
        if (!evict) {
                if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -432,10 +424,10 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                }
        } else {
 evict:
-               BUG_ON(!btree_node_intent_locked(c_iter, 0));
+               BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
 
-               mark_btree_node_unlocked(c_iter, 0);
-               c_iter->l[0].b = NULL;
+               mark_btree_node_unlocked(c_iter.path, 0);
+               c_iter.path->l[0].b = NULL;
 
                six_lock_write(&ck->c.lock, NULL, NULL);
 
@@ -451,8 +443,8 @@ evict:
                mutex_unlock(&c->btree_key_cache.lock);
        }
 out:
-       bch2_trans_iter_put(trans, b_iter);
-       bch2_trans_iter_put(trans, c_iter);
+       bch2_trans_iter_exit(trans, &b_iter);
+       bch2_trans_iter_exit(trans, &c_iter);
        return ret;
 }
 
@@ -503,11 +495,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
 }
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
-                                 struct btree_iter *iter,
+                                 struct btree_path *path,
                                  struct bkey_i *insert)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_cached *ck = (void *) iter->l[0].b;
+       struct bkey_cached *ck = (void *) path->l[0].b;
        bool kick_reclaim = false;
 
        BUG_ON(insert->u64s > ck->u64s);
index 7e2b0a08f745255b3b5c6986f4837d1674e00ec1..0768ef3ca77600d96b7b49e6bd101dcd09e4ff87 100644 (file)
@@ -26,10 +26,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *,
 struct bkey_cached *
 bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 
-int bch2_btree_iter_traverse_cached(struct btree_iter *);
+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
+                                   unsigned);
 
 bool bch2_btree_insert_key_cached(struct btree_trans *,
-                       struct btree_iter *, struct bkey_i *);
+                       struct btree_path *, struct bkey_i *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
                               enum btree_id, struct bpos);
 #ifdef CONFIG_BCACHEFS_DEBUG
index 7532bcdef96732b44bafac02aeb77ae92a82ebbd..5c6b758070e165c040214c1d15d85b675507f685 100644 (file)
@@ -21,7 +21,7 @@ enum btree_node_locked_type {
        BTREE_NODE_INTENT_LOCKED        = SIX_LOCK_intent,
 };
 
-static inline int btree_node_locked_type(struct btree_iter *iter,
+static inline int btree_node_locked_type(struct btree_path *path,
                                         unsigned level)
 {
        /*
@@ -30,35 +30,36 @@ static inline int btree_node_locked_type(struct btree_iter *iter,
         * branches:
         */
        return BTREE_NODE_UNLOCKED +
-               ((iter->nodes_locked >> level) & 1) +
-               ((iter->nodes_intent_locked >> level) & 1);
+               ((path->nodes_locked >> level) & 1) +
+               ((path->nodes_intent_locked >> level) & 1);
 }
 
-static inline bool btree_node_intent_locked(struct btree_iter *iter,
+static inline bool btree_node_intent_locked(struct btree_path *path,
                                            unsigned level)
 {
-       return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+       return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
 }
 
-static inline bool btree_node_read_locked(struct btree_iter *iter,
+static inline bool btree_node_read_locked(struct btree_path *path,
                                          unsigned level)
 {
-       return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+       return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
 }
 
-static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
 {
-       return iter->nodes_locked & (1 << level);
+       return path->nodes_locked & (1 << level);
 }
 
-static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+static inline void mark_btree_node_unlocked(struct btree_path *path,
                                            unsigned level)
 {
-       iter->nodes_locked &= ~(1 << level);
-       iter->nodes_intent_locked &= ~(1 << level);
+       path->nodes_locked &= ~(1 << level);
+       path->nodes_intent_locked &= ~(1 << level);
 }
 
-static inline void mark_btree_node_locked(struct btree_iter *iter,
+static inline void mark_btree_node_locked(struct btree_trans *trans,
+                                         struct btree_path *path,
                                          unsigned level,
                                          enum six_lock_type type)
 {
@@ -66,52 +67,62 @@ static inline void mark_btree_node_locked(struct btree_iter *iter,
        BUILD_BUG_ON(SIX_LOCK_read   != 0);
        BUILD_BUG_ON(SIX_LOCK_intent != 1);
 
-       iter->nodes_locked |= 1 << level;
-       iter->nodes_intent_locked |= type << level;
+       path->nodes_locked |= 1 << level;
+       path->nodes_intent_locked |= type << level;
+#ifdef CONFIG_BCACHEFS_DEBUG
+       path->ip_locked = _RET_IP_;
+       BUG_ON(trans->in_traverse_all &&
+              trans->traverse_all_idx != U8_MAX &&
+              path->sorted_idx > trans->paths[trans->traverse_all_idx].sorted_idx);
+#endif
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+static inline void mark_btree_node_intent_locked(struct btree_trans *trans,
+                                                struct btree_path *path,
                                                 unsigned level)
 {
-       mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+       mark_btree_node_locked(trans, path, level, SIX_LOCK_intent);
 }
 
-static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
 {
-       return level < iter->locks_want
+       return level < path->locks_want
                ? SIX_LOCK_intent
                : SIX_LOCK_read;
 }
 
 static inline enum btree_node_locked_type
-btree_lock_want(struct btree_iter *iter, int level)
+btree_lock_want(struct btree_path *path, int level)
 {
-       if (level < iter->level)
+       if (level < path->level)
                return BTREE_NODE_UNLOCKED;
-       if (level < iter->locks_want)
+       if (level < path->locks_want)
                return BTREE_NODE_INTENT_LOCKED;
-       if (level == iter->level)
+       if (level == path->level)
                return BTREE_NODE_READ_LOCKED;
        return BTREE_NODE_UNLOCKED;
 }
 
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void btree_node_unlock(struct btree_path *path, unsigned level)
 {
-       int lock_type = btree_node_locked_type(iter, level);
+       int lock_type = btree_node_locked_type(path, level);
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 
        if (lock_type != BTREE_NODE_UNLOCKED)
-               six_unlock_type(&iter->l[level].b->c.lock, lock_type);
-       mark_btree_node_unlocked(iter, level);
+               six_unlock_type(&path->l[level].b->c.lock, lock_type);
+       mark_btree_node_unlocked(path, level);
 }
 
-static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+static inline void __bch2_btree_path_unlock(struct btree_path *path)
 {
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+       btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
 
-       while (iter->nodes_locked)
-               btree_node_unlock(iter, __ffs(iter->nodes_locked));
+       while (path->nodes_locked)
+               btree_node_unlock(path, __ffs(path->nodes_locked));
+#ifdef CONFIG_BCACHEFS_DEBUG
+       path->ip_locked = 0;
+#endif
 }
 
 static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
@@ -155,11 +166,11 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
                                             struct btree *b, unsigned level,
                                             enum btree_node_locked_type want)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               if (iter->l[level].b == b &&
-                   btree_node_locked_type(iter, level) >= want) {
+       trans_for_each_path(trans, path)
+               if (path->l[level].b == b &&
+                   btree_node_locked_type(path, level) >= want) {
                        six_lock_increment(&b->c.lock, want);
                        return true;
                }
@@ -167,40 +178,39 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
        return false;
 }
 
-bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
-                           struct btree_iter *, enum six_lock_type,
+bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
+                           struct btree *, struct bpos, unsigned,
+                           enum six_lock_type,
                            six_lock_should_sleep_fn, void *,
                            unsigned long);
 
-static inline bool btree_node_lock(struct btree *b,
-                       struct bpos pos, unsigned level,
-                       struct btree_iter *iter,
+static inline bool btree_node_lock(struct btree_trans *trans,
+                       struct btree_path *path,
+                       struct btree *b, struct bpos pos, unsigned level,
                        enum six_lock_type type,
                        six_lock_should_sleep_fn should_sleep_fn, void *p,
                        unsigned long ip)
 {
-       struct btree_trans *trans = iter->trans;
-
        EBUG_ON(level >= BTREE_MAX_DEPTH);
-       EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+       EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
        return likely(six_trylock_type(&b->c.lock, type)) ||
                btree_node_lock_increment(trans, b, level, type) ||
-               __bch2_btree_node_lock(b, pos, level, iter, type,
+               __bch2_btree_node_lock(trans, path, b, pos, level, type,
                                       should_sleep_fn, p, ip);
 }
 
-bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
 
-static inline bool bch2_btree_node_relock(struct btree_iter *iter,
-                                         unsigned level)
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+                                         struct btree_path *path, unsigned level)
 {
-       EBUG_ON(btree_node_locked(iter, level) &&
-               btree_node_locked_type(iter, level) !=
-               __btree_lock_want(iter, level));
+       EBUG_ON(btree_node_locked(path, level) &&
+               btree_node_locked_type(path, level) !=
+               __btree_lock_want(path, level));
 
-       return likely(btree_node_locked(iter, level)) ||
-               __bch2_btree_node_relock(iter, level);
+       return likely(btree_node_locked(path, level)) ||
+               __bch2_btree_node_relock(trans, path, level);
 }
 
 /*
@@ -208,30 +218,35 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
  * succeed:
  */
 static inline void
-bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+                                    struct btree *b)
 {
-       struct btree_iter *linked;
+       struct btree_path *linked;
 
-       EBUG_ON(iter->l[b->c.level].b != b);
-       EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+       EBUG_ON(path->l[b->c.level].b != b);
+       EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
 
-       trans_for_each_iter_with_node(iter->trans, b, linked)
+       trans_for_each_path_with_node(trans, b, linked)
                linked->l[b->c.level].lock_seq += 2;
 
        six_unlock_write(&b->c.lock);
 }
 
-void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+void bch2_btree_node_unlock_write(struct btree_trans *,
+                       struct btree_path *, struct btree *);
 
-void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
 
-static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
+                                             struct btree_path *path,
+                                             struct btree *b)
 {
-       EBUG_ON(iter->l[b->c.level].b != b);
-       EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
+       EBUG_ON(path->l[b->c.level].b != b);
+       EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
+       EBUG_ON(!btree_node_intent_locked(path, b->c.level));
 
        if (unlikely(!six_trylock_write(&b->c.lock)))
-               __bch2_btree_node_lock_write(b, iter);
+               __bch2_btree_node_lock_write(trans, b);
 }
 
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
index a1e5debf19f3623d10366b399fd944813caa71a4..ccf91ebd94aa15410928bbcb2d685e949f9f736b 100644 (file)
@@ -176,52 +176,44 @@ struct btree_node_iter {
        } data[MAX_BSETS];
 };
 
-enum btree_iter_type {
-       BTREE_ITER_KEYS,
-       BTREE_ITER_NODES,
-       BTREE_ITER_CACHED,
-};
-
-#define BTREE_ITER_TYPE                        ((1 << 2) - 1)
-
 /*
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
-#define BTREE_ITER_SLOTS               (1 << 2)
+#define BTREE_ITER_SLOTS               (1 << 0)
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-#define BTREE_ITER_INTENT              (1 << 3)
+#define BTREE_ITER_INTENT              (1 << 1)
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-#define BTREE_ITER_PREFETCH            (1 << 4)
+#define BTREE_ITER_PREFETCH            (1 << 2)
 /*
  * Indicates that this iterator should not be reused until transaction commit,
  * either because a pending update references it or because the update depends
  * on that particular key being locked (e.g. by the str_hash code, for hash
  * table consistency)
  */
-#define BTREE_ITER_KEEP_UNTIL_COMMIT   (1 << 5)
+#define BTREE_ITER_KEEP_UNTIL_COMMIT   (1 << 3)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-#define BTREE_ITER_IS_EXTENTS          (1 << 6)
-#define BTREE_ITER_NOT_EXTENTS         (1 << 7)
-#define BTREE_ITER_ERROR               (1 << 8)
-#define BTREE_ITER_SET_POS_AFTER_COMMIT        (1 << 9)
-#define BTREE_ITER_CACHED_NOFILL       (1 << 10)
-#define BTREE_ITER_CACHED_NOCREATE     (1 << 11)
-#define BTREE_ITER_WITH_UPDATES                (1 << 12)
-#define BTREE_ITER_ALL_SNAPSHOTS       (1 << 13)
-
-enum btree_iter_uptodate {
+#define BTREE_ITER_IS_EXTENTS          (1 << 4)
+#define BTREE_ITER_NOT_EXTENTS         (1 << 5)
+#define BTREE_ITER_ERROR               (1 << 6)
+#define BTREE_ITER_CACHED              (1 << 7)
+#define BTREE_ITER_CACHED_NOFILL       (1 << 8)
+#define BTREE_ITER_CACHED_NOCREATE     (1 << 9)
+#define BTREE_ITER_WITH_UPDATES                (1 << 10)
+#define __BTREE_ITER_ALL_SNAPSHOTS     (1 << 11)
+#define BTREE_ITER_ALL_SNAPSHOTS       (1 << 12)
+
+enum btree_path_uptodate {
        BTREE_ITER_UPTODATE             = 0,
-       BTREE_ITER_NEED_PEEK            = 1,
-       BTREE_ITER_NEED_RELOCK          = 2,
-       BTREE_ITER_NEED_TRAVERSE        = 3,
+       BTREE_ITER_NEED_RELOCK          = 1,
+       BTREE_ITER_NEED_TRAVERSE        = 2,
 };
 
 #define BTREE_ITER_NO_NODE_GET_LOCKS   ((struct btree *) 1)
@@ -233,51 +225,67 @@ enum btree_iter_uptodate {
 #define BTREE_ITER_NO_NODE_ERROR       ((struct btree *) 7)
 #define BTREE_ITER_NO_NODE_CACHED      ((struct btree *) 8)
 
-/*
- * @pos                        - iterator's current position
- * @level              - current btree depth
- * @locks_want         - btree level below which we start taking intent locks
- * @nodes_locked       - bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked        - bitmask indicating which locks are intent locks
- */
-struct btree_iter {
-       struct btree_trans      *trans;
-       unsigned long           ip_allocated;
-
+struct btree_path {
        u8                      idx;
-       u8                      child_idx;
        u8                      sorted_idx;
+       u8                      ref;
+       u8                      intent_ref;
 
        /* btree_iter_copy starts here: */
-       u16                     flags;
-
-       /* When we're filtering by snapshot, the snapshot ID we're looking for: */
-       unsigned                snapshot;
-
        struct bpos             pos;
-       struct bpos             real_pos;
-       struct bpos             pos_after_commit;
 
        enum btree_id           btree_id:4;
-       enum btree_iter_uptodate uptodate:3;
+       bool                    cached:1;
+       bool                    preserve:1;
+       enum btree_path_uptodate uptodate:2;
        /*
-        * True if we've returned a key (and thus are expected to keep it
-        * locked), false after set_pos - for avoiding spurious transaction
-        * restarts in bch2_trans_relock():
+        * When true, failing to relock this path will cause the transaction to
+        * restart:
         */
        bool                    should_be_locked:1;
-       unsigned                level:4,
-                               min_depth:4,
+       unsigned                level:3,
                                locks_want:4,
                                nodes_locked:4,
                                nodes_intent_locked:4;
 
-       struct btree_iter_level {
+       struct btree_path_level {
                struct btree    *b;
                struct btree_node_iter iter;
                u32             lock_seq;
        }                       l[BTREE_MAX_DEPTH];
+#ifdef CONFIG_BCACHEFS_DEBUG
+       unsigned long           ip_allocated;
+       unsigned long           ip_locked;
+#endif
+};
 
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+       return path->l + path->level;
+}
+
+/*
+ * @pos                        - iterator's current position
+ * @level              - current btree depth
+ * @locks_want         - btree level below which we start taking intent locks
+ * @nodes_locked       - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked        - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+       struct btree_trans      *trans;
+       struct btree_path       *path;
+
+       enum btree_id           btree_id:4;
+       unsigned                min_depth:4;
+
+       /* btree_iter_copy starts here: */
+       u16                     flags;
+
+       /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+       unsigned                snapshot;
+
+       struct bpos             pos;
+       struct bpos             pos_after_commit;
        /*
         * Current unpacked key - so that bch2_btree_iter_next()/
         * bch2_btree_iter_next_slot() can correctly advance pos.
@@ -285,22 +293,6 @@ struct btree_iter {
        struct bkey             k;
 };
 
-static inline enum btree_iter_type
-btree_iter_type(const struct btree_iter *iter)
-{
-       return iter->flags & BTREE_ITER_TYPE;
-}
-
-static inline bool btree_iter_is_cached(const struct btree_iter *iter)
-{
-       return btree_iter_type(iter) == BTREE_ITER_CACHED;
-}
-
-static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
-{
-       return iter->l + iter->level;
-}
-
 struct btree_key_cache {
        struct mutex            lock;
        struct rhashtable       table;
@@ -345,9 +337,11 @@ struct btree_insert_entry {
        u8                      bkey_type;
        enum btree_id           btree_id:8;
        u8                      level;
-       unsigned                trans_triggers_run:1;
+       bool                    cached:1;
+       bool                    trans_triggers_run:1;
        struct bkey_i           *k;
-       struct btree_iter       *iter;
+       struct btree_path       *path;
+       unsigned long           ip_allocated;
 };
 
 #ifndef CONFIG_LOCKDEP
@@ -371,10 +365,11 @@ struct btree_trans {
 #ifdef CONFIG_BCACHEFS_DEBUG
        struct list_head        list;
        struct btree            *locking;
-       unsigned                locking_iter_idx;
+       unsigned                locking_path_idx;
        struct bpos             locking_pos;
        u8                      locking_btree_id;
        u8                      locking_level;
+       u8                      traverse_all_idx;
        pid_t                   pid;
 #endif
        unsigned long           ip;
@@ -392,16 +387,14 @@ struct btree_trans {
         */
        unsigned                extra_journal_res;
 
-       u64                     iters_linked;
-       u64                     iters_live;
-       u64                     iters_touched;
+       u64                     paths_allocated;
 
        unsigned                mem_top;
        unsigned                mem_bytes;
        void                    *mem;
 
-       u8                      *sorted;
-       struct btree_iter       *iters;
+       u8                      sorted[BTREE_ITER_MAX];
+       struct btree_path       *paths;
        struct btree_insert_entry *updates;
 
        /* update path: */
@@ -605,16 +598,6 @@ static inline bool btree_node_is_extents(struct btree *b)
        return btree_node_type_is_extents(btree_node_type(b));
 }
 
-static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter)
-{
-       return __btree_node_type(iter->level, iter->btree_id);
-}
-
-static inline bool btree_iter_is_extents(struct btree_iter *iter)
-{
-       return btree_node_type_is_extents(btree_iter_key_type(iter));
-}
-
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS             \
        ((1U << BKEY_TYPE_extents)|                     \
         (1U << BKEY_TYPE_inodes)|                      \
index 217b52e1a1683a7977198acce09e3f9668eab296..23b73d3a172cf2ff71d6ad3f85acb1255dddea17 100644 (file)
@@ -8,10 +8,11 @@
 struct bch_fs;
 struct btree;
 
-void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *,
+void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
                                     struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
-                               struct btree_node_iter *, struct bkey_i *);
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
+                               struct btree *, struct btree_node_iter *,
+                               struct bkey_i *);
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 enum btree_insert_flags {
@@ -134,4 +135,21 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
             (_i) < (_trans)->updates + (_trans)->nr_updates;           \
             (_i)++)
 
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+                                                     enum btree_id btree_id,
+                                                     struct bpos pos)
+{
+       struct btree_insert_entry *i;
+
+       trans_for_each_update(trans, i)
+               if ((cmp_int(btree_id,  i->btree_id) ?:
+                    bpos_cmp(pos,      i->k->k.p)) <= 0) {
+                       if (btree_id == i->btree_id)
+                               return i->k;
+                       break;
+               }
+
+       return NULL;
+}
+
 #endif /* _BCACHEFS_BTREE_UPDATE_H */
index c8c3382f48c7e31ded2872dbf34af9057306cff9..5a1420b392bad9041b242946caceba9ff25ec880 100644 (file)
@@ -23,8 +23,9 @@
 #include <trace/events/bcachefs.h>
 
 static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-                                  struct btree_iter *, struct btree *,
+                                  struct btree_path *, struct btree *,
                                   struct keylist *, unsigned);
+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
 /* Debug code: */
 
@@ -152,38 +153,25 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 
        clear_btree_node_noevict(b);
 
-       bch2_btree_node_hash_remove(&c->btree_cache, b);
-
        mutex_lock(&c->btree_cache.lock);
        list_move(&b->list, &c->btree_cache.freeable);
        mutex_unlock(&c->btree_cache.lock);
 }
 
-void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
+static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+                                      struct btree *b)
 {
-       struct open_buckets ob = b->ob;
+       struct bch_fs *c = trans->c;
+       struct btree_path *path;
 
-       b->ob.nr = 0;
+       trans_for_each_path(trans, path)
+               BUG_ON(path->l[b->c.level].b == b);
 
-       clear_btree_node_dirty(c, b);
+       six_lock_write(&b->c.lock, NULL, NULL);
 
-       btree_node_lock_type(c, b, SIX_LOCK_write);
+       bch2_btree_node_hash_remove(&c->btree_cache, b);
        __btree_node_free(c, b);
-       six_unlock_write(&b->c.lock);
 
-       bch2_open_buckets_put(c, &ob);
-}
-
-void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
-                               struct btree_iter *iter)
-{
-       struct btree_iter *linked;
-
-       trans_for_each_iter(iter->trans, linked)
-               BUG_ON(linked->l[b->c.level].b == b);
-
-       six_lock_write(&b->c.lock, NULL, NULL);
-       __btree_node_free(c, b);
        six_unlock_write(&b->c.lock);
        six_unlock_intent(&b->c.lock);
 }
@@ -773,7 +761,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
  * And it adds @b to the list of @as's new nodes, so that we can update sector
  * counts in bch2_btree_update_nodes_written:
  */
-void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
 {
        struct bch_fs *c = as->c;
 
@@ -827,7 +815,7 @@ found:
                closure_put(&as->cl);
 }
 
-void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
 {
        while (b->ob.nr)
                as->open_buckets[as->nr_open_buckets++] =
@@ -839,7 +827,7 @@ void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b
  * nodes and thus outstanding btree_updates - redirect @b's
  * btree_updates to point to this btree_update:
  */
-void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
                                               struct btree *b)
 {
        struct bch_fs *c = as->c;
@@ -911,7 +899,7 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        as->nr_old_nodes++;
 }
 
-void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as)
 {
        BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
 
@@ -925,11 +913,10 @@ void bch2_btree_update_done(struct btree_update *as)
                    as->c->btree_interior_update_worker);
 }
 
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *iter, unsigned level,
-                       unsigned nr_nodes, unsigned flags)
+static struct btree_update *
+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
+                       unsigned level, unsigned nr_nodes, unsigned flags)
 {
-       struct btree_trans *trans = iter->trans;
        struct bch_fs *c = trans->c;
        struct btree_update *as;
        struct closure cl;
@@ -938,7 +925,7 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level,
        int journal_flags = 0;
        int ret = 0;
 
-       BUG_ON(!iter->should_be_locked);
+       BUG_ON(!path->should_be_locked);
 
        if (flags & BTREE_INSERT_JOURNAL_RESERVED)
                journal_flags |= JOURNAL_RES_GET_RESERVED;
@@ -950,11 +937,11 @@ retry:
         * XXX: figure out how far we might need to split,
         * instead of locking/reserving all the way to the root:
         */
-       if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
+       if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
                trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
-                                                iter->btree_id,
-                                                &iter->real_pos);
-               return ERR_PTR(-EINTR);
+                                                path->btree_id, &path->pos);
+               ret = btree_trans_restart(trans);
+               return ERR_PTR(ret);
        }
 
        if (flags & BTREE_INSERT_GC_LOCK_HELD)
@@ -974,7 +961,7 @@ retry:
        as->c           = c;
        as->mode        = BTREE_INTERIOR_NO_UPDATE;
        as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
-       as->btree_id    = iter->btree_id;
+       as->btree_id    = path->btree_id;
        INIT_LIST_HEAD(&as->list);
        INIT_LIST_HEAD(&as->unwritten_list);
        INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1092,8 +1079,10 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
  * is nothing new to be done.  This just guarantees that there is a
  * journal write.
  */
-static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
-                               struct btree_iter *iter)
+static void bch2_btree_set_root(struct btree_update *as,
+                               struct btree_trans *trans,
+                               struct btree_path *path,
+                               struct btree *b)
 {
        struct bch_fs *c = as->c;
        struct btree *old;
@@ -1108,7 +1097,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
         * Ensure no one is using the old root while we switch to the
         * new root:
         */
-       bch2_btree_node_lock_write(old, iter);
+       bch2_btree_node_lock_write(trans, path, old);
 
        bch2_btree_set_root_inmem(c, b);
 
@@ -1121,15 +1110,17 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
         * an intent lock on the new root, and any updates that would
         * depend on the new root would have to update the new root.
         */
-       bch2_btree_node_unlock_write(old, iter);
+       bch2_btree_node_unlock_write(trans, path, old);
 }
 
 /* Interior node updates: */
 
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
-                                       struct btree_iter *iter,
-                                       struct bkey_i *insert,
-                                       struct btree_node_iter *node_iter)
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
+                                       struct btree_trans *trans,
+                                       struct btree_path *path,
+                                       struct btree *b,
+                                       struct btree_node_iter *node_iter,
+                                       struct bkey_i *insert)
 {
        struct bch_fs *c = as->c;
        struct bkey_packed *k;
@@ -1161,15 +1152,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
               bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
                bch2_btree_node_iter_advance(node_iter, b);
 
-       bch2_btree_bset_insert_key(iter, b, node_iter, insert);
+       bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
        set_btree_node_dirty(c, b);
        set_btree_node_need_write(b);
 }
 
 static void
-__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
-                                 struct btree_iter *iter, struct keylist *keys,
-                                 struct btree_node_iter node_iter)
+__bch2_btree_insert_keys_interior(struct btree_update *as,
+                                 struct btree_trans *trans,
+                                 struct btree_path *path,
+                                 struct btree *b,
+                                 struct btree_node_iter node_iter,
+                                 struct keylist *keys)
 {
        struct bkey_i *insert = bch2_keylist_front(keys);
        struct bkey_packed *k;
@@ -1181,8 +1175,8 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
                ;
 
        while (!bch2_keylist_empty(keys)) {
-               bch2_insert_fixup_btree_ptr(as, b, iter,
-                               bch2_keylist_front(keys), &node_iter);
+               bch2_insert_fixup_btree_ptr(as, trans, path, b,
+                               &node_iter, bch2_keylist_front(keys));
                bch2_keylist_pop_front(keys);
        }
 }
@@ -1192,8 +1186,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
  * node)
  */
 static struct btree *__btree_split_node(struct btree_update *as,
-                                       struct btree *n1,
-                                       struct btree_iter *iter)
+                                       struct btree *n1)
 {
        struct bkey_format_state s;
        size_t nr_packed = 0, nr_unpacked = 0;
@@ -1308,8 +1301,10 @@ static struct btree *__btree_split_node(struct btree_update *as,
  * nodes that were coalesced, and thus in the middle of a child node post
  * coalescing:
  */
-static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
-                                   struct btree_iter *iter,
+static void btree_split_insert_keys(struct btree_update *as,
+                                   struct btree_trans *trans,
+                                   struct btree_path *path,
+                                   struct btree *b,
                                    struct keylist *keys)
 {
        struct btree_node_iter node_iter;
@@ -1319,7 +1314,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 
        bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
-       __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter);
+       __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
 
        /*
         * We can't tolerate whiteouts here - with whiteouts there can be
@@ -1349,18 +1344,17 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
        btree_node_interior_verify(as->c, b);
 }
 
-static void btree_split(struct btree_update *as,
-                       struct btree_trans *trans, struct btree_iter *iter,
-                       struct btree *b, struct keylist *keys,
-                       unsigned flags)
+static void btree_split(struct btree_update *as, struct btree_trans *trans,
+                       struct btree_path *path, struct btree *b,
+                       struct keylist *keys, unsigned flags)
 {
        struct bch_fs *c = as->c;
-       struct btree *parent = btree_node_parent(iter, b);
+       struct btree *parent = btree_node_parent(path, b);
        struct btree *n1, *n2 = NULL, *n3 = NULL;
        u64 start_time = local_clock();
 
        BUG_ON(!parent && (b != btree_node_root(c, b)));
-       BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+       BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
 
        bch2_btree_interior_update_will_free_node(as, b);
 
@@ -1368,12 +1362,12 @@ static void btree_split(struct btree_update *as,
        bch2_btree_update_add_new_node(as, n1);
 
        if (keys)
-               btree_split_insert_keys(as, n1, iter, keys);
+               btree_split_insert_keys(as, trans, path, n1, keys);
 
        if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
                trace_btree_split(c, b);
 
-               n2 = __btree_split_node(as, n1, iter);
+               n2 = __btree_split_node(as, n1);
 
                bch2_btree_build_aux_trees(n2);
                bch2_btree_build_aux_trees(n1);
@@ -1398,7 +1392,7 @@ static void btree_split(struct btree_update *as,
                        n3->sib_u64s[0] = U16_MAX;
                        n3->sib_u64s[1] = U16_MAX;
 
-                       btree_split_insert_keys(as, n3, iter, &as->parent_keys);
+                       btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
 
                        bch2_btree_node_write(c, n3, SIX_LOCK_intent);
                }
@@ -1418,12 +1412,12 @@ static void btree_split(struct btree_update *as,
 
        if (parent) {
                /* Split a non root node */
-               bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
+               bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
        } else if (n3) {
-               bch2_btree_set_root(as, n3, iter);
+               bch2_btree_set_root(as, trans, path, n3);
        } else {
                /* Root filled up but didn't need to be split */
-               bch2_btree_set_root(as, n1, iter);
+               bch2_btree_set_root(as, trans, path, n1);
        }
 
        bch2_btree_update_get_open_buckets(as, n1);
@@ -1432,15 +1426,14 @@ static void btree_split(struct btree_update *as,
        if (n3)
                bch2_btree_update_get_open_buckets(as, n3);
 
-       /* Successful split, update the iterator to point to the new nodes: */
+       /* Successful split, update the path to point to the new nodes: */
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-       bch2_btree_iter_node_drop(iter, b);
        if (n3)
-               bch2_btree_iter_node_replace(iter, n3);
+               bch2_trans_node_add(trans, n3);
        if (n2)
-               bch2_btree_iter_node_replace(iter, n2);
-       bch2_btree_iter_node_replace(iter, n1);
+               bch2_trans_node_add(trans, n2);
+       bch2_trans_node_add(trans, n1);
 
        /*
         * The old node must be freed (in memory) _before_ unlocking the new
@@ -1448,7 +1441,7 @@ static void btree_split(struct btree_update *as,
         * node after another thread has locked and updated the new node, thus
         * seeing stale data:
         */
-       bch2_btree_node_free_inmem(c, b, iter);
+       bch2_btree_node_free_inmem(trans, b);
 
        if (n3)
                six_unlock_intent(&n3->c.lock);
@@ -1456,26 +1449,30 @@ static void btree_split(struct btree_update *as,
                six_unlock_intent(&n2->c.lock);
        six_unlock_intent(&n1->c.lock);
 
-       bch2_btree_trans_verify_locks(trans);
+       bch2_trans_verify_locks(trans);
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
                               start_time);
 }
 
 static void
-bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
-                               struct btree_iter *iter, struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+                               struct btree_trans *trans,
+                               struct btree_path *path,
+                               struct btree *b,
+                               struct keylist *keys)
 {
-       struct btree_iter *linked;
+       struct btree_path *linked;
 
-       __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter);
+       __bch2_btree_insert_keys_interior(as, trans, path, b,
+                                         path->l[b->c.level].iter, keys);
 
        btree_update_updated_node(as, b);
 
-       trans_for_each_iter_with_node(iter->trans, b, linked)
+       trans_for_each_path_with_node(trans, b, linked)
                bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
 
-       bch2_btree_trans_verify_iters(iter->trans, b);
+       bch2_trans_verify_paths(trans);
 }
 
 /**
@@ -1490,10 +1487,9 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
  * If a split occurred, this function will return early. This can only happen
  * for leaf nodes -- inserts into interior nodes have to be atomic.
  */
-static void bch2_btree_insert_node(struct btree_update *as,
-                                  struct btree_trans *trans, struct btree_iter *iter,
-                                  struct btree *b, struct keylist *keys,
-                                  unsigned flags)
+static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+                                  struct btree_path *path, struct btree *b,
+                                  struct keylist *keys, unsigned flags)
 {
        struct bch_fs *c = as->c;
        int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
@@ -1501,21 +1497,21 @@ static void bch2_btree_insert_node(struct btree_update *as,
        int live_u64s_added, u64s_added;
 
        lockdep_assert_held(&c->gc_lock);
-       BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+       BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
        BUG_ON(!b->c.level);
        BUG_ON(!as || as->b);
        bch2_verify_keylist_sorted(keys);
 
-       bch2_btree_node_lock_for_insert(trans, iter, b);
+       bch2_btree_node_lock_for_insert(trans, path, b);
 
        if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
-               bch2_btree_node_unlock_write(b, iter);
+               bch2_btree_node_unlock_write(trans, path, b);
                goto split;
        }
 
        btree_node_interior_verify(c, b);
 
-       bch2_btree_insert_keys_interior(as, b, iter, keys);
+       bch2_btree_insert_keys_interior(as, trans, path, b, keys);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
        u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1527,48 +1523,48 @@ static void bch2_btree_insert_node(struct btree_update *as,
 
        if (u64s_added > live_u64s_added &&
            bch2_maybe_compact_whiteouts(c, b))
-               bch2_btree_iter_reinit_node(iter, b);
+               bch2_trans_node_reinit_iter(trans, b);
 
-       bch2_btree_node_unlock_write(b, iter);
+       bch2_btree_node_unlock_write(trans, path, b);
 
        btree_node_interior_verify(c, b);
        return;
 split:
-       btree_split(as, trans, iter, b, keys, flags);
+       btree_split(as, trans, path, b, keys, flags);
 }
 
 int bch2_btree_split_leaf(struct btree_trans *trans,
-                         struct btree_iter *iter,
+                         struct btree_path *path,
                          unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter_l(iter)->b;
+       struct btree *b = path_l(path)->b;
        struct btree_update *as;
        unsigned l;
        int ret = 0;
 
-       as = bch2_btree_update_start(iter, iter->level,
+       as = bch2_btree_update_start(trans, path, path->level,
                btree_update_reserve_required(c, b), flags);
        if (IS_ERR(as))
                return PTR_ERR(as);
 
-       btree_split(as, trans, iter, b, NULL, flags);
+       btree_split(as, trans, path, b, NULL, flags);
        bch2_btree_update_done(as);
 
-       for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
-               ret = bch2_foreground_maybe_merge(trans, iter, l, flags);
+       for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
+               ret = bch2_foreground_maybe_merge(trans, path, l, flags);
 
        return ret;
 }
 
 int __bch2_foreground_maybe_merge(struct btree_trans *trans,
-                                 struct btree_iter *iter,
+                                 struct btree_path *path,
                                  unsigned level,
                                  unsigned flags,
                                  enum btree_node_sibling sib)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *sib_iter = NULL;
+       struct btree_path *sib_path = NULL;
        struct btree_update *as;
        struct bkey_format_state new_s;
        struct bkey_format new_f;
@@ -1576,39 +1572,35 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
        struct btree *b, *m, *n, *prev, *next, *parent;
        struct bpos sib_pos;
        size_t sib_u64s;
-       int ret = 0, ret2 = 0;
-
-retry:
-       ret = bch2_btree_iter_traverse(iter);
-       if (ret)
-               return ret;
+       int ret = 0;
 
-       BUG_ON(!iter->should_be_locked);
-       BUG_ON(!btree_node_locked(iter, level));
+       BUG_ON(!path->should_be_locked);
+       BUG_ON(!btree_node_locked(path, level));
 
-       b = iter->l[level].b;
+       b = path->l[level].b;
 
        if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
            (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
                b->sib_u64s[sib] = U16_MAX;
-               goto out;
+               return 0;
        }
 
        sib_pos = sib == btree_prev_sib
                ? bpos_predecessor(b->data->min_key)
                : bpos_successor(b->data->max_key);
 
-       sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
-                                           sib_pos, U8_MAX, level,
-                                           BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(sib_iter);
+       sib_path = bch2_path_get(trans, false, path->btree_id,
+                                sib_pos, U8_MAX, level, true);
+       ret = bch2_btree_path_traverse(trans, sib_path, false);
        if (ret)
                goto err;
 
-       m = sib_iter->l[level].b;
+       sib_path->should_be_locked = true;
 
-       if (btree_node_parent(iter, b) !=
-           btree_node_parent(sib_iter, m)) {
+       m = sib_path->l[level].b;
+
+       if (btree_node_parent(path, b) !=
+           btree_node_parent(sib_path, m)) {
                b->sib_u64s[sib] = U16_MAX;
                goto out;
        }
@@ -1659,8 +1651,8 @@ retry:
        if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
                goto out;
 
-       parent = btree_node_parent(iter, b);
-       as = bch2_btree_update_start(iter, level,
+       parent = btree_node_parent(path, b);
+       as = bch2_btree_update_start(trans, path, level,
                         btree_update_reserve_required(c, parent) + 1,
                         flags|
                         BTREE_INSERT_NOFAIL|
@@ -1696,47 +1688,32 @@ retry:
        bch2_keylist_add(&as->parent_keys, &delete);
        bch2_keylist_add(&as->parent_keys, &n->key);
 
-       bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
+       bch2_trans_verify_paths(trans);
+
+       bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+
+       bch2_trans_verify_paths(trans);
 
        bch2_btree_update_get_open_buckets(as, n);
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
        six_lock_increment(&m->c.lock, SIX_LOCK_intent);
-       bch2_btree_iter_node_drop(iter, b);
-       bch2_btree_iter_node_drop(iter, m);
 
-       bch2_btree_iter_node_replace(iter, n);
+       bch2_trans_node_add(trans, n);
 
-       bch2_btree_trans_verify_iters(trans, n);
+       bch2_trans_verify_paths(trans);
 
-       bch2_btree_node_free_inmem(c, b, iter);
-       bch2_btree_node_free_inmem(c, m, iter);
+       bch2_btree_node_free_inmem(trans, b);
+       bch2_btree_node_free_inmem(trans, m);
 
        six_unlock_intent(&n->c.lock);
 
        bch2_btree_update_done(as);
 out:
-       bch2_btree_trans_verify_locks(trans);
-       bch2_trans_iter_free(trans, sib_iter);
-
-       /*
-        * Don't downgrade locks here: we're called after successful insert,
-        * and the caller will downgrade locks after a successful insert
-        * anyways (in case e.g. a split was required first)
-        *
-        * And we're also called when inserting into interior nodes in the
-        * split path, and downgrading to read locks in there is potentially
-        * confusing:
-        */
-       return ret ?: ret2;
 err:
-       bch2_trans_iter_put(trans, sib_iter);
-       sib_iter = NULL;
-
-       if (ret == -EINTR && bch2_trans_relock(trans))
-               goto retry;
-
-       goto out;
+       bch2_path_put(trans, sib_path, true);
+       bch2_trans_verify_locks(trans);
+       return ret;
 }
 
 /**
@@ -1761,8 +1738,8 @@ retry:
        if (!b || b->data->keys.seq != seq)
                goto out;
 
-       parent = btree_node_parent(iter, b);
-       as = bch2_btree_update_start(iter, b->c.level,
+       parent = btree_node_parent(iter->path, b);
+       as = bch2_btree_update_start(trans, iter->path, b->c.level,
                (parent
                 ? btree_update_reserve_required(c, parent)
                 : 0) + 1,
@@ -1789,23 +1766,22 @@ retry:
 
        if (parent) {
                bch2_keylist_add(&as->parent_keys, &n->key);
-               bch2_btree_insert_node(as, trans, iter, parent,
+               bch2_btree_insert_node(as, trans, iter->path, parent,
                                       &as->parent_keys, flags);
        } else {
-               bch2_btree_set_root(as, n, iter);
+               bch2_btree_set_root(as, trans, iter->path, n);
        }
 
        bch2_btree_update_get_open_buckets(as, n);
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-       bch2_btree_iter_node_drop(iter, b);
-       bch2_btree_iter_node_replace(iter, n);
-       bch2_btree_node_free_inmem(c, b, iter);
+       bch2_trans_node_add(trans, n);
+       bch2_btree_node_free_inmem(trans, b);
        six_unlock_intent(&n->c.lock);
 
        bch2_btree_update_done(as);
 out:
-       bch2_btree_iter_downgrade(iter);
+       bch2_btree_path_downgrade(iter->path);
        return ret;
 }
 
@@ -1824,13 +1800,13 @@ void async_btree_node_rewrite_work(struct work_struct *work)
                container_of(work, struct async_btree_rewrite, work);
        struct bch_fs *c = a->c;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos,
+       bch2_trans_node_iter_init(&trans, &iter, a->btree_id, a->pos,
                                        BTREE_MAX_DEPTH, a->level, 0);
-       bch2_btree_node_rewrite(&trans, iter, a->seq, 0);
-       bch2_trans_iter_put(&trans, iter);
+       bch2_btree_node_rewrite(&trans, &iter, a->seq, 0);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        percpu_ref_put(&c->writes);
        kfree(a);
@@ -1869,7 +1845,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
                                        bool skip_triggers)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter2 = NULL;
+       struct btree_iter iter2 = { NULL };
        struct btree *parent;
        u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
        int ret;
@@ -1897,19 +1873,22 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
                BUG_ON(ret);
        }
 
-       parent = btree_node_parent(iter, b);
+       parent = btree_node_parent(iter->path, b);
        if (parent) {
-               iter2 = bch2_trans_copy_iter(trans, iter);
+               bch2_trans_copy_iter(&iter2, iter);
 
-               BUG_ON(iter2->level != b->c.level);
-               BUG_ON(bpos_cmp(iter2->pos, new_key->k.p));
+               iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
+                               iter2.flags & BTREE_ITER_INTENT);
 
-               btree_node_unlock(iter2, iter2->level);
-               iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP;
-               iter2->level++;
+               BUG_ON(iter2.path->level != b->c.level);
+               BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
 
-               ret   = bch2_btree_iter_traverse(iter2) ?:
-                       bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN);
+               btree_node_unlock(iter2.path, iter2.path->level);
+               path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
+               iter2.path->level++;
+
+               ret   = bch2_btree_iter_traverse(&iter2) ?:
+                       bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
                if (ret)
                        goto err;
        } else {
@@ -1931,7 +1910,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       bch2_btree_node_lock_write(b, iter);
+       bch2_btree_node_lock_write(trans, iter->path, b);
 
        if (new_hash) {
                mutex_lock(&c->btree_cache.lock);
@@ -1946,9 +1925,9 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
                bkey_copy(&b->key, new_key);
        }
 
-       bch2_btree_node_unlock_write(b, iter);
+       bch2_btree_node_unlock_write(trans, iter->path, b);
 out:
-       bch2_trans_iter_put(trans, iter2);
+       bch2_trans_iter_exit(trans, &iter2);
        return ret;
 err:
        if (new_hash) {
@@ -2006,18 +1985,18 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
                                        struct btree *b, struct bkey_i *new_key,
                                        bool skip_triggers)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
-       iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p,
-                                       BTREE_MAX_DEPTH, b->c.level,
-                                       BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(iter);
+       bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
+                                 BTREE_MAX_DEPTH, b->c.level,
+                                 BTREE_ITER_INTENT);
+       ret = bch2_btree_iter_traverse(&iter);
        if (ret)
                goto out;
 
        /* has node been freed? */
-       if (iter->l[b->c.level].b != b) {
+       if (iter.path->l[b->c.level].b != b) {
                /* node has been freed: */
                BUG_ON(!btree_node_dying(b));
                goto out;
@@ -2025,9 +2004,9 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
 
        BUG_ON(!btree_node_hashed(b));
 
-       ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers);
+       ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index e88e737ee8134e8365fcd451bba8257f08a4820a..8e03bd987d6dc52bcc74fafe98c830fc5054d9d1 100644 (file)
@@ -113,57 +113,39 @@ struct btree_update {
        u64                             inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
 };
 
-void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
-                               struct btree_iter *);
-void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
-
-void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
-
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
                                                  struct btree *,
                                                  struct bkey_format);
 
-void bch2_btree_update_done(struct btree_update *);
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
-
-void bch2_btree_interior_update_will_free_node(struct btree_update *,
-                                              struct btree *);
-void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
-int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
 
-int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *,
+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
                                  unsigned, unsigned, enum btree_node_sibling);
 
 static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
-                                       struct btree_iter *iter,
+                                       struct btree_path *path,
                                        unsigned level, unsigned flags,
                                        enum btree_node_sibling sib)
 {
        struct btree *b;
 
-       if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
-               return 0;
-
-       if (!bch2_btree_node_relock(iter, level))
-               return 0;
+       EBUG_ON(!btree_node_locked(path, level));
 
-       b = iter->l[level].b;
+       b = path->l[level].b;
        if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
                return 0;
 
-       return __bch2_foreground_maybe_merge(trans, iter, level, flags, sib);
+       return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
 }
 
 static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
-                                             struct btree_iter *iter,
+                                             struct btree_path *path,
                                              unsigned level,
                                              unsigned flags)
 {
-       return  bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
+       return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
                                                    btree_prev_sib) ?:
-               bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
+               bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
                                                    btree_next_sib);
 }
 
index 7e9909e2dcaf5ef305effe1bfedd9f74625ee710..9c8c5cacc4fced52c1b6f7074a599f30a4b24309 100644 (file)
@@ -29,40 +29,59 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
                 bpos_cmp(l->k->k.p,    r->k->k.p);
 }
 
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+{
+       return i->path->l + i->level;
+}
+
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
                                     struct btree_insert_entry *i)
 {
        return i != trans->updates &&
-               iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
+               insert_l(&i[0])->b == insert_l(&i[-1])->b;
+}
+
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+                                    struct btree_insert_entry *i)
+{
+       return i + 1 < trans->updates + trans->nr_updates &&
+               insert_l(&i[0])->b == insert_l(&i[1])->b;
 }
 
-inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
-                                           struct btree_iter *iter,
-                                           struct btree *b)
+static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+                                                 struct btree_path *path,
+                                                 struct btree *b)
 {
        struct bch_fs *c = trans->c;
 
-       bch2_btree_node_lock_write(b, iter);
-
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED)
+       if (path->cached)
                return;
 
        if (unlikely(btree_node_just_written(b)) &&
            bch2_btree_post_write_cleanup(c, b))
-               bch2_btree_iter_reinit_node(iter, b);
+               bch2_trans_node_reinit_iter(trans, b);
 
        /*
         * If the last bset has been written, or if it's gotten too big - start
         * a new bset to insert into:
         */
        if (want_new_bset(c, b))
-               bch2_btree_init_next(trans, iter, b);
+               bch2_btree_init_next(trans, b);
+}
+
+void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
+                                    struct btree_path *path,
+                                    struct btree *b)
+{
+       bch2_btree_node_lock_write(trans, path, b);
+       bch2_btree_node_prep_for_write(trans, path, b);
 }
 
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+                               struct btree_path *path,
                                struct btree *b,
                                struct btree_node_iter *node_iter,
                                struct bkey_i *insert)
@@ -76,8 +95,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
        EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
        EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
        EBUG_ON(insert->k.u64s >
-               bch_btree_keys_u64s_remaining(iter->trans->c, b));
-       EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+               bch_btree_keys_u64s_remaining(trans->c, b));
 
        k = bch2_btree_node_iter_peek_all(node_iter, b);
        if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
@@ -96,7 +114,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                k->type = KEY_TYPE_deleted;
 
                if (k->needs_whiteout)
-                       push_whiteout(iter->trans->c, b, insert->k.p);
+                       push_whiteout(trans->c, b, insert->k.p);
                k->needs_whiteout = false;
 
                if (k >= btree_bset_last(b)->start) {
@@ -104,7 +122,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                        bch2_bset_delete(b, k, clobber_u64s);
                        goto fix_iter;
                } else {
-                       bch2_btree_iter_fix_key_modified(iter, b, k);
+                       bch2_btree_path_fix_key_modified(trans, b, k);
                }
 
                return true;
@@ -122,7 +140,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                        clobber_u64s = k->u64s;
                        goto overwrite;
                } else {
-                       bch2_btree_iter_fix_key_modified(iter, b, k);
+                       bch2_btree_path_fix_key_modified(trans, b, k);
                }
        }
 
@@ -132,7 +150,7 @@ overwrite:
        new_u64s = k->u64s;
 fix_iter:
        if (clobber_u64s != new_u64s)
-               bch2_btree_node_iter_fix(iter, b, node_iter, k,
+               bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
                                         clobber_u64s, new_u64s);
        return true;
 }
@@ -176,22 +194,21 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
  * btree_insert_key - insert a key one key into a leaf node
  */
 static bool btree_insert_key_leaf(struct btree_trans *trans,
-                                 struct btree_iter *iter,
-                                 struct bkey_i *insert)
+                                 struct btree_insert_entry *insert)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter_l(iter)->b;
+       struct btree *b = insert_l(insert)->b;
        struct bset_tree *t = bset_tree_last(b);
        struct bset *i = bset(b, t);
        int old_u64s = bset_u64s(t);
        int old_live_u64s = b->nr.live_u64s;
        int live_u64s_added, u64s_added;
 
-       EBUG_ON(!iter->level &&
+       EBUG_ON(!insert->level &&
                !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
 
-       if (unlikely(!bch2_btree_bset_insert_key(iter, b,
-                                       &iter_l(iter)->iter, insert)))
+       if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
+                                       &insert_l(insert)->iter, insert->k)))
                return false;
 
        i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
@@ -212,9 +229,8 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 
        if (u64s_added > live_u64s_added &&
            bch2_maybe_compact_whiteouts(c, b))
-               bch2_btree_iter_reinit_node(iter, b);
+               bch2_trans_node_reinit_iter(trans, b);
 
-       trace_btree_insert_key(c, b, insert);
        return true;
 }
 
@@ -225,9 +241,10 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
                                             struct btree_insert_entry *i)
 {
-       BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos));
-       BUG_ON(i->level         != i->iter->level);
-       BUG_ON(i->btree_id      != i->iter->btree_id);
+       BUG_ON(bpos_cmp(i->k->k.p, i->path->pos));
+       BUG_ON(i->cached        != i->path->cached);
+       BUG_ON(i->level         != i->path->level);
+       BUG_ON(i->btree_id      != i->path->btree_id);
 }
 
 static noinline int
@@ -267,13 +284,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
        return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
 }
 
-static enum btree_insert_ret
+static inline enum btree_insert_ret
 btree_key_can_insert(struct btree_trans *trans,
-                    struct btree_iter *iter,
+                    struct btree *b,
                     unsigned u64s)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter_l(iter)->b;
 
        if (!bch2_btree_node_insert_fits(c, b, u64s))
                return BTREE_INSERT_BTREE_NODE_FULL;
@@ -283,14 +299,14 @@ btree_key_can_insert(struct btree_trans *trans,
 
 static enum btree_insert_ret
 btree_key_can_insert_cached(struct btree_trans *trans,
-                           struct btree_iter *iter,
+                           struct btree_path *path,
                            unsigned u64s)
 {
-       struct bkey_cached *ck = (void *) iter->l[0].b;
+       struct bkey_cached *ck = (void *) path->l[0].b;
        unsigned new_u64s;
        struct bkey_i *new_k;
 
-       BUG_ON(iter->level);
+       EBUG_ON(path->level);
 
        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
            bch2_btree_key_cache_must_wait(trans->c) &&
@@ -328,9 +344,9 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 
        i->k->k.needs_whiteout = false;
 
-       did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED)
-               ? btree_insert_key_leaf(trans, i->iter, i->k)
-               : bch2_btree_insert_key_cached(trans, i->iter, i->k);
+       did_work = !i->cached
+               ? btree_insert_key_leaf(trans, i)
+               : bch2_btree_insert_key_cached(trans, i->path, i->k);
        if (!did_work)
                return;
 
@@ -356,11 +372,12 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
        trans_for_each_update(trans, i) {
                /*
                 * XXX: synchronization of cached update triggers with gc
+                * XXX: synchronization of interior node updates with gc
                 */
-               BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
+               BUG_ON(i->cached || i->level);
 
-               if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
-                       bch2_mark_update(trans, i->iter, i->k,
+               if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b)))
+                       bch2_mark_update(trans, i->path, i->k,
                                         i->flags|BTREE_TRIGGER_GC);
        }
 }
@@ -405,9 +422,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                        u64s = 0;
 
                u64s += i->k->k.u64s;
-               ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
-                       ? btree_key_can_insert(trans, i->iter, u64s)
-                       : btree_key_can_insert_cached(trans, i->iter, u64s);
+               ret = !i->cached
+                       ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
+                       : btree_key_can_insert_cached(trans, i->path, u64s);
                if (ret) {
                        *stopped_at = i;
                        return ret;
@@ -466,8 +483,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
 
        trans_for_each_update(trans, i)
                if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
-                       bch2_mark_update(trans, i->iter, i->k,
-                                        i->flags);
+                       bch2_mark_update(trans, i->path, i->k, i->flags);
 
        if (marking && trans->fs_usage_deltas)
                bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
@@ -485,42 +501,96 @@ err:
        return ret;
 }
 
-static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
+static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
 {
-       struct btree_insert_entry *i;
-       struct btree *b = iter_l(iter)->b;
-       struct bkey_s_c old;
-       int u64s_delta = 0;
-       int ret;
+       struct btree *b = path_l(path)->b;
 
-       /*
-        * Inserting directly into interior nodes is an uncommon operation with
-        * various weird edge cases: also, a lot of things about
-        * BTREE_ITER_NODES iters need to be audited
-        */
-       if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
-               return 0;
+       do {
+               if (path->nodes_locked &&
+                   path->nodes_locked != path->nodes_intent_locked)
+                       BUG_ON(!bch2_btree_path_upgrade(trans, path, path->level + 1));
+       } while ((path = prev_btree_path(trans, path)) &&
+                path_l(path)->b == b);
+}
+
+/*
+ * Check for nodes that we have both read and intent locks on, and upgrade the
+ * readers to intent:
+ */
+static inline void normalize_read_intent_locks(struct btree_trans *trans)
+{
+       struct btree_path *path;
+       unsigned i, nr_read = 0, nr_intent = 0;
+
+       trans_for_each_path_inorder(trans, path, i) {
+               struct btree_path *next = i + 1 < trans->nr_sorted
+                       ? trans->paths + trans->sorted[i + 1]
+                       : NULL;
+
+               if (path->nodes_locked) {
+                       if (path->nodes_intent_locked)
+                               nr_intent++;
+                       else
+                               nr_read++;
+               }
+
+               if (!next || path_l(path)->b != path_l(next)->b) {
+                       if (nr_read && nr_intent)
+                               upgrade_readers(trans, path);
+
+                       nr_read = nr_intent = 0;
+               }
+       }
+
+       bch2_trans_verify_locks(trans);
+}
+
+static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
+{
+       struct btree_path *path;
+       unsigned i;
 
-       BUG_ON(iter->level);
+       trans_for_each_path_inorder(trans, path, i) {
+               //if (path == pos)
+               //      break;
+
+               if (path->nodes_locked != path->nodes_intent_locked)
+                       return true;
+       }
+
+       return false;
+}
+
+static inline int trans_lock_write(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
 
        trans_for_each_update(trans, i) {
-               if (iter_l(i->iter)->b != b)
+               if (same_leaf_as_prev(trans, i))
                        continue;
 
-               old = bch2_btree_iter_peek_slot(i->iter);
-               ret = bkey_err(old);
-               if (ret)
-                       return ret;
+               if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
+                       if (have_conflicting_read_lock(trans, i->path))
+                               goto fail;
 
-               u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-               u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+                       __btree_node_lock_type(trans->c, insert_l(i)->b,
+                                              SIX_LOCK_write);
+               }
+
+               bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
        }
 
-       if (u64s_delta > 0)
-               return 0;
+       return 0;
+fail:
+       while (--i >= trans->updates) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
+
+               bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
+       }
 
-       return bch2_foreground_maybe_merge(trans, iter,
-                               iter->level, trans->flags);
+       trace_trans_restart_would_deadlock_write(trans->ip);
+       return btree_trans_restart(trans);
 }
 
 /*
@@ -532,29 +602,55 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
-       struct btree_iter *iter;
-       int ret;
+       struct bkey_s_c old;
+       int ret, u64s_delta = 0;
 
        trans_for_each_update(trans, i) {
-               struct btree *b;
+               const char *invalid = bch2_bkey_invalid(c,
+                               bkey_i_to_s_c(i->k), i->bkey_type);
+               if (invalid) {
+                       char buf[200];
 
-               BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+                       bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
+                               buf, (void *) trans->ip,
+                               (void *) i->ip_allocated, invalid);
+                       bch2_fatal_error(c);
+                       return -EINVAL;
+               }
+               btree_insert_entry_checks(trans, i);
+       }
+
+       trans_for_each_update(trans, i) {
+               struct bkey u;
 
-               if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
+               /*
+                * peek_slot() doesn't yet work on iterators that point to
+                * interior nodes:
+                */
+               if (i->cached || i->level)
                        continue;
 
-               b = iter_l(i->iter)->b;
-               if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
-                   b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
-                       ret = maybe_do_btree_merge(trans, i->iter);
-                       if (unlikely(ret))
-                               return ret;
+               old = bch2_btree_path_peek_slot(i->path, &u);
+               ret = bkey_err(old);
+               if (unlikely(ret))
+                       return ret;
+
+               u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+               u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+
+               if (!same_leaf_as_next(trans, i)) {
+                       if (u64s_delta <= 0) {
+                               ret = bch2_foreground_maybe_merge(trans, i->path,
+                                                       i->level, trans->flags);
+                               if (unlikely(ret))
+                                       return ret;
+                       }
+
+                       u64s_delta = 0;
                }
        }
 
-       trans_for_each_update(trans, i)
-               BUG_ON(!btree_node_intent_locked(i->iter, i->level));
-
        ret = bch2_journal_preres_get(&c->journal,
                        &trans->journal_preres, trans->journal_preres_u64s,
                        JOURNAL_RES_GET_NONBLOCK|
@@ -566,52 +662,18 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        if (unlikely(ret))
                return ret;
 
-       /*
-        * Can't be holding any read locks when we go to take write locks:
-        * another thread could be holding an intent lock on the same node we
-        * have a read lock on, and it'll block trying to take a write lock
-        * (because we hold a read lock) and it could be blocking us by holding
-        * its own read lock (while we're trying to to take write locks).
-        *
-        * note - this must be done after bch2_trans_journal_preres_get_cold()
-        * or anything else that might call bch2_trans_relock(), since that
-        * would just retake the read locks:
-        */
-       trans_for_each_iter(trans, iter)
-               if (iter->nodes_locked != iter->nodes_intent_locked &&
-                   !bch2_btree_iter_upgrade(iter, 1)) {
-                       trace_trans_restart_upgrade(trans->ip, trace_ip,
-                                                   iter->btree_id,
-                                                   &iter->real_pos);
-                       trans->restarted = true;
-                       return -EINTR;
-               }
-
-       trans_for_each_update(trans, i) {
-               const char *invalid = bch2_bkey_invalid(c,
-                               bkey_i_to_s_c(i->k), i->bkey_type);
-               if (invalid) {
-                       char buf[200];
-
-                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-                       bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid);
-                       bch2_fatal_error(c);
-               }
-               btree_insert_entry_checks(trans, i);
-       }
-       bch2_btree_trans_verify_locks(trans);
+       normalize_read_intent_locks(trans);
 
-       trans_for_each_update(trans, i)
-               if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_lock_for_insert(trans, i->iter,
-                                       iter_l(i->iter)->b);
+       ret = trans_lock_write(trans);
+       if (unlikely(ret))
+               return ret;
 
        ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
        trans_for_each_update(trans, i)
                if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
-                                                            i->iter);
+                       bch2_btree_node_unlock_write_inlined(trans, i->path,
+                                                       insert_l(i)->b);
 
        if (!ret && trans->journal_pin)
                bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -650,14 +712,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
        switch (ret) {
        case BTREE_INSERT_BTREE_NODE_FULL:
-               ret = bch2_btree_split_leaf(trans, i->iter, trans->flags);
+               ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
                if (!ret)
                        return 0;
 
                if (ret == -EINTR)
                        trace_trans_restart_btree_node_split(trans->ip, trace_ip,
-                                                            i->iter->btree_id,
-                                                            &i->iter->real_pos);
+                                               i->btree_id, &i->path->pos);
                break;
        case BTREE_INSERT_NEED_MARK_REPLICAS:
                bch2_trans_unlock(trans);
@@ -738,120 +799,9 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
        return 0;
 }
 
-static int extent_handle_overwrites(struct btree_trans *trans,
-                                   struct btree_insert_entry *i)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter *iter, *update_iter;
-       struct bpos start = bkey_start_pos(&i->k->k);
-       struct bkey_i *update;
-       struct bkey_s_c k;
-       int ret = 0, compressed_sectors;
-
-       iter = bch2_trans_get_iter(trans, i->btree_id, start,
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_WITH_UPDATES|
-                                  BTREE_ITER_NOT_EXTENTS);
-       k = bch2_btree_iter_peek(iter);
-       if (!k.k || (ret = bkey_err(k)))
-               goto out;
-
-       if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) {
-               update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       goto out;
-
-               bkey_reassemble(update, k);
-
-               if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) {
-                       update_iter = bch2_trans_copy_iter(trans, iter);
-                       ret = bch2_btree_delete_at(trans, update_iter, i->flags);
-                       bch2_trans_iter_put(trans, update_iter);
-
-                       if (ret)
-                               goto out;
-
-                       i->k = update;
-                       goto next;
-               }
-       }
-
-       if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k)))
-               goto next;
-
-       while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) {
-               /*
-                * If we're going to be splitting a compressed extent, note it
-                * so that __bch2_trans_commit() can increase our disk
-                * reservation:
-                */
-               if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
-                   bkey_cmp(k.k->p, i->k->k.p) > 0 &&
-                   (compressed_sectors = bch2_bkey_sectors_compressed(k)))
-                       trans->extra_journal_res += compressed_sectors;
-
-               if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
-                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-                       if ((ret = PTR_ERR_OR_ZERO(update)))
-                               goto out;
-
-                       bkey_reassemble(update, k);
-
-                       bch2_cut_back(start, update);
-
-                       update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p,
-                                                         BTREE_ITER_NOT_EXTENTS|
-                                                         BTREE_ITER_INTENT);
-                       ret = bch2_btree_iter_traverse(update_iter);
-                       if (ret) {
-                               bch2_trans_iter_put(trans, update_iter);
-                               goto out;
-                       }
-
-                       bch2_trans_update(trans, update_iter, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-                                         i->flags);
-                       bch2_trans_iter_put(trans, update_iter);
-               }
-
-               if (bkey_cmp(k.k->p, i->k->k.p) <= 0) {
-                       update_iter = bch2_trans_copy_iter(trans, iter);
-                       ret = bch2_btree_delete_at(trans, update_iter,
-                                                  i->flags);
-                       bch2_trans_iter_put(trans, update_iter);
-
-                       if (ret)
-                               goto out;
-               }
-
-               if (bkey_cmp(k.k->p, i->k->k.p) > 0) {
-                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-                       if ((ret = PTR_ERR_OR_ZERO(update)))
-                               goto out;
-
-                       bkey_reassemble(update, k);
-                       bch2_cut_front(i->k->k.p, update);
-
-                       bch2_trans_update(trans, iter, update, i->flags);
-                       goto out;
-               }
-next:
-               k = bch2_btree_iter_next(iter);
-               if (!k.k || (ret = bkey_err(k)))
-                       goto out;
-       }
-
-       bch2_bkey_merge(c, bkey_i_to_s(i->k), k);
-out:
-       bch2_trans_iter_put(trans, iter);
-
-       return ret;
-}
-
 int __bch2_trans_commit(struct btree_trans *trans)
 {
        struct btree_insert_entry *i = NULL;
-       struct btree_iter *iter;
        bool trans_trigger_run;
        unsigned u64s;
        int ret = 0;
@@ -876,8 +826,12 @@ int __bch2_trans_commit(struct btree_trans *trans)
        }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
+       /*
+        * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
+        * from the key cache flush code:
+        */
        trans_for_each_update(trans, i)
-               if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
+               if (!i->cached &&
                    !(i->flags & BTREE_TRIGGER_NORUN))
                        bch2_btree_key_cache_verify_clean(trans,
                                        i->btree_id, i->k->k.p);
@@ -896,13 +850,12 @@ int __bch2_trans_commit(struct btree_trans *trans)
                                i->trans_triggers_run = true;
                                trans_trigger_run = true;
 
-                               ret = bch2_trans_mark_update(trans, i->iter,
+                               ret = bch2_trans_mark_update(trans, i->path,
                                                             i->k, i->flags);
                                if (unlikely(ret)) {
                                        if (ret == -EINTR)
                                                trace_trans_restart_mark(trans->ip, _RET_IP_,
-                                                                        i->iter->btree_id,
-                                                                        &i->iter->pos);
+                                                               i->btree_id, &i->path->pos);
                                        goto out;
                                }
                        }
@@ -910,21 +863,19 @@ int __bch2_trans_commit(struct btree_trans *trans)
        } while (trans_trigger_run);
 
        trans_for_each_update(trans, i) {
-               BUG_ON(!i->iter->should_be_locked);
+               BUG_ON(!i->path->should_be_locked);
 
-               if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
+               if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
                        trace_trans_restart_upgrade(trans->ip, _RET_IP_,
-                                                   i->iter->btree_id,
-                                                   &i->iter->pos);
-                       trans->restarted = true;
-                       ret = -EINTR;
+                                                   i->btree_id, &i->path->pos);
+                       ret = btree_trans_restart(trans);
                        goto out;
                }
 
-               BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+               BUG_ON(!btree_node_intent_locked(i->path, i->level));
 
                u64s = jset_u64s(i->k->k.u64s);
-               if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
+               if (i->cached &&
                    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
                        trans->journal_preres_u64s += u64s;
                trans->journal_u64s += u64s;
@@ -945,21 +896,19 @@ retry:
        ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
 
        /* make sure we didn't drop or screw up locks: */
-       bch2_btree_trans_verify_locks(trans);
+       bch2_trans_verify_locks(trans);
 
        if (ret)
                goto err;
-
-       trans_for_each_iter(trans, iter)
-               if (btree_iter_live(trans, iter) &&
-                   (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
-                       bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
 out:
        bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
        if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
                percpu_ref_put(&trans->c->writes);
 out_reset:
+       trans_for_each_update(trans, i)
+               bch2_path_put(trans, i->path, true);
+
        trans->extra_journal_res        = 0;
        trans->nr_updates               = 0;
        trans->hooks                    = NULL;
@@ -982,22 +931,154 @@ err:
        goto retry;
 }
 
+static int bch2_trans_update_extent(struct btree_trans *trans,
+                                   struct btree_iter *orig_iter,
+                                   struct bkey_i *insert,
+                                   enum btree_update_flags flags)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter, update_iter;
+       struct bpos start = bkey_start_pos(&insert->k);
+       struct bkey_i *update;
+       struct bkey_s_c k;
+       enum btree_id btree_id = orig_iter->btree_id;
+       int ret = 0, compressed_sectors;
+
+       bch2_trans_iter_init(trans, &iter, btree_id, start,
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES|
+                            BTREE_ITER_NOT_EXTENTS);
+       k = bch2_btree_iter_peek(&iter);
+       if ((ret = bkey_err(k)))
+               goto err;
+       if (!k.k)
+               goto out;
+
+       if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+               update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       goto err;
+
+               bkey_reassemble(update, k);
+
+               if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) {
+                       ret = bch2_btree_delete_at(trans, &iter, flags);
+                       if (ret)
+                               goto err;
+
+                       insert = update;
+                       goto next;
+               }
+       }
+
+       if (!bkey_cmp(k.k->p, bkey_start_pos(&insert->k)))
+               goto next;
+
+       while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
+               /*
+                * If we're going to be splitting a compressed extent, note it
+                * so that __bch2_trans_commit() can increase our disk
+                * reservation:
+                */
+               if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
+                   bkey_cmp(k.k->p, insert->k.p) > 0 &&
+                   (compressed_sectors = bch2_bkey_sectors_compressed(k)))
+                       trans->extra_journal_res += compressed_sectors;
+
+               if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
+                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+                       if ((ret = PTR_ERR_OR_ZERO(update)))
+                               goto err;
+
+                       bkey_reassemble(update, k);
+
+                       bch2_cut_back(start, update);
+
+                       bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_INTENT);
+                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
+                               bch2_trans_update(trans, &update_iter, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                                 flags);
+                       bch2_trans_iter_exit(trans, &update_iter);
+                       if (ret)
+                               goto err;
+               }
+
+               if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
+                       ret = bch2_btree_delete_at(trans, &iter, flags);
+                       if (ret)
+                               goto err;
+               }
+
+               if (bkey_cmp(k.k->p, insert->k.p) > 0) {
+                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+                       if ((ret = PTR_ERR_OR_ZERO(update)))
+                               goto err;
+
+                       bkey_reassemble(update, k);
+                       bch2_cut_front(insert->k.p, update);
+
+                       ret = bch2_trans_update(trans, &iter, update, flags);
+                       if (ret)
+                               goto err;
+
+                       goto out;
+               }
+next:
+               k = bch2_btree_iter_next(&iter);
+               if ((ret = bkey_err(k)))
+                       goto err;
+               if (!k.k)
+                       goto out;
+       }
+
+       bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+out:
+       if (!bkey_deleted(&insert->k)) {
+               /*
+                * Rewinding iterators is expensive: get a new one and the one
+                * that points to the start of insert will be cloned from:
+                */
+               bch2_trans_iter_exit(trans, &iter);
+               bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p,
+                                    BTREE_ITER_NOT_EXTENTS|
+                                    BTREE_ITER_INTENT);
+               ret   = bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(trans, &iter, insert, flags);
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
 int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                      struct bkey_i *k, enum btree_update_flags flags)
 {
-       struct btree_insert_entry *i, n = (struct btree_insert_entry) {
+       struct btree_insert_entry *i, n;
+
+       BUG_ON(!iter->path->should_be_locked);
+
+       if (iter->flags & BTREE_ITER_IS_EXTENTS)
+               return bch2_trans_update_extent(trans, iter, k, flags);
+
+       BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+       BUG_ON(bpos_cmp(k->k.p, iter->path->pos));
+
+       n = (struct btree_insert_entry) {
                .flags          = flags,
-               .bkey_type      = __btree_node_type(iter->level, iter->btree_id),
+               .bkey_type      = __btree_node_type(iter->path->level, iter->btree_id),
                .btree_id       = iter->btree_id,
-               .level          = iter->level,
-               .iter           = iter,
-               .k              = k
+               .level          = iter->path->level,
+               .cached         = iter->flags & BTREE_ITER_CACHED,
+               .path           = iter->path,
+               .k              = k,
+               .ip_allocated   = _RET_IP_,
        };
-       bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0;
-       int ret = 0;
 
-       BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-       BUG_ON(!iter->should_be_locked);
+       __btree_path_get(n.path, true);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
        trans_for_each_update(trans, i)
@@ -1005,31 +1086,6 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                       btree_insert_entry_cmp(i - 1, i) >= 0);
 #endif
 
-       if (is_extent) {
-               ret = extent_handle_overwrites(trans, &n);
-               if (ret)
-                       return ret;
-
-               iter->pos_after_commit = k->k.p;
-               iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
-
-               if (bkey_deleted(&n.k->k))
-                       return 0;
-
-               n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p,
-                                            BTREE_ITER_INTENT|
-                                            BTREE_ITER_NOT_EXTENTS);
-               ret = bch2_btree_iter_traverse(n.iter);
-               bch2_trans_iter_put(trans, n.iter);
-
-               if (ret)
-                       return ret;
-       }
-
-       BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS);
-
-       n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-
        /*
         * Pending updates are kept sorted: first, find position of new update,
         * then delete/trim any updates the new update overwrites:
@@ -1047,11 +1103,13 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                 * not the key cache, which helps with cache coherency issues in
                 * other areas:
                 */
-               if (btree_iter_type(n.iter) == BTREE_ITER_CACHED &&
-                   btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
+               if (n.cached && !i->cached) {
                        i->k = n.k;
                        i->flags = n.flags;
+
+                       __btree_path_get(n.path, false);
                } else {
+                       bch2_path_put(trans, i->path, true);
                        *i = n;
                }
        } else
@@ -1071,15 +1129,14 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
 int __bch2_btree_insert(struct btree_trans *trans,
                        enum btree_id id, struct bkey_i *k)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
-                                  BTREE_ITER_INTENT);
-
-       ret   = bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(trans, iter, k, 0);
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, 0);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1117,16 +1174,16 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
                                  struct bpos start, struct bpos end,
                                  u64 *journal_seq)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
+       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
 retry:
        while ((bch2_trans_begin(trans),
-              (k = bch2_btree_iter_peek(iter)).k) &&
+              (k = bch2_btree_iter_peek(&iter)).k) &&
               !(ret = bkey_err(k)) &&
-              bkey_cmp(iter->pos, end) < 0) {
+              bkey_cmp(iter.pos, end) < 0) {
                struct bkey_i delete;
 
                bkey_init(&delete.k);
@@ -1145,9 +1202,9 @@ retry:
                 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
                 * bkey_start_pos(k.k)).
                 */
-               delete.k.p = iter->pos;
+               delete.k.p = iter.pos;
 
-               if (btree_node_type_is_extents(iter->btree_id)) {
+               if (btree_node_type_is_extents(id)) {
                        unsigned max_sectors =
                                KEY_SIZE_MAX & (~0 << trans->c->block_bits);
 
@@ -1155,12 +1212,12 @@ retry:
                        bch2_key_resize(&delete.k, max_sectors);
                        bch2_cut_back(end, &delete);
 
-                       ret = bch2_extent_trim_atomic(&delete, iter);
+                       ret = bch2_extent_trim_atomic(trans, &iter, &delete);
                        if (ret)
                                break;
                }
 
-               ret   = bch2_trans_update(trans, iter, &delete, 0) ?:
+               ret   = bch2_trans_update(trans, &iter, &delete, 0) ?:
                        bch2_trans_commit(trans, NULL, journal_seq,
                                        BTREE_INSERT_NOFAIL);
                if (ret)
@@ -1174,7 +1231,7 @@ retry:
                goto retry;
        }
 
-       bch2_trans_iter_free(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index 76945e50e4b15a4f660ed3080db79fe7d822722c..df12416eff8ecb041c67b8e64574a4fc901d8264 100644 (file)
@@ -662,8 +662,11 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
 static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
 {
-       return p.crc.compression_type
-               ? DIV_ROUND_UP(sectors * p.crc.compressed_size,
+       EBUG_ON(sectors < 0);
+
+       return p.crc.compression_type &&
+               p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
+               ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
                               p.crc.uncompressed_size)
                : sectors;
 }
@@ -925,9 +928,6 @@ static int bch2_mark_extent(struct bch_fs *c,
        BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
               (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
 
-       if (flags & BTREE_TRIGGER_OVERWRITE)
-               sectors = -sectors;
-
        r.e.data_type   = data_type;
        r.e.nr_devs     = 0;
        r.e.nr_required = 1;
@@ -935,6 +935,9 @@ static int bch2_mark_extent(struct bch_fs *c,
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                s64 disk_sectors = ptr_disk_sectors(sectors, p);
 
+               if (flags & BTREE_TRIGGER_OVERWRITE)
+                       disk_sectors = -disk_sectors;
+
                ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
                                        journal_seq, flags);
                if (ret < 0)
@@ -1215,38 +1218,23 @@ int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
        return ret;
 }
 
-int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter,
+int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
                     struct bkey_i *new, unsigned flags)
 {
        struct bch_fs           *c = trans->c;
        struct bkey             _deleted = KEY(0, 0, 0);
        struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
        struct bkey_s_c         old;
-       int iter_flags, ret;
+       struct bkey             unpacked;
+       int ret;
 
        if (unlikely(flags & BTREE_TRIGGER_NORUN))
                return 0;
 
-       if (!btree_node_type_needs_gc(iter->btree_id))
+       if (!btree_node_type_needs_gc(path->btree_id))
                return 0;
 
-       if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
-               iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
-               iter->flags &= ~BTREE_ITER_WITH_UPDATES;
-
-               old = bch2_btree_iter_peek_slot(iter);
-               iter->flags |= iter_flags;
-
-               ret = bkey_err(old);
-               if (ret)
-                       return ret;
-       } else {
-               /*
-                * If BTREE_ITER_CACHED_NOFILL was used, we better not be
-                * running triggers that do anything on removal (alloc btree):
-                */
-               old = deleted;
-       }
+       old = bch2_btree_path_peek_slot(path, &unpacked);
 
        if (old.k->type == new->k.type &&
            ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
@@ -1283,23 +1271,14 @@ void fs_usage_apply_warn(struct btree_trans *trans,
                pr_err("%s", buf);
                pr_err("overlapping with");
 
-               if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
-                       struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
-                       struct bkey_s_c k;
-                       int ret;
-
-                       for_each_btree_key_continue(copy, 0, k, ret) {
-                               if (btree_node_type_is_extents(i->iter->btree_id)
-                                   ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
-                                   : bkey_cmp(i->k->k.p, k.k->p))
-                                       break;
+               if (!i->cached) {
+                       struct bkey u;
+                       struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
 
-                               bch2_bkey_val_to_text(&PBUF(buf), c, k);
-                               pr_err("%s", buf);
-                       }
-                       bch2_trans_iter_put(trans, copy);
+                       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+                       pr_err("%s", buf);
                } else {
-                       struct bkey_cached *ck = (void *) i->iter->l[0].b;
+                       struct bkey_cached *ck = (void *) i->path->l[0].b;
 
                        if (ck->valid) {
                                bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
@@ -1378,31 +1357,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
 /* trans_mark: */
 
-static struct btree_iter *trans_get_update(struct btree_trans *trans,
-                           enum btree_id btree_id, struct bpos pos,
-                           struct bkey_s_c *k)
-{
-       struct btree_insert_entry *i;
-
-       trans_for_each_update(trans, i)
-               if (i->iter->btree_id == btree_id &&
-                   (btree_node_type_is_extents(btree_id)
-                    ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
-                      bkey_cmp(pos, i->k->k.p) < 0
-                    : !bkey_cmp(pos, i->iter->pos))) {
-                       *k = bkey_i_to_s_c(i->k);
-
-                       /* ugly hack.. */
-                       BUG_ON(btree_iter_live(trans, i->iter));
-                       trans->iters_live |= 1ULL << i->iter->idx;
-                       return i->iter;
-               }
-
-       return NULL;
-}
-
 static struct bkey_alloc_buf *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
                              const struct bch_extent_ptr *ptr,
                              struct bkey_alloc_unpacked *u)
 {
@@ -1410,36 +1366,33 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_it
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
        struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
        struct bucket *g;
-       struct btree_iter *iter;
-       struct bkey_s_c k;
        struct bkey_alloc_buf *a;
+       struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos);
        int ret;
 
        a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
        if (IS_ERR(a))
                return a;
 
-       iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
-       if (iter) {
-               *u = bch2_alloc_unpack(k);
-       } else {
-               iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
-                                          BTREE_ITER_CACHED|
-                                          BTREE_ITER_CACHED_NOFILL|
-                                          BTREE_ITER_INTENT);
-               ret = bch2_btree_iter_traverse(iter);
-               if (ret) {
-                       bch2_trans_iter_put(trans, iter);
-                       return ERR_PTR(ret);
-               }
+       bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_CACHED_NOFILL|
+                            BTREE_ITER_INTENT);
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret) {
+               bch2_trans_iter_exit(trans, iter);
+               return ERR_PTR(ret);
+       }
 
+       if (update && !bpos_cmp(update->k.p, pos)) {
+               *u = bch2_alloc_unpack(bkey_i_to_s_c(update));
+       } else {
                percpu_down_read(&c->mark_lock);
                g = bucket(ca, pos.offset);
                *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
                percpu_up_read(&c->mark_lock);
        }
 
-       *_iter = iter;
        return a;
 }
 
@@ -1448,7 +1401,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
                        s64 sectors, enum bch_data_type data_type)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_alloc_unpacked u;
        struct bkey_alloc_buf *a;
        int ret;
@@ -1463,9 +1416,9 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
                goto out;
 
        bch2_alloc_pack(c, a, u);
-       bch2_trans_update(trans, iter, &a->k, 0);
+       bch2_trans_update(trans, &iter, &a->k, 0);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1474,16 +1427,16 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
                        s64 sectors, enum bch_data_type data_type)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i_stripe *s;
        struct bch_replicas_padded r;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, POS(0, p.ec.idx),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_WITH_UPDATES);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -1514,13 +1467,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
        stripe_blockcount_set(&s->v, p.ec.block,
                stripe_blockcount_get(&s->v, p.ec.block) +
                sectors);
-       bch2_trans_update(trans, iter, &s->k_i, 0);
+       bch2_trans_update(trans, &iter, &s->k_i, 0);
 
        bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
        r.e.data_type = data_type;
        update_replicas_list(trans, &r.e, sectors);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1545,9 +1498,6 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
        BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
               (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
 
-       if (flags & BTREE_TRIGGER_OVERWRITE)
-               sectors = -sectors;
-
        r.e.data_type   = data_type;
        r.e.nr_devs     = 0;
        r.e.nr_required = 1;
@@ -1555,6 +1505,9 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                s64 disk_sectors = ptr_disk_sectors(sectors, p);
 
+               if (flags & BTREE_TRIGGER_OVERWRITE)
+                       disk_sectors = -disk_sectors;
+
                ret = bch2_trans_mark_pointer(trans, k, p,
                                        disk_sectors, data_type);
                if (ret < 0)
@@ -1592,7 +1545,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
        struct bkey_alloc_buf *a;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_alloc_unpacked u;
        bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
        int ret = 0;
@@ -1616,7 +1569,7 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
        if (!deleting) {
                if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
                                "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
-                               iter->pos.inode, iter->pos.offset, u.gen,
+                               iter.pos.inode, iter.pos.offset, u.gen,
                                u.stripe, s.k->p.offset)) {
                        ret = -EIO;
                        goto err;
@@ -1630,9 +1583,9 @@ static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
        }
 
        bch2_alloc_pack(c, a, u);
-       bch2_trans_update(trans, iter, &a->k, 0);
+       bch2_trans_update(trans, &iter, &a->k, 0);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1737,17 +1690,17 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
                        u64 idx, unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i *n;
        __le64 *refcount;
        int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
        s64 ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_WITH_UPDATES);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, idx),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -1777,14 +1730,14 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
                set_bkey_val_u64s(&n->k, 0);
        }
 
-       bch2_btree_iter_set_pos_to_extent_start(iter);
-       ret = bch2_trans_update(trans, iter, n, 0);
+       bch2_btree_iter_set_pos_to_extent_start(&iter);
+       ret = bch2_trans_update(trans, &iter, n, 0);
        if (ret)
                goto err;
 
        ret = k.k->p.offset - idx;
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1836,39 +1789,23 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
 }
 
 int bch2_trans_mark_update(struct btree_trans *trans,
-                          struct btree_iter *iter,
+                          struct btree_path *path,
                           struct bkey_i *new,
                           unsigned flags)
 {
        struct bkey             _deleted = KEY(0, 0, 0);
        struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
        struct bkey_s_c         old;
-       int iter_flags, ret;
+       struct bkey             unpacked;
+       int ret;
 
        if (unlikely(flags & BTREE_TRIGGER_NORUN))
                return 0;
 
-       if (!btree_node_type_needs_gc(iter->btree_id))
+       if (!btree_node_type_needs_gc(path->btree_id))
                return 0;
 
-
-       if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
-               iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
-               iter->flags &= ~BTREE_ITER_WITH_UPDATES;
-
-               old = bch2_btree_iter_peek_slot(iter);
-               iter->flags |= iter_flags;
-
-               ret = bkey_err(old);
-               if (ret)
-                       return ret;
-       } else {
-               /*
-                * If BTREE_ITER_CACHED_NOFILL was used, we better not be
-                * running triggers that do anything on removal (alloc btree):
-                */
-               old = deleted;
-       }
+       old = bch2_btree_path_peek_slot(path, &unpacked);
 
        if (old.k->type == new->k.type &&
            ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
@@ -1890,7 +1827,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                                    unsigned sectors)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_alloc_unpacked u;
        struct bkey_alloc_buf *a;
        struct bch_extent_ptr ptr = {
@@ -1913,7 +1850,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
-                       iter->pos.inode, iter->pos.offset, u.gen,
+                       iter.pos.inode, iter.pos.offset, u.gen,
                        bch2_data_types[u.data_type],
                        bch2_data_types[type],
                        bch2_data_types[type]);
@@ -1925,9 +1862,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
        u.dirty_sectors = sectors;
 
        bch2_alloc_pack(c, a, u);
-       bch2_trans_update(trans, iter, &a->k, 0);
+       bch2_trans_update(trans, &iter, &a->k, 0);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index 0f544b62fc908f3fea5fd54c866741277d89dbf6..61c2c0f9ff8f1d2e9cbd337539f88c45d4f7d292 100644 (file)
@@ -228,13 +228,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
 
 int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned);
 
-int bch2_mark_update(struct btree_trans *, struct btree_iter *,
+int bch2_mark_update(struct btree_trans *, struct btree_path *,
                     struct bkey_i *, unsigned);
 
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
                        struct bkey_s_c, unsigned);
-int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
-                          struct bkey_i *insert, unsigned);
+int bch2_trans_mark_update(struct btree_trans *, struct btree_path *,
+                          struct bkey_i *, unsigned);
 void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
index b0a8eb58a7a755c33097a54e252ec92e9b9d0579..9f14bf4cb49ae7e902fc2b4eb4e8516da4bad075 100644 (file)
@@ -243,7 +243,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 {
        struct dump_iter *i = file->private_data;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int err;
 
@@ -260,10 +260,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, i->id, i->from,
-                                  BTREE_ITER_PREFETCH|
-                                  BTREE_ITER_ALL_SNAPSHOTS);
-       k = bch2_btree_iter_peek(iter);
+       bch2_trans_iter_init(&trans, &iter, i->id, i->from,
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       k = bch2_btree_iter_peek(&iter);
 
        while (k.k && !(err = bkey_err(k))) {
                bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
@@ -272,8 +272,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
                i->buf[i->bytes] = '\n';
                i->bytes++;
 
-               k = bch2_btree_iter_next(iter);
-               i->from = iter->pos;
+               k = bch2_btree_iter_next(&iter);
+               i->from = iter.pos;
 
                err = flush_buf(i);
                if (err)
@@ -282,7 +282,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
                if (!i->size)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 
@@ -301,7 +301,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 {
        struct dump_iter *i = file->private_data;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        int err;
 
@@ -336,7 +336,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
                if (!i->size)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 
@@ -355,7 +355,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 {
        struct dump_iter *i = file->private_data;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct btree *prev_node = NULL;
        int err;
@@ -373,11 +373,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, i->id, i->from, BTREE_ITER_PREFETCH);
 
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(err = bkey_err(k))) {
-               struct btree_iter_level *l = &iter->l[0];
+               struct btree_path_level *l = &iter.path->l[0];
                struct bkey_packed *_k =
                        bch2_btree_node_iter_peek(&l->iter, l->b);
 
@@ -396,8 +396,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
                if (err)
                        break;
 
-               bch2_btree_iter_advance(iter);
-               i->from = iter->pos;
+               bch2_btree_iter_advance(&iter);
+               i->from = iter.pos;
 
                err = flush_buf(i);
                if (err)
index 02b29681f695e09c30dcf8d3cc50fbff1c938ce9..1d510f7728b6853bb89f6dd1bc60e25352f6273c 100644 (file)
@@ -183,7 +183,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
                       const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
                       enum bch_rename_mode mode)
 {
-       struct btree_iter *src_iter = NULL, *dst_iter = NULL;
+       struct btree_iter src_iter = { NULL };
+       struct btree_iter dst_iter = { NULL };
        struct bkey_s_c old_src, old_dst;
        struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
        struct bpos dst_pos =
@@ -199,17 +200,16 @@ int bch2_dirent_rename(struct btree_trans *trans,
         * the target already exists - we're relying on the VFS
         * to do that check for us for correctness:
         */
-       dst_iter = mode == BCH_RENAME
-               ? bch2_hash_hole(trans, bch2_dirent_hash_desc,
+       ret = mode == BCH_RENAME
+               ? bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
                                 dst_hash, dst_dir, dst_name)
-               : bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+               : bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
                                   dst_hash, dst_dir, dst_name,
                                   BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(dst_iter);
        if (ret)
                goto out;
 
-       old_dst = bch2_btree_iter_peek_slot(dst_iter);
+       old_dst = bch2_btree_iter_peek_slot(&dst_iter);
        ret = bkey_err(old_dst);
        if (ret)
                goto out;
@@ -217,17 +217,16 @@ int bch2_dirent_rename(struct btree_trans *trans,
        if (mode != BCH_RENAME)
                *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
        if (mode != BCH_RENAME_EXCHANGE)
-               *src_offset = dst_iter->pos.offset;
+               *src_offset = dst_iter.pos.offset;
 
        /* Lookup src: */
-       src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-                                   src_hash, src_dir, src_name,
-                                   BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(src_iter);
+       ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+                              src_hash, src_dir, src_name,
+                              BTREE_ITER_INTENT);
        if (ret)
                goto out;
 
-       old_src = bch2_btree_iter_peek_slot(src_iter);
+       old_src = bch2_btree_iter_peek_slot(&src_iter);
        ret = bkey_err(old_src);
        if (ret)
                goto out;
@@ -241,7 +240,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
                goto out;
 
        dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-       new_dst->k.p = dst_iter->pos;
+       new_dst->k.p = dst_iter.pos;
 
        /* Create new src key: */
        if (mode == BCH_RENAME_EXCHANGE) {
@@ -251,7 +250,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
                        goto out;
 
                dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
-               new_src->k.p = src_iter->pos;
+               new_src->k.p = src_iter.pos;
        } else {
                new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
                ret = PTR_ERR_OR_ZERO(new_src);
@@ -259,10 +258,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
                        goto out;
 
                bkey_init(&new_src->k);
-               new_src->k.p = src_iter->pos;
+               new_src->k.p = src_iter.pos;
 
-               if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
-                   bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+               if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+                   bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
                        /*
                         * We have a hash collision for the new dst key,
                         * and new_src - the key we're deleting - is between
@@ -275,8 +274,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
                                 * If we're not overwriting, we can just insert
                                 * new_dst at the src position:
                                 */
-                               new_dst->k.p = src_iter->pos;
-                               bch2_trans_update(trans, src_iter,
+                               new_dst->k.p = src_iter.pos;
+                               bch2_trans_update(trans, &src_iter,
                                                  &new_dst->k_i, 0);
                                goto out_set_offset;
                        } else {
@@ -290,7 +289,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
                } else {
                        /* Check if we need a whiteout to delete src: */
                        ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
-                                                      src_hash, src_iter);
+                                                      src_hash, &src_iter);
                        if (ret < 0)
                                goto out;
 
@@ -299,15 +298,15 @@ int bch2_dirent_rename(struct btree_trans *trans,
                }
        }
 
-       bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
-       bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
+       bch2_trans_update(trans, &src_iter, &new_src->k_i, 0);
+       bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
 out_set_offset:
        if (mode == BCH_RENAME_EXCHANGE)
                *src_offset = new_src->k.p.offset;
        *dst_offset = new_dst->k.p.offset;
 out:
-       bch2_trans_iter_put(trans, src_iter);
-       bch2_trans_iter_put(trans, dst_iter);
+       bch2_trans_iter_exit(trans, &src_iter);
+       bch2_trans_iter_exit(trans, &dst_iter);
        return ret;
 }
 
@@ -319,12 +318,13 @@ int bch2_dirent_delete_at(struct btree_trans *trans,
                                   hash_info, iter);
 }
 
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
-                          const struct bch_hash_info *hash_info,
-                          const struct qstr *name, unsigned flags)
+int __bch2_dirent_lookup_trans(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              u64 dir_inum,
+                              const struct bch_hash_info *hash_info,
+                              const struct qstr *name, unsigned flags)
 {
-       return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
+       return bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
                                hash_info, dir_inum, name, flags);
 }
 
@@ -333,26 +333,25 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
                       const struct qstr *name)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        u64 inum = 0;
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
-                                         hash_info, name, 0);
-       ret = PTR_ERR_OR_ZERO(iter);
+       ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum,
+                                        hash_info, name, 0);
        if (ret)
                goto out;
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto out;
 
        inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 out:
        BUG_ON(ret == -EINTR);
        bch2_trans_exit(&trans);
@@ -361,7 +360,7 @@ out:
 
 int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
@@ -375,7 +374,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
                        break;
                }
        }
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        return ret;
 }
@@ -383,7 +382,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
 int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_dirent dirent;
        int ret;
@@ -412,7 +411,7 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
                        break;
                ctx->pos = dirent.k->p.offset + 1;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
 
index e1d8ce377d43755cd5584edf1afa05d0ba65495e..c14f6029e1c98b7064440eab94564079f00a11bf 100644 (file)
@@ -50,8 +50,7 @@ int bch2_dirent_rename(struct btree_trans *,
                       const struct qstr *, u64 *, u64 *,
                       enum bch_rename_mode);
 
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *, u64,
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64,
                           const struct bch_hash_info *,
                           const struct qstr *, unsigned);
 u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
index 328e0429b5d77329d04ed061ebade45536648fcd..f0bdbdb2673d899956c1613c4fb4ffd9afe7db70 100644 (file)
@@ -429,13 +429,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes,
+                            POS(0, idx), BTREE_ITER_SLOTS);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -445,6 +446,7 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
        }
        bkey_reassemble(&stripe->key.k_i, k);
 err:
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -552,19 +554,19 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
        return 0;
 }
 
-static int ec_stripe_mem_alloc(struct bch_fs *c,
+static int ec_stripe_mem_alloc(struct btree_trans *trans,
                               struct btree_iter *iter)
 {
        size_t idx = iter->pos.offset;
        int ret = 0;
 
-       if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
+       if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
                return ret;
 
-       bch2_trans_unlock(iter->trans);
+       bch2_trans_unlock(trans);
        ret = -EINTR;
 
-       if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
+       if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
                return ret;
 
        return -ENOMEM;
@@ -704,7 +706,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c,
                                 struct disk_reservation *res)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bpos min_pos = POS(0, 1);
        struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
@@ -719,7 +721,7 @@ retry:
                if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
                        if (start_pos.offset) {
                                start_pos = min_pos;
-                               bch2_btree_iter_set_pos(iter, start_pos);
+                               bch2_btree_iter_set_pos(&iter, start_pos);
                                continue;
                        }
 
@@ -733,19 +735,19 @@ retry:
 
        goto err;
 found_slot:
-       start_pos = iter->pos;
+       start_pos = iter.pos;
 
-       ret = ec_stripe_mem_alloc(c, iter);
+       ret = ec_stripe_mem_alloc(&trans, &iter);
        if (ret)
                goto err;
 
-       stripe->k.p = iter->pos;
+       stripe->k.p = iter.pos;
 
-       ret   = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?:
+       ret   = bch2_trans_update(&trans, &iter, &stripe->k_i, 0) ?:
                bch2_trans_commit(&trans, res, NULL,
                                BTREE_INSERT_NOFAIL);
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        if (ret == -EINTR)
                goto retry;
@@ -759,15 +761,15 @@ err:
 static int ec_stripe_bkey_update(struct btree_trans *trans,
                                 struct bkey_i_stripe *new)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        const struct bch_stripe *existing;
        unsigned i;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_stripes,
-                                  new->k.p, BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
+                            new->k.p, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -790,9 +792,9 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
                stripe_blockcount_set(&new->v, i,
                        stripe_blockcount_get(existing, i));
 
-       ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+       ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -820,10 +822,11 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
                                 struct bkey *pos)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_extent e;
        struct bkey_buf sk;
+       struct bpos next_pos;
        int ret = 0, dev, block;
 
        bch2_bkey_buf_init(&sk);
@@ -831,23 +834,23 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
        /* XXX this doesn't support the reflink btree */
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  bkey_start_pos(pos),
-                                  BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            bkey_start_pos(pos),
+                            BTREE_ITER_INTENT);
 
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k)) &&
               bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
                struct bch_extent_ptr *ptr, *ec_ptr = NULL;
 
                if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
                block = bkey_matches_stripe(&s->key.v, k);
                if (block < 0) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
@@ -862,17 +865,21 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
                extent_stripe_ptr_add(e, s, ec_ptr, block);
 
-               bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
-               ret   = bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(&trans, iter, sk.k, 0) ?:
+               bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
+               next_pos = sk.k->k.p;
+
+               ret   = bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(&trans, &iter, sk.k, 0) ?:
                        bch2_trans_commit(&trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL);
+               if (!ret)
+                       bch2_btree_iter_set_pos(&iter, next_pos);
                if (ret == -EINTR)
                        ret = 0;
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
@@ -1593,7 +1600,7 @@ write:
 int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct genradix_iter giter;
        struct bkey_i_stripe *new_key;
        struct stripe *m;
@@ -1604,8 +1611,8 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN,
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS_MIN,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        genradix_for_each(&c->stripes[0], giter, m) {
                if (!m->alive)
@@ -1613,13 +1620,13 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
 
                ret = __bch2_trans_do(&trans, NULL, NULL,
                                      BTREE_INSERT_NOFAIL|flags,
-                       __bch2_stripe_write_key(&trans, iter, m,
+                       __bch2_stripe_write_key(&trans, &iter, m,
                                        giter.pos, new_key));
 
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 
@@ -1654,19 +1661,19 @@ int bch2_stripes_read(struct bch_fs *c)
 int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        size_t i, idx = 0;
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, POS(0, U64_MAX), 0);
 
-       k = bch2_btree_iter_prev(iter);
+       k = bch2_btree_iter_prev(&iter);
        if (!IS_ERR_OR_NULL(k.k))
                idx = k.k->p.offset + 1;
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        ret = bch2_trans_exit(&trans);
        if (ret)
                return ret;
index 4a8dd085f7fb80b2650f5bab433cae077343d1e3..9d959b053defd6c23a43188325db20ad145c3ac3 100644 (file)
@@ -58,7 +58,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
                u64 idx = le64_to_cpu(p.v->idx);
                unsigned sectors = bpos_min(*end, p.k->p).offset -
                        bkey_start_offset(p.k);
-               struct btree_iter *iter;
+               struct btree_iter iter;
                struct bkey_s_c r_k;
 
                for_each_btree_key(trans, iter,
@@ -83,8 +83,8 @@ static int count_iters_for_insert(struct btree_trans *trans,
                                break;
                        }
                }
+               bch2_trans_iter_exit(trans, &iter);
 
-               bch2_trans_iter_put(trans, iter);
                break;
        }
        }
@@ -94,12 +94,12 @@ static int count_iters_for_insert(struct btree_trans *trans,
 
 #define EXTENT_ITERS_MAX       (BTREE_ITER_MAX / 3)
 
-int bch2_extent_atomic_end(struct btree_iter *iter,
+int bch2_extent_atomic_end(struct btree_trans *trans,
+                          struct btree_iter *iter,
                           struct bkey_i *insert,
                           struct bpos *end)
 {
-       struct btree_trans *trans = iter->trans;
-       struct btree_iter *copy;
+       struct btree_iter copy;
        struct bkey_s_c k;
        unsigned nr_iters = 0;
        int ret;
@@ -118,7 +118,7 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
        if (ret < 0)
                return ret;
 
-       copy = bch2_trans_copy_iter(trans, iter);
+       bch2_trans_copy_iter(&copy, iter);
 
        for_each_btree_key_continue(copy, 0, k, ret) {
                unsigned offset = 0;
@@ -149,31 +149,21 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
                        break;
        }
 
-       bch2_trans_iter_put(trans, copy);
+       bch2_trans_iter_exit(trans, &copy);
        return ret < 0 ? ret : 0;
 }
 
-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+                           struct btree_iter *iter,
+                           struct bkey_i *k)
 {
        struct bpos end;
        int ret;
 
-       ret = bch2_extent_atomic_end(iter, k, &end);
+       ret = bch2_extent_atomic_end(trans, iter, k, &end);
        if (ret)
                return ret;
 
        bch2_cut_back(end, k);
        return 0;
 }
-
-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
-{
-       struct bpos end;
-       int ret;
-
-       ret = bch2_extent_atomic_end(iter, k, &end);
-       if (ret)
-               return ret;
-
-       return !bkey_cmp(end, k->k.p);
-}
index 2fa4602967e04f5cf02033a19d5d134ba90dcf8c..6f5cf449361a7f1aa6661086c43110c8b2e15455 100644 (file)
@@ -4,9 +4,9 @@
 
 #include "bcachefs.h"
 
-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
-                          struct bpos *);
-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+                          struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
+                           struct bkey_i *);
 
 #endif /* _BCACHEFS_EXTENT_UPDATE_H */
index 563e13057f5f2411cef336f4f5644becec9f058a..f66640c2a5edd73ad8c17059caed61df37a241b4 100644 (file)
@@ -616,7 +616,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
                                unsigned nr_replicas, bool compressed)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bpos end = pos;
        struct bkey_s_c k;
        bool ret = true;
@@ -637,7 +637,7 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
                        break;
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 
index 2189a11ccad8d42e89a3749f7b19836d76f9a053..6bc82559c9b17e64866a3f11a74d9e246320b011 100644 (file)
@@ -19,16 +19,15 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
                      struct posix_acl *acl)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *dir_iter = NULL;
-       struct btree_iter *inode_iter = NULL;
+       struct btree_iter dir_iter = { NULL };
+       struct btree_iter inode_iter = { NULL };
        struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
        u64 now = bch2_current_time(c);
        u64 cpu = raw_smp_processor_id();
        u64 dir_offset = 0;
        int ret;
 
-       dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(dir_iter);
+       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
@@ -37,8 +36,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
        if (!name)
                new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-       inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
+       ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu);
        if (ret)
                goto err;
 
@@ -63,7 +61,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
                if (S_ISDIR(new_inode->bi_mode))
                        dir_u->bi_nlink++;
 
-               ret = bch2_inode_write(trans, dir_iter, dir_u);
+               ret = bch2_inode_write(trans, &dir_iter, dir_u);
                if (ret)
                        goto err;
 
@@ -82,14 +80,14 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
        }
 
        /* XXX use bch2_btree_iter_set_snapshot() */
-       inode_iter->snapshot = U32_MAX;
-       bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+       inode_iter.snapshot = U32_MAX;
+       bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
 
-       ret   = bch2_btree_iter_traverse(inode_iter) ?:
-               bch2_inode_write(trans, inode_iter, new_inode);
+       ret   = bch2_btree_iter_traverse(&inode_iter) ?:
+               bch2_inode_write(trans, &inode_iter, new_inode);
 err:
-       bch2_trans_iter_put(trans, inode_iter);
-       bch2_trans_iter_put(trans, dir_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
+       bch2_trans_iter_exit(trans, &dir_iter);
        return ret;
 }
 
@@ -98,22 +96,21 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
                    struct bch_inode_unpacked *inode_u, const struct qstr *name)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
+       struct btree_iter dir_iter = { NULL };
+       struct btree_iter inode_iter = { NULL };
        struct bch_hash_info dir_hash;
        u64 now = bch2_current_time(c);
        u64 dir_offset = 0;
        int ret;
 
-       inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
+       ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        inode_u->bi_ctime = now;
        bch2_inode_nlink_inc(inode_u);
 
-       dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
-       ret = PTR_ERR_OR_ZERO(dir_iter);
+       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
@@ -133,11 +130,11 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
                inode_u->bi_dir_offset  = dir_offset;
        }
 
-       ret =   bch2_inode_write(trans, dir_iter, dir_u) ?:
-               bch2_inode_write(trans, inode_iter, inode_u);
+       ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
+               bch2_inode_write(trans, &inode_iter, inode_u);
 err:
-       bch2_trans_iter_put(trans, dir_iter);
-       bch2_trans_iter_put(trans, inode_iter);
+       bch2_trans_iter_exit(trans, &dir_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
        return ret;
 }
 
@@ -147,35 +144,33 @@ int bch2_unlink_trans(struct btree_trans *trans,
                      const struct qstr *name)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
-                         *inode_iter = NULL;
+       struct btree_iter dir_iter = { NULL };
+       struct btree_iter dirent_iter = { NULL };
+       struct btree_iter inode_iter = { NULL };
        struct bch_hash_info dir_hash;
        u64 inum, now = bch2_current_time(c);
        struct bkey_s_c k;
        int ret;
 
-       dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(dir_iter);
+       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        dir_hash = bch2_hash_info_init(c, dir_u);
 
-       dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
-                                                name, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(dirent_iter);
+       ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash,
+                                        name, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
-       k = bch2_btree_iter_peek_slot(dirent_iter);
+       k = bch2_btree_iter_peek_slot(&dirent_iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
 
        inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
 
-       inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
+       ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
@@ -192,13 +187,13 @@ int bch2_unlink_trans(struct btree_trans *trans,
        ret =   (S_ISDIR(inode_u->bi_mode)
                 ? bch2_empty_dir_trans(trans, inum)
                 : 0) ?:
-               bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
-               bch2_inode_write(trans, dir_iter, dir_u) ?:
-               bch2_inode_write(trans, inode_iter, inode_u);
+               bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?:
+               bch2_inode_write(trans, &dir_iter, dir_u) ?:
+               bch2_inode_write(trans, &inode_iter, inode_u);
 err:
-       bch2_trans_iter_put(trans, inode_iter);
-       bch2_trans_iter_put(trans, dirent_iter);
-       bch2_trans_iter_put(trans, dir_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
+       bch2_trans_iter_exit(trans, &dirent_iter);
+       bch2_trans_iter_exit(trans, &dir_iter);
        return ret;
 }
 
@@ -236,25 +231,25 @@ int bch2_rename_trans(struct btree_trans *trans,
                      enum bch_rename_mode mode)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
-       struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
+       struct btree_iter src_dir_iter = { NULL };
+       struct btree_iter dst_dir_iter = { NULL };
+       struct btree_iter src_inode_iter = { NULL };
+       struct btree_iter dst_inode_iter = { NULL };
        struct bch_hash_info src_hash, dst_hash;
        u64 src_inode, src_offset, dst_inode, dst_offset;
        u64 now = bch2_current_time(c);
        int ret;
 
-       src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
-                                      BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(src_dir_iter);
+       ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
+                             BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        src_hash = bch2_hash_info_init(c, src_dir_u);
 
        if (dst_dir != src_dir) {
-               dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
-                                              BTREE_ITER_INTENT);
-               ret = PTR_ERR_OR_ZERO(dst_dir_iter);
+               ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
+                                     BTREE_ITER_INTENT);
                if (ret)
                        goto err;
 
@@ -273,16 +268,14 @@ int bch2_rename_trans(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
-                                        BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(src_inode_iter);
+       ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode,
+                             BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        if (dst_inode) {
-               dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
-                                                BTREE_ITER_INTENT);
-               ret = PTR_ERR_OR_ZERO(dst_inode_iter);
+               ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode,
+                                     BTREE_ITER_INTENT);
                if (ret)
                        goto err;
        }
@@ -357,18 +350,18 @@ int bch2_rename_trans(struct btree_trans *trans,
        if (dst_inode)
                dst_inode_u->bi_ctime   = now;
 
-       ret =   bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
+       ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
                (src_dir != dst_dir
-                ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
+                ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
                 : 0 ) ?:
-               bch2_inode_write(trans, src_inode_iter, src_inode_u) ?:
+               bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
                (dst_inode
-                ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
+                ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
                 : 0 );
 err:
-       bch2_trans_iter_put(trans, dst_inode_iter);
-       bch2_trans_iter_put(trans, src_inode_iter);
-       bch2_trans_iter_put(trans, dst_dir_iter);
-       bch2_trans_iter_put(trans, src_dir_iter);
+       bch2_trans_iter_exit(trans, &dst_inode_iter);
+       bch2_trans_iter_exit(trans, &src_inode_iter);
+       bch2_trans_iter_exit(trans, &dst_dir_iter);
+       bch2_trans_iter_exit(trans, &src_dir_iter);
        return ret;
 }
index 3333f6166bf2c13531b915b0f36024a8699f5de7..2921037713d12677d1e02ba7d2151411469aa64e 100644 (file)
@@ -884,7 +884,7 @@ void bch2_readahead(struct readahead_control *ractl)
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct page *page;
        struct readpages_iter readpages_iter;
        int ret;
@@ -893,8 +893,8 @@ void bch2_readahead(struct readahead_control *ractl)
        BUG_ON(ret);
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
-                                  BTREE_ITER_SLOTS);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
+                            BTREE_ITER_SLOTS);
 
        bch2_pagecache_add_get(&inode->ei_pagecache_lock);
 
@@ -915,13 +915,13 @@ void bch2_readahead(struct readahead_control *ractl)
                rbio->bio.bi_end_io = bch2_readpages_end_io;
                BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
-               bchfs_read(&trans, iter, rbio, inode->v.i_ino,
+               bchfs_read(&trans, &iter, rbio, inode->v.i_ino,
                           &readpages_iter);
        }
 
        bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        kfree(readpages_iter.pages);
 }
@@ -930,7 +930,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
                             u64 inum, struct page *page)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
 
        bch2_page_state_create(page, __GFP_NOFAIL);
 
@@ -940,12 +940,12 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
        BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
-                                  BTREE_ITER_SLOTS);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN,
+                            BTREE_ITER_SLOTS);
 
-       bchfs_read(&trans, iter, rbio, inum, NULL);
+       bchfs_read(&trans, &iter, rbio, inum, NULL);
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 }
 
@@ -2151,7 +2151,7 @@ static inline int range_has_data(struct bch_fs *c,
                                  struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
@@ -2166,7 +2166,7 @@ static inline int range_has_data(struct bch_fs *c,
                        break;
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        return bch2_trans_exit(&trans) ?: ret;
 }
@@ -2476,7 +2476,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        struct address_space *mapping = inode->v.i_mapping;
        struct bkey_buf copy;
        struct btree_trans trans;
-       struct btree_iter *src, *dst, *del;
+       struct btree_iter src, dst, del;
        loff_t shift, new_size;
        u64 src_start;
        int ret = 0;
@@ -2541,11 +2541,11 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 
        bch2_bkey_buf_init(&copy);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+       bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
                        POS(inode->v.i_ino, src_start >> 9),
                        BTREE_ITER_INTENT);
-       dst = bch2_trans_copy_iter(&trans, src);
-       del = bch2_trans_copy_iter(&trans, src);
+       bch2_trans_copy_iter(&dst, &src);
+       bch2_trans_copy_iter(&del, &src);
 
        while (ret == 0 || ret == -EINTR) {
                struct disk_reservation disk_res =
@@ -2560,8 +2560,8 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
                bch2_trans_begin(&trans);
 
                k = insert
-                       ? bch2_btree_iter_peek_prev(src)
-                       : bch2_btree_iter_peek(src);
+                       ? bch2_btree_iter_peek_prev(&src)
+                       : bch2_btree_iter_peek(&src);
                if ((ret = bkey_err(k)))
                        continue;
 
@@ -2579,9 +2579,9 @@ reassemble:
                        bch2_cut_front(move_pos, copy.k);
 
                copy.k->k.p.offset += shift >> 9;
-               bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
+               bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
 
-               ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
+               ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
                if (ret)
                        continue;
 
@@ -2599,7 +2599,7 @@ reassemble:
                delete.k.p = copy.k->k.p;
                delete.k.size = copy.k->k.size;
                delete.k.p.offset -= shift >> 9;
-               bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
+               bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
 
                next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
@@ -2620,20 +2620,20 @@ reassemble:
                        BUG_ON(ret);
                }
 
-               ret =   bch2_btree_iter_traverse(del) ?:
-                       bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
-                       bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
+               ret =   bch2_btree_iter_traverse(&del) ?:
+                       bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
+                       bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
                        bch2_trans_commit(&trans, &disk_res,
                                          &inode->ei_journal_seq,
                                          BTREE_INSERT_NOFAIL);
                bch2_disk_reservation_put(c, &disk_res);
 
                if (!ret)
-                       bch2_btree_iter_set_pos(src, next_pos);
+                       bch2_btree_iter_set_pos(&src, next_pos);
        }
-       bch2_trans_iter_put(&trans, del);
-       bch2_trans_iter_put(&trans, dst);
-       bch2_trans_iter_put(&trans, src);
+       bch2_trans_iter_exit(&trans, &del);
+       bch2_trans_iter_exit(&trans, &dst);
+       bch2_trans_iter_exit(&trans, &src);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&copy, c);
 
@@ -2658,18 +2658,18 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bpos end_pos = POS(inode->v.i_ino, end_sector);
        unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
        int ret = 0;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
                        POS(inode->v.i_ino, start_sector),
                        BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-       while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
+       while (!ret && bkey_cmp(iter.pos, end_pos) < 0) {
                s64 i_sectors_delta = 0;
                struct disk_reservation disk_res = { 0 };
                struct quota_res quota_res = { 0 };
@@ -2679,20 +2679,20 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 
                bch2_trans_begin(&trans);
 
-               k = bch2_btree_iter_peek_slot(iter);
+               k = bch2_btree_iter_peek_slot(&iter);
                if ((ret = bkey_err(k)))
                        goto bkey_err;
 
                /* already reserved */
                if (k.k->type == KEY_TYPE_reservation &&
                    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
                if (bkey_extent_is_data(k.k) &&
                    !(mode & FALLOC_FL_ZERO_RANGE)) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
@@ -2701,7 +2701,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                reservation.k.p         = k.k->p;
                reservation.k.size      = k.k->size;
 
-               bch2_cut_front(iter->pos,       &reservation.k_i);
+               bch2_cut_front(iter.pos,        &reservation.k_i);
                bch2_cut_back(end_pos,          &reservation.k_i);
 
                sectors = reservation.k.size;
@@ -2725,7 +2725,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                        reservation.v.nr_replicas = disk_res.nr_replicas;
                }
 
-               ret = bch2_extent_update(&trans, iter, &reservation.k_i,
+               ret = bch2_extent_update(&trans, &iter, &reservation.k_i,
                                &disk_res, &inode->ei_journal_seq,
                                0, &i_sectors_delta, true);
                i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
@@ -2735,7 +2735,7 @@ bkey_err:
                if (ret == -EINTR)
                        ret = 0;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -3017,7 +3017,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        u64 isize, next_data = MAX_LFS_FILESIZE;
        int ret;
@@ -3038,7 +3038,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
                } else if (k.k->p.offset >> 9 > isize)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
@@ -3113,7 +3113,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        u64 isize, next_hole = MAX_LFS_FILESIZE;
        int ret;
@@ -3142,7 +3142,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
                        offset = max(offset, bkey_start_offset(k.k) << 9);
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
index 631fb87b81c9ca9866eaab3413fbe916a6fd4e5a..6cc56871d26d85bea6bcd72bd88cb3f289fd5942 100644 (file)
@@ -141,7 +141,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
                                  void *p, unsigned fields)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter = { NULL };
        struct bch_inode_unpacked inode_u;
        int ret;
 
@@ -149,11 +149,10 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 retry:
        bch2_trans_begin(&trans);
 
-       iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-                              BTREE_ITER_INTENT);
-       ret   = PTR_ERR_OR_ZERO(iter) ?:
+       ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino,
+                               BTREE_ITER_INTENT) ?:
                (set ? set(inode, &inode_u, p) : 0) ?:
-               bch2_inode_write(&trans, iter, &inode_u) ?:
+               bch2_inode_write(&trans, &iter, &inode_u) ?:
                bch2_trans_commit(&trans, NULL,
                                  &inode->ei_journal_seq,
                                  BTREE_INSERT_NOFAIL);
@@ -165,7 +164,7 @@ retry:
        if (!ret)
                bch2_inode_update_after_write(c, inode, &inode_u, fields);
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        if (ret == -EINTR)
                goto retry;
@@ -686,7 +685,7 @@ int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_qid qid;
        struct btree_trans trans;
-       struct btree_iter *inode_iter;
+       struct btree_iter inode_iter = { NULL };
        struct bch_inode_unpacked inode_u;
        struct posix_acl *acl = NULL;
        int ret;
@@ -712,9 +711,8 @@ retry:
        kfree(acl);
        acl = NULL;
 
-       inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-                                    BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
+       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino,
+                             BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
 
@@ -726,12 +724,12 @@ retry:
                        goto btree_err;
        }
 
-       ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+       ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
                bch2_trans_commit(&trans, NULL,
                                  &inode->ei_journal_seq,
                                  BTREE_INSERT_NOFAIL);
 btree_err:
-       bch2_trans_iter_put(&trans, inode_iter);
+       bch2_trans_iter_exit(&trans, &inode_iter);
 
        if (ret == -EINTR)
                goto retry;
@@ -881,7 +879,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        struct bch_fs *c = vinode->i_sb->s_fs_info;
        struct bch_inode_info *ei = to_bch_ei(vinode);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf cur, prev;
        struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
@@ -900,23 +898,23 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        bch2_bkey_buf_init(&prev);
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  POS(ei->v.i_ino, start >> 9), 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            POS(ei->v.i_ino, start >> 9), 0);
 retry:
        bch2_trans_begin(&trans);
 
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k)) &&
-              bkey_cmp(iter->pos, end) < 0) {
+              bkey_cmp(iter.pos, end) < 0) {
                enum btree_id data_btree = BTREE_ID_extents;
 
                if (!bkey_extent_is_data(k.k) &&
                    k.k->type != KEY_TYPE_reservation) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
-               offset_into_extent      = iter->pos.offset -
+               offset_into_extent      = iter.pos.offset -
                        bkey_start_offset(k.k);
                sectors                 = k.k->size - offset_into_extent;
 
@@ -937,7 +935,7 @@ retry:
                                   offset_into_extent),
                               cur.k);
                bch2_key_resize(&cur.k->k, sectors);
-               cur.k->k.p = iter->pos;
+               cur.k->k.p = iter.pos;
                cur.k->k.p.offset += cur.k->k.size;
 
                if (have_extent) {
@@ -950,8 +948,8 @@ retry:
                bkey_copy(prev.k, cur.k);
                have_extent = true;
 
-               bch2_btree_iter_set_pos(iter,
-                       POS(iter->pos.inode, iter->pos.offset + sectors));
+               bch2_btree_iter_set_pos(&iter,
+                       POS(iter.pos.inode, iter.pos.offset + sectors));
        }
 
        if (ret == -EINTR)
@@ -961,7 +959,7 @@ retry:
                ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
                                       FIEMAP_EXTENT_LAST);
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        ret = bch2_trans_exit(&trans) ?: ret;
        bch2_bkey_buf_exit(&cur, c);
        bch2_bkey_buf_exit(&prev, c);
index 36eba46d566e351c1b50dec637bc02f1ff2a7e17..eb979e79eaac9c226ff90c70ef912395d4506448 100644 (file)
@@ -19,7 +19,7 @@
 
 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        u64 sectors = 0;
        int ret;
@@ -33,7 +33,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
                        sectors += k.k->size;
        }
 
-       bch2_trans_iter_free(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        return ret ?: sectors;
 }
@@ -42,24 +42,24 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
                          struct bch_inode_unpacked *inode,
                          u32 *snapshot)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
-                       POS(0, inode_nr), 0);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+                            POS(0, inode_nr), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
 
        if (snapshot)
-               *snapshot = iter->pos.snapshot;
+               *snapshot = iter.pos.snapshot;
        ret = k.k->type == KEY_TYPE_inode
                ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
                : -ENOENT;
 err:
-       bch2_trans_iter_free(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -74,13 +74,16 @@ static int __write_inode(struct btree_trans *trans,
                         struct bch_inode_unpacked *inode,
                         u32 snapshot)
 {
-       struct btree_iter *inode_iter =
-               bch2_trans_get_iter(trans, BTREE_ID_inodes,
-                                   SPOS(0, inode->bi_inum, snapshot),
-                                   BTREE_ITER_INTENT);
-       int ret = bch2_btree_iter_traverse(inode_iter) ?:
-               bch2_inode_write(trans, inode_iter, inode);
-       bch2_trans_iter_put(trans, inode_iter);
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+                           SPOS(0, inode->bi_inum, snapshot),
+                           BTREE_ITER_INTENT);
+
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_inode_write(trans, &iter, inode);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -100,7 +103,7 @@ static int write_inode(struct btree_trans *trans,
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bch_inode_unpacked dir_inode;
        struct bch_hash_info dir_hash_info;
        int ret;
@@ -111,11 +114,11 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 
        dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
        ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-                                 &dir_hash_info, iter);
-       bch2_trans_iter_put(trans, iter);
+                                 &dir_hash_info, &iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -230,13 +233,13 @@ static int reattach_inode(struct btree_trans *trans,
 static int remove_backpointer(struct btree_trans *trans,
                              struct bch_inode_unpacked *inode)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
-                                  POS(inode->bi_dir, inode->bi_dir_offset), 0);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
+                            POS(inode->bi_dir, inode->bi_dir_offset), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto out;
@@ -247,7 +250,7 @@ static int remove_backpointer(struct btree_trans *trans,
 
        ret = remove_dirent(trans, k.k->p);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -343,7 +346,7 @@ static int hash_check_key(struct btree_trans *trans,
                          struct btree_iter *k_iter, struct bkey_s_c hash_k)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter = { NULL };
        char buf[200];
        struct bkey_s_c k;
        u64 hash;
@@ -378,12 +381,12 @@ static int hash_check_key(struct btree_trans *trans,
                }
 
                if (bkey_deleted(k.k)) {
-                       bch2_trans_iter_free(trans, iter);
+                       bch2_trans_iter_exit(trans, &iter);
                        goto bad_hash;
                }
 
        }
-       bch2_trans_iter_free(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 bad_hash:
        if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
@@ -513,7 +516,7 @@ noinline_for_stack
 static int check_inodes(struct bch_fs *c, bool full)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_inode inode;
        int ret;
@@ -532,12 +535,12 @@ static int check_inodes(struct bch_fs *c, bool full)
                    (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
                                          BCH_INODE_I_SECTORS_DIRTY|
                                          BCH_INODE_UNLINKED))) {
-                       ret = check_inode(&trans, iter, inode);
+                       ret = check_inode(&trans, &iter, inode);
                        if (ret)
                                break;
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        BUG_ON(ret == -EINTR);
 
@@ -547,7 +550,7 @@ static int check_inodes(struct bch_fs *c, bool full)
 static int fix_overlapping_extent(struct btree_trans *trans,
                                       struct bkey_s_c k, struct bpos cut_at)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_i *u;
        int ret;
 
@@ -567,29 +570,29 @@ static int fix_overlapping_extent(struct btree_trans *trans,
         * assume things about extent overwrites - we should be running the
         * triggers manually here
         */
-       iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
-                                  BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
+                            BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 
-       BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
-       ret   = bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?:
+       BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL|
                                  BTREE_INSERT_LAZY_RW);
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
 static int inode_backpointer_exists(struct btree_trans *trans,
                                    struct bch_inode_unpacked *inode)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
-                                  POS(inode->bi_dir, inode->bi_dir_offset), 0);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
+                            POS(inode->bi_dir, inode->bi_dir_offset), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto out;
@@ -598,7 +601,7 @@ static int inode_backpointer_exists(struct btree_trans *trans,
 
        ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
 out:
-       bch2_trans_iter_free(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -618,7 +621,7 @@ static int check_extents(struct bch_fs *c)
 {
        struct inode_walker w = inode_walker_init();
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf prev;
        u64 i_sectors = 0;
@@ -630,12 +633,12 @@ static int check_extents(struct bch_fs *c)
 
        bch_verbose(c, "checking extents");
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  POS(BCACHEFS_ROOT_INO, 0),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            POS(BCACHEFS_ROOT_INO, 0),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH);
 retry:
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k))) {
                if (w.have_inode &&
                    w.cur_inum != k.k->p.inode &&
@@ -700,12 +703,12 @@ retry:
                        i_sectors += k.k->size;
                bch2_bkey_buf_reassemble(&prev, c, k);
 
-               bch2_btree_iter_advance(iter);
+               bch2_btree_iter_advance(&iter);
        }
 fsck_err:
        if (ret == -EINTR)
                goto retry;
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_bkey_buf_exit(&prev, c);
        return bch2_trans_exit(&trans) ?: ret;
 }
@@ -890,7 +893,7 @@ static int check_dirents(struct bch_fs *c)
        struct inode_walker w = inode_walker_init();
        struct bch_hash_info hash_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        unsigned nr_subdirs = 0;
        int ret = 0;
 
@@ -898,18 +901,18 @@ static int check_dirents(struct bch_fs *c)
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
-                                  POS(BCACHEFS_ROOT_INO, 0),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
+                            POS(BCACHEFS_ROOT_INO, 0),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH);
 
        do {
                ret = lockrestart_do(&trans,
-                               check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs));
+                               check_dirent(&trans, &iter, &hash_info, &w, &nr_subdirs));
                if (ret)
                        break;
-       } while (bch2_btree_iter_advance(iter));
-       bch2_trans_iter_put(&trans, iter);
+       } while (bch2_btree_iter_advance(&iter));
+       bch2_trans_iter_exit(&trans, &iter);
 
        return bch2_trans_exit(&trans) ?: ret;
 }
@@ -923,7 +926,7 @@ static int check_xattrs(struct bch_fs *c)
        struct inode_walker w = inode_walker_init();
        struct bch_hash_info hash_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
@@ -931,12 +934,12 @@ static int check_xattrs(struct bch_fs *c)
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
-                                  POS(BCACHEFS_ROOT_INO, 0),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+                            POS(BCACHEFS_ROOT_INO, 0),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH);
 retry:
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k))) {
                ret = walk_inode(&trans, &w, k.k->p.inode);
                if (ret)
@@ -945,7 +948,7 @@ retry:
                if (fsck_err_on(!w.have_inode, c,
                                "xattr for missing inode %llu",
                                k.k->p.inode)) {
-                       ret = bch2_btree_delete_at(&trans, iter, 0);
+                       ret = bch2_btree_delete_at(&trans, &iter, 0);
                        if (ret)
                                break;
                        continue;
@@ -955,17 +958,17 @@ retry:
                        hash_info = bch2_hash_info_init(c, &w.inode);
 
                ret = hash_check_key(&trans, bch2_xattr_hash_desc,
-                                    &hash_info, iter, k);
+                                    &hash_info, &iter, k);
                if (ret)
                        break;
 
-               bch2_btree_iter_advance(iter);
+               bch2_btree_iter_advance(&iter);
        }
 fsck_err:
        if (ret == -EINTR)
                goto retry;
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        return bch2_trans_exit(&trans) ?: ret;
 }
 
@@ -1114,7 +1117,7 @@ fsck_err:
 static int check_directory_structure(struct bch_fs *c)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_inode_unpacked u;
        struct pathbuf path = { 0, 0, NULL };
@@ -1139,7 +1142,7 @@ static int check_directory_structure(struct bch_fs *c)
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        BUG_ON(ret == -EINTR);
 
@@ -1215,7 +1218,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
                                       u64 start, u64 *end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_inode inode;
        struct bch_inode_unpacked u;
@@ -1253,7 +1256,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
                }
 
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 
        if (ret)
@@ -1267,7 +1270,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
                                     u64 range_start, u64 range_end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_dirent d;
        int ret;
@@ -1289,7 +1292,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 
                bch2_trans_cond_resched(&trans);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
@@ -1304,7 +1307,7 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                               u64 range_start, u64 range_end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_inode inode;
        struct bch_inode_unpacked u;
@@ -1346,14 +1349,14 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                        ret = __bch2_trans_do(&trans, NULL, NULL,
                                              BTREE_INSERT_NOFAIL|
                                              BTREE_INSERT_LAZY_RW,
-                                             bch2_btree_iter_traverse(iter) ?:
-                                       bch2_inode_write(&trans, iter, &u));
+                                             bch2_btree_iter_traverse(&iter) ?:
+                                       bch2_inode_write(&trans, &iter, &u));
                        if (ret)
                                bch_err(c, "error in fsck: error %i updating inode", ret);
                }
        }
 fsck_err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 
        if (ret)
index 3b671082cd1e31f65bcf34be04f9afacd444d578..14b0e8c031199aec57938c7224a0542c0c403de7 100644 (file)
@@ -292,18 +292,18 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
        return 0;
 }
 
-struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
-                                  struct bch_inode_unpacked *inode,
-                                  u64 inum, unsigned flags)
+int bch2_inode_peek(struct btree_trans *trans,
+                   struct btree_iter *iter,
+                   struct bch_inode_unpacked *inode,
+                   u64 inum, unsigned flags)
 {
-       struct btree_iter *iter;
        struct bkey_s_c k;
        int ret;
 
        if (trans->c->opts.inodes_use_key_cache)
                flags |= BTREE_ITER_CACHED;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags);
+       bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags);
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
@@ -317,10 +317,10 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       return iter;
+       return 0;
 err:
-       bch2_trans_iter_put(trans, iter);
-       return ERR_PTR(ret);
+       bch2_trans_iter_exit(trans, iter);
+       return ret;
 }
 
 int bch2_inode_write(struct btree_trans *trans,
@@ -482,12 +482,12 @@ static inline u32 bkey_generation(struct bkey_s_c k)
        }
 }
 
-struct btree_iter *bch2_inode_create(struct btree_trans *trans,
-                                    struct bch_inode_unpacked *inode_u,
-                                    u32 snapshot, u64 cpu)
+int bch2_inode_create(struct btree_trans *trans,
+                     struct btree_iter *iter,
+                     struct bch_inode_unpacked *inode_u,
+                     u32 snapshot, u64 cpu)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter = NULL;
        struct bkey_s_c k;
        u64 min, max, start, pos, *hint;
        int ret = 0;
@@ -513,9 +513,9 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans,
                start = min;
 
        pos = start;
-       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos),
-                                  BTREE_ITER_ALL_SNAPSHOTS|
-                                  BTREE_ITER_INTENT);
+       bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
+                            BTREE_ITER_ALL_SNAPSHOTS|
+                            BTREE_ITER_INTENT);
 again:
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k)) &&
@@ -553,8 +553,8 @@ again:
                ret = -ENOSPC;
 
        if (ret) {
-               bch2_trans_iter_put(trans, iter);
-               return ERR_PTR(ret);
+               bch2_trans_iter_exit(trans, iter);
+               return ret;
        }
 
        /* Retry from start */
@@ -566,8 +566,8 @@ found_slot:
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret) {
-               bch2_trans_iter_put(trans, iter);
-               return ERR_PTR(ret);
+               bch2_trans_iter_exit(trans, iter);
+               return ret;
        }
 
        /* We may have raced while the iterator wasn't pointing at pos: */
@@ -578,13 +578,13 @@ found_slot:
        *hint                   = k.k->p.offset;
        inode_u->bi_inum        = k.k->p.offset;
        inode_u->bi_generation  = bkey_generation(k);
-       return iter;
+       return 0;
 }
 
 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 {
        struct btree_trans trans;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter = { NULL };
        struct bkey_i_inode_generation delete;
        struct bpos start = POS(inode_nr, 0);
        struct bpos end = POS(inode_nr + 1, 0);
@@ -617,9 +617,9 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 retry:
        bch2_trans_begin(&trans);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
-                                  POS(0, inode_nr), iter_flags);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
+                            POS(0, inode_nr), iter_flags);
+       k = bch2_btree_iter_peek_slot(&iter);
 
        ret = bkey_err(k);
        if (ret)
@@ -636,14 +636,14 @@ retry:
        bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
 
        bkey_inode_generation_init(&delete.k_i);
-       delete.k.p = iter->pos;
+       delete.k.p = iter.pos;
        delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-       ret   = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?:
+       ret   = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
                bch2_trans_commit(&trans, NULL, NULL,
                                BTREE_INSERT_NOFAIL);
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        if (ret == -EINTR)
                goto retry;
 
@@ -654,12 +654,11 @@ err:
 static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
                                         struct bch_inode_unpacked *inode)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter = { NULL };
        int ret;
 
-       iter = bch2_inode_peek(trans, inode, inode_nr, 0);
-       ret = PTR_ERR_OR_ZERO(iter);
-       bch2_trans_iter_put(trans, iter);
+       ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index d67af4f56f05b1f7e366e5f4bb036efd54751ffe..25bef104ebcc5a692a6dcc54c2b105697fadb294 100644 (file)
@@ -57,8 +57,8 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
-struct btree_iter *bch2_inode_peek(struct btree_trans *,
-                       struct bch_inode_unpacked *, u64, unsigned);
+int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+                   struct bch_inode_unpacked *, u64, unsigned);
 int bch2_inode_write(struct btree_trans *, struct btree_iter *,
                     struct bch_inode_unpacked *);
 
@@ -71,8 +71,8 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
                     uid_t, gid_t, umode_t, dev_t,
                     struct bch_inode_unpacked *);
 
-struct btree_iter *bch2_inode_create(struct btree_trans *,
-                                    struct bch_inode_unpacked *, u32, u64);
+int bch2_inode_create(struct btree_trans *, struct btree_iter *,
+                     struct bch_inode_unpacked *, u32, u64);
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
index 4585a4036f1b9948aff47f5d83f2266a98d6c977..ccde9001aaf72487e8be759d2254267d1c7ecf20 100644 (file)
@@ -192,7 +192,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
                               s64 *disk_sectors_delta)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c old;
        unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
        bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
@@ -203,7 +203,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
        *i_sectors_delta        = 0;
        *disk_sectors_delta     = 0;
 
-       iter = bch2_trans_copy_iter(trans, extent_iter);
+       bch2_trans_copy_iter(&iter, extent_iter);
 
        for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
                s64 sectors = min(new->k.p.offset, old.k->p.offset) -
@@ -236,7 +236,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
                         * less:
                         */
                        if (!bkey_cmp(old.k->p, new->k.p)) {
-                               old = bch2_btree_iter_next(iter);
+                               old = bch2_btree_iter_next(&iter);
                                ret = bkey_err(old);
                                if (ret)
                                        break;
@@ -251,7 +251,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
                }
        }
 
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -266,11 +266,22 @@ int bch2_extent_update(struct btree_trans *trans,
 {
        /* this must live until after bch2_trans_commit(): */
        struct bkey_inode_buf inode_p;
+       struct bpos next_pos;
        bool extending = false, usage_increasing;
        s64 i_sectors_delta = 0, disk_sectors_delta = 0;
        int ret;
 
-       ret = bch2_extent_trim_atomic(k, iter);
+       /*
+        * This traverses us the iterator without changing iter->path->pos to
+        * search_key() (which is pos + 1 for extents): we want there to be a
+        * path already traversed at iter->pos because
+        * bch2_trans_extent_update() will use it to attempt extent merging
+        */
+       ret = __bch2_btree_iter_traverse(iter);
+       if (ret)
+               return ret;
+
+       ret = bch2_extent_trim_atomic(trans, iter, k);
        if (ret)
                return ret;
 
@@ -300,12 +311,11 @@ int bch2_extent_update(struct btree_trans *trans,
                : 0;
 
        if (i_sectors_delta || new_i_size) {
-               struct btree_iter *inode_iter;
+               struct btree_iter inode_iter;
                struct bch_inode_unpacked inode_u;
 
-               inode_iter = bch2_inode_peek(trans, &inode_u,
+               ret = bch2_inode_peek(trans, &inode_iter, &inode_u,
                                k->k.p.inode, BTREE_ITER_INTENT);
-               ret = PTR_ERR_OR_ZERO(inode_iter);
                if (ret)
                        return ret;
 
@@ -334,16 +344,18 @@ int bch2_extent_update(struct btree_trans *trans,
 
                        inode_p.inode.k.p.snapshot = iter->snapshot;
 
-                       ret = bch2_trans_update(trans, inode_iter,
+                       ret = bch2_trans_update(trans, &inode_iter,
                                          &inode_p.inode.k_i, 0);
                }
 
-               bch2_trans_iter_put(trans, inode_iter);
+               bch2_trans_iter_exit(trans, &inode_iter);
 
                if (ret)
                        return ret;
        }
 
+       next_pos = k->k.p;
+
        ret =   bch2_trans_update(trans, iter, k, 0) ?:
                bch2_trans_commit(trans, disk_res, journal_seq,
                                BTREE_INSERT_NOCHECK_RW|
@@ -352,6 +364,8 @@ int bch2_extent_update(struct btree_trans *trans,
        if (ret)
                return ret;
 
+       bch2_btree_iter_set_pos(iter, next_pos);
+
        if (i_sectors_delta_total)
                *i_sectors_delta_total += i_sectors_delta;
        return 0;
@@ -409,18 +423,18 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
                u64 *journal_seq, s64 *i_sectors_delta)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret = 0;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
                                   POS(inum, start),
                                   BTREE_ITER_INTENT);
 
-       ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
+       ret = bch2_fpunch_at(&trans, &iter, POS(inum, end),
                             journal_seq, i_sectors_delta);
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 
        if (ret == -EINTR)
@@ -436,28 +450,28 @@ int bch2_write_index_default(struct bch_write_op *op)
        struct keylist *keys = &op->insert_keys;
        struct bkey_i *k = bch2_keylist_front(keys);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  bkey_start_pos(&k->k),
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            bkey_start_pos(&k->k),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        do {
                bch2_trans_begin(&trans);
 
                k = bch2_keylist_front(keys);
 
-               k->k.p.snapshot = iter->snapshot;
+               k->k.p.snapshot = iter.snapshot;
 
                bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
                bkey_copy(sk.k, k);
-               bch2_cut_front(iter->pos, sk.k);
+               bch2_cut_front(iter.pos, sk.k);
 
-               ret = bch2_extent_update(&trans, iter, sk.k,
+               ret = bch2_extent_update(&trans, &iter, sk.k,
                                         &op->res, op_journal_seq(op),
                                         op->new_i_size, &op->i_sectors_delta,
                                         op->flags & BCH_WRITE_CHECK_ENOSPC);
@@ -466,11 +480,11 @@ int bch2_write_index_default(struct bch_write_op *op)
                if (ret)
                        break;
 
-               if (bkey_cmp(iter->pos, k->k.p) >= 0)
+               if (bkey_cmp(iter.pos, k->k.p) >= 0)
                        bch2_keylist_pop_front(keys);
        } while (!bch2_keylist_empty(keys));
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
@@ -1636,7 +1650,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
                                     unsigned flags)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_buf sk;
        struct bkey_s_c k;
        int ret;
@@ -1647,12 +1661,12 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, rbio->data_btree,
-                                  rbio->read_pos, BTREE_ITER_SLOTS);
+       bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
+                            rbio->read_pos, BTREE_ITER_SLOTS);
 retry:
        rbio->bio.bi_status = 0;
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
        if (bkey_err(k))
                goto err;
 
@@ -1679,7 +1693,7 @@ retry:
                goto err;
 out:
        bch2_rbio_done(rbio);
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
        return;
@@ -1745,7 +1759,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        struct bch_fs *c = rbio->c;
        u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
        struct bch_extent_crc_unpacked new_crc;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter;
        struct bkey_i *new;
        struct bkey_s_c k;
        int ret = 0;
@@ -1753,9 +1767,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        if (crc_is_compressed(rbio->pick.crc))
                return 0;
 
-       iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
        if ((ret = bkey_err(k)))
                goto out;
 
@@ -1790,9 +1804,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        if (!bch2_bkey_narrow_crcs(new, new_crc))
                goto out;
 
-       ret = bch2_trans_update(trans, iter, new, 0);
+       ret = bch2_trans_update(trans, &iter, new, 0);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1963,7 +1977,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
                                unsigned *offset_into_extent,
                                struct bkey_buf *orig_k)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        u64 reflink_offset;
        int ret;
@@ -1971,10 +1985,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
        reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
                *offset_into_extent;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
-                                  POS(0, reflink_offset),
-                                  BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
+                            POS(0, reflink_offset),
+                            BTREE_ITER_SLOTS);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -1991,10 +2005,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
                goto err;
        }
 
-       *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+       *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
        bch2_bkey_buf_reassemble(orig_k, trans->c, k);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -2264,7 +2278,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
                 struct bch_io_failures *failed, unsigned flags)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_buf sk;
        struct bkey_s_c k;
        int ret;
@@ -2273,10 +2287,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  POS(inode, bvec_iter.bi_sector),
-                                  BTREE_ITER_SLOTS);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            POS(inode, bvec_iter.bi_sector),
+                            BTREE_ITER_SLOTS);
 retry:
        bch2_trans_begin(&trans);
 
@@ -2293,15 +2306,15 @@ retry:
                        break;
                }
 
-               bch2_btree_iter_set_pos(iter,
+               bch2_btree_iter_set_pos(&iter,
                                POS(inode, bvec_iter.bi_sector));
 
-               k = bch2_btree_iter_peek_slot(iter);
+               k = bch2_btree_iter_peek_slot(&iter);
                ret = bkey_err(k);
                if (ret)
                        break;
 
-               offset_into_extent = iter->pos.offset -
+               offset_into_extent = iter.pos.offset -
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
@@ -2332,7 +2345,7 @@ retry:
                if (bvec_iter.bi_size == bytes)
                        flags |= BCH_READ_LAST_FRAGMENT;
 
-               ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
+               ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
                                         data_btree, k,
                                         offset_into_extent, failed, flags);
                if (ret)
@@ -2348,7 +2361,7 @@ retry:
        if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
                goto retry;
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
index f2060f903cbcf90489de1712511c2925b69d7198..68fb2ebd91ac14dc5988d972e99d213e7ba5c2bc 100644 (file)
@@ -250,7 +250,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
        bch2_trans_init(&trans, c, 0, 0);
 
        for (i = 0; i < BTREE_ID_NR; i++) {
-               struct btree_iter *iter;
+               struct btree_iter iter;
                struct btree *b;
 
                for_each_btree_node(&trans, iter, i, POS_MIN,
@@ -259,7 +259,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
                                bch2_trans_exit(&trans);
                                return;
                        }
-               bch2_trans_iter_free(&trans, iter);
+               bch2_trans_iter_exit(&trans, &iter);
        }
 
        ret = bch2_trans_exit(&trans);
index 1f65eca48c6ef48d20c033d119a04f0215f7a607..1899326d9754eeebdf9c850ace262b2eb25fe16a 100644 (file)
@@ -39,7 +39,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                                   enum btree_id btree_id)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf sk;
        int ret = 0;
@@ -47,13 +47,13 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-                                  BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+                            BTREE_ITER_PREFETCH);
 
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k))) {
                if (!bch2_bkey_has_device(k, dev_idx)) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
@@ -71,10 +71,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                 */
                bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-               bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
+               bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
 
-               ret   = bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(&trans, iter, sk.k, 0) ?:
+               ret   = bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(&trans, &iter, sk.k, 0) ?:
                        bch2_trans_commit(&trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL);
 
@@ -88,7 +88,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        bch2_bkey_buf_exit(&sk, c);
@@ -107,7 +107,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct closure cl;
        struct btree *b;
        struct bkey_buf k;
@@ -139,9 +139,9 @@ retry:
                                break;
                        }
 
-                       ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false);
+                       ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
                        if (ret == -EINTR) {
-                               b = bch2_btree_iter_peek_node(iter);
+                               b = bch2_btree_iter_peek_node(&iter);
                                ret = 0;
                                goto retry;
                        }
@@ -150,7 +150,7 @@ retry:
                                break;
                        }
                }
-               bch2_trans_iter_free(&trans, iter);
+               bch2_trans_iter_exit(&trans, &iter);
 
                if (ret)
                        goto err;
index ee0f155fda6c85628c800516506b28e1d375c6ff..fb7c0abd40b19370f31c3ee24a1fb5703c5cc254 100644 (file)
@@ -57,7 +57,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct migrate_write *m =
                container_of(op, struct migrate_write, op);
        struct keylist *keys = &op->insert_keys;
@@ -70,9 +70,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-       iter = bch2_trans_get_iter(&trans, m->btree_id,
-                                  bkey_start_pos(&bch2_keylist_front(keys)->k),
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, m->btree_id,
+                            bkey_start_pos(&bch2_keylist_front(keys)->k),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        while (1) {
                struct bkey_s_c k;
@@ -80,13 +80,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                struct bkey_i_extent *new;
                const union bch_extent_entry *entry;
                struct extent_ptr_decoded p;
+               struct bpos next_pos;
                bool did_work = false;
                bool extending = false, should_check_enospc;
                s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 
                bch2_trans_begin(&trans);
 
-               k = bch2_btree_iter_peek_slot(iter);
+               k = bch2_btree_iter_peek_slot(&iter);
                ret = bkey_err(k);
                if (ret)
                        goto err;
@@ -102,9 +103,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
                bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
                new = bkey_i_to_extent(_new.k);
-               bch2_cut_front(iter->pos, &new->k_i);
+               bch2_cut_front(iter.pos, &new->k_i);
 
-               bch2_cut_front(iter->pos,       insert);
+               bch2_cut_front(iter.pos,        insert);
                bch2_cut_back(new->k.p,         insert);
                bch2_cut_back(insert->k.p,      &new->k_i);
 
@@ -146,7 +147,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                               op->opts.background_target,
                                               op->opts.data_replicas);
 
-               ret = bch2_sum_sector_overwrites(&trans, iter, insert,
+               ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
                                                 &extending,
                                                 &should_check_enospc,
                                                 &i_sectors_delta,
@@ -163,20 +164,24 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                goto out;
                }
 
-               ret   = bch2_trans_update(&trans, iter, insert, 0) ?:
+               next_pos = insert->k.p;
+
+               ret   = bch2_trans_update(&trans, &iter, insert, 0) ?:
                        bch2_trans_commit(&trans, &op->res,
                                op_journal_seq(op),
                                BTREE_INSERT_NOFAIL|
                                m->data_opts.btree_insert_flags);
-err:
-               if (!ret)
+               if (!ret) {
+                       bch2_btree_iter_set_pos(&iter, next_pos);
                        atomic_long_inc(&c->extent_migrate_done);
+               }
+err:
                if (ret == -EINTR)
                        ret = 0;
                if (ret)
                        break;
 next:
-               while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) {
+               while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
                        bch2_keylist_pop_front(keys);
                        if (bch2_keylist_empty(keys))
                                goto out;
@@ -184,18 +189,18 @@ next:
                continue;
 nomatch:
                if (m->ctxt) {
-                       BUG_ON(k.k->p.offset <= iter->pos.offset);
+                       BUG_ON(k.k->p.offset <= iter.pos.offset);
                        atomic64_inc(&m->ctxt->stats->keys_raced);
-                       atomic64_add(k.k->p.offset - iter->pos.offset,
+                       atomic64_add(k.k->p.offset - iter.pos.offset,
                                     &m->ctxt->stats->sectors_raced);
                }
                atomic_long_inc(&c->extent_migrate_raced);
                trace_move_race(&new->k);
-               bch2_btree_iter_advance(iter);
+               bch2_btree_iter_advance(&iter);
                goto next;
        }
 out:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&_insert, c);
        bch2_bkey_buf_exit(&_new, c);
@@ -216,11 +221,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
        m->op.crc       = rbio->pick.crc;
        m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
 
-       if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
-               m->op.nonce     = m->op.crc.nonce + m->op.crc.offset;
-               m->op.csum_type = m->op.crc.csum_type;
-       }
-
        if (m->data_cmd == DATA_REWRITE)
                bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
 }
@@ -235,6 +235,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
+       struct bch_extent_crc_unpacked crc;
        struct extent_ptr_decoded p;
        int ret;
 
@@ -255,6 +256,18 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
        m->op.target    = data_opts.target,
        m->op.write_point = wp;
 
+       /*
+        * op->csum_type is normally initialized from the fs/file's current
+        * options - but if an extent is encrypted, we require that it stays
+        * encrypted:
+        */
+       bkey_for_each_crc(k.k, ptrs, crc, entry)
+               if (bch2_csum_type_is_encryption(crc.csum_type)) {
+                       m->op.nonce     = crc.nonce + m->op.crc.offset;
+                       m->op.csum_type = crc.csum_type;
+                       break;
+               }
+
        if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
                m->op.alloc_reserve = RESERVE_MOVINGGC;
                m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
@@ -511,13 +524,13 @@ err:
 static int lookup_inode(struct btree_trans *trans, struct bpos pos,
                        struct bch_inode_unpacked *inode)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
-                                  BTREE_ITER_ALL_SNAPSHOTS);
-       k = bch2_btree_iter_peek(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       k = bch2_btree_iter_peek(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -535,7 +548,7 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
        if (ret)
                goto err;
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -553,7 +566,7 @@ static int __bch2_move_data(struct bch_fs *c,
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct bkey_buf sk;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct data_opts data_opts;
        enum data_cmd data_cmd;
@@ -567,8 +580,8 @@ static int __bch2_move_data(struct bch_fs *c,
        stats->btree_id = btree_id;
        stats->pos      = start;
 
-       iter = bch2_trans_get_iter(&trans, btree_id, start,
-                                  BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, btree_id, start,
+                            BTREE_ITER_PREFETCH);
 
        if (rate)
                bch2_ratelimit_reset(rate);
@@ -599,9 +612,9 @@ static int __bch2_move_data(struct bch_fs *c,
 
                bch2_trans_begin(&trans);
 
-               k = bch2_btree_iter_peek(iter);
+               k = bch2_btree_iter_peek(&iter);
 
-               stats->pos = iter->pos;
+               stats->pos = iter.pos;
 
                if (!k.k)
                        break;
@@ -674,18 +687,42 @@ next:
                atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
                             &stats->sectors_seen);
 next_nondata:
-               bch2_btree_iter_advance(iter);
+               bch2_btree_iter_advance(&iter);
                bch2_trans_cond_resched(&trans);
        }
 out:
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        ret = bch2_trans_exit(&trans) ?: ret;
        bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
 
+inline void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+       memset(stats, 0, sizeof(*stats));
+
+       scnprintf(stats->name, sizeof(stats->name),
+                       "%s", name);
+}
+
+static inline void progress_list_add(struct bch_fs *c,
+                                    struct bch_move_stats *stats)
+{
+       mutex_lock(&c->data_progress_lock);
+       list_add(&stats->list, &c->data_progress_list);
+       mutex_unlock(&c->data_progress_lock);
+}
+
+static inline void progress_list_del(struct bch_fs *c,
+                                    struct bch_move_stats *stats)
+{
+       mutex_lock(&c->data_progress_lock);
+       list_del(&stats->list);
+       mutex_unlock(&c->data_progress_lock);
+}
+
 int bch2_move_data(struct bch_fs *c,
                   enum btree_id start_btree_id, struct bpos start_pos,
                   enum btree_id end_btree_id,   struct bpos end_pos,
@@ -698,6 +735,7 @@ int bch2_move_data(struct bch_fs *c,
        enum btree_id id;
        int ret;
 
+       progress_list_add(c, stats);
        closure_init_stack(&ctxt.cl);
        INIT_LIST_HEAD(&ctxt.reads);
        init_waitqueue_head(&ctxt.wait);
@@ -731,6 +769,7 @@ int bch2_move_data(struct bch_fs *c,
                        atomic64_read(&stats->sectors_moved),
                        atomic64_read(&stats->keys_moved));
 
+       progress_list_del(c, stats);
        return ret;
 }
 
@@ -747,7 +786,7 @@ static int bch2_move_btree(struct bch_fs *c,
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        enum btree_id id;
        struct data_opts data_opts;
@@ -755,6 +794,7 @@ static int bch2_move_btree(struct bch_fs *c,
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
+       progress_list_add(c, stats);
 
        stats->data_type = BCH_DATA_btree;
 
@@ -773,7 +813,7 @@ static int bch2_move_btree(struct bch_fs *c,
                             bpos_cmp(b->key.k.p, end_pos)) > 0)
                                break;
 
-                       stats->pos = iter->pos;
+                       stats->pos = iter.pos;
 
                        switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
                        case DATA_SKIP:
@@ -787,13 +827,13 @@ static int bch2_move_btree(struct bch_fs *c,
                                BUG();
                        }
 
-                       ret = bch2_btree_node_rewrite(&trans, iter,
+                       ret = bch2_btree_node_rewrite(&trans, &iter,
                                        b->data->keys.seq, 0) ?: ret;
 next:
                        bch2_trans_cond_resched(&trans);
                }
+               bch2_trans_iter_exit(&trans, &iter);
 
-               ret = bch2_trans_iter_free(&trans, iter) ?: ret;
                if (kthread && kthread_should_stop())
                        break;
        }
@@ -803,6 +843,7 @@ next:
        if (ret)
                bch_err(c, "error %i in bch2_move_btree", ret);
 
+       progress_list_del(c, stats);
        return ret;
 }
 
@@ -944,6 +985,7 @@ int bch2_data_job(struct bch_fs *c,
 
        switch (op.op) {
        case BCH_DATA_OP_REREPLICATE:
+               bch_move_stats_init(stats, "rereplicate");
                stats->data_type = BCH_DATA_journal;
                ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
@@ -968,6 +1010,7 @@ int bch2_data_job(struct bch_fs *c,
                if (op.migrate.dev >= c->sb.nr_devices)
                        return -EINVAL;
 
+               bch_move_stats_init(stats, "migrate");
                stats->data_type = BCH_DATA_journal;
                ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
 
@@ -985,6 +1028,7 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_replicas_gc2(c) ?: ret;
                break;
        case BCH_DATA_OP_REWRITE_OLD_NODES:
+               bch_move_stats_init(stats, "rewrite_old_nodes");
                ret = bch2_scan_old_btree_nodes(c, stats);
                break;
        default:
index 5076153689d18bd3a55049eff957df4f376a2a19..2a789a1158ca22e4e7efb1cd4298b4115da3216d 100644 (file)
@@ -66,4 +66,8 @@ int bch2_data_job(struct bch_fs *,
                  struct bch_move_stats *,
                  struct bch_ioctl_data);
 
+inline void bch_move_stats_init(struct bch_move_stats *stats,
+                               char *name);
+
+
 #endif /* _BCACHEFS_MOVE_H */
index fc0de165af9fe354b246e24ca223904c87e7dc0b..9df6d18137a5e02655d6c34f10730b896f9d48d5 100644 (file)
@@ -6,6 +6,8 @@ struct bch_move_stats {
        enum bch_data_type      data_type;
        enum btree_id           btree_id;
        struct bpos             pos;
+       struct list_head        list;
+       char                    name[32];
 
        atomic64_t              keys_moved;
        atomic64_t              keys_raced;
index 2acca0ddb6fd64a140fffe7735a3239661425b6f..5c9eafc026c9f8288bc42aa8931c6aecf9effd95 100644 (file)
@@ -85,6 +85,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
                BUG_ON(i != j);
 #endif
                if (i >= 0 &&
+                   p.ptr.dev == h->data[i].dev &&
                    p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
                    p.ptr.gen == h->data[i].gen) {
                        /*
@@ -146,7 +147,8 @@ static int bch2_copygc(struct bch_fs *c)
        size_t b, heap_size = 0;
        int ret;
 
-       memset(&move_stats, 0, sizeof(move_stats));
+       bch_move_stats_init(&move_stats, "copygc");
+
        /*
         * Find buckets with lowest sector counts, skipping completely
         * empty buckets, by building a maxheap sorted by sector count,
index 003c00f2503730d0e3b792084ad5994689fa740e..147b4021fdaef0d3816cc6311eb895eff1f10242 100644 (file)
@@ -171,7 +171,7 @@ enum opt_type {
        x(shard_inode_numbers,          u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
          OPT_BOOL(),                                                   \
-         BCH_SB_SHARD_INUMS,           false,                          \
+         BCH_SB_SHARD_INUMS,           true,                           \
          NULL,         "Shard new inode numbers by CPU id")            \
        x(inodes_use_key_cache, u8,                                     \
          OPT_FORMAT|OPT_MOUNT,                                         \
index 7861781a4a7fea4de99f209070be22f61ec71b84..9b0f4d3f176d5fcbd7c0e0e189679c40416abefd 100644 (file)
@@ -357,7 +357,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
@@ -372,7 +372,7 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        return bch2_trans_exit(&trans) ?: ret;
 }
@@ -419,7 +419,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
        unsigned i, qtypes = enabled_qtypes(c);
        struct bch_memquota_type *q;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bch_inode_unpacked u;
        struct bkey_s_c k;
        int ret;
@@ -450,7 +450,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
                                        KEY_TYPE_QUOTA_NOCHECK);
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        return bch2_trans_exit(&trans) ?: ret;
 }
@@ -717,13 +717,13 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
                                struct bkey_i_quota *new_quota,
                                struct qc_dqblk *qdq)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p,
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
 
        ret = bkey_err(k);
        if (unlikely(ret))
@@ -742,8 +742,8 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
        if (qdq->d_fieldmask & QC_INO_HARD)
                new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-       ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0);
-       bch2_trans_iter_put(trans, iter);
+       ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index a0dbf41d1d3763c432c0c0ba90f9ecc498623f74..a573fede05b11fba7a5ada92b9bbfae322608612 100644 (file)
@@ -166,6 +166,7 @@ static int bch2_rebalance_thread(void *arg)
        struct bch_fs_rebalance *r = &c->rebalance;
        struct io_clock *clock = &c->io_clock[WRITE];
        struct rebalance_work w, p;
+       struct bch_move_stats move_stats;
        unsigned long start, prev_start;
        unsigned long prev_run_time, prev_run_cputime;
        unsigned long cputime, prev_cputime;
@@ -179,6 +180,7 @@ static int bch2_rebalance_thread(void *arg)
        prev_start      = jiffies;
        prev_cputime    = curr_cputime();
 
+       bch_move_stats_init(&move_stats, "rebalance");
        while (!kthread_wait_freezable(r->enabled)) {
                cond_resched();
 
@@ -235,7 +237,7 @@ static int bch2_rebalance_thread(void *arg)
                prev_cputime    = cputime;
 
                r->state = REBALANCE_RUNNING;
-               memset(&r->move_stats, 0, sizeof(r->move_stats));
+               memset(&move_stats, 0, sizeof(move_stats));
                rebalance_work_reset(c);
 
                bch2_move_data(c,
@@ -245,7 +247,7 @@ static int bch2_rebalance_thread(void *arg)
                               NULL, /*  &r->pd.rate, */
                               writepoint_ptr(&c->rebalance_write_point),
                               rebalance_pred, NULL,
-                              &r->move_stats);
+                              &move_stats);
        }
 
        return 0;
@@ -281,10 +283,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
                       h1);
                break;
        case REBALANCE_RUNNING:
-               pr_buf(out, "running\n"
-                      "pos ");
-               bch2_bpos_to_text(out, r->move_stats.pos);
-               pr_buf(out, "\n");
+               pr_buf(out, "running\n");
                break;
        }
 }
index 2f62a643c39fbb0c08f024fbf58a7f3325755875..7462a92e95985d91cdc454485d045659240dd0fc 100644 (file)
@@ -19,7 +19,6 @@ struct bch_fs_rebalance {
        enum rebalance_state    state;
        u64                     throttled_until_iotime;
        unsigned long           throttled_until_cputime;
-       struct bch_move_stats   move_stats;
 
        unsigned                enabled:1;
 };
index afb72648fe5416a0cba73f59548f1933ad78fc86..11208e83fabee044669a6bcc592a1e802d41dd40 100644 (file)
@@ -326,7 +326,7 @@ static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
               (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
                bch2_bkey_buf_reassemble(&tmp, c, k);
 
-               bch2_btree_node_prefetch(c, NULL, tmp.k,
+               bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
                                        b->c.btree_id, b->c.level - 1);
 
                bch2_btree_and_journal_iter_advance(&iter);
@@ -518,16 +518,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
                                     enum btree_id id, unsigned level,
                                     struct bkey_i *k)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
-       iter = bch2_trans_get_node_iter(trans, id, k->k.p,
-                                       BTREE_MAX_DEPTH, level,
-                                       BTREE_ITER_INTENT|
-                                       BTREE_ITER_NOT_EXTENTS);
-       ret   = bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_node_iter_init(trans, &iter, id, k->k.p,
+                                 BTREE_MAX_DEPTH, level,
+                                 BTREE_ITER_INTENT|
+                                 BTREE_ITER_NOT_EXTENTS);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -545,16 +545,16 @@ static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
 
 static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p,
-                                  BTREE_ITER_CACHED|
-                                  BTREE_ITER_CACHED_NOFILL|
-                                  BTREE_ITER_INTENT);
-       ret   = bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, k->k.p,
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_CACHED_NOFILL|
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, BTREE_TRIGGER_NORUN);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1216,7 +1216,9 @@ use_clean:
 
        if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
            !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
-               struct bch_move_stats stats = { 0 };
+               struct bch_move_stats stats;
+
+               bch_move_stats_init(&stats, "recovery");
 
                bch_info(c, "scanning for old btree nodes");
                ret = bch2_fs_read_write(c);
index 3d9c5c5b0eba75a7e548e6b1e48f539bddbf4365..576cfbccf5b537b2d000935d739476b278fe58b7 100644 (file)
@@ -116,7 +116,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
                                     struct bkey_i *orig)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *reflink_iter;
+       struct btree_iter reflink_iter = { NULL };
        struct bkey_s_c k;
        struct bkey_i *r_v;
        struct bkey_i_reflink_p *r_p;
@@ -129,8 +129,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
        for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
                           POS(0, c->reflink_hint),
                           BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
-               if (reflink_iter->pos.inode) {
-                       bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+               if (reflink_iter.pos.inode) {
+                       bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
                        continue;
                }
 
@@ -142,7 +142,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
                goto err;
 
        /* rewind iter to start of hole, if necessary: */
-       bch2_btree_iter_set_pos_to_extent_start(reflink_iter);
+       bch2_btree_iter_set_pos_to_extent_start(&reflink_iter);
 
        r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
        ret = PTR_ERR_OR_ZERO(r_v);
@@ -151,7 +151,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
        bkey_init(&r_v->k);
        r_v->k.type     = bkey_type_to_indirect(&orig->k);
-       r_v->k.p        = reflink_iter->pos;
+       r_v->k.p        = reflink_iter.pos;
        bch2_key_resize(&r_v->k, orig->k.size);
        r_v->k.version  = orig->k.version;
 
@@ -161,7 +161,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
        *refcount       = 0;
        memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
 
-       ret = bch2_trans_update(trans, reflink_iter, r_v, 0);
+       ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
        if (ret)
                goto err;
 
@@ -172,9 +172,8 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
        ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
 err:
-       if (!IS_ERR(reflink_iter))
-               c->reflink_hint = reflink_iter->pos.offset;
-       bch2_trans_iter_put(trans, reflink_iter);
+       c->reflink_hint = reflink_iter.pos.offset;
+       bch2_trans_iter_exit(trans, &reflink_iter);
 
        return ret;
 }
@@ -184,7 +183,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
        struct bkey_s_c k;
        int ret;
 
-       for_each_btree_key_continue(iter, 0, k, ret) {
+       for_each_btree_key_continue(*iter, 0, k, ret) {
                if (bkey_cmp(iter->pos, end) >= 0)
                        break;
 
@@ -203,7 +202,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                     u64 new_i_size, s64 *i_sectors_delta)
 {
        struct btree_trans trans;
-       struct btree_iter *dst_iter, *src_iter;
+       struct btree_iter dst_iter, src_iter;
        struct bkey_s_c src_k;
        struct bkey_buf new_dst, new_src;
        struct bpos dst_end = dst_start, src_end = src_start;
@@ -223,13 +222,13 @@ s64 bch2_remap_range(struct bch_fs *c,
        bch2_bkey_buf_init(&new_src);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-       src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start,
-                                      BTREE_ITER_INTENT);
-       dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
-                                      BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+                            BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+                            BTREE_ITER_INTENT);
 
        while ((ret == 0 || ret == -EINTR) &&
-              bkey_cmp(dst_iter->pos, dst_end) < 0) {
+              bkey_cmp(dst_iter.pos, dst_end) < 0) {
                struct disk_reservation disk_res = { 0 };
 
                bch2_trans_begin(&trans);
@@ -239,31 +238,31 @@ s64 bch2_remap_range(struct bch_fs *c,
                        break;
                }
 
-               dst_done = dst_iter->pos.offset - dst_start.offset;
+               dst_done = dst_iter.pos.offset - dst_start.offset;
                src_want = POS(src_start.inode, src_start.offset + dst_done);
-               bch2_btree_iter_set_pos(src_iter, src_want);
+               bch2_btree_iter_set_pos(&src_iter, src_want);
 
-               src_k = get_next_src(src_iter, src_end);
+               src_k = get_next_src(&src_iter, src_end);
                ret = bkey_err(src_k);
                if (ret)
                        continue;
 
-               if (bkey_cmp(src_want, src_iter->pos) < 0) {
-                       ret = bch2_fpunch_at(&trans, dst_iter,
+               if (bkey_cmp(src_want, src_iter.pos) < 0) {
+                       ret = bch2_fpunch_at(&trans, &dst_iter,
                                        bpos_min(dst_end,
-                                                POS(dst_iter->pos.inode, dst_iter->pos.offset +
-                                                    src_iter->pos.offset - src_want.offset)),
+                                                POS(dst_iter.pos.inode, dst_iter.pos.offset +
+                                                    src_iter.pos.offset - src_want.offset)),
                                                 journal_seq, i_sectors_delta);
                        continue;
                }
 
                if (src_k.k->type != KEY_TYPE_reflink_p) {
-                       bch2_btree_iter_set_pos_to_extent_start(src_iter);
+                       bch2_btree_iter_set_pos_to_extent_start(&src_iter);
 
                        bch2_bkey_buf_reassemble(&new_src, c, src_k);
                        src_k = bkey_i_to_s_c(new_src.k);
 
-                       ret = bch2_make_extent_indirect(&trans, src_iter,
+                       ret = bch2_make_extent_indirect(&trans, &src_iter,
                                                new_src.k);
                        if (ret)
                                continue;
@@ -286,43 +285,42 @@ s64 bch2_remap_range(struct bch_fs *c,
                        BUG();
                }
 
-               new_dst.k->k.p = dst_iter->pos;
+               new_dst.k->k.p = dst_iter.pos;
                bch2_key_resize(&new_dst.k->k,
                                min(src_k.k->p.offset - src_want.offset,
-                                   dst_end.offset - dst_iter->pos.offset));
-               ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
+                                   dst_end.offset - dst_iter.pos.offset));
+               ret = bch2_extent_update(&trans, &dst_iter, new_dst.k,
                                         &disk_res, journal_seq,
                                         new_i_size, i_sectors_delta,
                                         true);
                bch2_disk_reservation_put(c, &disk_res);
        }
-       bch2_trans_iter_put(&trans, dst_iter);
-       bch2_trans_iter_put(&trans, src_iter);
+       bch2_trans_iter_exit(&trans, &dst_iter);
+       bch2_trans_iter_exit(&trans, &src_iter);
 
-       BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end));
-       BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+       BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end));
+       BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0);
 
-       dst_done = dst_iter->pos.offset - dst_start.offset;
-       new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+       dst_done = dst_iter.pos.offset - dst_start.offset;
+       new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
 
        do {
                struct bch_inode_unpacked inode_u;
-               struct btree_iter *inode_iter;
+               struct btree_iter inode_iter = { NULL };
 
                bch2_trans_begin(&trans);
 
-               inode_iter = bch2_inode_peek(&trans, &inode_u,
+               ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
                                dst_start.inode, BTREE_ITER_INTENT);
-               ret2 = PTR_ERR_OR_ZERO(inode_iter);
 
                if (!ret2 &&
                    inode_u.bi_size < new_i_size) {
                        inode_u.bi_size = new_i_size;
-                       ret2  = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
+                       ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
                                bch2_trans_commit(&trans, NULL, journal_seq, 0);
                }
 
-               bch2_trans_iter_put(&trans, inode_iter);
+               bch2_trans_iter_exit(&trans, &inode_iter);
        } while (ret2 == -EINTR);
 
        ret = bch2_trans_exit(&trans) ?: ret;
index 23602349419161d7e7bd422f931a81dad44ba4e4..c6a132b3c5bb2eb24cf112f11fd23d254e01b499 100644 (file)
@@ -139,18 +139,18 @@ struct bch_hash_desc {
        bool            (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
 };
 
-static __always_inline struct btree_iter *
+static __always_inline int
 bch2_hash_lookup(struct btree_trans *trans,
+                struct btree_iter *iter,
                 const struct bch_hash_desc desc,
                 const struct bch_hash_info *info,
                 u64 inode, const void *key,
                 unsigned flags)
 {
-       struct btree_iter *iter;
        struct bkey_s_c k;
        int ret;
 
-       for_each_btree_key(trans, iter, desc.btree_id,
+       for_each_btree_key(trans, *iter, desc.btree_id,
                           POS(inode, desc.hash_key(info, key)),
                           BTREE_ITER_SLOTS|flags, k, ret) {
                if (iter->pos.inode != inode)
@@ -158,7 +158,7 @@ bch2_hash_lookup(struct btree_trans *trans,
 
                if (k.k->type == desc.key_type) {
                        if (!desc.cmp_key(k, key))
-                               return iter;
+                               return 0;
                } else if (k.k->type == KEY_TYPE_hash_whiteout) {
                        ;
                } else {
@@ -166,35 +166,33 @@ bch2_hash_lookup(struct btree_trans *trans,
                        break;
                }
        }
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, iter);
 
-       return ERR_PTR(ret ?: -ENOENT);
+       return ret ?: -ENOENT;
 }
 
-static __always_inline struct btree_iter *
+static __always_inline int
 bch2_hash_hole(struct btree_trans *trans,
+              struct btree_iter *iter,
               const struct bch_hash_desc desc,
               const struct bch_hash_info *info,
               u64 inode, const void *key)
 {
-       struct btree_iter *iter;
        struct bkey_s_c k;
        int ret;
 
-       for_each_btree_key(trans, iter, desc.btree_id,
+       for_each_btree_key(trans, *iter, desc.btree_id,
                           POS(inode, desc.hash_key(info, key)),
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
                if (iter->pos.inode != inode)
                        break;
 
                if (k.k->type != desc.key_type)
-                       return iter;
+                       return 0;
        }
+       bch2_trans_iter_exit(trans, iter);
 
-       iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-       bch2_trans_iter_put(trans, iter);
-
-       return ERR_PTR(ret ?: -ENOSPC);
+       return ret ?: -ENOSPC;
 }
 
 static __always_inline
@@ -203,13 +201,13 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
                             const struct bch_hash_info *info,
                             struct btree_iter *start)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_copy_iter(trans, start);
+       bch2_trans_copy_iter(&iter, start);
 
-       bch2_btree_iter_advance(iter);
+       bch2_btree_iter_advance(&iter);
 
        for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
                if (k.k->type != desc.key_type &&
@@ -218,13 +216,12 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 
                if (k.k->type == desc.key_type &&
                    desc.hash_bkey(info, k) <= start->pos.offset) {
-                       iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
                        ret = 1;
                        break;
                }
        }
 
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -234,7 +231,7 @@ int bch2_hash_set(struct btree_trans *trans,
                  const struct bch_hash_info *info,
                  u64 inode, struct bkey_i *insert, int flags)
 {
-       struct btree_iter *iter, *slot = NULL;
+       struct btree_iter iter, slot = { NULL };
        struct bkey_s_c k;
        bool found = false;
        int ret;
@@ -242,7 +239,7 @@ int bch2_hash_set(struct btree_trans *trans,
        for_each_btree_key(trans, iter, desc.btree_id,
                           POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               if (iter->pos.inode != inode)
+               if (iter.pos.inode != inode)
                        break;
 
                if (k.k->type == desc.key_type) {
@@ -253,9 +250,9 @@ int bch2_hash_set(struct btree_trans *trans,
                        continue;
                }
 
-               if (!slot &&
+               if (!slot.path &&
                    !(flags & BCH_HASH_SET_MUST_REPLACE))
-                       slot = bch2_trans_copy_iter(trans, iter);
+                       bch2_trans_copy_iter(&slot, &iter);
 
                if (k.k->type != KEY_TYPE_hash_whiteout)
                        goto not_found;
@@ -264,8 +261,8 @@ int bch2_hash_set(struct btree_trans *trans,
        if (!ret)
                ret = -ENOSPC;
 out:
-       bch2_trans_iter_put(trans, slot);
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &slot);
+       bch2_trans_iter_exit(trans, &iter);
 
        return ret;
 found:
@@ -277,11 +274,11 @@ not_found:
        } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
                ret = -EEXIST;
        } else {
-               if (!found && slot)
+               if (!found && slot.path)
                        swap(iter, slot);
 
-               insert->k.p = iter->pos;
-               ret = bch2_trans_update(trans, iter, insert, 0);
+               insert->k.p = iter.pos;
+               ret = bch2_trans_update(trans, &iter, insert, 0);
        }
 
        goto out;
@@ -318,16 +315,16 @@ int bch2_hash_delete(struct btree_trans *trans,
                     const struct bch_hash_info *info,
                     u64 inode, const void *key)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
-       iter = bch2_hash_lookup(trans, desc, info, inode, key,
+       ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key,
                                BTREE_ITER_INTENT);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
+       if (ret)
+               return ret;
 
-       ret = bch2_hash_delete_at(trans, desc, info, iter);
-       bch2_trans_iter_put(trans, iter);
+       ret = bch2_hash_delete_at(trans, desc, info, &iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index ce8e5d4843d0e8f4bd9a596d0421b9a04860bd4b..8f847661359498c71cb203faa682b7e87f1ec336 100644 (file)
@@ -486,12 +486,12 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_journal_entries_free(&c->journal_entries);
        percpu_free_rwsem(&c->mark_lock);
 
-       if (c->btree_iters_bufs)
+       if (c->btree_paths_bufs)
                for_each_possible_cpu(cpu)
-                       kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+                       kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
 
        free_percpu(c->online_reserved);
-       free_percpu(c->btree_iters_bufs);
+       free_percpu(c->btree_paths_bufs);
        free_percpu(c->pcpu);
        mempool_exit(&c->large_bkey_pool);
        mempool_exit(&c->btree_bounce_pool);
@@ -704,6 +704,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        INIT_LIST_HEAD(&c->ec_stripe_new_list);
        mutex_init(&c->ec_stripe_new_lock);
 
+       INIT_LIST_HEAD(&c->data_progress_list);
+       mutex_init(&c->data_progress_lock);
+
        spin_lock_init(&c->ec_stripes_heap_lock);
 
        seqcount_init(&c->gc_pos_lock);
@@ -771,7 +774,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                            offsetof(struct btree_write_bio, wbio.bio)),
                        BIOSET_NEED_BVECS) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
-           !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
+           !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
            !(c->online_reserved = alloc_percpu(u64)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
index 9b1ffbf96e14784a63527ff6128ee925564eee8a..92e58f5c6bbfabb078cefd292b214cff5883d0e8 100644 (file)
@@ -203,6 +203,8 @@ read_attribute(new_stripes);
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
+read_attribute(data_op_data_progress);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 write_attribute(perf_test);
 #endif /* CONFIG_BCACHEFS_TESTS */
@@ -239,6 +241,37 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c)
        return nr ? div64_u64(sectors, nr) : 0;
 }
 
+static long stats_to_text(struct printbuf *out, struct bch_fs *c,
+                         struct bch_move_stats *stats)
+{
+       pr_buf(out, "%s: data type %s btree_id %s position: ",
+               stats->name,
+               bch2_data_types[stats->data_type],
+               bch2_btree_ids[stats->btree_id]);
+       bch2_bpos_to_text(out, stats->pos);
+       pr_buf(out, "%s", "\n");
+
+       return 0;
+}
+
+static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       long ret = 0;
+       struct bch_move_stats *iter;
+
+       mutex_lock(&c->data_progress_lock);
+
+       if (list_empty(&c->data_progress_list))
+               pr_buf(out, "%s", "no progress to report\n");
+       else
+               list_for_each_entry(iter, &c->data_progress_list, list) {
+                       stats_to_text(out, c, iter);
+               }
+
+       mutex_unlock(&c->data_progress_lock);
+       return ret;
+}
+
 static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
 {
        struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
@@ -257,7 +290,7 @@ static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
            nr_compressed_extents = 0,
@@ -292,6 +325,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
                                break;
                        }
                }
+       bch2_trans_iter_exit(&trans, &iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
@@ -434,6 +468,11 @@ SHOW(bch2_fs)
                return out.pos - buf;
        }
 
+       if (attr == &sysfs_data_op_data_progress) {
+               data_progress_to_text(&out, c);
+               return out.pos - buf;
+       }
+
        return 0;
 }
 
@@ -596,6 +635,8 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_io_timers_read,
        &sysfs_io_timers_write,
 
+       &sysfs_data_op_data_progress,
+
        &sysfs_internal_uuid,
        NULL
 };
index 4d8d50fd76428520595fdab1a38dc0886b3a7af7..d5a74f4db64d115149a3c81cb087bf6571569faa 100644 (file)
@@ -29,7 +29,7 @@ static void delete_test_keys(struct bch_fs *c)
 static int test_delete(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_i_cookie k;
        int ret;
 
@@ -37,13 +37,12 @@ static int test_delete(struct bch_fs *c, u64 nr)
        k.k.p.snapshot = U32_MAX;
 
        bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
-                                  BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+                            BTREE_ITER_INTENT);
 
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(&trans, iter, &k.k_i, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(&trans, &iter, &k.k_i, 0));
        if (ret) {
                bch_err(c, "update error in test_delete: %i", ret);
                goto err;
@@ -51,8 +50,8 @@ static int test_delete(struct bch_fs *c, u64 nr)
 
        pr_info("deleting once");
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_btree_delete_at(&trans, iter, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_btree_delete_at(&trans, &iter, 0));
        if (ret) {
                bch_err(c, "delete error (first) in test_delete: %i", ret);
                goto err;
@@ -60,14 +59,14 @@ static int test_delete(struct bch_fs *c, u64 nr)
 
        pr_info("deleting twice");
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_btree_delete_at(&trans, iter, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_btree_delete_at(&trans, &iter, 0));
        if (ret) {
                bch_err(c, "delete error (second) in test_delete: %i", ret);
                goto err;
        }
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -75,7 +74,7 @@ err:
 static int test_delete_written(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_i_cookie k;
        int ret;
 
@@ -84,12 +83,12 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
-                                  BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+                            BTREE_ITER_INTENT);
 
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(&trans, iter, &k.k_i, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(&trans, &iter, &k.k_i, 0));
        if (ret) {
                bch_err(c, "update error in test_delete_written: %i", ret);
                goto err;
@@ -99,14 +98,14 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
        bch2_journal_flush_all_pins(&c->journal);
 
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_btree_delete_at(&trans, iter, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_btree_delete_at(&trans, &iter, 0));
        if (ret) {
                bch_err(c, "delete error in test_delete_written: %i", ret);
                goto err;
        }
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -114,7 +113,7 @@ err:
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -156,12 +155,12 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
        pr_info("iterating backwards");
 
-       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k))
+       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
                BUG_ON(k.k->p.offset != --i);
 
        BUG_ON(i);
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -169,7 +168,7 @@ err:
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -210,14 +209,14 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
        pr_info("iterating backwards");
 
-       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) {
+       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
                BUG_ON(k.k->p.offset != i);
                i = bkey_start_offset(k.k);
        }
 
        BUG_ON(i);
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -225,7 +224,7 @@ err:
 static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -263,7 +262,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
                BUG_ON(k.k->p.offset != i);
                i += 2;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        BUG_ON(i != nr * 2);
 
@@ -280,7 +279,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
                if (i == nr * 2)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 err:
        bch2_trans_exit(&trans);
        return ret;
@@ -289,7 +288,7 @@ err:
 static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -326,7 +325,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                BUG_ON(k.k->size != 8);
                i += 16;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        BUG_ON(i != nr);
 
@@ -345,7 +344,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                if (i == nr)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 err:
        bch2_trans_exit(&trans);
        return 0;
@@ -358,21 +357,19 @@ err:
 static int test_peek_end(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
 
        bch2_trans_init(&trans, c, 0, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
-
-       k = bch2_btree_iter_peek(iter);
+       k = bch2_btree_iter_peek(&iter);
        BUG_ON(k.k);
 
-       k = bch2_btree_iter_peek(iter);
+       k = bch2_btree_iter_peek(&iter);
        BUG_ON(k.k);
 
-       bch2_trans_iter_put(&trans, iter);
-
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return 0;
 }
@@ -380,21 +377,19 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
 
        bch2_trans_init(&trans, c, 0, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS_MIN, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0);
-
-       k = bch2_btree_iter_peek(iter);
+       k = bch2_btree_iter_peek(&iter);
        BUG_ON(k.k);
 
-       k = bch2_btree_iter_peek(iter);
+       k = bch2_btree_iter_peek(&iter);
        BUG_ON(k.k);
 
-       bch2_trans_iter_put(&trans, iter);
-
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return 0;
 }
@@ -540,18 +535,18 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 static int rand_lookup(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
        u64 i;
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
 
        for (i = 0; i < nr; i++) {
-               bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
+               bch2_btree_iter_set_pos(&iter, POS(0, test_rand()));
 
-               k = bch2_btree_iter_peek(iter);
+               k = bch2_btree_iter_peek(&iter);
                ret = bkey_err(k);
                if (ret) {
                        bch_err(c, "error in rand_lookup: %i", ret);
@@ -559,63 +554,73 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
                }
        }
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
 
+static int rand_mixed_trans(struct btree_trans *trans,
+                           struct btree_iter *iter,
+                           struct bkey_i_cookie *cookie,
+                           u64 i, u64 pos)
+{
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_btree_iter_set_pos(iter, POS(0, pos));
+
+       k = bch2_btree_iter_peek(iter);
+       ret = bkey_err(k);
+       if (ret && ret != -EINTR)
+               bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
+       if (ret)
+               return ret;
+
+       if (!(i & 3) && k.k) {
+               bkey_cookie_init(&cookie->k_i);
+               cookie->k.p = iter->pos;
+               bch2_trans_update(trans, iter, &cookie->k_i, 0);
+       }
+
+       return 0;
+}
+
 static int rand_mixed(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bkey_s_c k;
+       struct btree_iter iter;
+       struct bkey_i_cookie cookie;
        int ret = 0;
-       u64 i;
+       u64 i, rand;
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, POS_MIN, 0);
 
        for (i = 0; i < nr; i++) {
-               bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
-
-               k = bch2_btree_iter_peek(iter);
-               ret = bkey_err(k);
+               rand = test_rand();
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       rand_mixed_trans(&trans, &iter, &cookie, i, rand));
                if (ret) {
-                       bch_err(c, "lookup error in rand_mixed: %i", ret);
+                       bch_err(c, "update error in rand_mixed: %i", ret);
                        break;
                }
-
-               if (!(i & 3) && k.k) {
-                       struct bkey_i_cookie k;
-
-                       bkey_cookie_init(&k.k_i);
-                       k.k.p = iter->pos;
-
-                       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                               bch2_btree_iter_traverse(iter) ?:
-                               bch2_trans_update(&trans, iter, &k.k_i, 0));
-                       if (ret) {
-                               bch_err(c, "update error in rand_mixed: %i", ret);
-                               break;
-                       }
-               }
        }
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
 
 static int __do_delete(struct btree_trans *trans, struct bpos pos)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_i delete;
        struct bkey_s_c k;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos,
-                                  BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -626,9 +631,9 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
        bkey_init(&delete.k);
        delete.k.p = k.k->p;
 
-       ret = bch2_trans_update(trans, iter, &delete, 0);
+       ret = bch2_trans_update(trans, &iter, &delete, 0);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -658,7 +663,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
 static int seq_insert(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i_cookie insert;
        int ret = 0;
@@ -670,11 +675,11 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 
        for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               insert.k.p = iter->pos;
+               insert.k.p = iter.pos;
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(&trans, iter, &insert.k_i, 0));
+                       bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(&trans, &iter, &insert.k_i, 0));
                if (ret) {
                        bch_err(c, "error in seq_insert: %i", ret);
                        break;
@@ -683,7 +688,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
                if (++i == nr)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        return ret;
@@ -692,7 +697,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 static int seq_lookup(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
@@ -700,7 +705,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 
        for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
                ;
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        return ret;
@@ -709,7 +714,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 static int seq_overwrite(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
@@ -722,14 +727,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
                bkey_reassemble(&u.k_i, k);
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(&trans, iter, &u.k_i, 0));
+                       bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(&trans, &iter, &u.k_i, 0));
                if (ret) {
                        bch_err(c, "error in seq_overwrite: %i", ret);
                        break;
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        return ret;
@@ -778,7 +783,7 @@ static int btree_perf_test_thread(void *data)
                wait_event(j->ready_wait, !atomic_read(&j->ready));
        }
 
-       ret = j->fn(j->c, j->nr / j->nr_threads);
+       ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
        if (ret)
                j->ret = ret;
 
@@ -854,11 +859,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 
        scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
        bch2_hprint(&PBUF(nr_buf), nr);
-       bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time);
+       bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
        printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
                name_buf, nr_buf, nr_threads,
-               time / NSEC_PER_SEC,
-               time * nr_threads / nr,
+               div_u64(time, NSEC_PER_SEC),
+               div_u64(time * nr_threads, nr),
                per_sec_buf);
        return j.ret;
 }
index e6a041541792676d8936fdbc44dfac95314de076..752179b26a1eaac8db7c5203eed07face0306db9 100644 (file)
@@ -96,7 +96,7 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
 int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
 {
        u64 v = get_unaligned_le64(in);
-       unsigned bytes = ffz(v & 255) + 1;
+       unsigned bytes = ffz(*in) + 1;
 
        if (unlikely(in + bytes > end))
                return -1;
index e4d400b16dbaf8da85a211c55ddc9b74195d86e0..ef6ae97e0df58886d89785bfe7cd522e3d1b6409 100644 (file)
@@ -122,23 +122,22 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
                                const char *name, void *buffer, size_t size, int type)
 {
        struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c_xattr xattr;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash,
-                               inode->v.i_ino,
-                               &X_SEARCH(type, name, strlen(name)),
-                               0);
-       ret = PTR_ERR_OR_ZERO(iter);
+       ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+                              inode->v.i_ino,
+                              &X_SEARCH(type, name, strlen(name)),
+                              0);
        if (ret)
-               goto err;
+               goto err1;
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
-               goto err;
+               goto err2;
 
        xattr = bkey_s_c_to_xattr(k);
        ret = le16_to_cpu(xattr.v->x_val_len);
@@ -148,8 +147,9 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
                else
                        memcpy(buffer, xattr_val(xattr.v), ret);
        }
-       bch2_trans_iter_put(trans, iter);
-err:
+err2:
+       bch2_trans_iter_exit(trans, &iter);
+err1:
        return ret == -ENOENT ? -ENODATA : ret;
 }
 
@@ -279,7 +279,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        struct bch_fs *c = dentry->d_sb->s_fs_info;
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
        u64 inum = dentry->d_inode->i_ino;
@@ -301,7 +301,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        ret = bch2_trans_exit(&trans) ?: ret;