]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/snapshot.c
Disable pristine-tar option in gbp.conf, since there is no pristine-tar branch.
[bcachefs-tools-debian] / libbcachefs / snapshot.c
index 03ae280aee3a668e67179a9d8a675291996020e7..45f67e8b29eb67f188e5cfb32aa39e0b1ad1d625 100644 (file)
@@ -30,17 +30,18 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
                   le32_to_cpu(t.v->root_snapshot));
 }
 
-int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
                               enum bkey_invalid_flags flags,
                               struct printbuf *err)
 {
-       if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-           bkey_lt(k.k->p, POS(0, 1))) {
-               prt_printf(err, "bad pos");
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
 
-       return 0;
+       bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+                        bkey_lt(k.k->p, POS(0, 1)), c, err,
+                        snapshot_tree_pos_bad,
+                        "bad pos");
+fsck_err:
+       return ret;
 }
 
 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
@@ -122,7 +123,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
        struct snapshot_table *t;
        bool ret;
 
-       EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+       EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
 
        rcu_read_lock();
        t = rcu_dereference(c->snapshots);
@@ -163,8 +164,7 @@ static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
 
        rcu_assign_pointer(c->snapshots, new);
        c->snapshot_table_size = new_size;
-       if (old)
-               kvfree_rcu(old);
+       kvfree_rcu_mightsleep(old);
 
        return &rcu_dereference_protected(c->snapshots, true)->s[idx];
 }
@@ -203,68 +203,60 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
                           le32_to_cpu(s.v->skip[2]));
 }
 
-int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
                          enum bkey_invalid_flags flags,
                          struct printbuf *err)
 {
        struct bkey_s_c_snapshot s;
        u32 i, id;
+       int ret = 0;
 
-       if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-           bkey_lt(k.k->p, POS(0, 1))) {
-               prt_printf(err, "bad pos");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+                        bkey_lt(k.k->p, POS(0, 1)), c, err,
+                        snapshot_pos_bad,
+                        "bad pos");
 
        s = bkey_s_c_to_snapshot(k);
 
        id = le32_to_cpu(s.v->parent);
-       if (id && id <= k.k->p.offset) {
-               prt_printf(err, "bad parent node (%u <= %llu)",
-                      id, k.k->p.offset);
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
+                        snapshot_parent_bad,
+                        "bad parent node (%u <= %llu)",
+                        id, k.k->p.offset);
 
-       if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
-               prt_printf(err, "children not normalized");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
+                        snapshot_children_not_normalized,
+                        "children not normalized");
 
-       if (s.v->children[0] &&
-           s.v->children[0] == s.v->children[1]) {
-               prt_printf(err, "duplicate child nodes");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
+                        snapshot_child_duplicate,
+                        "duplicate child nodes");
 
        for (i = 0; i < 2; i++) {
                id = le32_to_cpu(s.v->children[i]);
 
-               if (id >= k.k->p.offset) {
-                       prt_printf(err, "bad child node (%u >= %llu)",
-                              id, k.k->p.offset);
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(id >= k.k->p.offset, c, err,
+                                snapshot_child_bad,
+                                "bad child node (%u >= %llu)",
+                                id, k.k->p.offset);
        }
 
        if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
-               if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
-                   le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
-                       prt_printf(err, "skiplist not normalized");
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
+                                le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
+                                snapshot_skiplist_not_normalized,
+                                "skiplist not normalized");
 
                for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
                        id = le32_to_cpu(s.v->skip[i]);
 
-                       if ((id && !s.v->parent) ||
-                           (id && id <= k.k->p.offset)) {
-                               prt_printf(err, "bad skiplist node %u", id);
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
+                                        snapshot_skiplist_bad,
+                                        "bad skiplist node %u", id);
                }
        }
-
-       return 0;
+fsck_err:
+       return ret;
 }
 
 static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
@@ -284,7 +276,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
        mutex_unlock(&c->snapshot_table_lock);
 }
 
-int bch2_mark_snapshot(struct btree_trans *trans,
+static int __bch2_mark_snapshot(struct btree_trans *trans,
                       enum btree_id btree, unsigned level,
                       struct bkey_s_c old, struct bkey_s_c new,
                       unsigned flags)
@@ -326,8 +318,9 @@ int bch2_mark_snapshot(struct btree_trans *trans,
                __set_is_ancestor_bitmap(c, id);
 
                if (BCH_SNAPSHOT_DELETED(s.v)) {
-                       set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-                       c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
+                       set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
+                       if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
+                               bch2_delete_dead_snapshots_async(c);
                }
        } else {
                memset(t, 0, sizeof(*t));
@@ -337,6 +330,14 @@ err:
        return ret;
 }
 
+int bch2_mark_snapshot(struct btree_trans *trans,
+                      enum btree_id btree, unsigned level,
+                      struct bkey_s_c old, struct bkey_s new,
+                      unsigned flags)
+{
+       return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
+}
+
 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
                         struct bch_snapshot *s)
 {
@@ -344,7 +345,7 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
                                       BTREE_ITER_WITH_UPDATES, snapshot, s);
 }
 
-int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
 {
        struct bch_snapshot v;
        int ret;
@@ -371,7 +372,7 @@ int bch2_snapshot_live(struct btree_trans *trans, u32 id)
  * it's part of such a linear chain: this correctly sets equivalence classes on
  * startup if we run leaf to root (i.e. in natural key order).
  */
-int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
        unsigned i, nr_live = 0, live_idx = 0;
@@ -466,7 +467,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct bkey_s_c_subvolume s;
        bool found = false;
        int ret;
 
@@ -475,7 +475,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
                if (k.k->type != KEY_TYPE_subvolume)
                        continue;
 
-               s = bkey_s_c_to_subvolume(k);
+               struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
                if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
                        continue;
                if (!BCH_SUBVOLUME_SNAP(s.v)) {
@@ -488,18 +488,18 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
        bch2_trans_iter_exit(trans, &iter);
 
        if (!ret && !found) {
-               struct bkey_i_subvolume *s;
+               struct bkey_i_subvolume *u;
 
                *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
 
-               s = bch2_bkey_get_mut_typed(trans, &iter,
+               u = bch2_bkey_get_mut_typed(trans, &iter,
                                            BTREE_ID_subvolumes, POS(0, *subvol_id),
                                            0, subvolume);
-               ret = PTR_ERR_OR_ZERO(s);
+               ret = PTR_ERR_OR_ZERO(u);
                if (ret)
                        return ret;
 
-               SET_BCH_SUBVOLUME_SNAP(&s->v, false);
+               SET_BCH_SUBVOLUME_SNAP(&u->v, false);
        }
 
        return ret;
@@ -530,7 +530,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
        if (fsck_err_on(ret ||
                        root_id != bch2_snapshot_root(c, root_id) ||
                        st.k->p.offset != le32_to_cpu(s.tree),
-                       c,
+                       c, snapshot_tree_to_missing_snapshot,
                        "snapshot tree points to missing/incorrect snapshot:\n  %s",
                        (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
                ret = bch2_btree_delete_at(trans, iter, 0);
@@ -542,17 +542,20 @@ static int check_snapshot_tree(struct btree_trans *trans,
        if (ret && !bch2_err_matches(ret, ENOENT))
                goto err;
 
-       if (fsck_err_on(ret, c,
+       if (fsck_err_on(ret,
+                       c, snapshot_tree_to_missing_subvol,
                        "snapshot tree points to missing subvolume:\n  %s",
                        (printbuf_reset(&buf),
                         bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
            fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
                                                le32_to_cpu(subvol.snapshot),
-                                               root_id), c,
+                                               root_id),
+                       c, snapshot_tree_to_wrong_subvol,
                        "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
                        (printbuf_reset(&buf),
                         bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-           fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
+           fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
+                       c, snapshot_tree_to_snapshot_subvol,
                        "snapshot tree points to snapshot subvolume:\n  %s",
                        (printbuf_reset(&buf),
                         bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
@@ -586,19 +589,13 @@ fsck_err:
  */
 int bch2_check_snapshot_trees(struct bch_fs *c)
 {
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+       int ret = bch2_trans_run(c,
+               for_each_btree_key_commit(trans, iter,
                        BTREE_ID_snapshot_trees, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_snapshot_tree(&trans, &iter, k)));
-
-       if (ret)
-               bch_err(c, "error %i checking snapshot trees", ret);
+                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+               check_snapshot_tree(trans, &iter, k)));
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -788,7 +785,9 @@ static int check_snapshot(struct btree_trans *trans,
                        goto err;
                }
        } else {
-               if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
+               if (fsck_err_on(s.subvol,
+                               c, snapshot_should_not_have_subvol,
+                               "snapshot should not point to subvol:\n  %s",
                                (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                        u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
                        ret = PTR_ERR_OR_ZERO(u);
@@ -804,7 +803,8 @@ static int check_snapshot(struct btree_trans *trans,
        if (ret < 0)
                goto err;
 
-       if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
+       if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
+                       "snapshot points to missing/incorrect tree:\n  %s",
                        (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
                if (ret)
@@ -814,10 +814,10 @@ static int check_snapshot(struct btree_trans *trans,
 
        real_depth = bch2_snapshot_depth(c, parent_id);
 
-       if (le32_to_cpu(s.depth) != real_depth &&
-           (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
-            fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
-                     real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+       if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
+                       c, snapshot_bad_depth,
+                       "snapshot with incorrect depth field, should be %u:\n  %s",
+                       real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
                ret = PTR_ERR_OR_ZERO(u);
                if (ret)
@@ -831,10 +831,9 @@ static int check_snapshot(struct btree_trans *trans,
        if (ret < 0)
                goto err;
 
-       if (!ret &&
-           (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
-            fsck_err(c, "snapshot with bad skiplist field:\n  %s",
-                     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+       if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
+                       "snapshot with bad skiplist field:\n  %s",
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
                ret = PTR_ERR_OR_ZERO(u);
                if (ret)
@@ -855,22 +854,17 @@ fsck_err:
 
 int bch2_check_snapshots(struct bch_fs *c)
 {
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
        /*
         * We iterate backwards as checking/fixing the depth field requires that
         * the parent's depth already be correct:
         */
-       ret = bch2_trans_run(c,
-               for_each_btree_key_reverse_commit(&trans, iter,
-                       BTREE_ID_snapshots, POS_MAX,
-                       BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_snapshot(&trans, &iter, k)));
-       if (ret)
-               bch_err_fn(c, ret);
+       int ret = bch2_trans_run(c,
+               for_each_btree_key_reverse_commit(trans, iter,
+                               BTREE_ID_snapshots, POS_MAX,
+                               BTREE_ITER_PREFETCH, k,
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                       check_snapshot(trans, &iter, k)));
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -911,7 +905,7 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
                swap(s->children[0], s->children[1]);
 }
 
-int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
@@ -958,7 +952,7 @@ int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
                                        parent_id, id))
                        goto err;
 
-               parent->v.children[i] = le32_to_cpu(child_id);
+               parent->v.children[i] = cpu_to_le32(child_id);
 
                normalize_snapshot_child_pointers(&parent->v);
        }
@@ -1059,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
                n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
                n->v.tree       = cpu_to_le32(tree);
                n->v.depth      = cpu_to_le32(depth);
+               n->v.btime.lo   = cpu_to_le64(bch2_current_time(c));
+               n->v.btime.hi   = 0;
 
                for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
                        n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
@@ -1066,12 +1062,16 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
                bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
                SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
 
-               ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+               ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
                                         bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
                if (ret)
                        goto err;
 
                new_snapids[i]  = iter.pos.offset;
+
+               mutex_lock(&c->snapshot_table_lock);
+               snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
+               mutex_unlock(&c->snapshot_table_lock);
        }
 err:
        bch2_trans_iter_exit(trans, &iter);
@@ -1248,13 +1248,7 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans,
        return 0;
 }
 
-/*
- * For a given snapshot, if it doesn't have a subvolume that points to it, and
- * it doesn't have child snapshot nodes - it's now redundant and we can mark it
- * as deleted.
- */
-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
-                                         struct bkey_s_c k)
+static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
 {
        struct bkey_s_c_snapshot snap;
        u32 children[2];
@@ -1275,16 +1269,30 @@ static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btre
                bch2_snapshot_live(trans, children[1]);
        if (ret < 0)
                return ret;
+       return !ret;
+}
 
-       if (!ret)
-               return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
-       return 0;
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
+{
+       int ret = bch2_snapshot_needs_delete(trans, k);
+
+       return ret <= 0
+               ? ret
+               : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
 }
 
 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
                                                snapshot_id_list *skip)
 {
        rcu_read_lock();
+       while (snapshot_list_has_id(skip, id))
+               id = __bch2_snapshot_parent(c, id);
+
        while (n--) {
                do {
                        id = __bch2_snapshot_parent(c, id);
@@ -1302,7 +1310,6 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        u32 nr_deleted_ancestors = 0;
        struct bkey_i_snapshot *s;
-       u32 *i;
        int ret;
 
        if (k.k->type != KEY_TYPE_snapshot)
@@ -1336,12 +1343,12 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
                        u32 id = le32_to_cpu(s->v.skip[j]);
 
                        if (snapshot_list_has_id(deleted, id)) {
-                               id = depth > 1
-                                       ? bch2_snapshot_nth_parent_skip(c,
+                               id = bch2_snapshot_nth_parent_skip(c,
                                                        parent,
-                                                       get_random_u32_below(depth - 1),
-                                                       deleted)
-                                       : parent;
+                                                       depth > 1
+                                                       ? get_random_u32_below(depth - 1)
+                                                       : 0,
+                                                       deleted);
                                s->v.skip[j] = cpu_to_le32(id);
                        }
                }
@@ -1354,64 +1361,55 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
 
 int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_s_c_snapshot snap;
+       struct btree_trans *trans;
        snapshot_id_list deleted = { 0 };
        snapshot_id_list deleted_interior = { 0 };
-       u32 *i, id;
+       u32 id;
        int ret = 0;
 
-       if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+       if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
+               return 0;
+
+       if (!test_bit(BCH_FS_started, &c->flags)) {
                ret = bch2_fs_read_write_early(c);
-               if (ret) {
-                       bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+               if (ret)
                        return ret;
-               }
        }
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
        /*
         * For every snapshot node: If we have no live children and it's not
         * pointed to by a subvolume, delete it:
         */
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
                        POS_MIN, 0, k,
                        NULL, NULL, 0,
-               bch2_delete_redundant_snapshot(&trans, &iter, k));
-       if (ret) {
-               bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
+               bch2_delete_redundant_snapshot(trans, k));
+       bch_err_msg(c, ret, "deleting redundant snapshots");
+       if (ret)
                goto err;
-       }
 
-       for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k,
-               bch2_snapshot_set_equiv(&trans, k));
-       if (ret) {
-               bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
+       ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+                                POS_MIN, 0, k,
+               bch2_snapshot_set_equiv(trans, k));
+       bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+       if (ret)
                goto err;
-       }
 
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
+       ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+                                POS_MIN, 0, k, ({
                if (k.k->type != KEY_TYPE_snapshot)
                        continue;
 
-               snap = bkey_s_c_to_snapshot(k);
-               if (BCH_SNAPSHOT_DELETED(snap.v)) {
-                       ret = snapshot_list_add(c, &deleted, k.k->p.offset);
-                       if (ret)
-                               break;
-               }
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-
-       if (ret) {
-               bch_err_msg(c, ret, "walking snapshots");
+               BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
+                       ? snapshot_list_add(c, &deleted, k.k->p.offset)
+                       : 0;
+       }));
+       bch_err_msg(c, ret, "walking snapshots");
+       if (ret)
                goto err;
-       }
 
        for (id = 0; id < BTREE_ID_NR; id++) {
                struct bpos last_pos = POS_MIN;
@@ -1421,73 +1419,85 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
                if (!btree_type_has_snapshots(id))
                        continue;
 
-               ret = for_each_btree_key_commit(&trans, iter,
+               /*
+                * deleted inodes btree is maintained by a trigger on the inodes
+                * btree - no work for us to do here, and it's not safe to scan
+                * it because we'll see out of date keys due to the btree write
+                * buffer:
+                */
+               if (id == BTREE_ID_deleted_inodes)
+                       continue;
+
+               ret = for_each_btree_key_commit(trans, iter,
                                id, POS_MIN,
                                BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                               &res, NULL, BTREE_INSERT_NOFAIL,
-                       snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
-                     for_each_btree_key_commit(&trans, iter,
+                               &res, NULL, BCH_TRANS_COMMIT_no_enospc,
+                       snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+                     for_each_btree_key_commit(trans, iter,
                                id, POS_MIN,
                                BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                               &res, NULL, BTREE_INSERT_NOFAIL,
-                       move_key_to_correct_snapshot(&trans, &iter, k));
+                               &res, NULL, BCH_TRANS_COMMIT_no_enospc,
+                       move_key_to_correct_snapshot(trans, &iter, k));
 
                bch2_disk_reservation_put(c, &res);
                darray_exit(&equiv_seen);
 
-               if (ret) {
-                       bch_err_msg(c, ret, "deleting keys from dying snapshots");
+               bch_err_msg(c, ret, "deleting keys from dying snapshots");
+               if (ret)
                        goto err;
-               }
        }
 
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
+       bch2_trans_unlock(trans);
+       down_write(&c->snapshot_create_lock);
+
+       ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+                                POS_MIN, 0, k, ({
                u32 snapshot = k.k->p.offset;
                u32 equiv = bch2_snapshot_equiv(c, snapshot);
 
-               if (equiv != snapshot)
-                       snapshot_list_add(c, &deleted_interior, snapshot);
-       }
-       bch2_trans_iter_exit(&trans, &iter);
+               equiv != snapshot
+                       ? snapshot_list_add(c, &deleted_interior, snapshot)
+                       : 0;
+       }));
+
+       bch_err_msg(c, ret, "walking snapshots");
+       if (ret)
+               goto err_create_lock;
 
        /*
         * Fixing children of deleted snapshots can't be done completely
         * atomically, if we crash between here and when we delete the interior
         * nodes some depth fields will be off:
         */
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
                                  BTREE_ITER_INTENT, k,
-                                 NULL, NULL, BTREE_INSERT_NOFAIL,
-               bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior));
+                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+               bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
        if (ret)
-               goto err;
+               goto err_create_lock;
 
        darray_for_each(deleted, i) {
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_snapshot_node_delete(&trans, *i));
-               if (ret) {
-                       bch_err_msg(c, ret, "deleting snapshot %u", *i);
-                       goto err;
-               }
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(trans, *i));
+               bch_err_msg(c, ret, "deleting snapshot %u", *i);
+               if (ret)
+                       goto err_create_lock;
        }
 
        darray_for_each(deleted_interior, i) {
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_snapshot_node_delete(&trans, *i));
-               if (ret) {
-                       bch_err_msg(c, ret, "deleting snapshot %u", *i);
-                       goto err;
-               }
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(trans, *i));
+               bch_err_msg(c, ret, "deleting snapshot %u", *i);
+               if (ret)
+                       goto err_create_lock;
        }
-
-       clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+err_create_lock:
+       up_write(&c->snapshot_create_lock);
 err:
        darray_exit(&deleted_interior);
        darray_exit(&deleted);
-       bch2_trans_exit(&trans);
-       if (ret)
-               bch_err_fn(c, ret);
+       bch2_trans_put(trans);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1495,8 +1505,7 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
 {
        struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
 
-       if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
-               bch2_delete_dead_snapshots(c);
+       bch2_delete_dead_snapshots(c);
        bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
@@ -1507,20 +1516,6 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c)
                bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
 }
 
-int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
-                                   struct btree_trans_commit_hook *h)
-{
-       struct bch_fs *c = trans->c;
-
-       set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-
-       if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
-               return 0;
-
-       bch2_delete_dead_snapshots_async(c);
-       return 0;
-}
-
 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
                                       enum btree_id id,
                                       struct bpos pos)
@@ -1618,7 +1613,8 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct bkey_buf sk;
-       int ret;
+       u32 restart_count = trans->restart_count;
+       int ret = 0;
 
        bch2_bkey_buf_init(&sk);
        bch2_bkey_buf_reassemble(&sk, c, k);
@@ -1632,37 +1628,60 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
                if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
                    !bch2_snapshot_is_leaf(c, id))
                        continue;
+again:
+               ret =   btree_trans_too_many_iters(trans) ?:
+                       bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
+                       bch2_trans_commit(trans, NULL, NULL, 0);
+               if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+                       bch2_trans_begin(trans);
+                       goto again;
+               }
 
-               ret = commit_do(trans, NULL, NULL, 0,
-                               bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos));
                if (ret)
                        break;
        }
 
        bch2_bkey_buf_exit(&sk, c);
-       return ret;
+
+       return ret ?: trans_was_restarted(trans, restart_count);
 }
 
-int bch2_snapshots_read(struct bch_fs *c)
+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
 {
-       struct btree_iter iter;
-       struct bkey_s_c k;
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c_snapshot snap;
        int ret = 0;
 
-       ret = bch2_trans_run(c,
-               for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k,
-                       bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-                       bch2_snapshot_set_equiv(&trans, k)) ?:
-               for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k,
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
+
+       snap = bkey_s_c_to_snapshot(k);
+       if (BCH_SNAPSHOT_DELETED(snap.v) ||
+           bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
+           (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
+               set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
+               return 0;
+       }
+
+       return ret;
+}
+
+int bch2_snapshots_read(struct bch_fs *c)
+{
+       int ret = bch2_trans_run(c,
+               for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+                                  POS_MIN, 0, k,
+                       __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+                       bch2_snapshot_set_equiv(trans, k) ?:
+                       bch2_check_snapshot_needs_deletion(trans, k)) ?:
+               for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+                                  POS_MIN, 0, k,
                           (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
 void bch2_fs_snapshots_exit(struct bch_fs *c)
 {
-       kfree(rcu_dereference_protected(c->snapshots, true));
+       kvfree(rcu_dereference_protected(c->snapshots, true));
 }