]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/subvolume.c
New upstream release
[bcachefs-tools-debian] / libbcachefs / subvolume.c
index 69603327d93df6587f4e8713d249c240c8bc1fde..8c98bacca290b8301f421a16707eebf811e32126 100644 (file)
@@ -3,21 +3,19 @@
 #include "bcachefs.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "errcode.h"
 #include "error.h"
 #include "fs.h"
 #include "subvolume.h"
 
 /* Snapshot tree: */
 
-static void bch2_delete_dead_snapshots_work(struct work_struct *);
-static void bch2_delete_dead_snapshots(struct bch_fs *);
-
 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
                           struct bkey_s_c k)
 {
        struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
 
-       pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+       prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
               BCH_SNAPSHOT_SUBVOL(s.v),
               BCH_SNAPSHOT_DELETED(s.v),
               le32_to_cpu(s.v->parent),
@@ -26,39 +24,55 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
               le32_to_cpu(s.v->subvol));
 }
 
-const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                         int rw, struct printbuf *err)
 {
        struct bkey_s_c_snapshot s;
        u32 i, id;
 
        if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
-           bkey_cmp(k.k->p, POS(0, 1)) < 0)
-               return "bad pos";
+           bkey_cmp(k.k->p, POS(0, 1)) < 0) {
+               prt_printf(err, "bad pos");
+               return -EINVAL;
+       }
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
-               return "bad val size";
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
+               prt_printf(err, "bad val size (%zu != %zu)",
+                      bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
+               return -EINVAL;
+       }
 
        s = bkey_s_c_to_snapshot(k);
 
        id = le32_to_cpu(s.v->parent);
-       if (id && id <= k.k->p.offset)
-               return "bad parent node";
+       if (id && id <= k.k->p.offset) {
+               prt_printf(err, "bad parent node (%u <= %llu)",
+                      id, k.k->p.offset);
+               return -EINVAL;
+       }
 
-       if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
-               return "children not normalized";
+       if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
+               prt_printf(err, "children not normalized");
+               return -EINVAL;
+       }
 
        if (s.v->children[0] &&
-           s.v->children[0] == s.v->children[1])
-               return "duplicate child nodes";
+           s.v->children[0] == s.v->children[1]) {
+               prt_printf(err, "duplicate child nodes");
+               return -EINVAL;
+       }
 
        for (i = 0; i < 2; i++) {
                id = le32_to_cpu(s.v->children[i]);
 
-               if (id >= k.k->p.offset)
-                       return "bad child node";
+               if (id >= k.k->p.offset) {
+                       prt_printf(err, "bad child node (%u >= %llu)",
+                              id, k.k->p.offset);
+                       return -EINVAL;
+               }
        }
 
-       return NULL;
+       return 0;
 }
 
 int bch2_mark_snapshot(struct btree_trans *trans,
@@ -118,7 +132,7 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
        if (!id)
                return 0;
 
-       ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+       ret = snapshot_lookup(trans, id, &v);
        if (ret == -ENOENT)
                bch_err(trans->c, "snapshot node %u not found", id);
        if (ret)
@@ -127,156 +141,206 @@ static int snapshot_live(struct btree_trans *trans, u32 id)
        return !BCH_SNAPSHOT_DELETED(&v);
 }
 
-static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
+       unsigned i, nr_live = 0, live_idx = 0;
        struct bkey_s_c_snapshot snap;
-       unsigned i;
-       int ret;
+       u32 id = k.k->p.offset, child[2];
 
-       for_each_btree_key(trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
-               u32 id = k.k->p.offset, child[2];
-               unsigned nr_live = 0, live_idx;
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
 
-               if (k.k->type != KEY_TYPE_snapshot)
-                       continue;
+       snap = bkey_s_c_to_snapshot(k);
 
-               snap = bkey_s_c_to_snapshot(k);
-               child[0] = le32_to_cpu(snap.v->children[0]);
-               child[1] = le32_to_cpu(snap.v->children[1]);
+       child[0] = le32_to_cpu(snap.v->children[0]);
+       child[1] = le32_to_cpu(snap.v->children[1]);
 
-               for (i = 0; i < 2; i++) {
-                       ret = snapshot_live(trans, child[i]);
-                       if (ret < 0)
-                               break;
-
-                       if (ret)
-                               live_idx = i;
-                       nr_live += ret;
-               }
+       for (i = 0; i < 2; i++) {
+               int ret = snapshot_live(trans, child[i]);
+               if (ret < 0)
+                       return ret;
 
-               snapshot_t(c, id)->equiv = nr_live == 1
-                       ? snapshot_t(c, child[live_idx])->equiv
-                       : id;
+               if (ret)
+                       live_idx = i;
+               nr_live += ret;
        }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (ret)
-               bch_err(c, "error walking snapshots: %i", ret);
 
-       return ret;
+       snapshot_t(c, id)->equiv = nr_live == 1
+               ? snapshot_t(c, child[live_idx])->equiv
+               : id;
+       return 0;
 }
 
 /* fsck: */
-static int bch2_snapshot_check(struct btree_trans *trans,
-                              struct bkey_s_c_snapshot s)
+static int check_snapshot(struct btree_trans *trans,
+                         struct btree_iter *iter,
+                         struct bkey_s_c k)
 {
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c_snapshot s;
        struct bch_subvolume subvol;
        struct bch_snapshot v;
+       struct printbuf buf = PRINTBUF;
+       bool should_have_subvol;
        u32 i, id;
-       int ret;
-
-       id = le32_to_cpu(s.v->subvol);
-       ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
-       if (ret == -ENOENT)
-               bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
-                       s.k->p.offset, id);
-       if (ret)
-               return ret;
+       int ret = 0;
 
-       if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
-               bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-                       s.k->p.offset);
-               return -EINVAL;
-       }
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
 
+       s = bkey_s_c_to_snapshot(k);
        id = le32_to_cpu(s.v->parent);
        if (id) {
-               ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+               ret = snapshot_lookup(trans, id, &v);
                if (ret == -ENOENT)
-                       bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
-                               s.k->p.offset, id);
+                       bch_err(c, "snapshot with nonexistent parent:\n  %s",
+                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
                if (ret)
-                       return ret;
+                       goto err;
 
                if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
                    le32_to_cpu(v.children[1]) != s.k->p.offset) {
-                       bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+                       bch_err(c, "snapshot parent %u missing pointer to child %llu",
                                id, s.k->p.offset);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto err;
                }
        }
 
        for (i = 0; i < 2 && s.v->children[i]; i++) {
                id = le32_to_cpu(s.v->children[i]);
 
-               ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+               ret = snapshot_lookup(trans, id, &v);
                if (ret == -ENOENT)
-                       bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+                       bch_err(c, "snapshot node %llu has nonexistent child %u",
                                s.k->p.offset, id);
                if (ret)
-                       return ret;
+                       goto err;
 
                if (le32_to_cpu(v.parent) != s.k->p.offset) {
-                       bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+                       bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
                                id, le32_to_cpu(v.parent), s.k->p.offset);
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto err;
                }
        }
 
-       return 0;
+       should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
+               !BCH_SNAPSHOT_DELETED(s.v);
+
+       if (should_have_subvol) {
+               id = le32_to_cpu(s.v->subvol);
+               ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+               if (ret == -ENOENT)
+                       bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
+                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
+               if (ret)
+                       goto err;
+
+               if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+                       bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+                               s.k->p.offset);
+                       ret = -EINVAL;
+                       goto err;
+               }
+       } else {
+               if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n  %s",
+                               (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+                       struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
+
+                       ret = PTR_ERR_OR_ZERO(u);
+                       if (ret)
+                               goto err;
+
+                       bkey_reassemble(&u->k_i, s.s_c);
+                       u->v.subvol = 0;
+                       ret = bch2_trans_update(trans, iter, &u->k_i, 0);
+                       if (ret)
+                               goto err;
+               }
+       }
+
+       if (BCH_SNAPSHOT_DELETED(s.v))
+               set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+err:
+fsck_err:
+       printbuf_exit(&buf);
+       return ret;
 }
 
-int bch2_fs_snapshots_check(struct bch_fs *c)
+int bch2_fs_check_snapshots(struct bch_fs *c)
 {
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       struct bch_snapshot s;
-       unsigned id;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
-               if (k.k->type != KEY_TYPE_snapshot)
-                       continue;
+       ret = for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_snapshots, POS_MIN,
+                       BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+               check_snapshot(&trans, &iter, k));
+
+       if (ret)
+               bch_err(c, "error %i checking snapshots", ret);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
 
-               ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+static int check_subvol(struct btree_trans *trans,
+                       struct btree_iter *iter,
+                       struct bkey_s_c k)
+{
+       struct bkey_s_c_subvolume subvol;
+       struct bch_snapshot snapshot;
+       unsigned snapid;
+       int ret;
+
+       if (k.k->type != KEY_TYPE_subvolume)
+               return 0;
+
+       subvol = bkey_s_c_to_subvolume(k);
+       snapid = le32_to_cpu(subvol.v->snapshot);
+       ret = snapshot_lookup(trans, snapid, &snapshot);
+
+       if (ret == -ENOENT)
+               bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
+                       k.k->p.offset, snapid);
+       if (ret)
+               return ret;
+
+       if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+               ret = bch2_subvolume_delete(trans, iter->pos.offset);
+               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       bch_err(trans->c, "error deleting subvolume %llu: %s",
+                               iter->pos.offset, bch2_err_str(ret));
                if (ret)
-                       break;
+                       return ret;
        }
-       bch2_trans_iter_exit(&trans, &iter);
 
-       if (ret) {
-               bch_err(c, "error %i checking snapshots", ret);
-               goto err;
-       }
+       return 0;
+}
+
+int bch2_fs_check_subvols(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       ret = for_each_btree_key_commit(&trans, iter,
+                       BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+               check_subvol(&trans, &iter, k));
 
-       for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
-                          POS_MIN, 0, k, ret) {
-               if (k.k->type != KEY_TYPE_subvolume)
-                       continue;
-again_2:
-               id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
-               ret = snapshot_lookup(&trans, id, &s);
-
-               if (ret == -EINTR) {
-                       k = bch2_btree_iter_peek(&iter);
-                       goto again_2;
-               } else if (ret == -ENOENT)
-                       bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
-                               k.k->p.offset, id);
-               else if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-err:
        bch2_trans_exit(&trans);
+
        return ret;
 }
 
@@ -290,49 +354,19 @@ int bch2_fs_snapshots_start(struct bch_fs *c)
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
-       bool have_deleted = false;
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
-              if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
-                      break;
-
-               if (k.k->type != KEY_TYPE_snapshot) {
-                       bch_err(c, "found wrong key type %u in snapshot node table",
-                               k.k->type);
-                       continue;
-               }
-
-               if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
-                       have_deleted = true;
-
-               ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(&trans, &iter);
+       for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k,
+               bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
+               bch2_snapshot_set_equiv(&trans, k));
 
-       if (ret)
-               goto err;
-
-       ret = bch2_snapshots_set_equiv(&trans);
-       if (ret)
-               goto err;
-err:
        bch2_trans_exit(&trans);
 
-       if (!ret && have_deleted) {
-               bch_info(c, "restarting deletion of dead snapshots");
-               if (c->opts.fsck) {
-                       bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
-               } else {
-                       bch2_delete_dead_snapshots(c);
-               }
-       }
-
+       if (ret)
+               bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
        return ret;
 }
 
@@ -369,8 +403,10 @@ static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
                goto err;
 
        bkey_reassemble(&s->k_i, k);
-
        SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+       SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+       s->v.subvol = 0;
+
        ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
        if (ret)
                goto err;
@@ -481,7 +517,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
                        goto err;
 
                if (!k.k || !k.k->p.offset) {
-                       ret = -ENOSPC;
+                       ret = -BCH_ERR_ENOSPC_snapshot_create;
                        goto err;
                }
 
@@ -534,6 +570,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
 
                n->v.children[0] = cpu_to_le32(new_snapids[0]);
                n->v.children[1] = cpu_to_le32(new_snapids[1]);
+               n->v.subvol = 0;
                SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
                ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
                if (ret)
@@ -544,141 +581,100 @@ err:
        return ret;
 }
 
-static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+static int snapshot_delete_key(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              struct bkey_s_c k,
+                              snapshot_id_list *deleted,
+                              snapshot_id_list *equiv_seen,
+                              struct bpos *last_pos)
 {
-       BUG_ON(snapshot_list_has_id(s, id));
-
-       if (s->nr == s->size) {
-               size_t new_size = max(8U, s->size * 2);
-               void *n = krealloc(s->d,
-                                  new_size * sizeof(s->d[0]),
-                                  GFP_KERNEL);
-               if (!n) {
-                       pr_err("error allocating snapshot ID list");
-                       return -ENOMEM;
-               }
+       struct bch_fs *c = trans->c;
+       u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
 
-               s->d    = n;
-               s->size = new_size;
-       };
+       if (bkey_cmp(k.k->p, *last_pos))
+               equiv_seen->nr = 0;
+       *last_pos = k.k->p;
 
-       s->d[s->nr++] = id;
-       return 0;
+       if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+           snapshot_list_has_id(equiv_seen, equiv)) {
+               return bch2_btree_delete_at(trans, iter,
+                                           BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+       } else {
+               return snapshot_list_add(c, equiv_seen, equiv);
+       }
 }
 
-static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
-                                          struct snapshot_id_list *deleted,
-                                          enum btree_id btree_id)
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
+                                         struct bkey_s_c k)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct snapshot_id_list equiv_seen = { 0 };
-       struct bpos last_pos = POS_MIN;
-       int ret = 0;
+       struct bkey_s_c_snapshot snap;
+       u32 children[2];
+       int ret;
 
-       /*
-        * XXX: We should also delete whiteouts that no longer overwrite
-        * anything
-        */
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
 
-       bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_PREFETCH|
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-
-       while ((bch2_trans_begin(trans),
-               (k = bch2_btree_iter_peek(&iter)).k) &&
-              !(ret = bkey_err(k))) {
-               u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
-
-               if (bkey_cmp(k.k->p, last_pos))
-                       equiv_seen.nr = 0;
-               last_pos = k.k->p;
-
-               if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
-                   snapshot_list_has_id(&equiv_seen, equiv)) {
-                       if (btree_id == BTREE_ID_inodes &&
-                           bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
-                               continue;
-
-                       ret = __bch2_trans_do(trans, NULL, NULL,
-                                             BTREE_INSERT_NOFAIL,
-                               bch2_btree_iter_traverse(&iter) ?:
-                               bch2_btree_delete_at(trans, &iter,
-                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
-                       if (ret)
-                               break;
-               } else {
-                       ret = snapshot_id_add(&equiv_seen, equiv);
-                       if (ret)
-                               break;
-               }
+       snap = bkey_s_c_to_snapshot(k);
+       if (BCH_SNAPSHOT_DELETED(snap.v) ||
+           BCH_SNAPSHOT_SUBVOL(snap.v))
+               return 0;
 
-               bch2_btree_iter_advance(&iter);
-       }
-       bch2_trans_iter_exit(trans, &iter);
+       children[0] = le32_to_cpu(snap.v->children[0]);
+       children[1] = le32_to_cpu(snap.v->children[1]);
 
-       kfree(equiv_seen.d);
+       ret   = snapshot_live(trans, children[0]) ?:
+               snapshot_live(trans, children[1]);
+       if (ret < 0)
+               return ret;
 
-       return ret;
+       if (!ret)
+               return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+       return 0;
 }
 
-static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+int bch2_delete_dead_snapshots(struct bch_fs *c)
 {
-       struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_snapshot snap;
-       struct snapshot_id_list deleted = { 0 };
-       u32 i, id, children[2];
+       snapshot_id_list deleted = { 0 };
+       u32 i, id;
        int ret = 0;
 
+       if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
+               return 0;
+
+       if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+               ret = bch2_fs_read_write_early(c);
+               if (ret) {
+                       bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
+                       return ret;
+               }
+       }
+
        bch2_trans_init(&trans, c, 0, 0);
 
        /*
         * For every snapshot node: If we have no live children and it's not
         * pointed to by a subvolume, delete it:
         */
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
-               if (k.k->type != KEY_TYPE_snapshot)
-                       continue;
-
-               snap = bkey_s_c_to_snapshot(k);
-               if (BCH_SNAPSHOT_DELETED(snap.v) ||
-                   BCH_SNAPSHOT_SUBVOL(snap.v))
-                       continue;
-
-               children[0] = le32_to_cpu(snap.v->children[0]);
-               children[1] = le32_to_cpu(snap.v->children[1]);
-
-               ret   = snapshot_live(&trans, children[0]) ?:
-                       snapshot_live(&trans, children[1]);
-               if (ret < 0)
-                       break;
-               if (ret)
-                       continue;
-
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
-               if (ret) {
-                       bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-
+       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
+                       POS_MIN, 0, k,
+                       NULL, NULL, 0,
+               bch2_delete_redundant_snapshot(&trans, &iter, k));
        if (ret) {
-               bch_err(c, "error walking snapshots: %i", ret);
+               bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
                goto err;
        }
 
-       ret = bch2_snapshots_set_equiv(&trans);
-       if (ret)
+       for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k,
+               bch2_snapshot_set_equiv(&trans, k));
+       if (ret) {
+               bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
                goto err;
+       }
 
        for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
                           POS_MIN, 0, k, ret) {
@@ -687,7 +683,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
 
                snap = bkey_s_c_to_snapshot(k);
                if (BCH_SNAPSHOT_DELETED(snap.v)) {
-                       ret = snapshot_id_add(&deleted, k.k->p.offset);
+                       ret = snapshot_list_add(c, &deleted, k.k->p.offset);
                        if (ret)
                                break;
                }
@@ -695,39 +691,59 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
        bch2_trans_iter_exit(&trans, &iter);
 
        if (ret) {
-               bch_err(c, "error walking snapshots: %i", ret);
+               bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
                goto err;
        }
 
        for (id = 0; id < BTREE_ID_NR; id++) {
+               struct bpos last_pos = POS_MIN;
+               snapshot_id_list equiv_seen = { 0 };
+
                if (!btree_type_has_snapshots(id))
                        continue;
 
-               ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+               ret = for_each_btree_key_commit(&trans, iter,
+                               id, POS_MIN,
+                               BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                               NULL, NULL, BTREE_INSERT_NOFAIL,
+                       snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
+
+               darray_exit(&equiv_seen);
+
                if (ret) {
-                       bch_err(c, "error deleting snapshot keys: %i", ret);
+                       bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
                        goto err;
                }
        }
 
        for (i = 0; i < deleted.nr; i++) {
-               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_snapshot_node_delete(&trans, deleted.d[i]));
+               ret = commit_do(&trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(&trans, deleted.data[i]));
                if (ret) {
-                       bch_err(c, "error deleting snapshot %u: %i",
-                               deleted.d[i], ret);
+                       bch_err(c, "error deleting snapshot %u: %s",
+                               deleted.data[i], bch2_err_str(ret));
                        goto err;
                }
        }
+
+       clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
 err:
-       kfree(deleted.d);
+       darray_exit(&deleted);
        bch2_trans_exit(&trans);
+       return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+       bch2_delete_dead_snapshots(c);
        percpu_ref_put(&c->writes);
 }
 
-static void bch2_delete_dead_snapshots(struct bch_fs *c)
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
 {
-       if (unlikely(!percpu_ref_tryget(&c->writes)))
+       if (!percpu_ref_tryget_live(&c->writes))
                return;
 
        if (!queue_work(system_long_wq, &c->snapshot_delete_work))
@@ -737,24 +753,35 @@ static void bch2_delete_dead_snapshots(struct bch_fs *c)
 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
                                           struct btree_trans_commit_hook *h)
 {
-       bch2_delete_dead_snapshots(trans->c);
+       struct bch_fs *c = trans->c;
+
+       set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
+
+       if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
+               return 0;
+
+       bch2_delete_dead_snapshots_async(c);
        return 0;
 }
 
 /* Subvolumes: */
 
-const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                          int rw, struct printbuf *err)
 {
-       if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
-               return "invalid pos";
-
-       if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
-               return "invalid pos";
+       if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
+           bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
+               prt_printf(err, "invalid pos");
+               return -EINVAL;
+       }
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
-               return "bad val size";
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
+               prt_printf(err, "incorrect value size (%zu != %zu)",
+                      bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
@@ -762,7 +789,7 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
 
-       pr_buf(out, "root %llu snapshot id %u",
+       prt_printf(out, "root %llu snapshot id %u",
               le64_to_cpu(s.v->inode),
               le32_to_cpu(s.v->snapshot));
 }
@@ -824,7 +851,6 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
        struct bkey_s_c k;
        struct bkey_s_c_subvolume subvol;
        struct btree_trans_commit_hook *h;
-       struct bkey_i *delete;
        u32 snapid;
        int ret = 0;
 
@@ -846,19 +872,14 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
        subvol = bkey_s_c_to_subvolume(k);
        snapid = le32_to_cpu(subvol.v->snapshot);
 
-       delete = bch2_trans_kmalloc(trans, sizeof(*delete));
-       ret = PTR_ERR_OR_ZERO(delete);
+       ret = bch2_btree_delete_at(trans, &iter, 0);
        if (ret)
                goto err;
 
-       bkey_init(&delete->k);
-       delete->k.p = iter.pos;
-       ret = bch2_trans_update(trans, &iter, delete, 0);
+       ret = bch2_snapshot_node_set_deleted(trans, snapid);
        if (ret)
                goto err;
 
-       ret = bch2_snapshot_node_set_deleted(trans, snapid);
-
        h = bch2_trans_kmalloc(trans, sizeof(*h));
        ret = PTR_ERR_OR_ZERO(h);
        if (ret)
@@ -875,14 +896,14 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 {
        struct bch_fs *c = container_of(work, struct bch_fs,
                                snapshot_wait_for_pagecache_and_delete_work);
-       struct snapshot_id_list s;
+       snapshot_id_list s;
        u32 *id;
        int ret = 0;
 
        while (!ret) {
                mutex_lock(&c->snapshots_unlinked_lock);
                s = c->snapshots_unlinked;
-               memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+               darray_init(&c->snapshots_unlinked);
                mutex_unlock(&c->snapshots_unlinked_lock);
 
                if (!s.nr)
@@ -890,16 +911,16 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
 
                bch2_evict_subvolume_inodes(c, &s);
 
-               for (id = s.d; id < s.d + s.nr; id++) {
+               for (id = s.data; id < s.data + s.nr; id++) {
                        ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
                                      bch2_subvolume_delete(&trans, *id));
                        if (ret) {
-                               bch_err(c, "error %i deleting subvolume %u", ret, *id);
+                               bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
                                break;
                        }
                }
 
-               kfree(s.d);
+               darray_exit(&s);
        }
 
        percpu_ref_put(&c->writes);
@@ -919,13 +940,13 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
 
        mutex_lock(&c->snapshots_unlinked_lock);
        if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
-               ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+               ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
        mutex_unlock(&c->snapshots_unlinked_lock);
 
        if (ret)
                return ret;
 
-       if (unlikely(!percpu_ref_tryget(&c->writes)))
+       if (unlikely(!percpu_ref_tryget_live(&c->writes)))
                return -EROFS;
 
        if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
@@ -1010,7 +1031,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
        }
 
        if (!ret)
-               ret = -ENOSPC;
+               ret = -BCH_ERR_ENOSPC_subvolume_create;
        goto err;
 found_slot:
        snapshot_subvols[0] = dst_iter.pos.offset;