]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/subvolume.c
Update bcachefs sources to a8115093df bcachefs: Fix divide by zero in rebalance_work()
[bcachefs-tools-debian] / libbcachefs / subvolume.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_update.h"
6 #include "errcode.h"
7 #include "error.h"
8 #include "fs.h"
9 #include "snapshot.h"
10 #include "subvolume.h"
11
12 #include <linux/random.h>
13
14 static int bch2_subvolume_delete(struct btree_trans *, u32);
15
16 static int check_subvol(struct btree_trans *trans,
17                         struct btree_iter *iter,
18                         struct bkey_s_c k)
19 {
20         struct bch_fs *c = trans->c;
21         struct bkey_s_c_subvolume subvol;
22         struct bch_snapshot snapshot;
23         unsigned snapid;
24         int ret = 0;
25
26         if (k.k->type != KEY_TYPE_subvolume)
27                 return 0;
28
29         subvol = bkey_s_c_to_subvolume(k);
30         snapid = le32_to_cpu(subvol.v->snapshot);
31         ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
32
33         if (bch2_err_matches(ret, ENOENT))
34                 bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
35                         k.k->p.offset, snapid);
36         if (ret)
37                 return ret;
38
39         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
40                 bch2_fs_lazy_rw(c);
41
42                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
43                 if (ret)
44                         bch_err(c, "error deleting subvolume %llu: %s",
45                                 iter->pos.offset, bch2_err_str(ret));
46                 return ret ?: -BCH_ERR_transaction_restart_nested;
47         }
48
49         if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
50                 u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
51                 u32 snapshot_tree;
52                 struct bch_snapshot_tree st;
53
54                 rcu_read_lock();
55                 snapshot_tree = snapshot_t(c, snapshot_root)->tree;
56                 rcu_read_unlock();
57
58                 ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
59
60                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
61                                 "%s: snapshot tree %u not found", __func__, snapshot_tree);
62
63                 if (ret)
64                         return ret;
65
66                 if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c,
67                                 "subvolume %llu is not set as snapshot but is not master subvolume",
68                                 k.k->p.offset)) {
69                         struct bkey_i_subvolume *s =
70                                 bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
71                         ret = PTR_ERR_OR_ZERO(s);
72                         if (ret)
73                                 return ret;
74
75                         SET_BCH_SUBVOLUME_SNAP(&s->v, true);
76                 }
77         }
78
79 fsck_err:
80         return ret;
81 }
82
83 int bch2_check_subvols(struct bch_fs *c)
84 {
85         struct btree_iter iter;
86         struct bkey_s_c k;
87         int ret;
88
89         ret = bch2_trans_run(c,
90                 for_each_btree_key_commit(&trans, iter,
91                         BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
92                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
93                 check_subvol(&trans, &iter, k)));
94         if (ret)
95                 bch_err_fn(c, ret);
96         return ret;
97 }
98
99 /*
100  * Mark a snapshot as deleted, for future cleanup:
101  */
102 static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
103 {
104         struct btree_iter iter;
105         struct bkey_i_snapshot *s;
106         int ret = 0;
107
108         s = bch2_bkey_get_mut_typed(trans, &iter,
109                                     BTREE_ID_snapshots, POS(0, id),
110                                     0, snapshot);
111         ret = PTR_ERR_OR_ZERO(s);
112         if (unlikely(ret)) {
113                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
114                                         trans->c, "missing snapshot %u", id);
115                 return ret;
116         }
117
118         /* already deleted? */
119         if (BCH_SNAPSHOT_DELETED(&s->v))
120                 goto err;
121
122         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
123         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
124         s->v.subvol = 0;
125 err:
126         bch2_trans_iter_exit(trans, &iter);
127         return ret;
128 }
129
130 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
131 {
132         struct bch_fs *c = trans->c;
133         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
134         struct btree_iter tree_iter = (struct btree_iter) { NULL };
135         struct bkey_s_c_snapshot s;
136         u32 parent_id;
137         unsigned i;
138         int ret = 0;
139
140         s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
141                                      BTREE_ITER_INTENT, snapshot);
142         ret = bkey_err(s);
143         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
144                                 "missing snapshot %u", id);
145
146         if (ret)
147                 goto err;
148
149         BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
150         parent_id = le32_to_cpu(s.v->parent);
151
152         if (parent_id) {
153                 struct bkey_i_snapshot *parent;
154
155                 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
156                                      BTREE_ID_snapshots, POS(0, parent_id),
157                                      0, snapshot);
158                 ret = PTR_ERR_OR_ZERO(parent);
159                 if (unlikely(ret)) {
160                         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
161                                                 "missing snapshot %u", parent_id);
162                         goto err;
163                 }
164
165                 for (i = 0; i < 2; i++)
166                         if (le32_to_cpu(parent->v.children[i]) == id)
167                                 break;
168
169                 if (i == 2)
170                         bch_err(c, "snapshot %u missing child pointer to %u",
171                                 parent_id, id);
172                 else
173                         parent->v.children[i] = 0;
174
175                 if (le32_to_cpu(parent->v.children[0]) <
176                     le32_to_cpu(parent->v.children[1]))
177                         swap(parent->v.children[0],
178                              parent->v.children[1]);
179         } else {
180                 /*
181                  * We're deleting the root of a snapshot tree: update the
182                  * snapshot_tree entry to point to the new root, or delete it if
183                  * this is the last snapshot ID in this tree:
184                  */
185                 struct bkey_i_snapshot_tree *s_t;
186
187                 BUG_ON(s.v->children[1]);
188
189                 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
190                                 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
191                                 0, snapshot_tree);
192                 ret = PTR_ERR_OR_ZERO(s_t);
193                 if (ret)
194                         goto err;
195
196                 if (s.v->children[0]) {
197                         s_t->v.root_snapshot = s.v->children[0];
198                 } else {
199                         s_t->k.type = KEY_TYPE_deleted;
200                         set_bkey_val_u64s(&s_t->k, 0);
201                 }
202         }
203
204         ret = bch2_btree_delete_at(trans, &iter, 0);
205 err:
206         bch2_trans_iter_exit(trans, &tree_iter);
207         bch2_trans_iter_exit(trans, &p_iter);
208         bch2_trans_iter_exit(trans, &iter);
209         return ret;
210 }
211
212 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
213                           u32 *new_snapids,
214                           u32 *snapshot_subvols,
215                           unsigned nr_snapids)
216 {
217         struct bch_fs *c = trans->c;
218         struct btree_iter iter;
219         struct bkey_i_snapshot *n;
220         struct bkey_s_c k;
221         unsigned i, j;
222         u32 depth = bch2_snapshot_depth(c, parent);
223         int ret;
224
225         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
226                              POS_MIN, BTREE_ITER_INTENT);
227         k = bch2_btree_iter_peek(&iter);
228         ret = bkey_err(k);
229         if (ret)
230                 goto err;
231
232         for (i = 0; i < nr_snapids; i++) {
233                 k = bch2_btree_iter_prev_slot(&iter);
234                 ret = bkey_err(k);
235                 if (ret)
236                         goto err;
237
238                 if (!k.k || !k.k->p.offset) {
239                         ret = -BCH_ERR_ENOSPC_snapshot_create;
240                         goto err;
241                 }
242
243                 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
244                 ret = PTR_ERR_OR_ZERO(n);
245                 if (ret)
246                         goto err;
247
248                 n->v.flags      = 0;
249                 n->v.parent     = cpu_to_le32(parent);
250                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
251                 n->v.tree       = cpu_to_le32(tree);
252                 n->v.depth      = cpu_to_le32(depth);
253
254                 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
255                         n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
256
257                 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
258                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
259
260                 ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
261                                          bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
262                 if (ret)
263                         goto err;
264
265                 new_snapids[i]  = iter.pos.offset;
266         }
267 err:
268         bch2_trans_iter_exit(trans, &iter);
269         return ret;
270 }
271
272 /*
273  * Create new snapshot IDs as children of an existing snapshot ID:
274  */
275 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
276                               u32 *new_snapids,
277                               u32 *snapshot_subvols,
278                               unsigned nr_snapids)
279 {
280         struct btree_iter iter;
281         struct bkey_i_snapshot *n_parent;
282         int ret = 0;
283
284         n_parent = bch2_bkey_get_mut_typed(trans, &iter,
285                         BTREE_ID_snapshots, POS(0, parent),
286                         0, snapshot);
287         ret = PTR_ERR_OR_ZERO(n_parent);
288         if (unlikely(ret)) {
289                 if (bch2_err_matches(ret, ENOENT))
290                         bch_err(trans->c, "snapshot %u not found", parent);
291                 return ret;
292         }
293
294         if (n_parent->v.children[0] || n_parent->v.children[1]) {
295                 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
296                 ret = -EINVAL;
297                 goto err;
298         }
299
300         ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
301                              new_snapids, snapshot_subvols, nr_snapids);
302         if (ret)
303                 goto err;
304
305         n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
306         n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
307         n_parent->v.subvol = 0;
308         SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
309 err:
310         bch2_trans_iter_exit(trans, &iter);
311         return ret;
312 }
313
314 /*
315  * Create a snapshot node that is the root of a new tree:
316  */
317 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
318                               u32 *new_snapids,
319                               u32 *snapshot_subvols,
320                               unsigned nr_snapids)
321 {
322         struct bkey_i_snapshot_tree *n_tree;
323         int ret;
324
325         n_tree = __bch2_snapshot_tree_create(trans);
326         ret =   PTR_ERR_OR_ZERO(n_tree) ?:
327                 create_snapids(trans, 0, n_tree->k.p.offset,
328                              new_snapids, snapshot_subvols, nr_snapids);
329         if (ret)
330                 return ret;
331
332         n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
333         n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
334         return 0;
335 }
336
337 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
338                               u32 *new_snapids,
339                               u32 *snapshot_subvols,
340                               unsigned nr_snapids)
341 {
342         BUG_ON((parent == 0) != (nr_snapids == 1));
343         BUG_ON((parent != 0) != (nr_snapids == 2));
344
345         return parent
346                 ? bch2_snapshot_node_create_children(trans, parent,
347                                 new_snapids, snapshot_subvols, nr_snapids)
348                 : bch2_snapshot_node_create_tree(trans,
349                                 new_snapids, snapshot_subvols, nr_snapids);
350
351 }
352
353 /*
354  * If we have an unlinked inode in an internal snapshot node, and the inode
355  * really has been deleted in all child snapshots, how does this get cleaned up?
356  *
357  * first there is the problem of how keys that have been overwritten in all
358  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
359  * special?
360  *
361  * also: unlinked inode in internal snapshot appears to not be getting deleted
362  * correctly if inode doesn't exist in leaf snapshots
363  */
364
365 static int snapshot_delete_key(struct btree_trans *trans,
366                                struct btree_iter *iter,
367                                struct bkey_s_c k,
368                                snapshot_id_list *deleted,
369                                snapshot_id_list *equiv_seen,
370                                struct bpos *last_pos)
371 {
372         struct bch_fs *c = trans->c;
373         u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
374
375         if (!bkey_eq(k.k->p, *last_pos))
376                 equiv_seen->nr = 0;
377         *last_pos = k.k->p;
378
379         if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
380             snapshot_list_has_id(equiv_seen, equiv)) {
381                 return bch2_btree_delete_at(trans, iter,
382                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
383         } else {
384                 return snapshot_list_add(c, equiv_seen, equiv);
385         }
386 }
387
388 /*
389  * For a given snapshot, if it doesn't have a subvolume that points to it, and
390  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
391  * as deleted.
392  */
393 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
394                                           struct bkey_s_c k)
395 {
396         struct bkey_s_c_snapshot snap;
397         u32 children[2];
398         int ret;
399
400         if (k.k->type != KEY_TYPE_snapshot)
401                 return 0;
402
403         snap = bkey_s_c_to_snapshot(k);
404         if (BCH_SNAPSHOT_DELETED(snap.v) ||
405             BCH_SNAPSHOT_SUBVOL(snap.v))
406                 return 0;
407
408         children[0] = le32_to_cpu(snap.v->children[0]);
409         children[1] = le32_to_cpu(snap.v->children[1]);
410
411         ret   = bch2_snapshot_live(trans, children[0]) ?:
412                 bch2_snapshot_live(trans, children[1]);
413         if (ret < 0)
414                 return ret;
415
416         if (!ret)
417                 return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
418         return 0;
419 }
420
421 int bch2_delete_dead_snapshots(struct bch_fs *c)
422 {
423         struct btree_trans trans;
424         struct btree_iter iter;
425         struct bkey_s_c k;
426         struct bkey_s_c_snapshot snap;
427         snapshot_id_list deleted = { 0 };
428         u32 i, id;
429         int ret = 0;
430
431         if (!test_bit(BCH_FS_STARTED, &c->flags)) {
432                 ret = bch2_fs_read_write_early(c);
433                 if (ret) {
434                         bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
435                         return ret;
436                 }
437         }
438
439         bch2_trans_init(&trans, c, 0, 0);
440
441         /*
442          * For every snapshot node: If we have no live children and it's not
443          * pointed to by a subvolume, delete it:
444          */
445         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
446                         POS_MIN, 0, k,
447                         NULL, NULL, 0,
448                 bch2_delete_redundant_snapshot(&trans, &iter, k));
449         if (ret) {
450                 bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
451                 goto err;
452         }
453
454         for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
455                            POS_MIN, 0, k,
456                 bch2_snapshot_set_equiv(&trans, k));
457         if (ret) {
458                 bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
459                 goto err;
460         }
461
462         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
463                            POS_MIN, 0, k, ret) {
464                 if (k.k->type != KEY_TYPE_snapshot)
465                         continue;
466
467                 snap = bkey_s_c_to_snapshot(k);
468                 if (BCH_SNAPSHOT_DELETED(snap.v)) {
469                         ret = snapshot_list_add(c, &deleted, k.k->p.offset);
470                         if (ret)
471                                 break;
472                 }
473         }
474         bch2_trans_iter_exit(&trans, &iter);
475
476         if (ret) {
477                 bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
478                 goto err;
479         }
480
481         for (id = 0; id < BTREE_ID_NR; id++) {
482                 struct bpos last_pos = POS_MIN;
483                 snapshot_id_list equiv_seen = { 0 };
484
485                 if (!btree_type_has_snapshots(id))
486                         continue;
487
488                 ret = for_each_btree_key_commit(&trans, iter,
489                                 id, POS_MIN,
490                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
491                                 NULL, NULL, BTREE_INSERT_NOFAIL,
492                         snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
493
494                 darray_exit(&equiv_seen);
495
496                 if (ret) {
497                         bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
498                         goto err;
499                 }
500         }
501
502         for (i = 0; i < deleted.nr; i++) {
503                 ret = commit_do(&trans, NULL, NULL, 0,
504                         bch2_snapshot_node_delete(&trans, deleted.data[i]));
505                 if (ret) {
506                         bch_err(c, "error deleting snapshot %u: %s",
507                                 deleted.data[i], bch2_err_str(ret));
508                         goto err;
509                 }
510         }
511
512         clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
513 err:
514         darray_exit(&deleted);
515         bch2_trans_exit(&trans);
516         if (ret)
517                 bch_err_fn(c, ret);
518         return ret;
519 }
520
521 static void bch2_delete_dead_snapshots_work(struct work_struct *work)
522 {
523         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
524
525         if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
526                 bch2_delete_dead_snapshots(c);
527         bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
528 }
529
530 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
531 {
532         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
533             !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
534                 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
535 }
536
537 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
538                                            struct btree_trans_commit_hook *h)
539 {
540         struct bch_fs *c = trans->c;
541
542         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
543
544         if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
545                 return 0;
546
547         bch2_delete_dead_snapshots_async(c);
548         return 0;
549 }
550
551 /* Subvolumes: */
552
553 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
554                            unsigned flags, struct printbuf *err)
555 {
556         if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
557             bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
558                 prt_printf(err, "invalid pos");
559                 return -BCH_ERR_invalid_bkey;
560         }
561
562         return 0;
563 }
564
565 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
566                             struct bkey_s_c k)
567 {
568         struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
569
570         prt_printf(out, "root %llu snapshot id %u",
571                    le64_to_cpu(s.v->inode),
572                    le32_to_cpu(s.v->snapshot));
573
574         if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
575                 prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
576 }
577
578 static __always_inline int
579 bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
580                            bool inconsistent_if_not_found,
581                            int iter_flags,
582                            struct bch_subvolume *s)
583 {
584         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
585                                           iter_flags, subvolume, s);
586         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
587                                 inconsistent_if_not_found,
588                                 trans->c, "missing subvolume %u", subvol);
589         return ret;
590 }
591
592 int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
593                        bool inconsistent_if_not_found,
594                        int iter_flags,
595                        struct bch_subvolume *s)
596 {
597         return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
598 }
599
600 int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
601                              struct bch_subvolume *subvol)
602 {
603         struct bch_snapshot snap;
604
605         return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
606                 bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
607 }
608
609 int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
610                                 u32 *snapid)
611 {
612         struct btree_iter iter;
613         struct bkey_s_c_subvolume subvol;
614         int ret;
615
616         subvol = bch2_bkey_get_iter_typed(trans, &iter,
617                                           BTREE_ID_subvolumes, POS(0, subvolid),
618                                           BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
619                                           subvolume);
620         ret = bkey_err(subvol);
621         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
622                                 "missing subvolume %u", subvolid);
623
624         if (likely(!ret))
625                 *snapid = le32_to_cpu(subvol.v->snapshot);
626         bch2_trans_iter_exit(trans, &iter);
627         return ret;
628 }
629
630 static int bch2_subvolume_reparent(struct btree_trans *trans,
631                                    struct btree_iter *iter,
632                                    struct bkey_s_c k,
633                                    u32 old_parent, u32 new_parent)
634 {
635         struct bkey_i_subvolume *s;
636         int ret;
637
638         if (k.k->type != KEY_TYPE_subvolume)
639                 return 0;
640
641         if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
642             le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
643                 return 0;
644
645         s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
646         ret = PTR_ERR_OR_ZERO(s);
647         if (ret)
648                 return ret;
649
650         s->v.parent = cpu_to_le32(new_parent);
651         return 0;
652 }
653
654 /*
655  * Separate from the snapshot tree in the snapshots btree, we record the tree
656  * structure of how snapshot subvolumes were created - the parent subvolume of
657  * each snapshot subvolume.
658  *
659  * When a subvolume is deleted, we scan for child subvolumes and reparant them,
660  * to avoid dangling references:
661  */
662 static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
663 {
664         struct btree_iter iter;
665         struct bkey_s_c k;
666         struct bch_subvolume s;
667
668         return lockrestart_do(trans,
669                         bch2_subvolume_get(trans, subvolid_to_delete, true,
670                                    BTREE_ITER_CACHED, &s)) ?:
671                 for_each_btree_key_commit(trans, iter,
672                                 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
673                                 NULL, NULL, BTREE_INSERT_NOFAIL,
674                         bch2_subvolume_reparent(trans, &iter, k,
675                                         subvolid_to_delete, le32_to_cpu(s.parent)));
676 }
677
678 /*
679  * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
680  * deletion/cleanup:
681  */
682 static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
683 {
684         struct btree_iter iter;
685         struct bkey_s_c_subvolume subvol;
686         struct btree_trans_commit_hook *h;
687         u32 snapid;
688         int ret = 0;
689
690         subvol = bch2_bkey_get_iter_typed(trans, &iter,
691                                 BTREE_ID_subvolumes, POS(0, subvolid),
692                                 BTREE_ITER_CACHED|BTREE_ITER_INTENT,
693                                 subvolume);
694         ret = bkey_err(subvol);
695         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
696                                 "missing subvolume %u", subvolid);
697         if (ret)
698                 return ret;
699
700         snapid = le32_to_cpu(subvol.v->snapshot);
701
702         ret = bch2_btree_delete_at(trans, &iter, 0);
703         if (ret)
704                 goto err;
705
706         ret = bch2_snapshot_node_set_deleted(trans, snapid);
707         if (ret)
708                 goto err;
709
710         h = bch2_trans_kmalloc(trans, sizeof(*h));
711         ret = PTR_ERR_OR_ZERO(h);
712         if (ret)
713                 goto err;
714
715         h->fn = bch2_delete_dead_snapshots_hook;
716         bch2_trans_commit_hook(trans, h);
717 err:
718         bch2_trans_iter_exit(trans, &iter);
719         return ret;
720 }
721
722 static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
723 {
724         return bch2_subvolumes_reparent(trans, subvolid) ?:
725                 commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
726                           __bch2_subvolume_delete(trans, subvolid));
727 }
728
729 static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
730 {
731         struct bch_fs *c = container_of(work, struct bch_fs,
732                                 snapshot_wait_for_pagecache_and_delete_work);
733         snapshot_id_list s;
734         u32 *id;
735         int ret = 0;
736
737         while (!ret) {
738                 mutex_lock(&c->snapshots_unlinked_lock);
739                 s = c->snapshots_unlinked;
740                 darray_init(&c->snapshots_unlinked);
741                 mutex_unlock(&c->snapshots_unlinked_lock);
742
743                 if (!s.nr)
744                         break;
745
746                 bch2_evict_subvolume_inodes(c, &s);
747
748                 for (id = s.data; id < s.data + s.nr; id++) {
749                         ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
750                         if (ret) {
751                                 bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
752                                 break;
753                         }
754                 }
755
756                 darray_exit(&s);
757         }
758
759         bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
760 }
761
762 struct subvolume_unlink_hook {
763         struct btree_trans_commit_hook  h;
764         u32                             subvol;
765 };
766
767 static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
768                                                       struct btree_trans_commit_hook *_h)
769 {
770         struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
771         struct bch_fs *c = trans->c;
772         int ret = 0;
773
774         mutex_lock(&c->snapshots_unlinked_lock);
775         if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
776                 ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
777         mutex_unlock(&c->snapshots_unlinked_lock);
778
779         if (ret)
780                 return ret;
781
782         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
783                 return -EROFS;
784
785         if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
786                 bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
787         return 0;
788 }
789
790 int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
791 {
792         struct btree_iter iter;
793         struct bkey_i_subvolume *n;
794         struct subvolume_unlink_hook *h;
795         int ret = 0;
796
797         h = bch2_trans_kmalloc(trans, sizeof(*h));
798         ret = PTR_ERR_OR_ZERO(h);
799         if (ret)
800                 return ret;
801
802         h->h.fn         = bch2_subvolume_wait_for_pagecache_and_delete_hook;
803         h->subvol       = subvolid;
804         bch2_trans_commit_hook(trans, &h->h);
805
806         n = bch2_bkey_get_mut_typed(trans, &iter,
807                         BTREE_ID_subvolumes, POS(0, subvolid),
808                         BTREE_ITER_CACHED, subvolume);
809         ret = PTR_ERR_OR_ZERO(n);
810         if (unlikely(ret)) {
811                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
812                                         "missing subvolume %u", subvolid);
813                 return ret;
814         }
815
816         SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
817         bch2_trans_iter_exit(trans, &iter);
818         return ret;
819 }
820
821 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
822                           u32 src_subvolid,
823                           u32 *new_subvolid,
824                           u32 *new_snapshotid,
825                           bool ro)
826 {
827         struct bch_fs *c = trans->c;
828         struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
829         struct bkey_i_subvolume *new_subvol = NULL;
830         struct bkey_i_subvolume *src_subvol = NULL;
831         u32 parent = 0, new_nodes[2], snapshot_subvols[2];
832         int ret = 0;
833
834         ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
835                                 BTREE_ID_subvolumes, POS(0, U32_MAX));
836         if (ret == -BCH_ERR_ENOSPC_btree_slot)
837                 ret = -BCH_ERR_ENOSPC_subvolume_create;
838         if (ret)
839                 return ret;
840
841         snapshot_subvols[0] = dst_iter.pos.offset;
842         snapshot_subvols[1] = src_subvolid;
843
844         if (src_subvolid) {
845                 /* Creating a snapshot: */
846
847                 src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
848                                 BTREE_ID_subvolumes, POS(0, src_subvolid),
849                                 BTREE_ITER_CACHED, subvolume);
850                 ret = PTR_ERR_OR_ZERO(src_subvol);
851                 if (unlikely(ret)) {
852                         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
853                                                 "subvolume %u not found", src_subvolid);
854                         goto err;
855                 }
856
857                 parent = le32_to_cpu(src_subvol->v.snapshot);
858         }
859
860         ret = bch2_snapshot_node_create(trans, parent, new_nodes,
861                                         snapshot_subvols,
862                                         src_subvolid ? 2 : 1);
863         if (ret)
864                 goto err;
865
866         if (src_subvolid) {
867                 src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
868                 ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
869                 if (ret)
870                         goto err;
871         }
872
873         new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
874         ret = PTR_ERR_OR_ZERO(new_subvol);
875         if (ret)
876                 goto err;
877
878         new_subvol->v.flags     = 0;
879         new_subvol->v.snapshot  = cpu_to_le32(new_nodes[0]);
880         new_subvol->v.inode     = cpu_to_le64(inode);
881         new_subvol->v.parent    = cpu_to_le32(src_subvolid);
882         new_subvol->v.otime.lo  = cpu_to_le64(bch2_current_time(c));
883         new_subvol->v.otime.hi  = 0;
884
885         SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
886         SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
887
888         *new_subvolid   = new_subvol->k.p.offset;
889         *new_snapshotid = new_nodes[0];
890 err:
891         bch2_trans_iter_exit(trans, &src_iter);
892         bch2_trans_iter_exit(trans, &dst_iter);
893         return ret;
894 }
895
896 int bch2_fs_subvolumes_init(struct bch_fs *c)
897 {
898         INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
899         INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
900                   bch2_subvolume_wait_for_pagecache_and_delete);
901         mutex_init(&c->snapshots_unlinked_lock);
902         return 0;
903 }