]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/subvolume.c
Update bcachefs sources to e14d7c7195 bcachefs: Compression levels
[bcachefs-tools-debian] / libbcachefs / subvolume.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_update.h"
6 #include "errcode.h"
7 #include "error.h"
8 #include "fs.h"
9 #include "subvolume.h"
10
11 #include <linux/random.h>
12
13 static int bch2_subvolume_delete(struct btree_trans *, u32);
14
15 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
16 {
17         const struct snapshot_t *s = __snapshot_t(t, id);
18
19         if (s->skip[2] <= ancestor)
20                 return s->skip[2];
21         if (s->skip[1] <= ancestor)
22                 return s->skip[1];
23         if (s->skip[0] <= ancestor)
24                 return s->skip[0];
25         return s->parent;
26 }
27
28 bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
29 {
30         struct snapshot_table *t;
31
32         EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
33
34         rcu_read_lock();
35         t = rcu_dereference(c->snapshots);
36
37         while (id && id < ancestor)
38                 id = get_ancestor_below(t, id, ancestor);
39         rcu_read_unlock();
40
41         return id == ancestor;
42 }
43
44 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
45 {
46         struct snapshot_table *t;
47
48         rcu_read_lock();
49         t = rcu_dereference(c->snapshots);
50
51         while (id && id < ancestor)
52                 id = __snapshot_t(t, id)->parent;
53         rcu_read_unlock();
54
55         return id == ancestor;
56 }
57
58 static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
59 {
60         u32 depth;
61
62         rcu_read_lock();
63         depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
64         rcu_read_unlock();
65
66         return depth;
67 }
68
69 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
70 {
71         size_t idx = U32_MAX - id;
72         size_t new_size;
73         struct snapshot_table *new, *old;
74
75         new_size = max(16UL, roundup_pow_of_two(idx + 1));
76
77         new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
78         if (!new)
79                 return NULL;
80
81         old = c->snapshots;
82         if (old)
83                 memcpy(new->s,
84                        rcu_dereference_protected(c->snapshots, true)->s,
85                        sizeof(new->s[0]) * c->snapshot_table_size);
86
87         rcu_assign_pointer(c->snapshots, new);
88         c->snapshot_table_size = new_size;
89         if (old)
90                 kvfree_rcu(old);
91
92         return &rcu_dereference_protected(c->snapshots, true)->s[idx];
93 }
94
95 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
96 {
97         size_t idx = U32_MAX - id;
98
99         lockdep_assert_held(&c->snapshot_table_lock);
100
101         if (likely(idx < c->snapshot_table_size))
102                 return &rcu_dereference_protected(c->snapshots, true)->s[idx];
103
104         return __snapshot_t_mut(c, id);
105 }
106
107 /* Snapshot tree: */
108
109 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
110                                 struct bkey_s_c k)
111 {
112         struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
113
114         prt_printf(out, "subvol %u root snapshot %u",
115                    le32_to_cpu(t.v->master_subvol),
116                    le32_to_cpu(t.v->root_snapshot));
117 }
118
119 int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
120                                enum bkey_invalid_flags flags,
121                                struct printbuf *err)
122 {
123         if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
124             bkey_lt(k.k->p, POS(0, 1))) {
125                 prt_printf(err, "bad pos");
126                 return -BCH_ERR_invalid_bkey;
127         }
128
129         return 0;
130 }
131
132 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
133                               struct bch_snapshot_tree *s)
134 {
135         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
136                                           BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
137
138         if (bch2_err_matches(ret, ENOENT))
139                 ret = -BCH_ERR_ENOENT_snapshot_tree;
140         return ret;
141 }
142
143 static struct bkey_i_snapshot_tree *
144 __snapshot_tree_create(struct btree_trans *trans)
145 {
146         struct btree_iter iter;
147         int ret = bch2_bkey_get_empty_slot(trans, &iter,
148                         BTREE_ID_snapshot_trees, POS(0, U32_MAX));
149         struct bkey_i_snapshot_tree *s_t;
150
151         if (ret == -BCH_ERR_ENOSPC_btree_slot)
152                 ret = -BCH_ERR_ENOSPC_snapshot_tree;
153         if (ret)
154                 return ERR_PTR(ret);
155
156         s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
157         ret = PTR_ERR_OR_ZERO(s_t);
158         bch2_trans_iter_exit(trans, &iter);
159         return ret ? ERR_PTR(ret) : s_t;
160 }
161
162 static int snapshot_tree_create(struct btree_trans *trans,
163                                 u32 root_id, u32 subvol_id, u32 *tree_id)
164 {
165         struct bkey_i_snapshot_tree *n_tree =
166                 __snapshot_tree_create(trans);
167
168         if (IS_ERR(n_tree))
169                 return PTR_ERR(n_tree);
170
171         n_tree->v.master_subvol = cpu_to_le32(subvol_id);
172         n_tree->v.root_snapshot = cpu_to_le32(root_id);
173         *tree_id = n_tree->k.p.offset;
174         return 0;
175 }
176
177 /* Snapshot nodes: */
178
179 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
180                            struct bkey_s_c k)
181 {
182         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
183
184         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
185                BCH_SNAPSHOT_SUBVOL(s.v),
186                BCH_SNAPSHOT_DELETED(s.v),
187                le32_to_cpu(s.v->parent),
188                le32_to_cpu(s.v->children[0]),
189                le32_to_cpu(s.v->children[1]),
190                le32_to_cpu(s.v->subvol),
191                le32_to_cpu(s.v->tree));
192 }
193
194 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
195                           enum bkey_invalid_flags flags,
196                           struct printbuf *err)
197 {
198         struct bkey_s_c_snapshot s;
199         u32 i, id;
200
201         if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
202             bkey_lt(k.k->p, POS(0, 1))) {
203                 prt_printf(err, "bad pos");
204                 return -BCH_ERR_invalid_bkey;
205         }
206
207         s = bkey_s_c_to_snapshot(k);
208
209         id = le32_to_cpu(s.v->parent);
210         if (id && id <= k.k->p.offset) {
211                 prt_printf(err, "bad parent node (%u <= %llu)",
212                        id, k.k->p.offset);
213                 return -BCH_ERR_invalid_bkey;
214         }
215
216         if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
217                 prt_printf(err, "children not normalized");
218                 return -BCH_ERR_invalid_bkey;
219         }
220
221         if (s.v->children[0] &&
222             s.v->children[0] == s.v->children[1]) {
223                 prt_printf(err, "duplicate child nodes");
224                 return -BCH_ERR_invalid_bkey;
225         }
226
227         for (i = 0; i < 2; i++) {
228                 id = le32_to_cpu(s.v->children[i]);
229
230                 if (id >= k.k->p.offset) {
231                         prt_printf(err, "bad child node (%u >= %llu)",
232                                id, k.k->p.offset);
233                         return -BCH_ERR_invalid_bkey;
234                 }
235         }
236
237         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
238                 if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
239                     le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
240                         prt_printf(err, "skiplist not normalized");
241                         return -BCH_ERR_invalid_bkey;
242                 }
243
244                 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
245                         id = le32_to_cpu(s.v->skip[i]);
246
247                         if (!id != !s.v->parent ||
248                             (s.v->parent &&
249                              id <= k.k->p.offset)) {
250                                 prt_printf(err, "bad skiplist node %u)", id);
251                                 return -BCH_ERR_invalid_bkey;
252                         }
253                 }
254         }
255
256         return 0;
257 }
258
259 int bch2_mark_snapshot(struct btree_trans *trans,
260                        enum btree_id btree, unsigned level,
261                        struct bkey_s_c old, struct bkey_s_c new,
262                        unsigned flags)
263 {
264         struct bch_fs *c = trans->c;
265         struct snapshot_t *t;
266         int ret = 0;
267
268         mutex_lock(&c->snapshot_table_lock);
269
270         t = snapshot_t_mut(c, new.k->p.offset);
271         if (!t) {
272                 ret = -BCH_ERR_ENOMEM_mark_snapshot;
273                 goto err;
274         }
275
276         if (new.k->type == KEY_TYPE_snapshot) {
277                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
278
279                 t->parent       = le32_to_cpu(s.v->parent);
280                 t->skip[0]      = le32_to_cpu(s.v->skip[0]);
281                 t->skip[1]      = le32_to_cpu(s.v->skip[1]);
282                 t->skip[2]      = le32_to_cpu(s.v->skip[2]);
283                 t->depth        = le32_to_cpu(s.v->depth);
284                 t->children[0]  = le32_to_cpu(s.v->children[0]);
285                 t->children[1]  = le32_to_cpu(s.v->children[1]);
286                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
287                 t->tree         = le32_to_cpu(s.v->tree);
288
289                 if (BCH_SNAPSHOT_DELETED(s.v))
290                         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
291         } else {
292                 t->parent       = 0;
293                 t->children[0]  = 0;
294                 t->children[1]  = 0;
295                 t->subvol       = 0;
296                 t->tree         = 0;
297         }
298 err:
299         mutex_unlock(&c->snapshot_table_lock);
300         return ret;
301 }
302
303 static int snapshot_lookup(struct btree_trans *trans, u32 id,
304                            struct bch_snapshot *s)
305 {
306         return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
307                                        BTREE_ITER_WITH_UPDATES, snapshot, s);
308 }
309
310 static int snapshot_live(struct btree_trans *trans, u32 id)
311 {
312         struct bch_snapshot v;
313         int ret;
314
315         if (!id)
316                 return 0;
317
318         ret = snapshot_lookup(trans, id, &v);
319         if (bch2_err_matches(ret, ENOENT))
320                 bch_err(trans->c, "snapshot node %u not found", id);
321         if (ret)
322                 return ret;
323
324         return !BCH_SNAPSHOT_DELETED(&v);
325 }
326
327 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
328 {
329         struct bch_fs *c = trans->c;
330         unsigned i, nr_live = 0, live_idx = 0;
331         struct bkey_s_c_snapshot snap;
332         u32 id = k.k->p.offset, child[2];
333
334         if (k.k->type != KEY_TYPE_snapshot)
335                 return 0;
336
337         snap = bkey_s_c_to_snapshot(k);
338
339         child[0] = le32_to_cpu(snap.v->children[0]);
340         child[1] = le32_to_cpu(snap.v->children[1]);
341
342         for (i = 0; i < 2; i++) {
343                 int ret = snapshot_live(trans, child[i]);
344
345                 if (ret < 0)
346                         return ret;
347
348                 if (ret)
349                         live_idx = i;
350                 nr_live += ret;
351         }
352
353         mutex_lock(&c->snapshot_table_lock);
354
355         snapshot_t_mut(c, id)->equiv = nr_live == 1
356                 ? snapshot_t_mut(c, child[live_idx])->equiv
357                 : id;
358
359         mutex_unlock(&c->snapshot_table_lock);
360
361         return 0;
362 }
363
364 /* fsck: */
365
366 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
367 {
368         return snapshot_t(c, id)->children[child];
369 }
370
371 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
372 {
373         return bch2_snapshot_child(c, id, 0);
374 }
375
376 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
377 {
378         return bch2_snapshot_child(c, id, 1);
379 }
380
381 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
382 {
383         u32 n, parent;
384
385         n = bch2_snapshot_left_child(c, id);
386         if (n)
387                 return n;
388
389         while ((parent = bch2_snapshot_parent(c, id))) {
390                 n = bch2_snapshot_right_child(c, parent);
391                 if (n && n != id)
392                         return n;
393                 id = parent;
394         }
395
396         return 0;
397 }
398
399 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
400 {
401         u32 id = snapshot_root;
402         u32 subvol = 0, s;
403
404         while (id) {
405                 s = snapshot_t(c, id)->subvol;
406
407                 if (s && (!subvol || s < subvol))
408                         subvol = s;
409
410                 id = bch2_snapshot_tree_next(c, id);
411         }
412
413         return subvol;
414 }
415
416 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
417                                             u32 snapshot_root, u32 *subvol_id)
418 {
419         struct bch_fs *c = trans->c;
420         struct btree_iter iter;
421         struct bkey_s_c k;
422         struct bkey_s_c_subvolume s;
423         bool found = false;
424         int ret;
425
426         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
427                                      0, k, ret) {
428                 if (k.k->type != KEY_TYPE_subvolume)
429                         continue;
430
431                 s = bkey_s_c_to_subvolume(k);
432                 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
433                         continue;
434                 if (!BCH_SUBVOLUME_SNAP(s.v)) {
435                         *subvol_id = s.k->p.offset;
436                         found = true;
437                         break;
438                 }
439         }
440
441         bch2_trans_iter_exit(trans, &iter);
442
443         if (!ret && !found) {
444                 struct bkey_i_subvolume *s;
445
446                 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
447
448                 s = bch2_bkey_get_mut_typed(trans, &iter,
449                                             BTREE_ID_subvolumes, POS(0, *subvol_id),
450                                             0, subvolume);
451                 ret = PTR_ERR_OR_ZERO(s);
452                 if (ret)
453                         return ret;
454
455                 SET_BCH_SUBVOLUME_SNAP(&s->v, false);
456         }
457
458         return ret;
459 }
460
461 static int check_snapshot_tree(struct btree_trans *trans,
462                                struct btree_iter *iter,
463                                struct bkey_s_c k)
464 {
465         struct bch_fs *c = trans->c;
466         struct bkey_s_c_snapshot_tree st;
467         struct bch_snapshot s;
468         struct bch_subvolume subvol;
469         struct printbuf buf = PRINTBUF;
470         u32 root_id;
471         int ret;
472
473         if (k.k->type != KEY_TYPE_snapshot_tree)
474                 return 0;
475
476         st = bkey_s_c_to_snapshot_tree(k);
477         root_id = le32_to_cpu(st.v->root_snapshot);
478
479         ret = snapshot_lookup(trans, root_id, &s);
480         if (ret && !bch2_err_matches(ret, ENOENT))
481                 goto err;
482
483         if (fsck_err_on(ret ||
484                         root_id != bch2_snapshot_root(c, root_id) ||
485                         st.k->p.offset != le32_to_cpu(s.tree),
486                         c,
487                         "snapshot tree points to missing/incorrect snapshot:\n  %s",
488                         (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
489                 ret = bch2_btree_delete_at(trans, iter, 0);
490                 goto err;
491         }
492
493         ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
494                                  false, 0, &subvol);
495         if (ret && !bch2_err_matches(ret, ENOENT))
496                 goto err;
497
498         if (fsck_err_on(ret, c,
499                         "snapshot tree points to missing subvolume:\n  %s",
500                         (printbuf_reset(&buf),
501                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
502             fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
503                                                 le32_to_cpu(subvol.snapshot),
504                                                 root_id), c,
505                         "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
506                         (printbuf_reset(&buf),
507                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
508             fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
509                         "snapshot tree points to snapshot subvolume:\n  %s",
510                         (printbuf_reset(&buf),
511                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
512                 struct bkey_i_snapshot_tree *u;
513                 u32 subvol_id;
514
515                 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
516                 if (ret)
517                         goto err;
518
519                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
520                 ret = PTR_ERR_OR_ZERO(u);
521                 if (ret)
522                         goto err;
523
524                 u->v.master_subvol = cpu_to_le32(subvol_id);
525                 st = snapshot_tree_i_to_s_c(u);
526         }
527 err:
528 fsck_err:
529         printbuf_exit(&buf);
530         return ret;
531 }
532
533 /*
534  * For each snapshot_tree, make sure it points to the root of a snapshot tree
535  * and that snapshot entry points back to it, or delete it.
536  *
537  * And, make sure it points to a subvolume within that snapshot tree, or correct
538  * it to point to the oldest subvolume within that snapshot tree.
539  */
540 int bch2_check_snapshot_trees(struct bch_fs *c)
541 {
542         struct btree_iter iter;
543         struct bkey_s_c k;
544         int ret;
545
546         ret = bch2_trans_run(c,
547                 for_each_btree_key_commit(&trans, iter,
548                         BTREE_ID_snapshot_trees, POS_MIN,
549                         BTREE_ITER_PREFETCH, k,
550                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
551                 check_snapshot_tree(&trans, &iter, k)));
552
553         if (ret)
554                 bch_err(c, "error %i checking snapshot trees", ret);
555         return ret;
556 }
557
558 /*
559  * Look up snapshot tree for @tree_id and find root,
560  * make sure @snap_id is a descendent:
561  */
562 static int snapshot_tree_ptr_good(struct btree_trans *trans,
563                                   u32 snap_id, u32 tree_id)
564 {
565         struct bch_snapshot_tree s_t;
566         int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
567
568         if (bch2_err_matches(ret, ENOENT))
569                 return 0;
570         if (ret)
571                 return ret;
572
573         return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
574 }
575
576 static u32 snapshot_rand_ancestor_get(struct bch_fs *c, u32 id)
577 {
578         const struct snapshot_t *s;
579
580         if (!id)
581                 return 0;
582
583         rcu_read_lock();
584         s = snapshot_t(c, id);
585         if (s->parent)
586                 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
587         rcu_read_unlock();
588
589         return id;
590 }
591
592 static int snapshot_rand_ancestor_good(struct btree_trans *trans,
593                                        struct bch_snapshot s)
594 {
595         struct bch_snapshot a;
596         unsigned i;
597         int ret;
598
599         for (i = 0; i < 3; i++) {
600                 if (!s.parent != !s.skip[i])
601                         return false;
602
603                 if (!s.parent)
604                         continue;
605
606                 ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
607                 if (bch2_err_matches(ret, ENOENT))
608                         return false;
609                 if (ret)
610                         return ret;
611
612                 if (a.tree != s.tree)
613                         return false;
614         }
615
616         return true;
617 }
618
619 /*
620  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
621  * its snapshot_tree pointer is correct (allocate new one if necessary), then
622  * update this node's pointer to root node's pointer:
623  */
624 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
625                                     struct btree_iter *iter,
626                                     struct bkey_s_c k,
627                                     struct bch_snapshot *s)
628 {
629         struct bch_fs *c = trans->c;
630         struct btree_iter root_iter;
631         struct bch_snapshot_tree s_t;
632         struct bkey_s_c_snapshot root;
633         struct bkey_i_snapshot *u;
634         u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
635         int ret;
636
637         root = bch2_bkey_get_iter_typed(trans, &root_iter,
638                                BTREE_ID_snapshots, POS(0, root_id),
639                                BTREE_ITER_WITH_UPDATES, snapshot);
640         ret = bkey_err(root);
641         if (ret)
642                 goto err;
643
644         tree_id = le32_to_cpu(root.v->tree);
645
646         ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
647         if (ret && !bch2_err_matches(ret, ENOENT))
648                 return ret;
649
650         if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
651                 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
652                 ret =   PTR_ERR_OR_ZERO(u) ?:
653                         snapshot_tree_create(trans, root_id,
654                                 bch2_snapshot_tree_oldest_subvol(c, root_id),
655                                 &tree_id);
656                 if (ret)
657                         goto err;
658
659                 u->v.tree = cpu_to_le32(tree_id);
660                 if (k.k->p.offset == root_id)
661                         *s = u->v;
662         }
663
664         if (k.k->p.offset != root_id) {
665                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
666                 ret = PTR_ERR_OR_ZERO(u);
667                 if (ret)
668                         goto err;
669
670                 u->v.tree = cpu_to_le32(tree_id);
671                 *s = u->v;
672         }
673 err:
674         bch2_trans_iter_exit(trans, &root_iter);
675         return ret;
676 }
677
678 static int check_snapshot(struct btree_trans *trans,
679                           struct btree_iter *iter,
680                           struct bkey_s_c k)
681 {
682         struct bch_fs *c = trans->c;
683         struct bch_snapshot s;
684         struct bch_subvolume subvol;
685         struct bch_snapshot v;
686         struct bkey_i_snapshot *u;
687         u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
688         u32 real_depth;
689         struct printbuf buf = PRINTBUF;
690         bool should_have_subvol;
691         u32 i, id;
692         int ret = 0;
693
694         if (k.k->type != KEY_TYPE_snapshot)
695                 return 0;
696
697         memset(&s, 0, sizeof(s));
698         memcpy(&s, k.v, bkey_val_bytes(k.k));
699
700         id = le32_to_cpu(s.parent);
701         if (id) {
702                 ret = snapshot_lookup(trans, id, &v);
703                 if (bch2_err_matches(ret, ENOENT))
704                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
705                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
706                 if (ret)
707                         goto err;
708
709                 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
710                     le32_to_cpu(v.children[1]) != k.k->p.offset) {
711                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
712                                 id, k.k->p.offset);
713                         ret = -EINVAL;
714                         goto err;
715                 }
716         }
717
718         for (i = 0; i < 2 && s.children[i]; i++) {
719                 id = le32_to_cpu(s.children[i]);
720
721                 ret = snapshot_lookup(trans, id, &v);
722                 if (bch2_err_matches(ret, ENOENT))
723                         bch_err(c, "snapshot node %llu has nonexistent child %u",
724                                 k.k->p.offset, id);
725                 if (ret)
726                         goto err;
727
728                 if (le32_to_cpu(v.parent) != k.k->p.offset) {
729                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
730                                 id, le32_to_cpu(v.parent), k.k->p.offset);
731                         ret = -EINVAL;
732                         goto err;
733                 }
734         }
735
736         should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
737                 !BCH_SNAPSHOT_DELETED(&s);
738
739         if (should_have_subvol) {
740                 id = le32_to_cpu(s.subvol);
741                 ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
742                 if (bch2_err_matches(ret, ENOENT))
743                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
744                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
745                 if (ret)
746                         goto err;
747
748                 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
749                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
750                                 k.k->p.offset);
751                         ret = -EINVAL;
752                         goto err;
753                 }
754         } else {
755                 if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
756                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
757                         u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
758                         ret = PTR_ERR_OR_ZERO(u);
759                         if (ret)
760                                 goto err;
761
762                         u->v.subvol = 0;
763                         s = u->v;
764                 }
765         }
766
767         ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
768         if (ret < 0)
769                 goto err;
770
771         if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
772                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
773                 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
774                 if (ret)
775                         goto err;
776         }
777         ret = 0;
778
779         real_depth = bch2_snapshot_depth(c, parent_id);
780
781         if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, c,
782                         "snapshot with incorrect depth fields, should be %u:\n  %s",
783                         real_depth,
784                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
785                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
786                 ret = PTR_ERR_OR_ZERO(u);
787                 if (ret)
788                         goto err;
789
790                 u->v.depth = cpu_to_le32(real_depth);
791                 s = u->v;
792         }
793
794         ret = snapshot_rand_ancestor_good(trans, s);
795         if (ret < 0)
796                 goto err;
797
798         if (fsck_err_on(!ret, c, "snapshot with bad rand_ancestor field:\n  %s",
799                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
800                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
801                 ret = PTR_ERR_OR_ZERO(u);
802                 if (ret)
803                         goto err;
804
805                 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
806                         u->v.skip[i] = cpu_to_le32(snapshot_rand_ancestor_get(c, parent_id));
807
808                 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_int);
809                 s = u->v;
810         }
811         ret = 0;
812 err:
813 fsck_err:
814         printbuf_exit(&buf);
815         return ret;
816 }
817
818 int bch2_check_snapshots(struct bch_fs *c)
819 {
820         struct btree_iter iter;
821         struct bkey_s_c k;
822         int ret;
823
824         /*
825          * We iterate backwards as checking/fixing the depth field requires that
826          * the parent's depth already be correct:
827          */
828         ret = bch2_trans_run(c,
829                 for_each_btree_key_reverse_commit(&trans, iter,
830                         BTREE_ID_snapshots, POS_MAX,
831                         BTREE_ITER_PREFETCH, k,
832                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
833                 check_snapshot(&trans, &iter, k)));
834         if (ret)
835                 bch_err_fn(c, ret);
836         return ret;
837 }
838
839 static int check_subvol(struct btree_trans *trans,
840                         struct btree_iter *iter,
841                         struct bkey_s_c k)
842 {
843         struct bch_fs *c = trans->c;
844         struct bkey_s_c_subvolume subvol;
845         struct bch_snapshot snapshot;
846         unsigned snapid;
847         int ret = 0;
848
849         if (k.k->type != KEY_TYPE_subvolume)
850                 return 0;
851
852         subvol = bkey_s_c_to_subvolume(k);
853         snapid = le32_to_cpu(subvol.v->snapshot);
854         ret = snapshot_lookup(trans, snapid, &snapshot);
855
856         if (bch2_err_matches(ret, ENOENT))
857                 bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
858                         k.k->p.offset, snapid);
859         if (ret)
860                 return ret;
861
862         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
863                 bch2_fs_lazy_rw(c);
864
865                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
866                 if (ret)
867                         bch_err(c, "error deleting subvolume %llu: %s",
868                                 iter->pos.offset, bch2_err_str(ret));
869                 return ret ?: -BCH_ERR_transaction_restart_nested;
870         }
871
872         if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
873                 u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
874                 u32 snapshot_tree;
875                 struct bch_snapshot_tree st;
876
877                 rcu_read_lock();
878                 snapshot_tree = snapshot_t(c, snapshot_root)->tree;
879                 rcu_read_unlock();
880
881                 ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
882
883                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
884                                 "%s: snapshot tree %u not found", __func__, snapshot_tree);
885
886                 if (ret)
887                         return ret;
888
889                 if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c,
890                                 "subvolume %llu is not set as snapshot but is not master subvolume",
891                                 k.k->p.offset)) {
892                         struct bkey_i_subvolume *s =
893                                 bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
894                         ret = PTR_ERR_OR_ZERO(s);
895                         if (ret)
896                                 return ret;
897
898                         SET_BCH_SUBVOLUME_SNAP(&s->v, true);
899                 }
900         }
901
902 fsck_err:
903         return ret;
904 }
905
906 int bch2_check_subvols(struct bch_fs *c)
907 {
908         struct btree_iter iter;
909         struct bkey_s_c k;
910         int ret;
911
912         ret = bch2_trans_run(c,
913                 for_each_btree_key_commit(&trans, iter,
914                         BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
915                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
916                 check_subvol(&trans, &iter, k)));
917         if (ret)
918                 bch_err_fn(c, ret);
919         return ret;
920 }
921
922 void bch2_fs_snapshots_exit(struct bch_fs *c)
923 {
924         kfree(c->snapshots);
925 }
926
927 int bch2_snapshots_read(struct bch_fs *c)
928 {
929         struct btree_iter iter;
930         struct bkey_s_c k;
931         int ret = 0;
932
933         ret = bch2_trans_run(c,
934                 for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
935                            POS_MIN, 0, k,
936                         bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
937                         bch2_snapshot_set_equiv(&trans, k)));
938         if (ret)
939                 bch_err_fn(c, ret);
940         return ret;
941 }
942
943 /*
944  * Mark a snapshot as deleted, for future cleanup:
945  */
946 static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
947 {
948         struct btree_iter iter;
949         struct bkey_i_snapshot *s;
950         int ret = 0;
951
952         s = bch2_bkey_get_mut_typed(trans, &iter,
953                                     BTREE_ID_snapshots, POS(0, id),
954                                     0, snapshot);
955         ret = PTR_ERR_OR_ZERO(s);
956         if (unlikely(ret)) {
957                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
958                                         trans->c, "missing snapshot %u", id);
959                 return ret;
960         }
961
962         /* already deleted? */
963         if (BCH_SNAPSHOT_DELETED(&s->v))
964                 goto err;
965
966         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
967         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
968         s->v.subvol = 0;
969 err:
970         bch2_trans_iter_exit(trans, &iter);
971         return ret;
972 }
973
974 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
975 {
976         struct bch_fs *c = trans->c;
977         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
978         struct btree_iter tree_iter = (struct btree_iter) { NULL };
979         struct bkey_s_c_snapshot s;
980         u32 parent_id;
981         unsigned i;
982         int ret = 0;
983
984         s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
985                                      BTREE_ITER_INTENT, snapshot);
986         ret = bkey_err(s);
987         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
988                                 "missing snapshot %u", id);
989
990         if (ret)
991                 goto err;
992
993         BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
994         parent_id = le32_to_cpu(s.v->parent);
995
996         if (parent_id) {
997                 struct bkey_i_snapshot *parent;
998
999                 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1000                                      BTREE_ID_snapshots, POS(0, parent_id),
1001                                      0, snapshot);
1002                 ret = PTR_ERR_OR_ZERO(parent);
1003                 if (unlikely(ret)) {
1004                         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1005                                                 "missing snapshot %u", parent_id);
1006                         goto err;
1007                 }
1008
1009                 for (i = 0; i < 2; i++)
1010                         if (le32_to_cpu(parent->v.children[i]) == id)
1011                                 break;
1012
1013                 if (i == 2)
1014                         bch_err(c, "snapshot %u missing child pointer to %u",
1015                                 parent_id, id);
1016                 else
1017                         parent->v.children[i] = 0;
1018
1019                 if (le32_to_cpu(parent->v.children[0]) <
1020                     le32_to_cpu(parent->v.children[1]))
1021                         swap(parent->v.children[0],
1022                              parent->v.children[1]);
1023         } else {
1024                 /*
1025                  * We're deleting the root of a snapshot tree: update the
1026                  * snapshot_tree entry to point to the new root, or delete it if
1027                  * this is the last snapshot ID in this tree:
1028                  */
1029                 struct bkey_i_snapshot_tree *s_t;
1030
1031                 BUG_ON(s.v->children[1]);
1032
1033                 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1034                                 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1035                                 0, snapshot_tree);
1036                 ret = PTR_ERR_OR_ZERO(s_t);
1037                 if (ret)
1038                         goto err;
1039
1040                 if (s.v->children[0]) {
1041                         s_t->v.root_snapshot = s.v->children[0];
1042                 } else {
1043                         s_t->k.type = KEY_TYPE_deleted;
1044                         set_bkey_val_u64s(&s_t->k, 0);
1045                 }
1046         }
1047
1048         ret = bch2_btree_delete_at(trans, &iter, 0);
1049 err:
1050         bch2_trans_iter_exit(trans, &tree_iter);
1051         bch2_trans_iter_exit(trans, &p_iter);
1052         bch2_trans_iter_exit(trans, &iter);
1053         return ret;
1054 }
1055
1056 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1057                           u32 *new_snapids,
1058                           u32 *snapshot_subvols,
1059                           unsigned nr_snapids)
1060 {
1061         struct bch_fs *c = trans->c;
1062         struct btree_iter iter;
1063         struct bkey_i_snapshot *n;
1064         struct bkey_s_c k;
1065         unsigned i, j;
1066         u32 depth = bch2_snapshot_depth(c, parent);
1067         int ret;
1068
1069         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1070                              POS_MIN, BTREE_ITER_INTENT);
1071         k = bch2_btree_iter_peek(&iter);
1072         ret = bkey_err(k);
1073         if (ret)
1074                 goto err;
1075
1076         for (i = 0; i < nr_snapids; i++) {
1077                 k = bch2_btree_iter_prev_slot(&iter);
1078                 ret = bkey_err(k);
1079                 if (ret)
1080                         goto err;
1081
1082                 if (!k.k || !k.k->p.offset) {
1083                         ret = -BCH_ERR_ENOSPC_snapshot_create;
1084                         goto err;
1085                 }
1086
1087                 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1088                 ret = PTR_ERR_OR_ZERO(n);
1089                 if (ret)
1090                         goto err;
1091
1092                 n->v.flags      = 0;
1093                 n->v.parent     = cpu_to_le32(parent);
1094                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
1095                 n->v.tree       = cpu_to_le32(tree);
1096                 n->v.depth      = cpu_to_le32(depth);
1097
1098                 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1099                         n->v.skip[j] = cpu_to_le32(snapshot_rand_ancestor_get(c, parent));
1100
1101                 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_int);
1102                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1103
1104                 ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1105                                          bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1106                 if (ret)
1107                         goto err;
1108
1109                 new_snapids[i]  = iter.pos.offset;
1110         }
1111 err:
1112         bch2_trans_iter_exit(trans, &iter);
1113         return ret;
1114 }
1115
1116 /*
1117  * Create new snapshot IDs as children of an existing snapshot ID:
1118  */
1119 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1120                               u32 *new_snapids,
1121                               u32 *snapshot_subvols,
1122                               unsigned nr_snapids)
1123 {
1124         struct btree_iter iter;
1125         struct bkey_i_snapshot *n_parent;
1126         int ret = 0;
1127
1128         n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1129                         BTREE_ID_snapshots, POS(0, parent),
1130                         0, snapshot);
1131         ret = PTR_ERR_OR_ZERO(n_parent);
1132         if (unlikely(ret)) {
1133                 if (bch2_err_matches(ret, ENOENT))
1134                         bch_err(trans->c, "snapshot %u not found", parent);
1135                 return ret;
1136         }
1137
1138         if (n_parent->v.children[0] || n_parent->v.children[1]) {
1139                 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1140                 ret = -EINVAL;
1141                 goto err;
1142         }
1143
1144         ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1145                              new_snapids, snapshot_subvols, nr_snapids);
1146         if (ret)
1147                 goto err;
1148
1149         n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1150         n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1151         n_parent->v.subvol = 0;
1152         SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1153 err:
1154         bch2_trans_iter_exit(trans, &iter);
1155         return ret;
1156 }
1157
1158 /*
1159  * Create a snapshot node that is the root of a new tree:
1160  */
1161 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1162                               u32 *new_snapids,
1163                               u32 *snapshot_subvols,
1164                               unsigned nr_snapids)
1165 {
1166         struct bkey_i_snapshot_tree *n_tree;
1167         int ret;
1168
1169         n_tree = __snapshot_tree_create(trans);
1170         ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1171                 create_snapids(trans, 0, n_tree->k.p.offset,
1172                              new_snapids, snapshot_subvols, nr_snapids);
1173         if (ret)
1174                 return ret;
1175
1176         n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1177         n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1178         return 0;
1179 }
1180
1181 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1182                               u32 *new_snapids,
1183                               u32 *snapshot_subvols,
1184                               unsigned nr_snapids)
1185 {
1186         BUG_ON((parent == 0) != (nr_snapids == 1));
1187         BUG_ON((parent != 0) != (nr_snapids == 2));
1188
1189         return parent
1190                 ? bch2_snapshot_node_create_children(trans, parent,
1191                                 new_snapids, snapshot_subvols, nr_snapids)
1192                 : bch2_snapshot_node_create_tree(trans,
1193                                 new_snapids, snapshot_subvols, nr_snapids);
1194
1195 }
1196
1197 static int snapshot_delete_key(struct btree_trans *trans,
1198                                struct btree_iter *iter,
1199                                struct bkey_s_c k,
1200                                snapshot_id_list *deleted,
1201                                snapshot_id_list *equiv_seen,
1202                                struct bpos *last_pos)
1203 {
1204         struct bch_fs *c = trans->c;
1205         u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1206
1207         if (!bkey_eq(k.k->p, *last_pos))
1208                 equiv_seen->nr = 0;
1209         *last_pos = k.k->p;
1210
1211         if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
1212             snapshot_list_has_id(equiv_seen, equiv)) {
1213                 return bch2_btree_delete_at(trans, iter,
1214                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1215         } else {
1216                 return snapshot_list_add(c, equiv_seen, equiv);
1217         }
1218 }
1219
1220 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
1221                                           struct bkey_s_c k)
1222 {
1223         struct bkey_s_c_snapshot snap;
1224         u32 children[2];
1225         int ret;
1226
1227         if (k.k->type != KEY_TYPE_snapshot)
1228                 return 0;
1229
1230         snap = bkey_s_c_to_snapshot(k);
1231         if (BCH_SNAPSHOT_DELETED(snap.v) ||
1232             BCH_SNAPSHOT_SUBVOL(snap.v))
1233                 return 0;
1234
1235         children[0] = le32_to_cpu(snap.v->children[0]);
1236         children[1] = le32_to_cpu(snap.v->children[1]);
1237
1238         ret   = snapshot_live(trans, children[0]) ?:
1239                 snapshot_live(trans, children[1]);
1240         if (ret < 0)
1241                 return ret;
1242
1243         if (!ret)
1244                 return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
1245         return 0;
1246 }
1247
1248 int bch2_delete_dead_snapshots(struct bch_fs *c)
1249 {
1250         struct btree_trans trans;
1251         struct btree_iter iter;
1252         struct bkey_s_c k;
1253         struct bkey_s_c_snapshot snap;
1254         snapshot_id_list deleted = { 0 };
1255         u32 i, id;
1256         int ret = 0;
1257
1258         if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
1259                 return 0;
1260
1261         if (!test_bit(BCH_FS_STARTED, &c->flags)) {
1262                 ret = bch2_fs_read_write_early(c);
1263                 if (ret) {
1264                         bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
1265                         return ret;
1266                 }
1267         }
1268
1269         bch2_trans_init(&trans, c, 0, 0);
1270
1271         /*
1272          * For every snapshot node: If we have no live children and it's not
1273          * pointed to by a subvolume, delete it:
1274          */
1275         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
1276                         POS_MIN, 0, k,
1277                         NULL, NULL, 0,
1278                 bch2_delete_redundant_snapshot(&trans, &iter, k));
1279         if (ret) {
1280                 bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
1281                 goto err;
1282         }
1283
1284         for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
1285                            POS_MIN, 0, k,
1286                 bch2_snapshot_set_equiv(&trans, k));
1287         if (ret) {
1288                 bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
1289                 goto err;
1290         }
1291
1292         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
1293                            POS_MIN, 0, k, ret) {
1294                 if (k.k->type != KEY_TYPE_snapshot)
1295                         continue;
1296
1297                 snap = bkey_s_c_to_snapshot(k);
1298                 if (BCH_SNAPSHOT_DELETED(snap.v)) {
1299                         ret = snapshot_list_add(c, &deleted, k.k->p.offset);
1300                         if (ret)
1301                                 break;
1302                 }
1303         }
1304         bch2_trans_iter_exit(&trans, &iter);
1305
1306         if (ret) {
1307                 bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
1308                 goto err;
1309         }
1310
1311         for (id = 0; id < BTREE_ID_NR; id++) {
1312                 struct bpos last_pos = POS_MIN;
1313                 snapshot_id_list equiv_seen = { 0 };
1314
1315                 if (!btree_type_has_snapshots(id))
1316                         continue;
1317
1318                 ret = for_each_btree_key_commit(&trans, iter,
1319                                 id, POS_MIN,
1320                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1321                                 NULL, NULL, BTREE_INSERT_NOFAIL,
1322                         snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
1323
1324                 darray_exit(&equiv_seen);
1325
1326                 if (ret) {
1327                         bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
1328                         goto err;
1329                 }
1330         }
1331
1332         for (i = 0; i < deleted.nr; i++) {
1333                 ret = commit_do(&trans, NULL, NULL, 0,
1334                         bch2_snapshot_node_delete(&trans, deleted.data[i]));
1335                 if (ret) {
1336                         bch_err(c, "error deleting snapshot %u: %s",
1337                                 deleted.data[i], bch2_err_str(ret));
1338                         goto err;
1339                 }
1340         }
1341
1342         clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
1343 err:
1344         darray_exit(&deleted);
1345         bch2_trans_exit(&trans);
1346         if (ret)
1347                 bch_err_fn(c, ret);
1348         return ret;
1349 }
1350
1351 static void bch2_delete_dead_snapshots_work(struct work_struct *work)
1352 {
1353         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1354
1355         bch2_delete_dead_snapshots(c);
1356         bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1357 }
1358
1359 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1360 {
1361         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
1362             !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1363                 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1364 }
1365
1366 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
1367                                            struct btree_trans_commit_hook *h)
1368 {
1369         struct bch_fs *c = trans->c;
1370
1371         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
1372
1373         if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
1374                 return 0;
1375
1376         bch2_delete_dead_snapshots_async(c);
1377         return 0;
1378 }
1379
1380 /* Subvolumes: */
1381
1382 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
1383                            unsigned flags, struct printbuf *err)
1384 {
1385         if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
1386             bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
1387                 prt_printf(err, "invalid pos");
1388                 return -BCH_ERR_invalid_bkey;
1389         }
1390
1391         return 0;
1392 }
1393
1394 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
1395                             struct bkey_s_c k)
1396 {
1397         struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1398
1399         prt_printf(out, "root %llu snapshot id %u",
1400                    le64_to_cpu(s.v->inode),
1401                    le32_to_cpu(s.v->snapshot));
1402
1403         if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
1404                 prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
1405 }
1406
1407 static __always_inline int
1408 bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
1409                            bool inconsistent_if_not_found,
1410                            int iter_flags,
1411                            struct bch_subvolume *s)
1412 {
1413         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
1414                                           iter_flags, subvolume, s);
1415         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
1416                                 inconsistent_if_not_found,
1417                                 trans->c, "missing subvolume %u", subvol);
1418         return ret;
1419 }
1420
1421 int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
1422                        bool inconsistent_if_not_found,
1423                        int iter_flags,
1424                        struct bch_subvolume *s)
1425 {
1426         return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
1427 }
1428
1429 int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
1430                              struct bch_subvolume *subvol)
1431 {
1432         struct bch_snapshot snap;
1433
1434         return  snapshot_lookup(trans, snapshot, &snap) ?:
1435                 bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
1436 }
1437
1438 int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
1439                                 u32 *snapid)
1440 {
1441         struct btree_iter iter;
1442         struct bkey_s_c k;
1443         int ret;
1444
1445         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
1446                                BTREE_ITER_CACHED|
1447                                BTREE_ITER_WITH_UPDATES);
1448         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume;
1449
1450         if (likely(!ret))
1451                 *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
1452         else if (bch2_err_matches(ret, ENOENT))
1453                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
1454         bch2_trans_iter_exit(trans, &iter);
1455         return ret;
1456 }
1457
1458 static int bch2_subvolume_reparent(struct btree_trans *trans,
1459                                    struct btree_iter *iter,
1460                                    struct bkey_s_c k,
1461                                    u32 old_parent, u32 new_parent)
1462 {
1463         struct bkey_i_subvolume *s;
1464         int ret;
1465
1466         if (k.k->type != KEY_TYPE_subvolume)
1467                 return 0;
1468
1469         if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
1470             le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
1471                 return 0;
1472
1473         s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
1474         ret = PTR_ERR_OR_ZERO(s);
1475         if (ret)
1476                 return ret;
1477
1478         s->v.parent = cpu_to_le32(new_parent);
1479         return 0;
1480 }
1481
1482 /*
1483  * Scan for subvolumes with parent @subvolid_to_delete, reparent:
1484  */
1485 static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
1486 {
1487         struct btree_iter iter;
1488         struct bkey_s_c k;
1489         struct bch_subvolume s;
1490
1491         return lockrestart_do(trans,
1492                         bch2_subvolume_get(trans, subvolid_to_delete, true,
1493                                    BTREE_ITER_CACHED, &s)) ?:
1494                 for_each_btree_key_commit(trans, iter,
1495                                 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
1496                                 NULL, NULL, BTREE_INSERT_NOFAIL,
1497                         bch2_subvolume_reparent(trans, &iter, k,
1498                                         subvolid_to_delete, le32_to_cpu(s.parent)));
1499 }
1500
1501 /*
1502  * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
1503  * deletion/cleanup:
1504  */
1505 static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
1506 {
1507         struct btree_iter iter;
1508         struct bkey_s_c_subvolume subvol;
1509         struct btree_trans_commit_hook *h;
1510         u32 snapid;
1511         int ret = 0;
1512
1513         subvol = bch2_bkey_get_iter_typed(trans, &iter,
1514                                 BTREE_ID_subvolumes, POS(0, subvolid),
1515                                 BTREE_ITER_CACHED|BTREE_ITER_INTENT,
1516                                 subvolume);
1517         ret = bkey_err(subvol);
1518         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
1519                                 "missing subvolume %u", subvolid);
1520         if (ret)
1521                 return ret;
1522
1523         snapid = le32_to_cpu(subvol.v->snapshot);
1524
1525         ret = bch2_btree_delete_at(trans, &iter, 0);
1526         if (ret)
1527                 goto err;
1528
1529         ret = bch2_snapshot_node_set_deleted(trans, snapid);
1530         if (ret)
1531                 goto err;
1532
1533         h = bch2_trans_kmalloc(trans, sizeof(*h));
1534         ret = PTR_ERR_OR_ZERO(h);
1535         if (ret)
1536                 goto err;
1537
1538         h->fn = bch2_delete_dead_snapshots_hook;
1539         bch2_trans_commit_hook(trans, h);
1540 err:
1541         bch2_trans_iter_exit(trans, &iter);
1542         return ret;
1543 }
1544
1545 static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
1546 {
1547         return bch2_subvolumes_reparent(trans, subvolid) ?:
1548                 commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
1549                           __bch2_subvolume_delete(trans, subvolid));
1550 }
1551
1552 static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
1553 {
1554         struct bch_fs *c = container_of(work, struct bch_fs,
1555                                 snapshot_wait_for_pagecache_and_delete_work);
1556         snapshot_id_list s;
1557         u32 *id;
1558         int ret = 0;
1559
1560         while (!ret) {
1561                 mutex_lock(&c->snapshots_unlinked_lock);
1562                 s = c->snapshots_unlinked;
1563                 darray_init(&c->snapshots_unlinked);
1564                 mutex_unlock(&c->snapshots_unlinked_lock);
1565
1566                 if (!s.nr)
1567                         break;
1568
1569                 bch2_evict_subvolume_inodes(c, &s);
1570
1571                 for (id = s.data; id < s.data + s.nr; id++) {
1572                         ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
1573                         if (ret) {
1574                                 bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
1575                                 break;
1576                         }
1577                 }
1578
1579                 darray_exit(&s);
1580         }
1581
1582         bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
1583 }
1584
1585 struct subvolume_unlink_hook {
1586         struct btree_trans_commit_hook  h;
1587         u32                             subvol;
1588 };
1589
1590 static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
1591                                                       struct btree_trans_commit_hook *_h)
1592 {
1593         struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
1594         struct bch_fs *c = trans->c;
1595         int ret = 0;
1596
1597         mutex_lock(&c->snapshots_unlinked_lock);
1598         if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
1599                 ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
1600         mutex_unlock(&c->snapshots_unlinked_lock);
1601
1602         if (ret)
1603                 return ret;
1604
1605         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
1606                 return -EROFS;
1607
1608         if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
1609                 bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
1610         return 0;
1611 }
1612
1613 int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
1614 {
1615         struct btree_iter iter;
1616         struct bkey_i_subvolume *n;
1617         struct subvolume_unlink_hook *h;
1618         int ret = 0;
1619
1620         h = bch2_trans_kmalloc(trans, sizeof(*h));
1621         ret = PTR_ERR_OR_ZERO(h);
1622         if (ret)
1623                 return ret;
1624
1625         h->h.fn         = bch2_subvolume_wait_for_pagecache_and_delete_hook;
1626         h->subvol       = subvolid;
1627         bch2_trans_commit_hook(trans, &h->h);
1628
1629         n = bch2_bkey_get_mut_typed(trans, &iter,
1630                         BTREE_ID_subvolumes, POS(0, subvolid),
1631                         BTREE_ITER_CACHED, subvolume);
1632         ret = PTR_ERR_OR_ZERO(n);
1633         if (unlikely(ret)) {
1634                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
1635                                         "missing subvolume %u", subvolid);
1636                 return ret;
1637         }
1638
1639         SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
1640         bch2_trans_iter_exit(trans, &iter);
1641         return ret;
1642 }
1643
1644 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
1645                           u32 src_subvolid,
1646                           u32 *new_subvolid,
1647                           u32 *new_snapshotid,
1648                           bool ro)
1649 {
1650         struct bch_fs *c = trans->c;
1651         struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
1652         struct bkey_i_subvolume *new_subvol = NULL;
1653         struct bkey_i_subvolume *src_subvol = NULL;
1654         u32 parent = 0, new_nodes[2], snapshot_subvols[2];
1655         int ret = 0;
1656
1657         ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
1658                                 BTREE_ID_subvolumes, POS(0, U32_MAX));
1659         if (ret == -BCH_ERR_ENOSPC_btree_slot)
1660                 ret = -BCH_ERR_ENOSPC_subvolume_create;
1661         if (ret)
1662                 return ret;
1663
1664         snapshot_subvols[0] = dst_iter.pos.offset;
1665         snapshot_subvols[1] = src_subvolid;
1666
1667         if (src_subvolid) {
1668                 /* Creating a snapshot: */
1669
1670                 src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
1671                                 BTREE_ID_subvolumes, POS(0, src_subvolid),
1672                                 BTREE_ITER_CACHED, subvolume);
1673                 ret = PTR_ERR_OR_ZERO(src_subvol);
1674                 if (unlikely(ret)) {
1675                         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1676                                                 "subvolume %u not found", src_subvolid);
1677                         goto err;
1678                 }
1679
1680                 parent = le32_to_cpu(src_subvol->v.snapshot);
1681         }
1682
1683         ret = bch2_snapshot_node_create(trans, parent, new_nodes,
1684                                         snapshot_subvols,
1685                                         src_subvolid ? 2 : 1);
1686         if (ret)
1687                 goto err;
1688
1689         if (src_subvolid) {
1690                 src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
1691                 ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
1692                 if (ret)
1693                         goto err;
1694         }
1695
1696         new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
1697         ret = PTR_ERR_OR_ZERO(new_subvol);
1698         if (ret)
1699                 goto err;
1700
1701         new_subvol->v.flags     = 0;
1702         new_subvol->v.snapshot  = cpu_to_le32(new_nodes[0]);
1703         new_subvol->v.inode     = cpu_to_le64(inode);
1704         new_subvol->v.parent    = cpu_to_le32(src_subvolid);
1705         new_subvol->v.otime.lo  = cpu_to_le64(bch2_current_time(c));
1706         new_subvol->v.otime.hi  = 0;
1707
1708         SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
1709         SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
1710
1711         *new_subvolid   = new_subvol->k.p.offset;
1712         *new_snapshotid = new_nodes[0];
1713 err:
1714         bch2_trans_iter_exit(trans, &src_iter);
1715         bch2_trans_iter_exit(trans, &dst_iter);
1716         return ret;
1717 }
1718
1719 int bch2_fs_subvolumes_init(struct bch_fs *c)
1720 {
1721         INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
1722         INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
1723                   bch2_subvolume_wait_for_pagecache_and_delete);
1724         mutex_init(&c->snapshots_unlinked_lock);
1725         return 0;
1726 }