]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/subvolume.c
Update bcachefs sources to 070ec8d07b bcachefs: Snapshot depth, skiplist fields
[bcachefs-tools-debian] / libbcachefs / subvolume.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_update.h"
6 #include "errcode.h"
7 #include "error.h"
8 #include "fs.h"
9 #include "subvolume.h"
10
11 #include <linux/random.h>
12
13 static int bch2_subvolume_delete(struct btree_trans *, u32);
14
15 static inline u32 get_ancestor_below(struct bch_fs *c, u32 id, u32 ancestor)
16 {
17         struct snapshot_t *s = snapshot_t(c, id);
18
19         if (s->skip[2] <= ancestor)
20                 return s->skip[2];
21         if (s->skip[1] <= ancestor)
22                 return s->skip[1];
23         if (s->skip[0] <= ancestor)
24                 return s->skip[0];
25         return s->parent;
26 }
27
28 bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
29 {
30         EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
31
32         while (id && id < ancestor)
33                 id = get_ancestor_below(c, id, ancestor);
34
35         return id == ancestor;
36 }
37
38 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
39 {
40         while (id && id < ancestor)
41                 id = snapshot_t(c, id)->parent;
42
43         return id == ancestor;
44 }
45
46 /* Snapshot tree: */
47
48 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
49                                 struct bkey_s_c k)
50 {
51         struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
52
53         prt_printf(out, "subvol %u root snapshot %u",
54                    le32_to_cpu(t.v->master_subvol),
55                    le32_to_cpu(t.v->root_snapshot));
56 }
57
58 int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
59                                enum bkey_invalid_flags flags,
60                                struct printbuf *err)
61 {
62         if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
63             bkey_lt(k.k->p, POS(0, 1))) {
64                 prt_printf(err, "bad pos");
65                 return -BCH_ERR_invalid_bkey;
66         }
67
68         return 0;
69 }
70
71 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
72                               struct bch_snapshot_tree *s)
73 {
74         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
75                                           BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
76
77         if (bch2_err_matches(ret, ENOENT))
78                 ret = -BCH_ERR_ENOENT_snapshot_tree;
79         return ret;
80 }
81
82 static struct bkey_i_snapshot_tree *
83 __snapshot_tree_create(struct btree_trans *trans)
84 {
85         struct btree_iter iter;
86         int ret = bch2_bkey_get_empty_slot(trans, &iter,
87                         BTREE_ID_snapshot_trees, POS(0, U32_MAX));
88         struct bkey_i_snapshot_tree *s_t;
89
90         if (ret == -BCH_ERR_ENOSPC_btree_slot)
91                 ret = -BCH_ERR_ENOSPC_snapshot_tree;
92         if (ret)
93                 return ERR_PTR(ret);
94
95         s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
96         ret = PTR_ERR_OR_ZERO(s_t);
97         bch2_trans_iter_exit(trans, &iter);
98         return ret ? ERR_PTR(ret) : s_t;
99 }
100
101 static int snapshot_tree_create(struct btree_trans *trans,
102                                 u32 root_id, u32 subvol_id, u32 *tree_id)
103 {
104         struct bkey_i_snapshot_tree *n_tree =
105                 __snapshot_tree_create(trans);
106
107         if (IS_ERR(n_tree))
108                 return PTR_ERR(n_tree);
109
110         n_tree->v.master_subvol = cpu_to_le32(subvol_id);
111         n_tree->v.root_snapshot = cpu_to_le32(root_id);
112         *tree_id = n_tree->k.p.offset;
113         return 0;
114 }
115
116 /* Snapshot nodes: */
117
118 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
119                            struct bkey_s_c k)
120 {
121         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
122
123         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
124                BCH_SNAPSHOT_SUBVOL(s.v),
125                BCH_SNAPSHOT_DELETED(s.v),
126                le32_to_cpu(s.v->parent),
127                le32_to_cpu(s.v->children[0]),
128                le32_to_cpu(s.v->children[1]),
129                le32_to_cpu(s.v->subvol),
130                le32_to_cpu(s.v->tree));
131 }
132
133 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
134                           enum bkey_invalid_flags flags,
135                           struct printbuf *err)
136 {
137         struct bkey_s_c_snapshot s;
138         u32 i, id;
139
140         if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
141             bkey_lt(k.k->p, POS(0, 1))) {
142                 prt_printf(err, "bad pos");
143                 return -BCH_ERR_invalid_bkey;
144         }
145
146         s = bkey_s_c_to_snapshot(k);
147
148         id = le32_to_cpu(s.v->parent);
149         if (id && id <= k.k->p.offset) {
150                 prt_printf(err, "bad parent node (%u <= %llu)",
151                        id, k.k->p.offset);
152                 return -BCH_ERR_invalid_bkey;
153         }
154
155         if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
156                 prt_printf(err, "children not normalized");
157                 return -BCH_ERR_invalid_bkey;
158         }
159
160         if (s.v->children[0] &&
161             s.v->children[0] == s.v->children[1]) {
162                 prt_printf(err, "duplicate child nodes");
163                 return -BCH_ERR_invalid_bkey;
164         }
165
166         for (i = 0; i < 2; i++) {
167                 id = le32_to_cpu(s.v->children[i]);
168
169                 if (id >= k.k->p.offset) {
170                         prt_printf(err, "bad child node (%u >= %llu)",
171                                id, k.k->p.offset);
172                         return -BCH_ERR_invalid_bkey;
173                 }
174         }
175
176         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
177                 if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
178                     le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
179                         prt_printf(err, "skiplist not normalized");
180                         return -BCH_ERR_invalid_bkey;
181                 }
182
183                 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
184                         id = le32_to_cpu(s.v->skip[i]);
185
186                         if (!id != !s.v->parent ||
187                             (s.v->parent &&
188                              id <= k.k->p.offset)) {
189                                 prt_printf(err, "bad skiplist node %u)", id);
190                                 return -BCH_ERR_invalid_bkey;
191                         }
192                 }
193         }
194
195         return 0;
196 }
197
198 int bch2_mark_snapshot(struct btree_trans *trans,
199                        enum btree_id btree, unsigned level,
200                        struct bkey_s_c old, struct bkey_s_c new,
201                        unsigned flags)
202 {
203         struct bch_fs *c = trans->c;
204         struct snapshot_t *t;
205
206         t = genradix_ptr_alloc(&c->snapshots,
207                                U32_MAX - new.k->p.offset,
208                                GFP_KERNEL);
209         if (!t)
210                 return -BCH_ERR_ENOMEM_mark_snapshot;
211
212         if (new.k->type == KEY_TYPE_snapshot) {
213                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
214
215                 t->parent       = le32_to_cpu(s.v->parent);
216                 t->skip[0]      = le32_to_cpu(s.v->skip[0]);
217                 t->skip[1]      = le32_to_cpu(s.v->skip[1]);
218                 t->skip[2]      = le32_to_cpu(s.v->skip[2]);
219                 t->depth        = le32_to_cpu(s.v->depth);
220                 t->children[0]  = le32_to_cpu(s.v->children[0]);
221                 t->children[1]  = le32_to_cpu(s.v->children[1]);
222                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
223                 t->tree         = le32_to_cpu(s.v->tree);
224
225                 if (BCH_SNAPSHOT_DELETED(s.v))
226                         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
227         } else {
228                 t->parent       = 0;
229                 t->children[0]  = 0;
230                 t->children[1]  = 0;
231                 t->subvol       = 0;
232                 t->tree         = 0;
233         }
234
235         return 0;
236 }
237
238 static int snapshot_lookup(struct btree_trans *trans, u32 id,
239                            struct bch_snapshot *s)
240 {
241         return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
242                                        BTREE_ITER_WITH_UPDATES, snapshot, s);
243 }
244
245 static int snapshot_live(struct btree_trans *trans, u32 id)
246 {
247         struct bch_snapshot v;
248         int ret;
249
250         if (!id)
251                 return 0;
252
253         ret = snapshot_lookup(trans, id, &v);
254         if (bch2_err_matches(ret, ENOENT))
255                 bch_err(trans->c, "snapshot node %u not found", id);
256         if (ret)
257                 return ret;
258
259         return !BCH_SNAPSHOT_DELETED(&v);
260 }
261
262 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
263 {
264         struct bch_fs *c = trans->c;
265         unsigned i, nr_live = 0, live_idx = 0;
266         struct bkey_s_c_snapshot snap;
267         u32 id = k.k->p.offset, child[2];
268
269         if (k.k->type != KEY_TYPE_snapshot)
270                 return 0;
271
272         snap = bkey_s_c_to_snapshot(k);
273
274         child[0] = le32_to_cpu(snap.v->children[0]);
275         child[1] = le32_to_cpu(snap.v->children[1]);
276
277         for (i = 0; i < 2; i++) {
278                 int ret = snapshot_live(trans, child[i]);
279
280                 if (ret < 0)
281                         return ret;
282
283                 if (ret)
284                         live_idx = i;
285                 nr_live += ret;
286         }
287
288         snapshot_t(c, id)->equiv = nr_live == 1
289                 ? snapshot_t(c, child[live_idx])->equiv
290                 : id;
291         return 0;
292 }
293
294 /* fsck: */
295
296 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
297 {
298         return snapshot_t(c, id)->children[child];
299 }
300
301 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
302 {
303         return bch2_snapshot_child(c, id, 0);
304 }
305
306 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
307 {
308         return bch2_snapshot_child(c, id, 1);
309 }
310
311 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
312 {
313         u32 n, parent;
314
315         n = bch2_snapshot_left_child(c, id);
316         if (n)
317                 return n;
318
319         while ((parent = bch2_snapshot_parent(c, id))) {
320                 n = bch2_snapshot_right_child(c, parent);
321                 if (n && n != id)
322                         return n;
323                 id = parent;
324         }
325
326         return 0;
327 }
328
329 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
330 {
331         u32 id = snapshot_root;
332         u32 subvol = 0, s;
333
334         while (id) {
335                 s = snapshot_t(c, id)->subvol;
336
337                 if (s && (!subvol || s < subvol))
338                         subvol = s;
339
340                 id = bch2_snapshot_tree_next(c, id);
341         }
342
343         return subvol;
344 }
345
346 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
347                                             u32 snapshot_root, u32 *subvol_id)
348 {
349         struct bch_fs *c = trans->c;
350         struct btree_iter iter;
351         struct bkey_s_c k;
352         struct bkey_s_c_subvolume s;
353         bool found = false;
354         int ret;
355
356         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
357                                      0, k, ret) {
358                 if (k.k->type != KEY_TYPE_subvolume)
359                         continue;
360
361                 s = bkey_s_c_to_subvolume(k);
362                 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
363                         continue;
364                 if (!BCH_SUBVOLUME_SNAP(s.v)) {
365                         *subvol_id = s.k->p.offset;
366                         found = true;
367                         break;
368                 }
369         }
370
371         bch2_trans_iter_exit(trans, &iter);
372
373         if (!ret && !found) {
374                 struct bkey_i_subvolume *s;
375
376                 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
377
378                 s = bch2_bkey_get_mut_typed(trans, &iter,
379                                             BTREE_ID_subvolumes, POS(0, *subvol_id),
380                                             0, subvolume);
381                 ret = PTR_ERR_OR_ZERO(s);
382                 if (ret)
383                         return ret;
384
385                 SET_BCH_SUBVOLUME_SNAP(&s->v, false);
386         }
387
388         return ret;
389 }
390
391 static int check_snapshot_tree(struct btree_trans *trans,
392                                struct btree_iter *iter,
393                                struct bkey_s_c k)
394 {
395         struct bch_fs *c = trans->c;
396         struct bkey_s_c_snapshot_tree st;
397         struct bch_snapshot s;
398         struct bch_subvolume subvol;
399         struct printbuf buf = PRINTBUF;
400         u32 root_id;
401         int ret;
402
403         if (k.k->type != KEY_TYPE_snapshot_tree)
404                 return 0;
405
406         st = bkey_s_c_to_snapshot_tree(k);
407         root_id = le32_to_cpu(st.v->root_snapshot);
408
409         ret = snapshot_lookup(trans, root_id, &s);
410         if (ret && !bch2_err_matches(ret, ENOENT))
411                 goto err;
412
413         if (fsck_err_on(ret ||
414                         root_id != bch2_snapshot_root(c, root_id) ||
415                         st.k->p.offset != le32_to_cpu(s.tree),
416                         c,
417                         "snapshot tree points to missing/incorrect snapshot:\n  %s",
418                         (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
419                 ret = bch2_btree_delete_at(trans, iter, 0);
420                 goto err;
421         }
422
423         ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
424                                  false, 0, &subvol);
425         if (ret && !bch2_err_matches(ret, ENOENT))
426                 goto err;
427
428         if (fsck_err_on(ret, c,
429                         "snapshot tree points to missing subvolume:\n  %s",
430                         (printbuf_reset(&buf),
431                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
432             fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
433                                                 le32_to_cpu(subvol.snapshot),
434                                                 root_id), c,
435                         "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
436                         (printbuf_reset(&buf),
437                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
438             fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
439                         "snapshot tree points to snapshot subvolume:\n  %s",
440                         (printbuf_reset(&buf),
441                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
442                 struct bkey_i_snapshot_tree *u;
443                 u32 subvol_id;
444
445                 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
446                 if (ret)
447                         goto err;
448
449                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
450                 ret = PTR_ERR_OR_ZERO(u);
451                 if (ret)
452                         goto err;
453
454                 u->v.master_subvol = cpu_to_le32(subvol_id);
455                 st = snapshot_tree_i_to_s_c(u);
456         }
457 err:
458 fsck_err:
459         printbuf_exit(&buf);
460         return ret;
461 }
462
463 /*
464  * For each snapshot_tree, make sure it points to the root of a snapshot tree
465  * and that snapshot entry points back to it, or delete it.
466  *
467  * And, make sure it points to a subvolume within that snapshot tree, or correct
468  * it to point to the oldest subvolume within that snapshot tree.
469  */
470 int bch2_check_snapshot_trees(struct bch_fs *c)
471 {
472         struct btree_iter iter;
473         struct bkey_s_c k;
474         int ret;
475
476         ret = bch2_trans_run(c,
477                 for_each_btree_key_commit(&trans, iter,
478                         BTREE_ID_snapshot_trees, POS_MIN,
479                         BTREE_ITER_PREFETCH, k,
480                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
481                 check_snapshot_tree(&trans, &iter, k)));
482
483         if (ret)
484                 bch_err(c, "error %i checking snapshot trees", ret);
485         return ret;
486 }
487
488 /*
489  * Look up snapshot tree for @tree_id and find root,
490  * make sure @snap_id is a descendent:
491  */
492 static int snapshot_tree_ptr_good(struct btree_trans *trans,
493                                   u32 snap_id, u32 tree_id)
494 {
495         struct bch_snapshot_tree s_t;
496         int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
497
498         if (bch2_err_matches(ret, ENOENT))
499                 return 0;
500         if (ret)
501                 return ret;
502
503         return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
504 }
505
506 static u32 snapshot_rand_ancestor_get(struct bch_fs *c, u32 id)
507 {
508         struct snapshot_t *s;
509
510         if (!id)
511                 return 0;
512
513         s = snapshot_t(c, id);
514         if (!s->parent)
515                 return id;
516
517         return bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
518 }
519
520 static int snapshot_rand_ancestor_good(struct btree_trans *trans,
521                                        struct bch_snapshot s)
522 {
523         struct bch_snapshot a;
524         unsigned i;
525         int ret;
526
527         for (i = 0; i < 3; i++) {
528                 if (!s.parent != !s.skip[i])
529                         return false;
530
531                 if (!s.parent)
532                         continue;
533
534                 ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
535                 if (bch2_err_matches(ret, ENOENT))
536                         return false;
537                 if (ret)
538                         return ret;
539
540                 if (a.tree != s.tree)
541                         return false;
542         }
543
544         return true;
545 }
546
547 /*
548  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
549  * its snapshot_tree pointer is correct (allocate new one if necessary), then
550  * update this node's pointer to root node's pointer:
551  */
552 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
553                                     struct btree_iter *iter,
554                                     struct bkey_s_c k,
555                                     struct bch_snapshot *s)
556 {
557         struct bch_fs *c = trans->c;
558         struct btree_iter root_iter;
559         struct bch_snapshot_tree s_t;
560         struct bkey_s_c_snapshot root;
561         struct bkey_i_snapshot *u;
562         u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
563         int ret;
564
565         root = bch2_bkey_get_iter_typed(trans, &root_iter,
566                                BTREE_ID_snapshots, POS(0, root_id),
567                                BTREE_ITER_WITH_UPDATES, snapshot);
568         ret = bkey_err(root);
569         if (ret)
570                 goto err;
571
572         tree_id = le32_to_cpu(root.v->tree);
573
574         ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
575         if (ret && !bch2_err_matches(ret, ENOENT))
576                 return ret;
577
578         if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
579                 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
580                 ret =   PTR_ERR_OR_ZERO(u) ?:
581                         snapshot_tree_create(trans, root_id,
582                                 bch2_snapshot_tree_oldest_subvol(c, root_id),
583                                 &tree_id);
584                 if (ret)
585                         goto err;
586
587                 u->v.tree = cpu_to_le32(tree_id);
588                 if (k.k->p.offset == root_id)
589                         *s = u->v;
590         }
591
592         if (k.k->p.offset != root_id) {
593                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
594                 ret = PTR_ERR_OR_ZERO(u);
595                 if (ret)
596                         goto err;
597
598                 u->v.tree = cpu_to_le32(tree_id);
599                 *s = u->v;
600         }
601 err:
602         bch2_trans_iter_exit(trans, &root_iter);
603         return ret;
604 }
605
606 static int check_snapshot(struct btree_trans *trans,
607                           struct btree_iter *iter,
608                           struct bkey_s_c k)
609 {
610         struct bch_fs *c = trans->c;
611         struct bch_snapshot s;
612         struct bch_subvolume subvol;
613         struct bch_snapshot v;
614         struct bkey_i_snapshot *u;
615         u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
616         struct snapshot_t *parent = parent_id
617                 ? snapshot_t(c, parent_id)
618                 : NULL;
619         struct printbuf buf = PRINTBUF;
620         bool should_have_subvol;
621         u32 i, id;
622         int ret = 0;
623
624         if (k.k->type != KEY_TYPE_snapshot)
625                 return 0;
626
627         memset(&s, 0, sizeof(s));
628         memcpy(&s, k.v, bkey_val_bytes(k.k));
629
630         id = le32_to_cpu(s.parent);
631         if (id) {
632                 ret = snapshot_lookup(trans, id, &v);
633                 if (bch2_err_matches(ret, ENOENT))
634                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
635                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
636                 if (ret)
637                         goto err;
638
639                 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
640                     le32_to_cpu(v.children[1]) != k.k->p.offset) {
641                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
642                                 id, k.k->p.offset);
643                         ret = -EINVAL;
644                         goto err;
645                 }
646         }
647
648         for (i = 0; i < 2 && s.children[i]; i++) {
649                 id = le32_to_cpu(s.children[i]);
650
651                 ret = snapshot_lookup(trans, id, &v);
652                 if (bch2_err_matches(ret, ENOENT))
653                         bch_err(c, "snapshot node %llu has nonexistent child %u",
654                                 k.k->p.offset, id);
655                 if (ret)
656                         goto err;
657
658                 if (le32_to_cpu(v.parent) != k.k->p.offset) {
659                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
660                                 id, le32_to_cpu(v.parent), k.k->p.offset);
661                         ret = -EINVAL;
662                         goto err;
663                 }
664         }
665
666         should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
667                 !BCH_SNAPSHOT_DELETED(&s);
668
669         if (should_have_subvol) {
670                 id = le32_to_cpu(s.subvol);
671                 ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
672                 if (bch2_err_matches(ret, ENOENT))
673                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
674                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
675                 if (ret)
676                         goto err;
677
678                 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
679                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
680                                 k.k->p.offset);
681                         ret = -EINVAL;
682                         goto err;
683                 }
684         } else {
685                 if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
686                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
687                         u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
688                         ret = PTR_ERR_OR_ZERO(u);
689                         if (ret)
690                                 goto err;
691
692                         u->v.subvol = 0;
693                         s = u->v;
694                 }
695         }
696
697         ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
698         if (ret < 0)
699                 goto err;
700
701         if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
702                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
703                 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
704                 if (ret)
705                         goto err;
706         }
707         ret = 0;
708
709         if (fsck_err_on(le32_to_cpu(s.depth) != (parent ? parent->depth + 1 : 0), c,
710                         "snapshot with incorrect depth fields, should be %u:\n  %s",
711                         parent->depth + 1,
712                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
713                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
714                 ret = PTR_ERR_OR_ZERO(u);
715                 if (ret)
716                         goto err;
717
718                 u->v.depth = cpu_to_le32(parent ? parent->depth + 1 : 0);
719                 s = u->v;
720         }
721
722         ret = snapshot_rand_ancestor_good(trans, s);
723         if (ret < 0)
724                 goto err;
725
726         if (fsck_err_on(!ret, c, "snapshot with bad rand_ancestor field:\n  %s",
727                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
728                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
729                 ret = PTR_ERR_OR_ZERO(u);
730                 if (ret)
731                         goto err;
732
733                 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
734                         u->v.skip[i] = cpu_to_le32(snapshot_rand_ancestor_get(c, parent_id));
735
736                 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_int);
737                 s = u->v;
738         }
739         ret = 0;
740 err:
741 fsck_err:
742         printbuf_exit(&buf);
743         return ret;
744 }
745
746 int bch2_check_snapshots(struct bch_fs *c)
747 {
748         struct btree_iter iter;
749         struct bkey_s_c k;
750         int ret;
751
752         /*
753          * We iterate backwards as checking/fixing the depth field requires that
754          * the parent's depth already be correct:
755          */
756         ret = bch2_trans_run(c,
757                 for_each_btree_key_reverse_commit(&trans, iter,
758                         BTREE_ID_snapshots, POS_MAX,
759                         BTREE_ITER_PREFETCH, k,
760                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
761                 check_snapshot(&trans, &iter, k)));
762         if (ret)
763                 bch_err_fn(c, ret);
764         return ret;
765 }
766
767 static int check_subvol(struct btree_trans *trans,
768                         struct btree_iter *iter,
769                         struct bkey_s_c k)
770 {
771         struct bch_fs *c = trans->c;
772         struct bkey_s_c_subvolume subvol;
773         struct bch_snapshot snapshot;
774         unsigned snapid;
775         int ret = 0;
776
777         if (k.k->type != KEY_TYPE_subvolume)
778                 return 0;
779
780         subvol = bkey_s_c_to_subvolume(k);
781         snapid = le32_to_cpu(subvol.v->snapshot);
782         ret = snapshot_lookup(trans, snapid, &snapshot);
783
784         if (bch2_err_matches(ret, ENOENT))
785                 bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
786                         k.k->p.offset, snapid);
787         if (ret)
788                 return ret;
789
790         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
791                 bch2_fs_lazy_rw(c);
792
793                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
794                 if (ret)
795                         bch_err(c, "error deleting subvolume %llu: %s",
796                                 iter->pos.offset, bch2_err_str(ret));
797                 return ret ?: -BCH_ERR_transaction_restart_nested;
798         }
799
800         if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
801                 u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
802                 u32 snapshot_tree = snapshot_t(c, snapshot_root)->tree;
803                 struct bch_snapshot_tree st;
804
805                 ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
806
807                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
808                                 "%s: snapshot tree %u not found", __func__, snapshot_tree);
809
810                 if (ret)
811                         return ret;
812
813                 if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c,
814                                 "subvolume %llu is not set as snapshot but is not master subvolume",
815                                 k.k->p.offset)) {
816                         struct bkey_i_subvolume *s =
817                                 bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
818                         ret = PTR_ERR_OR_ZERO(s);
819                         if (ret)
820                                 return ret;
821
822                         SET_BCH_SUBVOLUME_SNAP(&s->v, true);
823                 }
824         }
825
826 fsck_err:
827         return ret;
828 }
829
830 int bch2_check_subvols(struct bch_fs *c)
831 {
832         struct btree_iter iter;
833         struct bkey_s_c k;
834         int ret;
835
836         ret = bch2_trans_run(c,
837                 for_each_btree_key_commit(&trans, iter,
838                         BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
839                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
840                 check_subvol(&trans, &iter, k)));
841         if (ret)
842                 bch_err_fn(c, ret);
843         return ret;
844 }
845
846 void bch2_fs_snapshots_exit(struct bch_fs *c)
847 {
848         genradix_free(&c->snapshots);
849 }
850
851 int bch2_snapshots_read(struct bch_fs *c)
852 {
853         struct btree_iter iter;
854         struct bkey_s_c k;
855         int ret = 0;
856
857         ret = bch2_trans_run(c,
858                 for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
859                            POS_MIN, 0, k,
860                         bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
861                         bch2_snapshot_set_equiv(&trans, k)));
862         if (ret)
863                 bch_err_fn(c, ret);
864         return ret;
865 }
866
867 /*
868  * Mark a snapshot as deleted, for future cleanup:
869  */
870 static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
871 {
872         struct btree_iter iter;
873         struct bkey_i_snapshot *s;
874         int ret = 0;
875
876         s = bch2_bkey_get_mut_typed(trans, &iter,
877                                     BTREE_ID_snapshots, POS(0, id),
878                                     0, snapshot);
879         ret = PTR_ERR_OR_ZERO(s);
880         if (unlikely(ret)) {
881                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
882                                         trans->c, "missing snapshot %u", id);
883                 return ret;
884         }
885
886         /* already deleted? */
887         if (BCH_SNAPSHOT_DELETED(&s->v))
888                 goto err;
889
890         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
891         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
892         s->v.subvol = 0;
893 err:
894         bch2_trans_iter_exit(trans, &iter);
895         return ret;
896 }
897
898 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
899 {
900         struct bch_fs *c = trans->c;
901         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
902         struct btree_iter tree_iter = (struct btree_iter) { NULL };
903         struct bkey_s_c_snapshot s;
904         u32 parent_id;
905         unsigned i;
906         int ret = 0;
907
908         s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
909                                      BTREE_ITER_INTENT, snapshot);
910         ret = bkey_err(s);
911         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
912                                 "missing snapshot %u", id);
913
914         if (ret)
915                 goto err;
916
917         BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
918         parent_id = le32_to_cpu(s.v->parent);
919
920         if (parent_id) {
921                 struct bkey_i_snapshot *parent;
922
923                 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
924                                      BTREE_ID_snapshots, POS(0, parent_id),
925                                      0, snapshot);
926                 ret = PTR_ERR_OR_ZERO(parent);
927                 if (unlikely(ret)) {
928                         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
929                                                 "missing snapshot %u", parent_id);
930                         goto err;
931                 }
932
933                 for (i = 0; i < 2; i++)
934                         if (le32_to_cpu(parent->v.children[i]) == id)
935                                 break;
936
937                 if (i == 2)
938                         bch_err(c, "snapshot %u missing child pointer to %u",
939                                 parent_id, id);
940                 else
941                         parent->v.children[i] = 0;
942
943                 if (le32_to_cpu(parent->v.children[0]) <
944                     le32_to_cpu(parent->v.children[1]))
945                         swap(parent->v.children[0],
946                              parent->v.children[1]);
947         } else {
948                 /*
949                  * We're deleting the root of a snapshot tree: update the
950                  * snapshot_tree entry to point to the new root, or delete it if
951                  * this is the last snapshot ID in this tree:
952                  */
953                 struct bkey_i_snapshot_tree *s_t;
954
955                 BUG_ON(s.v->children[1]);
956
957                 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
958                                 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
959                                 0, snapshot_tree);
960                 ret = PTR_ERR_OR_ZERO(s_t);
961                 if (ret)
962                         goto err;
963
964                 if (s.v->children[0]) {
965                         s_t->v.root_snapshot = s.v->children[0];
966                 } else {
967                         s_t->k.type = KEY_TYPE_deleted;
968                         set_bkey_val_u64s(&s_t->k, 0);
969                 }
970         }
971
972         ret = bch2_btree_delete_at(trans, &iter, 0);
973 err:
974         bch2_trans_iter_exit(trans, &tree_iter);
975         bch2_trans_iter_exit(trans, &p_iter);
976         bch2_trans_iter_exit(trans, &iter);
977         return ret;
978 }
979
980 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
981                           u32 *new_snapids,
982                           u32 *snapshot_subvols,
983                           unsigned nr_snapids)
984 {
985         struct bch_fs *c = trans->c;
986         struct btree_iter iter;
987         struct bkey_i_snapshot *n;
988         struct bkey_s_c k;
989         unsigned i, j;
990         u32 depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
991         int ret;
992
993         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
994                              POS_MIN, BTREE_ITER_INTENT);
995         k = bch2_btree_iter_peek(&iter);
996         ret = bkey_err(k);
997         if (ret)
998                 goto err;
999
1000         for (i = 0; i < nr_snapids; i++) {
1001                 k = bch2_btree_iter_prev_slot(&iter);
1002                 ret = bkey_err(k);
1003                 if (ret)
1004                         goto err;
1005
1006                 if (!k.k || !k.k->p.offset) {
1007                         ret = -BCH_ERR_ENOSPC_snapshot_create;
1008                         goto err;
1009                 }
1010
1011                 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1012                 ret = PTR_ERR_OR_ZERO(n);
1013                 if (ret)
1014                         goto err;
1015
1016                 n->v.flags      = 0;
1017                 n->v.parent     = cpu_to_le32(parent);
1018                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
1019                 n->v.tree       = cpu_to_le32(tree);
1020                 n->v.depth      = cpu_to_le32(depth);
1021
1022                 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1023                         n->v.skip[j] = cpu_to_le32(snapshot_rand_ancestor_get(c, parent));
1024
1025                 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_int);
1026                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1027
1028                 ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1029                                          bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1030                 if (ret)
1031                         goto err;
1032
1033                 new_snapids[i]  = iter.pos.offset;
1034         }
1035 err:
1036         bch2_trans_iter_exit(trans, &iter);
1037         return ret;
1038 }
1039
1040 /*
1041  * Create new snapshot IDs as children of an existing snapshot ID:
1042  */
1043 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1044                               u32 *new_snapids,
1045                               u32 *snapshot_subvols,
1046                               unsigned nr_snapids)
1047 {
1048         struct btree_iter iter;
1049         struct bkey_i_snapshot *n_parent;
1050         int ret = 0;
1051
1052         n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1053                         BTREE_ID_snapshots, POS(0, parent),
1054                         0, snapshot);
1055         ret = PTR_ERR_OR_ZERO(n_parent);
1056         if (unlikely(ret)) {
1057                 if (bch2_err_matches(ret, ENOENT))
1058                         bch_err(trans->c, "snapshot %u not found", parent);
1059                 return ret;
1060         }
1061
1062         if (n_parent->v.children[0] || n_parent->v.children[1]) {
1063                 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1064                 ret = -EINVAL;
1065                 goto err;
1066         }
1067
1068         ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1069                              new_snapids, snapshot_subvols, nr_snapids);
1070         if (ret)
1071                 goto err;
1072
1073         n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1074         n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1075         n_parent->v.subvol = 0;
1076         SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1077 err:
1078         bch2_trans_iter_exit(trans, &iter);
1079         return ret;
1080 }
1081
1082 /*
1083  * Create a snapshot node that is the root of a new tree:
1084  */
1085 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1086                               u32 *new_snapids,
1087                               u32 *snapshot_subvols,
1088                               unsigned nr_snapids)
1089 {
1090         struct bkey_i_snapshot_tree *n_tree;
1091         int ret;
1092
1093         n_tree = __snapshot_tree_create(trans);
1094         ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1095                 create_snapids(trans, 0, n_tree->k.p.offset,
1096                              new_snapids, snapshot_subvols, nr_snapids);
1097         if (ret)
1098                 return ret;
1099
1100         n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1101         n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1102         return 0;
1103 }
1104
1105 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1106                               u32 *new_snapids,
1107                               u32 *snapshot_subvols,
1108                               unsigned nr_snapids)
1109 {
1110         BUG_ON((parent == 0) != (nr_snapids == 1));
1111         BUG_ON((parent != 0) != (nr_snapids == 2));
1112
1113         return parent
1114                 ? bch2_snapshot_node_create_children(trans, parent,
1115                                 new_snapids, snapshot_subvols, nr_snapids)
1116                 : bch2_snapshot_node_create_tree(trans,
1117                                 new_snapids, snapshot_subvols, nr_snapids);
1118
1119 }
1120
1121 static int snapshot_delete_key(struct btree_trans *trans,
1122                                struct btree_iter *iter,
1123                                struct bkey_s_c k,
1124                                snapshot_id_list *deleted,
1125                                snapshot_id_list *equiv_seen,
1126                                struct bpos *last_pos)
1127 {
1128         struct bch_fs *c = trans->c;
1129         u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
1130
1131         if (!bkey_eq(k.k->p, *last_pos))
1132                 equiv_seen->nr = 0;
1133         *last_pos = k.k->p;
1134
1135         if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
1136             snapshot_list_has_id(equiv_seen, equiv)) {
1137                 return bch2_btree_delete_at(trans, iter,
1138                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1139         } else {
1140                 return snapshot_list_add(c, equiv_seen, equiv);
1141         }
1142 }
1143
1144 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
1145                                           struct bkey_s_c k)
1146 {
1147         struct bkey_s_c_snapshot snap;
1148         u32 children[2];
1149         int ret;
1150
1151         if (k.k->type != KEY_TYPE_snapshot)
1152                 return 0;
1153
1154         snap = bkey_s_c_to_snapshot(k);
1155         if (BCH_SNAPSHOT_DELETED(snap.v) ||
1156             BCH_SNAPSHOT_SUBVOL(snap.v))
1157                 return 0;
1158
1159         children[0] = le32_to_cpu(snap.v->children[0]);
1160         children[1] = le32_to_cpu(snap.v->children[1]);
1161
1162         ret   = snapshot_live(trans, children[0]) ?:
1163                 snapshot_live(trans, children[1]);
1164         if (ret < 0)
1165                 return ret;
1166
1167         if (!ret)
1168                 return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
1169         return 0;
1170 }
1171
1172 int bch2_delete_dead_snapshots(struct bch_fs *c)
1173 {
1174         struct btree_trans trans;
1175         struct btree_iter iter;
1176         struct bkey_s_c k;
1177         struct bkey_s_c_snapshot snap;
1178         snapshot_id_list deleted = { 0 };
1179         u32 i, id;
1180         int ret = 0;
1181
1182         if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
1183                 return 0;
1184
1185         if (!test_bit(BCH_FS_STARTED, &c->flags)) {
1186                 ret = bch2_fs_read_write_early(c);
1187                 if (ret) {
1188                         bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
1189                         return ret;
1190                 }
1191         }
1192
1193         bch2_trans_init(&trans, c, 0, 0);
1194
1195         /*
1196          * For every snapshot node: If we have no live children and it's not
1197          * pointed to by a subvolume, delete it:
1198          */
1199         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
1200                         POS_MIN, 0, k,
1201                         NULL, NULL, 0,
1202                 bch2_delete_redundant_snapshot(&trans, &iter, k));
1203         if (ret) {
1204                 bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
1205                 goto err;
1206         }
1207
1208         for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
1209                            POS_MIN, 0, k,
1210                 bch2_snapshot_set_equiv(&trans, k));
1211         if (ret) {
1212                 bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
1213                 goto err;
1214         }
1215
1216         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
1217                            POS_MIN, 0, k, ret) {
1218                 if (k.k->type != KEY_TYPE_snapshot)
1219                         continue;
1220
1221                 snap = bkey_s_c_to_snapshot(k);
1222                 if (BCH_SNAPSHOT_DELETED(snap.v)) {
1223                         ret = snapshot_list_add(c, &deleted, k.k->p.offset);
1224                         if (ret)
1225                                 break;
1226                 }
1227         }
1228         bch2_trans_iter_exit(&trans, &iter);
1229
1230         if (ret) {
1231                 bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
1232                 goto err;
1233         }
1234
1235         for (id = 0; id < BTREE_ID_NR; id++) {
1236                 struct bpos last_pos = POS_MIN;
1237                 snapshot_id_list equiv_seen = { 0 };
1238
1239                 if (!btree_type_has_snapshots(id))
1240                         continue;
1241
1242                 ret = for_each_btree_key_commit(&trans, iter,
1243                                 id, POS_MIN,
1244                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1245                                 NULL, NULL, BTREE_INSERT_NOFAIL,
1246                         snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
1247
1248                 darray_exit(&equiv_seen);
1249
1250                 if (ret) {
1251                         bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
1252                         goto err;
1253                 }
1254         }
1255
1256         for (i = 0; i < deleted.nr; i++) {
1257                 ret = commit_do(&trans, NULL, NULL, 0,
1258                         bch2_snapshot_node_delete(&trans, deleted.data[i]));
1259                 if (ret) {
1260                         bch_err(c, "error deleting snapshot %u: %s",
1261                                 deleted.data[i], bch2_err_str(ret));
1262                         goto err;
1263                 }
1264         }
1265
1266         clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
1267 err:
1268         darray_exit(&deleted);
1269         bch2_trans_exit(&trans);
1270         if (ret)
1271                 bch_err_fn(c, ret);
1272         return ret;
1273 }
1274
1275 static void bch2_delete_dead_snapshots_work(struct work_struct *work)
1276 {
1277         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1278
1279         bch2_delete_dead_snapshots(c);
1280         bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1281 }
1282
1283 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1284 {
1285         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
1286             !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1287                 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1288 }
1289
1290 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
1291                                            struct btree_trans_commit_hook *h)
1292 {
1293         struct bch_fs *c = trans->c;
1294
1295         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
1296
1297         if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
1298                 return 0;
1299
1300         bch2_delete_dead_snapshots_async(c);
1301         return 0;
1302 }
1303
1304 /* Subvolumes: */
1305
1306 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
1307                            unsigned flags, struct printbuf *err)
1308 {
1309         if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
1310             bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
1311                 prt_printf(err, "invalid pos");
1312                 return -BCH_ERR_invalid_bkey;
1313         }
1314
1315         return 0;
1316 }
1317
1318 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
1319                             struct bkey_s_c k)
1320 {
1321         struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1322
1323         prt_printf(out, "root %llu snapshot id %u",
1324                    le64_to_cpu(s.v->inode),
1325                    le32_to_cpu(s.v->snapshot));
1326
1327         if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
1328                 prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
1329 }
1330
1331 static __always_inline int
1332 bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
1333                            bool inconsistent_if_not_found,
1334                            int iter_flags,
1335                            struct bch_subvolume *s)
1336 {
1337         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
1338                                           iter_flags, subvolume, s);
1339         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
1340                                 inconsistent_if_not_found,
1341                                 trans->c, "missing subvolume %u", subvol);
1342         return ret;
1343 }
1344
1345 int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
1346                        bool inconsistent_if_not_found,
1347                        int iter_flags,
1348                        struct bch_subvolume *s)
1349 {
1350         return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
1351 }
1352
1353 int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
1354                              struct bch_subvolume *subvol)
1355 {
1356         struct bch_snapshot snap;
1357
1358         return  snapshot_lookup(trans, snapshot, &snap) ?:
1359                 bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
1360 }
1361
1362 int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
1363                                 u32 *snapid)
1364 {
1365         struct btree_iter iter;
1366         struct bkey_s_c k;
1367         int ret;
1368
1369         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
1370                                BTREE_ITER_CACHED|
1371                                BTREE_ITER_WITH_UPDATES);
1372         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume;
1373
1374         if (likely(!ret))
1375                 *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
1376         else if (bch2_err_matches(ret, ENOENT))
1377                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
1378         bch2_trans_iter_exit(trans, &iter);
1379         return ret;
1380 }
1381
1382 static int bch2_subvolume_reparent(struct btree_trans *trans,
1383                                    struct btree_iter *iter,
1384                                    struct bkey_s_c k,
1385                                    u32 old_parent, u32 new_parent)
1386 {
1387         struct bkey_i_subvolume *s;
1388         int ret;
1389
1390         if (k.k->type != KEY_TYPE_subvolume)
1391                 return 0;
1392
1393         if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
1394             le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
1395                 return 0;
1396
1397         s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
1398         ret = PTR_ERR_OR_ZERO(s);
1399         if (ret)
1400                 return ret;
1401
1402         s->v.parent = cpu_to_le32(new_parent);
1403         return 0;
1404 }
1405
1406 /*
1407  * Scan for subvolumes with parent @subvolid_to_delete, reparent:
1408  */
1409 static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
1410 {
1411         struct btree_iter iter;
1412         struct bkey_s_c k;
1413         struct bch_subvolume s;
1414
1415         return lockrestart_do(trans,
1416                         bch2_subvolume_get(trans, subvolid_to_delete, true,
1417                                    BTREE_ITER_CACHED, &s)) ?:
1418                 for_each_btree_key_commit(trans, iter,
1419                                 BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
1420                                 NULL, NULL, BTREE_INSERT_NOFAIL,
1421                         bch2_subvolume_reparent(trans, &iter, k,
1422                                         subvolid_to_delete, le32_to_cpu(s.parent)));
1423 }
1424
1425 /*
1426  * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
1427  * deletion/cleanup:
1428  */
1429 static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
1430 {
1431         struct btree_iter iter;
1432         struct bkey_s_c_subvolume subvol;
1433         struct btree_trans_commit_hook *h;
1434         u32 snapid;
1435         int ret = 0;
1436
1437         subvol = bch2_bkey_get_iter_typed(trans, &iter,
1438                                 BTREE_ID_subvolumes, POS(0, subvolid),
1439                                 BTREE_ITER_CACHED|BTREE_ITER_INTENT,
1440                                 subvolume);
1441         ret = bkey_err(subvol);
1442         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
1443                                 "missing subvolume %u", subvolid);
1444         if (ret)
1445                 return ret;
1446
1447         snapid = le32_to_cpu(subvol.v->snapshot);
1448
1449         ret = bch2_btree_delete_at(trans, &iter, 0);
1450         if (ret)
1451                 goto err;
1452
1453         ret = bch2_snapshot_node_set_deleted(trans, snapid);
1454         if (ret)
1455                 goto err;
1456
1457         h = bch2_trans_kmalloc(trans, sizeof(*h));
1458         ret = PTR_ERR_OR_ZERO(h);
1459         if (ret)
1460                 goto err;
1461
1462         h->fn = bch2_delete_dead_snapshots_hook;
1463         bch2_trans_commit_hook(trans, h);
1464 err:
1465         bch2_trans_iter_exit(trans, &iter);
1466         return ret;
1467 }
1468
1469 static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
1470 {
1471         return bch2_subvolumes_reparent(trans, subvolid) ?:
1472                 commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
1473                           __bch2_subvolume_delete(trans, subvolid));
1474 }
1475
1476 static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
1477 {
1478         struct bch_fs *c = container_of(work, struct bch_fs,
1479                                 snapshot_wait_for_pagecache_and_delete_work);
1480         snapshot_id_list s;
1481         u32 *id;
1482         int ret = 0;
1483
1484         while (!ret) {
1485                 mutex_lock(&c->snapshots_unlinked_lock);
1486                 s = c->snapshots_unlinked;
1487                 darray_init(&c->snapshots_unlinked);
1488                 mutex_unlock(&c->snapshots_unlinked_lock);
1489
1490                 if (!s.nr)
1491                         break;
1492
1493                 bch2_evict_subvolume_inodes(c, &s);
1494
1495                 for (id = s.data; id < s.data + s.nr; id++) {
1496                         ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
1497                         if (ret) {
1498                                 bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
1499                                 break;
1500                         }
1501                 }
1502
1503                 darray_exit(&s);
1504         }
1505
1506         bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
1507 }
1508
1509 struct subvolume_unlink_hook {
1510         struct btree_trans_commit_hook  h;
1511         u32                             subvol;
1512 };
1513
1514 static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
1515                                                       struct btree_trans_commit_hook *_h)
1516 {
1517         struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
1518         struct bch_fs *c = trans->c;
1519         int ret = 0;
1520
1521         mutex_lock(&c->snapshots_unlinked_lock);
1522         if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
1523                 ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
1524         mutex_unlock(&c->snapshots_unlinked_lock);
1525
1526         if (ret)
1527                 return ret;
1528
1529         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
1530                 return -EROFS;
1531
1532         if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
1533                 bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
1534         return 0;
1535 }
1536
1537 int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
1538 {
1539         struct btree_iter iter;
1540         struct bkey_i_subvolume *n;
1541         struct subvolume_unlink_hook *h;
1542         int ret = 0;
1543
1544         h = bch2_trans_kmalloc(trans, sizeof(*h));
1545         ret = PTR_ERR_OR_ZERO(h);
1546         if (ret)
1547                 return ret;
1548
1549         h->h.fn         = bch2_subvolume_wait_for_pagecache_and_delete_hook;
1550         h->subvol       = subvolid;
1551         bch2_trans_commit_hook(trans, &h->h);
1552
1553         n = bch2_bkey_get_mut_typed(trans, &iter,
1554                         BTREE_ID_subvolumes, POS(0, subvolid),
1555                         BTREE_ITER_CACHED, subvolume);
1556         ret = PTR_ERR_OR_ZERO(n);
1557         if (unlikely(ret)) {
1558                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
1559                                         "missing subvolume %u", subvolid);
1560                 return ret;
1561         }
1562
1563         SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
1564         bch2_trans_iter_exit(trans, &iter);
1565         return ret;
1566 }
1567
1568 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
1569                           u32 src_subvolid,
1570                           u32 *new_subvolid,
1571                           u32 *new_snapshotid,
1572                           bool ro)
1573 {
1574         struct bch_fs *c = trans->c;
1575         struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
1576         struct bkey_i_subvolume *new_subvol = NULL;
1577         struct bkey_i_subvolume *src_subvol = NULL;
1578         u32 parent = 0, new_nodes[2], snapshot_subvols[2];
1579         int ret = 0;
1580
1581         ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
1582                                 BTREE_ID_subvolumes, POS(0, U32_MAX));
1583         if (ret == -BCH_ERR_ENOSPC_btree_slot)
1584                 ret = -BCH_ERR_ENOSPC_subvolume_create;
1585         if (ret)
1586                 return ret;
1587
1588         snapshot_subvols[0] = dst_iter.pos.offset;
1589         snapshot_subvols[1] = src_subvolid;
1590
1591         if (src_subvolid) {
1592                 /* Creating a snapshot: */
1593
1594                 src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter,
1595                                 BTREE_ID_subvolumes, POS(0, src_subvolid),
1596                                 BTREE_ITER_CACHED, subvolume);
1597                 ret = PTR_ERR_OR_ZERO(src_subvol);
1598                 if (unlikely(ret)) {
1599                         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1600                                                 "subvolume %u not found", src_subvolid);
1601                         goto err;
1602                 }
1603
1604                 parent = le32_to_cpu(src_subvol->v.snapshot);
1605         }
1606
1607         ret = bch2_snapshot_node_create(trans, parent, new_nodes,
1608                                         snapshot_subvols,
1609                                         src_subvolid ? 2 : 1);
1610         if (ret)
1611                 goto err;
1612
1613         if (src_subvolid) {
1614                 src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
1615                 ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
1616                 if (ret)
1617                         goto err;
1618         }
1619
1620         new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume);
1621         ret = PTR_ERR_OR_ZERO(new_subvol);
1622         if (ret)
1623                 goto err;
1624
1625         new_subvol->v.flags     = 0;
1626         new_subvol->v.snapshot  = cpu_to_le32(new_nodes[0]);
1627         new_subvol->v.inode     = cpu_to_le64(inode);
1628         new_subvol->v.parent    = cpu_to_le32(src_subvolid);
1629         new_subvol->v.otime.lo  = cpu_to_le64(bch2_current_time(c));
1630         new_subvol->v.otime.hi  = 0;
1631
1632         SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
1633         SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
1634
1635         *new_subvolid   = new_subvol->k.p.offset;
1636         *new_snapshotid = new_nodes[0];
1637 err:
1638         bch2_trans_iter_exit(trans, &src_iter);
1639         bch2_trans_iter_exit(trans, &dst_iter);
1640         return ret;
1641 }
1642
1643 int bch2_fs_subvolumes_init(struct bch_fs *c)
1644 {
1645         INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
1646         INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
1647                   bch2_subvolume_wait_for_pagecache_and_delete);
1648         mutex_init(&c->snapshots_unlinked_lock);
1649         return 0;
1650 }