]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/snapshot.c
Update bcachefs sources to bed61fae3b bcachefs: Delete a faulty assertion
[bcachefs-tools-debian] / libbcachefs / snapshot.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_update.h"
6 #include "errcode.h"
7 #include "error.h"
8 #include "fs.h"
9 #include "snapshot.h"
10
11 #include <linux/random.h>
12
13 /*
14  * Snapshot trees:
15  *
16  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
17  * exist to provide a stable identifier for the whole lifetime of a snapshot
18  * tree.
19  */
20
21 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
22                                 struct bkey_s_c k)
23 {
24         struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
25
26         prt_printf(out, "subvol %u root snapshot %u",
27                    le32_to_cpu(t.v->master_subvol),
28                    le32_to_cpu(t.v->root_snapshot));
29 }
30
31 int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
32                                enum bkey_invalid_flags flags,
33                                struct printbuf *err)
34 {
35         if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
36             bkey_lt(k.k->p, POS(0, 1))) {
37                 prt_printf(err, "bad pos");
38                 return -BCH_ERR_invalid_bkey;
39         }
40
41         return 0;
42 }
43
44 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
45                               struct bch_snapshot_tree *s)
46 {
47         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
48                                           BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
49
50         if (bch2_err_matches(ret, ENOENT))
51                 ret = -BCH_ERR_ENOENT_snapshot_tree;
52         return ret;
53 }
54
55 struct bkey_i_snapshot_tree *
56 __bch2_snapshot_tree_create(struct btree_trans *trans)
57 {
58         struct btree_iter iter;
59         int ret = bch2_bkey_get_empty_slot(trans, &iter,
60                         BTREE_ID_snapshot_trees, POS(0, U32_MAX));
61         struct bkey_i_snapshot_tree *s_t;
62
63         if (ret == -BCH_ERR_ENOSPC_btree_slot)
64                 ret = -BCH_ERR_ENOSPC_snapshot_tree;
65         if (ret)
66                 return ERR_PTR(ret);
67
68         s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
69         ret = PTR_ERR_OR_ZERO(s_t);
70         bch2_trans_iter_exit(trans, &iter);
71         return ret ? ERR_PTR(ret) : s_t;
72 }
73
74 static int bch2_snapshot_tree_create(struct btree_trans *trans,
75                                 u32 root_id, u32 subvol_id, u32 *tree_id)
76 {
77         struct bkey_i_snapshot_tree *n_tree =
78                 __bch2_snapshot_tree_create(trans);
79
80         if (IS_ERR(n_tree))
81                 return PTR_ERR(n_tree);
82
83         n_tree->v.master_subvol = cpu_to_le32(subvol_id);
84         n_tree->v.root_snapshot = cpu_to_le32(root_id);
85         *tree_id = n_tree->k.p.offset;
86         return 0;
87 }
88
89 /* Snapshot nodes: */
90
91 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
92 {
93         const struct snapshot_t *s = __snapshot_t(t, id);
94
95         if (s->skip[2] <= ancestor)
96                 return s->skip[2];
97         if (s->skip[1] <= ancestor)
98                 return s->skip[1];
99         if (s->skip[0] <= ancestor)
100                 return s->skip[0];
101         return s->parent;
102 }
103
104 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
105 {
106         struct snapshot_table *t;
107         bool ret;
108
109         EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
110
111         rcu_read_lock();
112         t = rcu_dereference(c->snapshots);
113
114         while (id && id < ancestor - IS_ANCESTOR_BITMAP)
115                 id = get_ancestor_below(t, id, ancestor);
116
117         ret = id && id < ancestor
118                 ? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor)
119                 : id == ancestor;
120         rcu_read_unlock();
121
122         return ret;
123 }
124
125 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
126 {
127         struct snapshot_table *t;
128
129         rcu_read_lock();
130         t = rcu_dereference(c->snapshots);
131
132         while (id && id < ancestor)
133                 id = __snapshot_t(t, id)->parent;
134         rcu_read_unlock();
135
136         return id == ancestor;
137 }
138
139 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
140 {
141         size_t idx = U32_MAX - id;
142         size_t new_size;
143         struct snapshot_table *new, *old;
144
145         new_size = max(16UL, roundup_pow_of_two(idx + 1));
146
147         new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
148         if (!new)
149                 return NULL;
150
151         old = rcu_dereference_protected(c->snapshots, true);
152         if (old)
153                 memcpy(new->s,
154                        rcu_dereference_protected(c->snapshots, true)->s,
155                        sizeof(new->s[0]) * c->snapshot_table_size);
156
157         rcu_assign_pointer(c->snapshots, new);
158         c->snapshot_table_size = new_size;
159         if (old)
160                 kvfree_rcu(old);
161
162         return &rcu_dereference_protected(c->snapshots, true)->s[idx];
163 }
164
165 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
166 {
167         size_t idx = U32_MAX - id;
168
169         lockdep_assert_held(&c->snapshot_table_lock);
170
171         if (likely(idx < c->snapshot_table_size))
172                 return &rcu_dereference_protected(c->snapshots, true)->s[idx];
173
174         return __snapshot_t_mut(c, id);
175 }
176
177 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
178                            struct bkey_s_c k)
179 {
180         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
181
182         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
183                BCH_SNAPSHOT_SUBVOL(s.v),
184                BCH_SNAPSHOT_DELETED(s.v),
185                le32_to_cpu(s.v->parent),
186                le32_to_cpu(s.v->children[0]),
187                le32_to_cpu(s.v->children[1]),
188                le32_to_cpu(s.v->subvol),
189                le32_to_cpu(s.v->tree));
190
191         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
192                 prt_printf(out, " depth %u skiplist %u %u %u",
193                            le32_to_cpu(s.v->depth),
194                            le32_to_cpu(s.v->skip[0]),
195                            le32_to_cpu(s.v->skip[1]),
196                            le32_to_cpu(s.v->skip[2]));
197 }
198
199 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
200                           enum bkey_invalid_flags flags,
201                           struct printbuf *err)
202 {
203         struct bkey_s_c_snapshot s;
204         u32 i, id;
205
206         if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
207             bkey_lt(k.k->p, POS(0, 1))) {
208                 prt_printf(err, "bad pos");
209                 return -BCH_ERR_invalid_bkey;
210         }
211
212         s = bkey_s_c_to_snapshot(k);
213
214         id = le32_to_cpu(s.v->parent);
215         if (id && id <= k.k->p.offset) {
216                 prt_printf(err, "bad parent node (%u <= %llu)",
217                        id, k.k->p.offset);
218                 return -BCH_ERR_invalid_bkey;
219         }
220
221         if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
222                 prt_printf(err, "children not normalized");
223                 return -BCH_ERR_invalid_bkey;
224         }
225
226         if (s.v->children[0] &&
227             s.v->children[0] == s.v->children[1]) {
228                 prt_printf(err, "duplicate child nodes");
229                 return -BCH_ERR_invalid_bkey;
230         }
231
232         for (i = 0; i < 2; i++) {
233                 id = le32_to_cpu(s.v->children[i]);
234
235                 if (id >= k.k->p.offset) {
236                         prt_printf(err, "bad child node (%u >= %llu)",
237                                id, k.k->p.offset);
238                         return -BCH_ERR_invalid_bkey;
239                 }
240         }
241
242         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
243                 if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
244                     le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
245                         prt_printf(err, "skiplist not normalized");
246                         return -BCH_ERR_invalid_bkey;
247                 }
248
249                 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
250                         id = le32_to_cpu(s.v->skip[i]);
251
252                         if (!id != !s.v->parent ||
253                             (s.v->parent &&
254                              id <= k.k->p.offset)) {
255                                 prt_printf(err, "bad skiplist node %u)", id);
256                                 return -BCH_ERR_invalid_bkey;
257                         }
258                 }
259         }
260
261         return 0;
262 }
263
264 int bch2_mark_snapshot(struct btree_trans *trans,
265                        enum btree_id btree, unsigned level,
266                        struct bkey_s_c old, struct bkey_s_c new,
267                        unsigned flags)
268 {
269         struct bch_fs *c = trans->c;
270         struct snapshot_t *t;
271         u32 id = new.k->p.offset;
272         int ret = 0;
273
274         mutex_lock(&c->snapshot_table_lock);
275
276         t = snapshot_t_mut(c, id);
277         if (!t) {
278                 ret = -BCH_ERR_ENOMEM_mark_snapshot;
279                 goto err;
280         }
281
282         if (new.k->type == KEY_TYPE_snapshot) {
283                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
284                 u32 parent = id;
285
286                 t->parent       = le32_to_cpu(s.v->parent);
287                 t->children[0]  = le32_to_cpu(s.v->children[0]);
288                 t->children[1]  = le32_to_cpu(s.v->children[1]);
289                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
290                 t->tree         = le32_to_cpu(s.v->tree);
291
292                 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
293                         t->depth        = le32_to_cpu(s.v->depth);
294                         t->skip[0]      = le32_to_cpu(s.v->skip[0]);
295                         t->skip[1]      = le32_to_cpu(s.v->skip[1]);
296                         t->skip[2]      = le32_to_cpu(s.v->skip[2]);
297                 } else {
298                         t->depth        = 0;
299                         t->skip[0]      = 0;
300                         t->skip[1]      = 0;
301                         t->skip[2]      = 0;
302                 }
303
304                 while ((parent = bch2_snapshot_parent_early(c, parent)) &&
305                        parent - id - 1 < IS_ANCESTOR_BITMAP)
306                         __set_bit(parent - id - 1, t->is_ancestor);
307
308                 if (BCH_SNAPSHOT_DELETED(s.v)) {
309                         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
310                         c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
311                 }
312         } else {
313                 memset(t, 0, sizeof(*t));
314         }
315 err:
316         mutex_unlock(&c->snapshot_table_lock);
317         return ret;
318 }
319
320 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
321                          struct bch_snapshot *s)
322 {
323         return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
324                                        BTREE_ITER_WITH_UPDATES, snapshot, s);
325 }
326
327 int bch2_snapshot_live(struct btree_trans *trans, u32 id)
328 {
329         struct bch_snapshot v;
330         int ret;
331
332         if (!id)
333                 return 0;
334
335         ret = bch2_snapshot_lookup(trans, id, &v);
336         if (bch2_err_matches(ret, ENOENT))
337                 bch_err(trans->c, "snapshot node %u not found", id);
338         if (ret)
339                 return ret;
340
341         return !BCH_SNAPSHOT_DELETED(&v);
342 }
343
344 /*
345  * If @k is a snapshot with just one live child, it's part of a linear chain,
346  * which we consider to be an equivalence class: and then after snapshot
347  * deletion cleanup, there should only be a single key at a given position in
348  * this equivalence class.
349  *
350  * This sets the equivalence class of @k to be the child's equivalence class, if
351  * it's part of such a linear chain: this correctly sets equivalence classes on
352  * startup if we run leaf to root (i.e. in natural key order).
353  */
354 int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
355 {
356         struct bch_fs *c = trans->c;
357         unsigned i, nr_live = 0, live_idx = 0;
358         struct bkey_s_c_snapshot snap;
359         u32 id = k.k->p.offset, child[2];
360
361         if (k.k->type != KEY_TYPE_snapshot)
362                 return 0;
363
364         snap = bkey_s_c_to_snapshot(k);
365
366         child[0] = le32_to_cpu(snap.v->children[0]);
367         child[1] = le32_to_cpu(snap.v->children[1]);
368
369         for (i = 0; i < 2; i++) {
370                 int ret = bch2_snapshot_live(trans, child[i]);
371
372                 if (ret < 0)
373                         return ret;
374
375                 if (ret)
376                         live_idx = i;
377                 nr_live += ret;
378         }
379
380         mutex_lock(&c->snapshot_table_lock);
381
382         snapshot_t_mut(c, id)->equiv = nr_live == 1
383                 ? snapshot_t_mut(c, child[live_idx])->equiv
384                 : id;
385
386         mutex_unlock(&c->snapshot_table_lock);
387
388         return 0;
389 }
390
391 /* fsck: */
392
393 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
394 {
395         return snapshot_t(c, id)->children[child];
396 }
397
398 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
399 {
400         return bch2_snapshot_child(c, id, 0);
401 }
402
403 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
404 {
405         return bch2_snapshot_child(c, id, 1);
406 }
407
408 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
409 {
410         u32 n, parent;
411
412         n = bch2_snapshot_left_child(c, id);
413         if (n)
414                 return n;
415
416         while ((parent = bch2_snapshot_parent(c, id))) {
417                 n = bch2_snapshot_right_child(c, parent);
418                 if (n && n != id)
419                         return n;
420                 id = parent;
421         }
422
423         return 0;
424 }
425
426 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
427 {
428         u32 id = snapshot_root;
429         u32 subvol = 0, s;
430
431         while (id) {
432                 s = snapshot_t(c, id)->subvol;
433
434                 if (s && (!subvol || s < subvol))
435                         subvol = s;
436
437                 id = bch2_snapshot_tree_next(c, id);
438         }
439
440         return subvol;
441 }
442
443 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
444                                             u32 snapshot_root, u32 *subvol_id)
445 {
446         struct bch_fs *c = trans->c;
447         struct btree_iter iter;
448         struct bkey_s_c k;
449         struct bkey_s_c_subvolume s;
450         bool found = false;
451         int ret;
452
453         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
454                                      0, k, ret) {
455                 if (k.k->type != KEY_TYPE_subvolume)
456                         continue;
457
458                 s = bkey_s_c_to_subvolume(k);
459                 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
460                         continue;
461                 if (!BCH_SUBVOLUME_SNAP(s.v)) {
462                         *subvol_id = s.k->p.offset;
463                         found = true;
464                         break;
465                 }
466         }
467
468         bch2_trans_iter_exit(trans, &iter);
469
470         if (!ret && !found) {
471                 struct bkey_i_subvolume *s;
472
473                 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
474
475                 s = bch2_bkey_get_mut_typed(trans, &iter,
476                                             BTREE_ID_subvolumes, POS(0, *subvol_id),
477                                             0, subvolume);
478                 ret = PTR_ERR_OR_ZERO(s);
479                 if (ret)
480                         return ret;
481
482                 SET_BCH_SUBVOLUME_SNAP(&s->v, false);
483         }
484
485         return ret;
486 }
487
488 static int check_snapshot_tree(struct btree_trans *trans,
489                                struct btree_iter *iter,
490                                struct bkey_s_c k)
491 {
492         struct bch_fs *c = trans->c;
493         struct bkey_s_c_snapshot_tree st;
494         struct bch_snapshot s;
495         struct bch_subvolume subvol;
496         struct printbuf buf = PRINTBUF;
497         u32 root_id;
498         int ret;
499
500         if (k.k->type != KEY_TYPE_snapshot_tree)
501                 return 0;
502
503         st = bkey_s_c_to_snapshot_tree(k);
504         root_id = le32_to_cpu(st.v->root_snapshot);
505
506         ret = bch2_snapshot_lookup(trans, root_id, &s);
507         if (ret && !bch2_err_matches(ret, ENOENT))
508                 goto err;
509
510         if (fsck_err_on(ret ||
511                         root_id != bch2_snapshot_root(c, root_id) ||
512                         st.k->p.offset != le32_to_cpu(s.tree),
513                         c,
514                         "snapshot tree points to missing/incorrect snapshot:\n  %s",
515                         (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
516                 ret = bch2_btree_delete_at(trans, iter, 0);
517                 goto err;
518         }
519
520         ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
521                                  false, 0, &subvol);
522         if (ret && !bch2_err_matches(ret, ENOENT))
523                 goto err;
524
525         if (fsck_err_on(ret, c,
526                         "snapshot tree points to missing subvolume:\n  %s",
527                         (printbuf_reset(&buf),
528                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
529             fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
530                                                 le32_to_cpu(subvol.snapshot),
531                                                 root_id), c,
532                         "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
533                         (printbuf_reset(&buf),
534                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
535             fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
536                         "snapshot tree points to snapshot subvolume:\n  %s",
537                         (printbuf_reset(&buf),
538                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
539                 struct bkey_i_snapshot_tree *u;
540                 u32 subvol_id;
541
542                 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
543                 if (ret)
544                         goto err;
545
546                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
547                 ret = PTR_ERR_OR_ZERO(u);
548                 if (ret)
549                         goto err;
550
551                 u->v.master_subvol = cpu_to_le32(subvol_id);
552                 st = snapshot_tree_i_to_s_c(u);
553         }
554 err:
555 fsck_err:
556         printbuf_exit(&buf);
557         return ret;
558 }
559
560 /*
561  * For each snapshot_tree, make sure it points to the root of a snapshot tree
562  * and that snapshot entry points back to it, or delete it.
563  *
564  * And, make sure it points to a subvolume within that snapshot tree, or correct
565  * it to point to the oldest subvolume within that snapshot tree.
566  */
567 int bch2_check_snapshot_trees(struct bch_fs *c)
568 {
569         struct btree_iter iter;
570         struct bkey_s_c k;
571         int ret;
572
573         ret = bch2_trans_run(c,
574                 for_each_btree_key_commit(&trans, iter,
575                         BTREE_ID_snapshot_trees, POS_MIN,
576                         BTREE_ITER_PREFETCH, k,
577                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
578                 check_snapshot_tree(&trans, &iter, k)));
579
580         if (ret)
581                 bch_err(c, "error %i checking snapshot trees", ret);
582         return ret;
583 }
584
585 /*
586  * Look up snapshot tree for @tree_id and find root,
587  * make sure @snap_id is a descendent:
588  */
589 static int snapshot_tree_ptr_good(struct btree_trans *trans,
590                                   u32 snap_id, u32 tree_id)
591 {
592         struct bch_snapshot_tree s_t;
593         int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
594
595         if (bch2_err_matches(ret, ENOENT))
596                 return 0;
597         if (ret)
598                 return ret;
599
600         return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
601 }
602
603 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
604 {
605         const struct snapshot_t *s;
606
607         if (!id)
608                 return 0;
609
610         rcu_read_lock();
611         s = snapshot_t(c, id);
612         if (s->parent)
613                 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
614         rcu_read_unlock();
615
616         return id;
617 }
618
619 static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
620 {
621         struct bch_snapshot a;
622         unsigned i;
623         int ret;
624
625         for (i = 0; i < 3; i++) {
626                 if (!s.parent != !s.skip[i])
627                         return false;
628
629                 if (!s.parent)
630                         continue;
631
632                 ret = bch2_snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
633                 if (bch2_err_matches(ret, ENOENT))
634                         return false;
635                 if (ret)
636                         return ret;
637
638                 if (a.tree != s.tree)
639                         return false;
640         }
641
642         return true;
643 }
644
645 /*
646  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
647  * its snapshot_tree pointer is correct (allocate new one if necessary), then
648  * update this node's pointer to root node's pointer:
649  */
650 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
651                                     struct btree_iter *iter,
652                                     struct bkey_s_c k,
653                                     struct bch_snapshot *s)
654 {
655         struct bch_fs *c = trans->c;
656         struct btree_iter root_iter;
657         struct bch_snapshot_tree s_t;
658         struct bkey_s_c_snapshot root;
659         struct bkey_i_snapshot *u;
660         u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
661         int ret;
662
663         root = bch2_bkey_get_iter_typed(trans, &root_iter,
664                                BTREE_ID_snapshots, POS(0, root_id),
665                                BTREE_ITER_WITH_UPDATES, snapshot);
666         ret = bkey_err(root);
667         if (ret)
668                 goto err;
669
670         tree_id = le32_to_cpu(root.v->tree);
671
672         ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
673         if (ret && !bch2_err_matches(ret, ENOENT))
674                 return ret;
675
676         if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
677                 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
678                 ret =   PTR_ERR_OR_ZERO(u) ?:
679                         bch2_snapshot_tree_create(trans, root_id,
680                                 bch2_snapshot_tree_oldest_subvol(c, root_id),
681                                 &tree_id);
682                 if (ret)
683                         goto err;
684
685                 u->v.tree = cpu_to_le32(tree_id);
686                 if (k.k->p.offset == root_id)
687                         *s = u->v;
688         }
689
690         if (k.k->p.offset != root_id) {
691                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
692                 ret = PTR_ERR_OR_ZERO(u);
693                 if (ret)
694                         goto err;
695
696                 u->v.tree = cpu_to_le32(tree_id);
697                 *s = u->v;
698         }
699 err:
700         bch2_trans_iter_exit(trans, &root_iter);
701         return ret;
702 }
703
704 static int check_snapshot(struct btree_trans *trans,
705                           struct btree_iter *iter,
706                           struct bkey_s_c k)
707 {
708         struct bch_fs *c = trans->c;
709         struct bch_snapshot s;
710         struct bch_subvolume subvol;
711         struct bch_snapshot v;
712         struct bkey_i_snapshot *u;
713         u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
714         u32 real_depth;
715         struct printbuf buf = PRINTBUF;
716         bool should_have_subvol;
717         u32 i, id;
718         int ret = 0;
719
720         if (k.k->type != KEY_TYPE_snapshot)
721                 return 0;
722
723         memset(&s, 0, sizeof(s));
724         memcpy(&s, k.v, bkey_val_bytes(k.k));
725
726         id = le32_to_cpu(s.parent);
727         if (id) {
728                 ret = bch2_snapshot_lookup(trans, id, &v);
729                 if (bch2_err_matches(ret, ENOENT))
730                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
731                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
732                 if (ret)
733                         goto err;
734
735                 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
736                     le32_to_cpu(v.children[1]) != k.k->p.offset) {
737                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
738                                 id, k.k->p.offset);
739                         ret = -EINVAL;
740                         goto err;
741                 }
742         }
743
744         for (i = 0; i < 2 && s.children[i]; i++) {
745                 id = le32_to_cpu(s.children[i]);
746
747                 ret = bch2_snapshot_lookup(trans, id, &v);
748                 if (bch2_err_matches(ret, ENOENT))
749                         bch_err(c, "snapshot node %llu has nonexistent child %u",
750                                 k.k->p.offset, id);
751                 if (ret)
752                         goto err;
753
754                 if (le32_to_cpu(v.parent) != k.k->p.offset) {
755                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
756                                 id, le32_to_cpu(v.parent), k.k->p.offset);
757                         ret = -EINVAL;
758                         goto err;
759                 }
760         }
761
762         should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
763                 !BCH_SNAPSHOT_DELETED(&s);
764
765         if (should_have_subvol) {
766                 id = le32_to_cpu(s.subvol);
767                 ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
768                 if (bch2_err_matches(ret, ENOENT))
769                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
770                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
771                 if (ret)
772                         goto err;
773
774                 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
775                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
776                                 k.k->p.offset);
777                         ret = -EINVAL;
778                         goto err;
779                 }
780         } else {
781                 if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
782                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
783                         u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
784                         ret = PTR_ERR_OR_ZERO(u);
785                         if (ret)
786                                 goto err;
787
788                         u->v.subvol = 0;
789                         s = u->v;
790                 }
791         }
792
793         ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
794         if (ret < 0)
795                 goto err;
796
797         if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
798                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
799                 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
800                 if (ret)
801                         goto err;
802         }
803         ret = 0;
804
805         real_depth = bch2_snapshot_depth(c, parent_id);
806
807         if (le32_to_cpu(s.depth) != real_depth &&
808             (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
809              fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
810                       real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
811                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
812                 ret = PTR_ERR_OR_ZERO(u);
813                 if (ret)
814                         goto err;
815
816                 u->v.depth = cpu_to_le32(real_depth);
817                 s = u->v;
818         }
819
820         ret = snapshot_skiplist_good(trans, s);
821         if (ret < 0)
822                 goto err;
823
824         if (!ret &&
825             (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
826              fsck_err(c, "snapshot with bad skiplist field:\n  %s",
827                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
828                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
829                 ret = PTR_ERR_OR_ZERO(u);
830                 if (ret)
831                         goto err;
832
833                 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
834                         u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
835
836                 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
837                 s = u->v;
838         }
839         ret = 0;
840 err:
841 fsck_err:
842         printbuf_exit(&buf);
843         return ret;
844 }
845
846 int bch2_check_snapshots(struct bch_fs *c)
847 {
848         struct btree_iter iter;
849         struct bkey_s_c k;
850         int ret;
851
852         /*
853          * We iterate backwards as checking/fixing the depth field requires that
854          * the parent's depth already be correct:
855          */
856         ret = bch2_trans_run(c,
857                 for_each_btree_key_reverse_commit(&trans, iter,
858                         BTREE_ID_snapshots, POS_MAX,
859                         BTREE_ITER_PREFETCH, k,
860                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
861                 check_snapshot(&trans, &iter, k)));
862         if (ret)
863                 bch_err_fn(c, ret);
864         return ret;
865 }
866
867 int bch2_snapshots_read(struct bch_fs *c)
868 {
869         struct btree_iter iter;
870         struct bkey_s_c k;
871         int ret = 0;
872
873         ret = bch2_trans_run(c,
874                 for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
875                            POS_MIN, 0, k,
876                         bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
877                         bch2_snapshot_set_equiv(&trans, k)));
878         if (ret)
879                 bch_err_fn(c, ret);
880         return ret;
881 }
882
883 void bch2_fs_snapshots_exit(struct bch_fs *c)
884 {
885         kfree(rcu_dereference_protected(c->snapshots, true));
886 }