]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/snapshot.c
3ecc17b8d6fcaea206cb166d5ed58a5cad889f9c
[bcachefs-tools-debian] / libbcachefs / snapshot.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_key_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "errcode.h"
9 #include "error.h"
10 #include "fs.h"
11 #include "snapshot.h"
12
13 #include <linux/random.h>
14
15 /*
16  * Snapshot trees:
17  *
18  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
19  * exist to provide a stable identifier for the whole lifetime of a snapshot
20  * tree.
21  */
22
23 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
24                                 struct bkey_s_c k)
25 {
26         struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
27
28         prt_printf(out, "subvol %u root snapshot %u",
29                    le32_to_cpu(t.v->master_subvol),
30                    le32_to_cpu(t.v->root_snapshot));
31 }
32
33 int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
34                                enum bkey_invalid_flags flags,
35                                struct printbuf *err)
36 {
37         if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
38             bkey_lt(k.k->p, POS(0, 1))) {
39                 prt_printf(err, "bad pos");
40                 return -BCH_ERR_invalid_bkey;
41         }
42
43         return 0;
44 }
45
46 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
47                               struct bch_snapshot_tree *s)
48 {
49         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
50                                           BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
51
52         if (bch2_err_matches(ret, ENOENT))
53                 ret = -BCH_ERR_ENOENT_snapshot_tree;
54         return ret;
55 }
56
57 struct bkey_i_snapshot_tree *
58 __bch2_snapshot_tree_create(struct btree_trans *trans)
59 {
60         struct btree_iter iter;
61         int ret = bch2_bkey_get_empty_slot(trans, &iter,
62                         BTREE_ID_snapshot_trees, POS(0, U32_MAX));
63         struct bkey_i_snapshot_tree *s_t;
64
65         if (ret == -BCH_ERR_ENOSPC_btree_slot)
66                 ret = -BCH_ERR_ENOSPC_snapshot_tree;
67         if (ret)
68                 return ERR_PTR(ret);
69
70         s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
71         ret = PTR_ERR_OR_ZERO(s_t);
72         bch2_trans_iter_exit(trans, &iter);
73         return ret ? ERR_PTR(ret) : s_t;
74 }
75
76 static int bch2_snapshot_tree_create(struct btree_trans *trans,
77                                 u32 root_id, u32 subvol_id, u32 *tree_id)
78 {
79         struct bkey_i_snapshot_tree *n_tree =
80                 __bch2_snapshot_tree_create(trans);
81
82         if (IS_ERR(n_tree))
83                 return PTR_ERR(n_tree);
84
85         n_tree->v.master_subvol = cpu_to_le32(subvol_id);
86         n_tree->v.root_snapshot = cpu_to_le32(root_id);
87         *tree_id = n_tree->k.p.offset;
88         return 0;
89 }
90
91 /* Snapshot nodes: */
92
93 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
94 {
95         struct snapshot_table *t;
96
97         rcu_read_lock();
98         t = rcu_dereference(c->snapshots);
99
100         while (id && id < ancestor)
101                 id = __snapshot_t(t, id)->parent;
102         rcu_read_unlock();
103
104         return id == ancestor;
105 }
106
107 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
108 {
109         const struct snapshot_t *s = __snapshot_t(t, id);
110
111         if (s->skip[2] <= ancestor)
112                 return s->skip[2];
113         if (s->skip[1] <= ancestor)
114                 return s->skip[1];
115         if (s->skip[0] <= ancestor)
116                 return s->skip[0];
117         return s->parent;
118 }
119
120 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
121 {
122         struct snapshot_table *t;
123         bool ret;
124
125         EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
126
127         rcu_read_lock();
128         t = rcu_dereference(c->snapshots);
129
130         while (id && id < ancestor - IS_ANCESTOR_BITMAP)
131                 id = get_ancestor_below(t, id, ancestor);
132
133         if (id && id < ancestor) {
134                 ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
135
136                 EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
137         } else {
138                 ret = id == ancestor;
139         }
140
141         rcu_read_unlock();
142
143         return ret;
144 }
145
146 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
147 {
148         size_t idx = U32_MAX - id;
149         size_t new_size;
150         struct snapshot_table *new, *old;
151
152         new_size = max(16UL, roundup_pow_of_two(idx + 1));
153
154         new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
155         if (!new)
156                 return NULL;
157
158         old = rcu_dereference_protected(c->snapshots, true);
159         if (old)
160                 memcpy(new->s,
161                        rcu_dereference_protected(c->snapshots, true)->s,
162                        sizeof(new->s[0]) * c->snapshot_table_size);
163
164         rcu_assign_pointer(c->snapshots, new);
165         c->snapshot_table_size = new_size;
166         kvfree_rcu_mightsleep(old);
167
168         return &rcu_dereference_protected(c->snapshots, true)->s[idx];
169 }
170
171 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
172 {
173         size_t idx = U32_MAX - id;
174
175         lockdep_assert_held(&c->snapshot_table_lock);
176
177         if (likely(idx < c->snapshot_table_size))
178                 return &rcu_dereference_protected(c->snapshots, true)->s[idx];
179
180         return __snapshot_t_mut(c, id);
181 }
182
183 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
184                            struct bkey_s_c k)
185 {
186         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
187
188         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
189                BCH_SNAPSHOT_SUBVOL(s.v),
190                BCH_SNAPSHOT_DELETED(s.v),
191                le32_to_cpu(s.v->parent),
192                le32_to_cpu(s.v->children[0]),
193                le32_to_cpu(s.v->children[1]),
194                le32_to_cpu(s.v->subvol),
195                le32_to_cpu(s.v->tree));
196
197         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
198                 prt_printf(out, " depth %u skiplist %u %u %u",
199                            le32_to_cpu(s.v->depth),
200                            le32_to_cpu(s.v->skip[0]),
201                            le32_to_cpu(s.v->skip[1]),
202                            le32_to_cpu(s.v->skip[2]));
203 }
204
205 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
206                           enum bkey_invalid_flags flags,
207                           struct printbuf *err)
208 {
209         struct bkey_s_c_snapshot s;
210         u32 i, id;
211
212         if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
213             bkey_lt(k.k->p, POS(0, 1))) {
214                 prt_printf(err, "bad pos");
215                 return -BCH_ERR_invalid_bkey;
216         }
217
218         s = bkey_s_c_to_snapshot(k);
219
220         id = le32_to_cpu(s.v->parent);
221         if (id && id <= k.k->p.offset) {
222                 prt_printf(err, "bad parent node (%u <= %llu)",
223                        id, k.k->p.offset);
224                 return -BCH_ERR_invalid_bkey;
225         }
226
227         if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
228                 prt_printf(err, "children not normalized");
229                 return -BCH_ERR_invalid_bkey;
230         }
231
232         if (s.v->children[0] &&
233             s.v->children[0] == s.v->children[1]) {
234                 prt_printf(err, "duplicate child nodes");
235                 return -BCH_ERR_invalid_bkey;
236         }
237
238         for (i = 0; i < 2; i++) {
239                 id = le32_to_cpu(s.v->children[i]);
240
241                 if (id >= k.k->p.offset) {
242                         prt_printf(err, "bad child node (%u >= %llu)",
243                                id, k.k->p.offset);
244                         return -BCH_ERR_invalid_bkey;
245                 }
246         }
247
248         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
249                 if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
250                     le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
251                         prt_printf(err, "skiplist not normalized");
252                         return -BCH_ERR_invalid_bkey;
253                 }
254
255                 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
256                         id = le32_to_cpu(s.v->skip[i]);
257
258                         if ((id && !s.v->parent) ||
259                             (id && id <= k.k->p.offset)) {
260                                 prt_printf(err, "bad skiplist node %u", id);
261                                 return -BCH_ERR_invalid_bkey;
262                         }
263                 }
264         }
265
266         return 0;
267 }
268
269 static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
270 {
271         struct snapshot_t *t = snapshot_t_mut(c, id);
272         u32 parent = id;
273
274         while ((parent = bch2_snapshot_parent_early(c, parent)) &&
275                parent - id - 1 < IS_ANCESTOR_BITMAP)
276                 __set_bit(parent - id - 1, t->is_ancestor);
277 }
278
279 static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
280 {
281         mutex_lock(&c->snapshot_table_lock);
282         __set_is_ancestor_bitmap(c, id);
283         mutex_unlock(&c->snapshot_table_lock);
284 }
285
286 int bch2_mark_snapshot(struct btree_trans *trans,
287                        enum btree_id btree, unsigned level,
288                        struct bkey_s_c old, struct bkey_s_c new,
289                        unsigned flags)
290 {
291         struct bch_fs *c = trans->c;
292         struct snapshot_t *t;
293         u32 id = new.k->p.offset;
294         int ret = 0;
295
296         mutex_lock(&c->snapshot_table_lock);
297
298         t = snapshot_t_mut(c, id);
299         if (!t) {
300                 ret = -BCH_ERR_ENOMEM_mark_snapshot;
301                 goto err;
302         }
303
304         if (new.k->type == KEY_TYPE_snapshot) {
305                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
306
307                 t->parent       = le32_to_cpu(s.v->parent);
308                 t->children[0]  = le32_to_cpu(s.v->children[0]);
309                 t->children[1]  = le32_to_cpu(s.v->children[1]);
310                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
311                 t->tree         = le32_to_cpu(s.v->tree);
312
313                 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
314                         t->depth        = le32_to_cpu(s.v->depth);
315                         t->skip[0]      = le32_to_cpu(s.v->skip[0]);
316                         t->skip[1]      = le32_to_cpu(s.v->skip[1]);
317                         t->skip[2]      = le32_to_cpu(s.v->skip[2]);
318                 } else {
319                         t->depth        = 0;
320                         t->skip[0]      = 0;
321                         t->skip[1]      = 0;
322                         t->skip[2]      = 0;
323                 }
324
325                 __set_is_ancestor_bitmap(c, id);
326
327                 if (BCH_SNAPSHOT_DELETED(s.v)) {
328                         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
329                         c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
330                 }
331         } else {
332                 memset(t, 0, sizeof(*t));
333         }
334 err:
335         mutex_unlock(&c->snapshot_table_lock);
336         return ret;
337 }
338
339 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
340                          struct bch_snapshot *s)
341 {
342         return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
343                                        BTREE_ITER_WITH_UPDATES, snapshot, s);
344 }
345
346 static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
347 {
348         struct bch_snapshot v;
349         int ret;
350
351         if (!id)
352                 return 0;
353
354         ret = bch2_snapshot_lookup(trans, id, &v);
355         if (bch2_err_matches(ret, ENOENT))
356                 bch_err(trans->c, "snapshot node %u not found", id);
357         if (ret)
358                 return ret;
359
360         return !BCH_SNAPSHOT_DELETED(&v);
361 }
362
363 /*
364  * If @k is a snapshot with just one live child, it's part of a linear chain,
365  * which we consider to be an equivalence class: and then after snapshot
366  * deletion cleanup, there should only be a single key at a given position in
367  * this equivalence class.
368  *
369  * This sets the equivalence class of @k to be the child's equivalence class, if
370  * it's part of such a linear chain: this correctly sets equivalence classes on
371  * startup if we run leaf to root (i.e. in natural key order).
372  */
373 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
374 {
375         struct bch_fs *c = trans->c;
376         unsigned i, nr_live = 0, live_idx = 0;
377         struct bkey_s_c_snapshot snap;
378         u32 id = k.k->p.offset, child[2];
379
380         if (k.k->type != KEY_TYPE_snapshot)
381                 return 0;
382
383         snap = bkey_s_c_to_snapshot(k);
384
385         child[0] = le32_to_cpu(snap.v->children[0]);
386         child[1] = le32_to_cpu(snap.v->children[1]);
387
388         for (i = 0; i < 2; i++) {
389                 int ret = bch2_snapshot_live(trans, child[i]);
390
391                 if (ret < 0)
392                         return ret;
393
394                 if (ret)
395                         live_idx = i;
396                 nr_live += ret;
397         }
398
399         mutex_lock(&c->snapshot_table_lock);
400
401         snapshot_t_mut(c, id)->equiv = nr_live == 1
402                 ? snapshot_t_mut(c, child[live_idx])->equiv
403                 : id;
404
405         mutex_unlock(&c->snapshot_table_lock);
406
407         return 0;
408 }
409
410 /* fsck: */
411
412 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
413 {
414         return snapshot_t(c, id)->children[child];
415 }
416
417 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
418 {
419         return bch2_snapshot_child(c, id, 0);
420 }
421
422 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
423 {
424         return bch2_snapshot_child(c, id, 1);
425 }
426
427 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
428 {
429         u32 n, parent;
430
431         n = bch2_snapshot_left_child(c, id);
432         if (n)
433                 return n;
434
435         while ((parent = bch2_snapshot_parent(c, id))) {
436                 n = bch2_snapshot_right_child(c, parent);
437                 if (n && n != id)
438                         return n;
439                 id = parent;
440         }
441
442         return 0;
443 }
444
445 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
446 {
447         u32 id = snapshot_root;
448         u32 subvol = 0, s;
449
450         while (id) {
451                 s = snapshot_t(c, id)->subvol;
452
453                 if (s && (!subvol || s < subvol))
454                         subvol = s;
455
456                 id = bch2_snapshot_tree_next(c, id);
457         }
458
459         return subvol;
460 }
461
462 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
463                                             u32 snapshot_root, u32 *subvol_id)
464 {
465         struct bch_fs *c = trans->c;
466         struct btree_iter iter;
467         struct bkey_s_c k;
468         struct bkey_s_c_subvolume s;
469         bool found = false;
470         int ret;
471
472         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
473                                      0, k, ret) {
474                 if (k.k->type != KEY_TYPE_subvolume)
475                         continue;
476
477                 s = bkey_s_c_to_subvolume(k);
478                 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
479                         continue;
480                 if (!BCH_SUBVOLUME_SNAP(s.v)) {
481                         *subvol_id = s.k->p.offset;
482                         found = true;
483                         break;
484                 }
485         }
486
487         bch2_trans_iter_exit(trans, &iter);
488
489         if (!ret && !found) {
490                 struct bkey_i_subvolume *u;
491
492                 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
493
494                 u = bch2_bkey_get_mut_typed(trans, &iter,
495                                             BTREE_ID_subvolumes, POS(0, *subvol_id),
496                                             0, subvolume);
497                 ret = PTR_ERR_OR_ZERO(u);
498                 if (ret)
499                         return ret;
500
501                 SET_BCH_SUBVOLUME_SNAP(&u->v, false);
502         }
503
504         return ret;
505 }
506
507 static int check_snapshot_tree(struct btree_trans *trans,
508                                struct btree_iter *iter,
509                                struct bkey_s_c k)
510 {
511         struct bch_fs *c = trans->c;
512         struct bkey_s_c_snapshot_tree st;
513         struct bch_snapshot s;
514         struct bch_subvolume subvol;
515         struct printbuf buf = PRINTBUF;
516         u32 root_id;
517         int ret;
518
519         if (k.k->type != KEY_TYPE_snapshot_tree)
520                 return 0;
521
522         st = bkey_s_c_to_snapshot_tree(k);
523         root_id = le32_to_cpu(st.v->root_snapshot);
524
525         ret = bch2_snapshot_lookup(trans, root_id, &s);
526         if (ret && !bch2_err_matches(ret, ENOENT))
527                 goto err;
528
529         if (fsck_err_on(ret ||
530                         root_id != bch2_snapshot_root(c, root_id) ||
531                         st.k->p.offset != le32_to_cpu(s.tree),
532                         c,
533                         "snapshot tree points to missing/incorrect snapshot:\n  %s",
534                         (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
535                 ret = bch2_btree_delete_at(trans, iter, 0);
536                 goto err;
537         }
538
539         ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
540                                  false, 0, &subvol);
541         if (ret && !bch2_err_matches(ret, ENOENT))
542                 goto err;
543
544         if (fsck_err_on(ret, c,
545                         "snapshot tree points to missing subvolume:\n  %s",
546                         (printbuf_reset(&buf),
547                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
548             fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
549                                                 le32_to_cpu(subvol.snapshot),
550                                                 root_id), c,
551                         "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
552                         (printbuf_reset(&buf),
553                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
554             fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
555                         "snapshot tree points to snapshot subvolume:\n  %s",
556                         (printbuf_reset(&buf),
557                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
558                 struct bkey_i_snapshot_tree *u;
559                 u32 subvol_id;
560
561                 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
562                 if (ret)
563                         goto err;
564
565                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
566                 ret = PTR_ERR_OR_ZERO(u);
567                 if (ret)
568                         goto err;
569
570                 u->v.master_subvol = cpu_to_le32(subvol_id);
571                 st = snapshot_tree_i_to_s_c(u);
572         }
573 err:
574 fsck_err:
575         printbuf_exit(&buf);
576         return ret;
577 }
578
579 /*
580  * For each snapshot_tree, make sure it points to the root of a snapshot tree
581  * and that snapshot entry points back to it, or delete it.
582  *
583  * And, make sure it points to a subvolume within that snapshot tree, or correct
584  * it to point to the oldest subvolume within that snapshot tree.
585  */
586 int bch2_check_snapshot_trees(struct bch_fs *c)
587 {
588         struct btree_iter iter;
589         struct bkey_s_c k;
590         int ret;
591
592         ret = bch2_trans_run(c,
593                 for_each_btree_key_commit(trans, iter,
594                         BTREE_ID_snapshot_trees, POS_MIN,
595                         BTREE_ITER_PREFETCH, k,
596                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
597                 check_snapshot_tree(trans, &iter, k)));
598
599         if (ret)
600                 bch_err(c, "error %i checking snapshot trees", ret);
601         return ret;
602 }
603
604 /*
605  * Look up snapshot tree for @tree_id and find root,
606  * make sure @snap_id is a descendent:
607  */
608 static int snapshot_tree_ptr_good(struct btree_trans *trans,
609                                   u32 snap_id, u32 tree_id)
610 {
611         struct bch_snapshot_tree s_t;
612         int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
613
614         if (bch2_err_matches(ret, ENOENT))
615                 return 0;
616         if (ret)
617                 return ret;
618
619         return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
620 }
621
622 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
623 {
624         const struct snapshot_t *s;
625
626         if (!id)
627                 return 0;
628
629         rcu_read_lock();
630         s = snapshot_t(c, id);
631         if (s->parent)
632                 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
633         rcu_read_unlock();
634
635         return id;
636 }
637
638 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
639 {
640         unsigned i;
641
642         for (i = 0; i < 3; i++)
643                 if (!s.parent) {
644                         if (s.skip[i])
645                                 return false;
646                 } else {
647                         if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
648                                 return false;
649                 }
650
651         return true;
652 }
653
654 /*
655  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
656  * its snapshot_tree pointer is correct (allocate new one if necessary), then
657  * update this node's pointer to root node's pointer:
658  */
659 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
660                                     struct btree_iter *iter,
661                                     struct bkey_s_c k,
662                                     struct bch_snapshot *s)
663 {
664         struct bch_fs *c = trans->c;
665         struct btree_iter root_iter;
666         struct bch_snapshot_tree s_t;
667         struct bkey_s_c_snapshot root;
668         struct bkey_i_snapshot *u;
669         u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
670         int ret;
671
672         root = bch2_bkey_get_iter_typed(trans, &root_iter,
673                                BTREE_ID_snapshots, POS(0, root_id),
674                                BTREE_ITER_WITH_UPDATES, snapshot);
675         ret = bkey_err(root);
676         if (ret)
677                 goto err;
678
679         tree_id = le32_to_cpu(root.v->tree);
680
681         ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
682         if (ret && !bch2_err_matches(ret, ENOENT))
683                 return ret;
684
685         if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
686                 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
687                 ret =   PTR_ERR_OR_ZERO(u) ?:
688                         bch2_snapshot_tree_create(trans, root_id,
689                                 bch2_snapshot_tree_oldest_subvol(c, root_id),
690                                 &tree_id);
691                 if (ret)
692                         goto err;
693
694                 u->v.tree = cpu_to_le32(tree_id);
695                 if (k.k->p.offset == root_id)
696                         *s = u->v;
697         }
698
699         if (k.k->p.offset != root_id) {
700                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
701                 ret = PTR_ERR_OR_ZERO(u);
702                 if (ret)
703                         goto err;
704
705                 u->v.tree = cpu_to_le32(tree_id);
706                 *s = u->v;
707         }
708 err:
709         bch2_trans_iter_exit(trans, &root_iter);
710         return ret;
711 }
712
713 static int check_snapshot(struct btree_trans *trans,
714                           struct btree_iter *iter,
715                           struct bkey_s_c k)
716 {
717         struct bch_fs *c = trans->c;
718         struct bch_snapshot s;
719         struct bch_subvolume subvol;
720         struct bch_snapshot v;
721         struct bkey_i_snapshot *u;
722         u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
723         u32 real_depth;
724         struct printbuf buf = PRINTBUF;
725         bool should_have_subvol;
726         u32 i, id;
727         int ret = 0;
728
729         if (k.k->type != KEY_TYPE_snapshot)
730                 return 0;
731
732         memset(&s, 0, sizeof(s));
733         memcpy(&s, k.v, bkey_val_bytes(k.k));
734
735         id = le32_to_cpu(s.parent);
736         if (id) {
737                 ret = bch2_snapshot_lookup(trans, id, &v);
738                 if (bch2_err_matches(ret, ENOENT))
739                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
740                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
741                 if (ret)
742                         goto err;
743
744                 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
745                     le32_to_cpu(v.children[1]) != k.k->p.offset) {
746                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
747                                 id, k.k->p.offset);
748                         ret = -EINVAL;
749                         goto err;
750                 }
751         }
752
753         for (i = 0; i < 2 && s.children[i]; i++) {
754                 id = le32_to_cpu(s.children[i]);
755
756                 ret = bch2_snapshot_lookup(trans, id, &v);
757                 if (bch2_err_matches(ret, ENOENT))
758                         bch_err(c, "snapshot node %llu has nonexistent child %u",
759                                 k.k->p.offset, id);
760                 if (ret)
761                         goto err;
762
763                 if (le32_to_cpu(v.parent) != k.k->p.offset) {
764                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
765                                 id, le32_to_cpu(v.parent), k.k->p.offset);
766                         ret = -EINVAL;
767                         goto err;
768                 }
769         }
770
771         should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
772                 !BCH_SNAPSHOT_DELETED(&s);
773
774         if (should_have_subvol) {
775                 id = le32_to_cpu(s.subvol);
776                 ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
777                 if (bch2_err_matches(ret, ENOENT))
778                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
779                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
780                 if (ret)
781                         goto err;
782
783                 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
784                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
785                                 k.k->p.offset);
786                         ret = -EINVAL;
787                         goto err;
788                 }
789         } else {
790                 if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
791                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
792                         u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
793                         ret = PTR_ERR_OR_ZERO(u);
794                         if (ret)
795                                 goto err;
796
797                         u->v.subvol = 0;
798                         s = u->v;
799                 }
800         }
801
802         ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
803         if (ret < 0)
804                 goto err;
805
806         if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
807                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
808                 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
809                 if (ret)
810                         goto err;
811         }
812         ret = 0;
813
814         real_depth = bch2_snapshot_depth(c, parent_id);
815
816         if (le32_to_cpu(s.depth) != real_depth &&
817             (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
818              fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
819                       real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
820                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
821                 ret = PTR_ERR_OR_ZERO(u);
822                 if (ret)
823                         goto err;
824
825                 u->v.depth = cpu_to_le32(real_depth);
826                 s = u->v;
827         }
828
829         ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
830         if (ret < 0)
831                 goto err;
832
833         if (!ret &&
834             (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
835              fsck_err(c, "snapshot with bad skiplist field:\n  %s",
836                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
837                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
838                 ret = PTR_ERR_OR_ZERO(u);
839                 if (ret)
840                         goto err;
841
842                 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
843                         u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
844
845                 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
846                 s = u->v;
847         }
848         ret = 0;
849 err:
850 fsck_err:
851         printbuf_exit(&buf);
852         return ret;
853 }
854
855 int bch2_check_snapshots(struct bch_fs *c)
856 {
857         struct btree_iter iter;
858         struct bkey_s_c k;
859         int ret;
860
861         /*
862          * We iterate backwards as checking/fixing the depth field requires that
863          * the parent's depth already be correct:
864          */
865         ret = bch2_trans_run(c,
866                 for_each_btree_key_reverse_commit(trans, iter,
867                         BTREE_ID_snapshots, POS_MAX,
868                         BTREE_ITER_PREFETCH, k,
869                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
870                 check_snapshot(trans, &iter, k)));
871         if (ret)
872                 bch_err_fn(c, ret);
873         return ret;
874 }
875
876 /*
877  * Mark a snapshot as deleted, for future cleanup:
878  */
879 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
880 {
881         struct btree_iter iter;
882         struct bkey_i_snapshot *s;
883         int ret = 0;
884
885         s = bch2_bkey_get_mut_typed(trans, &iter,
886                                     BTREE_ID_snapshots, POS(0, id),
887                                     0, snapshot);
888         ret = PTR_ERR_OR_ZERO(s);
889         if (unlikely(ret)) {
890                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
891                                         trans->c, "missing snapshot %u", id);
892                 return ret;
893         }
894
895         /* already deleted? */
896         if (BCH_SNAPSHOT_DELETED(&s->v))
897                 goto err;
898
899         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
900         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
901         s->v.subvol = 0;
902 err:
903         bch2_trans_iter_exit(trans, &iter);
904         return ret;
905 }
906
907 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
908 {
909         if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
910                 swap(s->children[0], s->children[1]);
911 }
912
913 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
914 {
915         struct bch_fs *c = trans->c;
916         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
917         struct btree_iter c_iter = (struct btree_iter) { NULL };
918         struct btree_iter tree_iter = (struct btree_iter) { NULL };
919         struct bkey_s_c_snapshot s;
920         u32 parent_id, child_id;
921         unsigned i;
922         int ret = 0;
923
924         s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
925                                      BTREE_ITER_INTENT, snapshot);
926         ret = bkey_err(s);
927         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
928                                 "missing snapshot %u", id);
929
930         if (ret)
931                 goto err;
932
933         BUG_ON(s.v->children[1]);
934
935         parent_id = le32_to_cpu(s.v->parent);
936         child_id = le32_to_cpu(s.v->children[0]);
937
938         if (parent_id) {
939                 struct bkey_i_snapshot *parent;
940
941                 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
942                                      BTREE_ID_snapshots, POS(0, parent_id),
943                                      0, snapshot);
944                 ret = PTR_ERR_OR_ZERO(parent);
945                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
946                                         "missing snapshot %u", parent_id);
947                 if (unlikely(ret))
948                         goto err;
949
950                 /* find entry in parent->children for node being deleted */
951                 for (i = 0; i < 2; i++)
952                         if (le32_to_cpu(parent->v.children[i]) == id)
953                                 break;
954
955                 if (bch2_fs_inconsistent_on(i == 2, c,
956                                         "snapshot %u missing child pointer to %u",
957                                         parent_id, id))
958                         goto err;
959
960                 parent->v.children[i] = le32_to_cpu(child_id);
961
962                 normalize_snapshot_child_pointers(&parent->v);
963         }
964
965         if (child_id) {
966                 struct bkey_i_snapshot *child;
967
968                 child = bch2_bkey_get_mut_typed(trans, &c_iter,
969                                      BTREE_ID_snapshots, POS(0, child_id),
970                                      0, snapshot);
971                 ret = PTR_ERR_OR_ZERO(child);
972                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
973                                         "missing snapshot %u", child_id);
974                 if (unlikely(ret))
975                         goto err;
976
977                 child->v.parent = cpu_to_le32(parent_id);
978
979                 if (!child->v.parent) {
980                         child->v.skip[0] = 0;
981                         child->v.skip[1] = 0;
982                         child->v.skip[2] = 0;
983                 }
984         }
985
986         if (!parent_id) {
987                 /*
988                  * We're deleting the root of a snapshot tree: update the
989                  * snapshot_tree entry to point to the new root, or delete it if
990                  * this is the last snapshot ID in this tree:
991                  */
992                 struct bkey_i_snapshot_tree *s_t;
993
994                 BUG_ON(s.v->children[1]);
995
996                 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
997                                 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
998                                 0, snapshot_tree);
999                 ret = PTR_ERR_OR_ZERO(s_t);
1000                 if (ret)
1001                         goto err;
1002
1003                 if (s.v->children[0]) {
1004                         s_t->v.root_snapshot = s.v->children[0];
1005                 } else {
1006                         s_t->k.type = KEY_TYPE_deleted;
1007                         set_bkey_val_u64s(&s_t->k, 0);
1008                 }
1009         }
1010
1011         ret = bch2_btree_delete_at(trans, &iter, 0);
1012 err:
1013         bch2_trans_iter_exit(trans, &tree_iter);
1014         bch2_trans_iter_exit(trans, &p_iter);
1015         bch2_trans_iter_exit(trans, &c_iter);
1016         bch2_trans_iter_exit(trans, &iter);
1017         return ret;
1018 }
1019
1020 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1021                           u32 *new_snapids,
1022                           u32 *snapshot_subvols,
1023                           unsigned nr_snapids)
1024 {
1025         struct bch_fs *c = trans->c;
1026         struct btree_iter iter;
1027         struct bkey_i_snapshot *n;
1028         struct bkey_s_c k;
1029         unsigned i, j;
1030         u32 depth = bch2_snapshot_depth(c, parent);
1031         int ret;
1032
1033         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1034                              POS_MIN, BTREE_ITER_INTENT);
1035         k = bch2_btree_iter_peek(&iter);
1036         ret = bkey_err(k);
1037         if (ret)
1038                 goto err;
1039
1040         for (i = 0; i < nr_snapids; i++) {
1041                 k = bch2_btree_iter_prev_slot(&iter);
1042                 ret = bkey_err(k);
1043                 if (ret)
1044                         goto err;
1045
1046                 if (!k.k || !k.k->p.offset) {
1047                         ret = -BCH_ERR_ENOSPC_snapshot_create;
1048                         goto err;
1049                 }
1050
1051                 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1052                 ret = PTR_ERR_OR_ZERO(n);
1053                 if (ret)
1054                         goto err;
1055
1056                 n->v.flags      = 0;
1057                 n->v.parent     = cpu_to_le32(parent);
1058                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
1059                 n->v.tree       = cpu_to_le32(tree);
1060                 n->v.depth      = cpu_to_le32(depth);
1061
1062                 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1063                         n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1064
1065                 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1066                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1067
1068                 ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1069                                          bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1070                 if (ret)
1071                         goto err;
1072
1073                 new_snapids[i]  = iter.pos.offset;
1074
1075                 mutex_lock(&c->snapshot_table_lock);
1076                 snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
1077                 mutex_unlock(&c->snapshot_table_lock);
1078         }
1079 err:
1080         bch2_trans_iter_exit(trans, &iter);
1081         return ret;
1082 }
1083
1084 /*
1085  * Create new snapshot IDs as children of an existing snapshot ID:
1086  */
1087 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1088                               u32 *new_snapids,
1089                               u32 *snapshot_subvols,
1090                               unsigned nr_snapids)
1091 {
1092         struct btree_iter iter;
1093         struct bkey_i_snapshot *n_parent;
1094         int ret = 0;
1095
1096         n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1097                         BTREE_ID_snapshots, POS(0, parent),
1098                         0, snapshot);
1099         ret = PTR_ERR_OR_ZERO(n_parent);
1100         if (unlikely(ret)) {
1101                 if (bch2_err_matches(ret, ENOENT))
1102                         bch_err(trans->c, "snapshot %u not found", parent);
1103                 return ret;
1104         }
1105
1106         if (n_parent->v.children[0] || n_parent->v.children[1]) {
1107                 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1108                 ret = -EINVAL;
1109                 goto err;
1110         }
1111
1112         ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1113                              new_snapids, snapshot_subvols, nr_snapids);
1114         if (ret)
1115                 goto err;
1116
1117         n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1118         n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1119         n_parent->v.subvol = 0;
1120         SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1121 err:
1122         bch2_trans_iter_exit(trans, &iter);
1123         return ret;
1124 }
1125
1126 /*
1127  * Create a snapshot node that is the root of a new tree:
1128  */
1129 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1130                               u32 *new_snapids,
1131                               u32 *snapshot_subvols,
1132                               unsigned nr_snapids)
1133 {
1134         struct bkey_i_snapshot_tree *n_tree;
1135         int ret;
1136
1137         n_tree = __bch2_snapshot_tree_create(trans);
1138         ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1139                 create_snapids(trans, 0, n_tree->k.p.offset,
1140                              new_snapids, snapshot_subvols, nr_snapids);
1141         if (ret)
1142                 return ret;
1143
1144         n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1145         n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1146         return 0;
1147 }
1148
1149 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1150                               u32 *new_snapids,
1151                               u32 *snapshot_subvols,
1152                               unsigned nr_snapids)
1153 {
1154         BUG_ON((parent == 0) != (nr_snapids == 1));
1155         BUG_ON((parent != 0) != (nr_snapids == 2));
1156
1157         return parent
1158                 ? bch2_snapshot_node_create_children(trans, parent,
1159                                 new_snapids, snapshot_subvols, nr_snapids)
1160                 : bch2_snapshot_node_create_tree(trans,
1161                                 new_snapids, snapshot_subvols, nr_snapids);
1162
1163 }
1164
1165 /*
1166  * If we have an unlinked inode in an internal snapshot node, and the inode
1167  * really has been deleted in all child snapshots, how does this get cleaned up?
1168  *
1169  * first there is the problem of how keys that have been overwritten in all
1170  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1171  * special?
1172  *
1173  * also: unlinked inode in internal snapshot appears to not be getting deleted
1174  * correctly if inode doesn't exist in leaf snapshots
1175  *
1176  * solution:
1177  *
1178  * for a key in an interior snapshot node that needs work to be done that
1179  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1180  * that key to snapshot leaf nodes, where we can mutate it
1181  */
1182
1183 static int snapshot_delete_key(struct btree_trans *trans,
1184                                struct btree_iter *iter,
1185                                struct bkey_s_c k,
1186                                snapshot_id_list *deleted,
1187                                snapshot_id_list *equiv_seen,
1188                                struct bpos *last_pos)
1189 {
1190         struct bch_fs *c = trans->c;
1191         u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1192
1193         if (!bkey_eq(k.k->p, *last_pos))
1194                 equiv_seen->nr = 0;
1195         *last_pos = k.k->p;
1196
1197         if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
1198             snapshot_list_has_id(equiv_seen, equiv)) {
1199                 return bch2_btree_delete_at(trans, iter,
1200                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1201         } else {
1202                 return snapshot_list_add(c, equiv_seen, equiv);
1203         }
1204 }
1205
1206 static int move_key_to_correct_snapshot(struct btree_trans *trans,
1207                                struct btree_iter *iter,
1208                                struct bkey_s_c k)
1209 {
1210         struct bch_fs *c = trans->c;
1211         u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1212
1213         /*
1214          * When we have a linear chain of snapshot nodes, we consider
1215          * those to form an equivalence class: we're going to collapse
1216          * them all down to a single node, and keep the leaf-most node -
1217          * which has the same id as the equivalence class id.
1218          *
1219          * If there are multiple keys in different snapshots at the same
1220          * position, we're only going to keep the one in the newest
1221          * snapshot - the rest have been overwritten and are redundant,
1222          * and for the key we're going to keep we need to move it to the
1223          * equivalance class ID if it's not there already.
1224          */
1225         if (equiv != k.k->p.snapshot) {
1226                 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1227                 struct btree_iter new_iter;
1228                 int ret;
1229
1230                 ret = PTR_ERR_OR_ZERO(new);
1231                 if (ret)
1232                         return ret;
1233
1234                 new->k.p.snapshot = equiv;
1235
1236                 bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
1237                                      BTREE_ITER_ALL_SNAPSHOTS|
1238                                      BTREE_ITER_CACHED|
1239                                      BTREE_ITER_INTENT);
1240
1241                 ret =   bch2_btree_iter_traverse(&new_iter) ?:
1242                         bch2_trans_update(trans, &new_iter, new,
1243                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
1244                         bch2_btree_delete_at(trans, iter,
1245                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1246                 bch2_trans_iter_exit(trans, &new_iter);
1247                 if (ret)
1248                         return ret;
1249         }
1250
1251         return 0;
1252 }
1253
1254 /*
1255  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1256  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1257  * as deleted.
1258  */
1259 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
1260                                           struct bkey_s_c k)
1261 {
1262         struct bkey_s_c_snapshot snap;
1263         u32 children[2];
1264         int ret;
1265
1266         if (k.k->type != KEY_TYPE_snapshot)
1267                 return 0;
1268
1269         snap = bkey_s_c_to_snapshot(k);
1270         if (BCH_SNAPSHOT_DELETED(snap.v) ||
1271             BCH_SNAPSHOT_SUBVOL(snap.v))
1272                 return 0;
1273
1274         children[0] = le32_to_cpu(snap.v->children[0]);
1275         children[1] = le32_to_cpu(snap.v->children[1]);
1276
1277         ret   = bch2_snapshot_live(trans, children[0]) ?:
1278                 bch2_snapshot_live(trans, children[1]);
1279         if (ret < 0)
1280                 return ret;
1281
1282         if (!ret)
1283                 return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
1284         return 0;
1285 }
1286
1287 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1288                                                 snapshot_id_list *skip)
1289 {
1290         rcu_read_lock();
1291         while (n--) {
1292                 do {
1293                         id = __bch2_snapshot_parent(c, id);
1294                 } while (snapshot_list_has_id(skip, id));
1295         }
1296         rcu_read_unlock();
1297
1298         return id;
1299 }
1300
1301 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1302                                               struct btree_iter *iter, struct bkey_s_c k,
1303                                               snapshot_id_list *deleted)
1304 {
1305         struct bch_fs *c = trans->c;
1306         u32 nr_deleted_ancestors = 0;
1307         struct bkey_i_snapshot *s;
1308         u32 *i;
1309         int ret;
1310
1311         if (k.k->type != KEY_TYPE_snapshot)
1312                 return 0;
1313
1314         if (snapshot_list_has_id(deleted, k.k->p.offset))
1315                 return 0;
1316
1317         s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1318         ret = PTR_ERR_OR_ZERO(s);
1319         if (ret)
1320                 return ret;
1321
1322         darray_for_each(*deleted, i)
1323                 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
1324
1325         if (!nr_deleted_ancestors)
1326                 return 0;
1327
1328         le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1329
1330         if (!s->v.depth) {
1331                 s->v.skip[0] = 0;
1332                 s->v.skip[1] = 0;
1333                 s->v.skip[2] = 0;
1334         } else {
1335                 u32 depth = le32_to_cpu(s->v.depth);
1336                 u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1337
1338                 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1339                         u32 id = le32_to_cpu(s->v.skip[j]);
1340
1341                         if (snapshot_list_has_id(deleted, id)) {
1342                                 id = depth > 1
1343                                         ? bch2_snapshot_nth_parent_skip(c,
1344                                                         parent,
1345                                                         get_random_u32_below(depth - 1),
1346                                                         deleted)
1347                                         : parent;
1348                                 s->v.skip[j] = cpu_to_le32(id);
1349                         }
1350                 }
1351
1352                 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1353         }
1354
1355         return bch2_trans_update(trans, iter, &s->k_i, 0);
1356 }
1357
1358 int bch2_delete_dead_snapshots(struct bch_fs *c)
1359 {
1360         struct btree_trans *trans;
1361         struct btree_iter iter;
1362         struct bkey_s_c k;
1363         struct bkey_s_c_snapshot snap;
1364         snapshot_id_list deleted = { 0 };
1365         snapshot_id_list deleted_interior = { 0 };
1366         u32 *i, id;
1367         int ret = 0;
1368
1369         if (!test_bit(BCH_FS_STARTED, &c->flags)) {
1370                 ret = bch2_fs_read_write_early(c);
1371                 if (ret) {
1372                         bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
1373                         return ret;
1374                 }
1375         }
1376
1377         trans = bch2_trans_get(c);
1378
1379         /*
1380          * For every snapshot node: If we have no live children and it's not
1381          * pointed to by a subvolume, delete it:
1382          */
1383         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
1384                         POS_MIN, 0, k,
1385                         NULL, NULL, 0,
1386                 bch2_delete_redundant_snapshot(trans, &iter, k));
1387         if (ret) {
1388                 bch_err_msg(c, ret, "deleting redundant snapshots");
1389                 goto err;
1390         }
1391
1392         ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
1393                                   POS_MIN, 0, k,
1394                 bch2_snapshot_set_equiv(trans, k));
1395         if (ret) {
1396                 bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
1397                 goto err;
1398         }
1399
1400         for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1401                            POS_MIN, 0, k, ret) {
1402                 if (k.k->type != KEY_TYPE_snapshot)
1403                         continue;
1404
1405                 snap = bkey_s_c_to_snapshot(k);
1406                 if (BCH_SNAPSHOT_DELETED(snap.v)) {
1407                         ret = snapshot_list_add(c, &deleted, k.k->p.offset);
1408                         if (ret)
1409                                 break;
1410                 }
1411         }
1412         bch2_trans_iter_exit(trans, &iter);
1413
1414         if (ret) {
1415                 bch_err_msg(c, ret, "walking snapshots");
1416                 goto err;
1417         }
1418
1419         for (id = 0; id < BTREE_ID_NR; id++) {
1420                 struct bpos last_pos = POS_MIN;
1421                 snapshot_id_list equiv_seen = { 0 };
1422                 struct disk_reservation res = { 0 };
1423
1424                 if (!btree_type_has_snapshots(id))
1425                         continue;
1426
1427                 ret = for_each_btree_key_commit(trans, iter,
1428                                 id, POS_MIN,
1429                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1430                                 &res, NULL, BTREE_INSERT_NOFAIL,
1431                         snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
1432                       for_each_btree_key_commit(trans, iter,
1433                                 id, POS_MIN,
1434                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1435                                 &res, NULL, BTREE_INSERT_NOFAIL,
1436                         move_key_to_correct_snapshot(trans, &iter, k));
1437
1438                 bch2_disk_reservation_put(c, &res);
1439                 darray_exit(&equiv_seen);
1440
1441                 if (ret) {
1442                         bch_err_msg(c, ret, "deleting keys from dying snapshots");
1443                         goto err;
1444                 }
1445         }
1446
1447         for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1448                            POS_MIN, 0, k, ret) {
1449                 u32 snapshot = k.k->p.offset;
1450                 u32 equiv = bch2_snapshot_equiv(c, snapshot);
1451
1452                 if (equiv != snapshot)
1453                         snapshot_list_add(c, &deleted_interior, snapshot);
1454         }
1455         bch2_trans_iter_exit(trans, &iter);
1456
1457         /*
1458          * Fixing children of deleted snapshots can't be done completely
1459          * atomically, if we crash between here and when we delete the interior
1460          * nodes some depth fields will be off:
1461          */
1462         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1463                                   BTREE_ITER_INTENT, k,
1464                                   NULL, NULL, BTREE_INSERT_NOFAIL,
1465                 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
1466         if (ret)
1467                 goto err;
1468
1469         darray_for_each(deleted, i) {
1470                 ret = commit_do(trans, NULL, NULL, 0,
1471                         bch2_snapshot_node_delete(trans, *i));
1472                 if (ret) {
1473                         bch_err_msg(c, ret, "deleting snapshot %u", *i);
1474                         goto err;
1475                 }
1476         }
1477
1478         darray_for_each(deleted_interior, i) {
1479                 ret = commit_do(trans, NULL, NULL, 0,
1480                         bch2_snapshot_node_delete(trans, *i));
1481                 if (ret) {
1482                         bch_err_msg(c, ret, "deleting snapshot %u", *i);
1483                         goto err;
1484                 }
1485         }
1486
1487         clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
1488 err:
1489         darray_exit(&deleted_interior);
1490         darray_exit(&deleted);
1491         bch2_trans_put(trans);
1492         if (ret)
1493                 bch_err_fn(c, ret);
1494         return ret;
1495 }
1496
1497 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1498 {
1499         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1500
1501         if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
1502                 bch2_delete_dead_snapshots(c);
1503         bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1504 }
1505
1506 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1507 {
1508         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
1509             !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1510                 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1511 }
1512
1513 int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
1514                                     struct btree_trans_commit_hook *h)
1515 {
1516         struct bch_fs *c = trans->c;
1517
1518         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
1519
1520         if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
1521                 return 0;
1522
1523         bch2_delete_dead_snapshots_async(c);
1524         return 0;
1525 }
1526
1527 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1528                                        enum btree_id id,
1529                                        struct bpos pos)
1530 {
1531         struct bch_fs *c = trans->c;
1532         struct btree_iter iter;
1533         struct bkey_s_c k;
1534         int ret;
1535
1536         bch2_trans_iter_init(trans, &iter, id, pos,
1537                              BTREE_ITER_NOT_EXTENTS|
1538                              BTREE_ITER_ALL_SNAPSHOTS);
1539         while (1) {
1540                 k = bch2_btree_iter_prev(&iter);
1541                 ret = bkey_err(k);
1542                 if (ret)
1543                         break;
1544
1545                 if (!k.k)
1546                         break;
1547
1548                 if (!bkey_eq(pos, k.k->p))
1549                         break;
1550
1551                 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1552                         ret = 1;
1553                         break;
1554                 }
1555         }
1556         bch2_trans_iter_exit(trans, &iter);
1557
1558         return ret;
1559 }
1560
1561 static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
1562 {
1563         const struct snapshot_t *s = snapshot_t(c, id);
1564
1565         return s->children[1] ?: s->children[0];
1566 }
1567
1568 static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
1569 {
1570         u32 child;
1571
1572         while ((child = bch2_snapshot_smallest_child(c, id)))
1573                 id = child;
1574         return id;
1575 }
1576
1577 static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
1578                                                enum btree_id btree,
1579                                                struct bkey_s_c interior_k,
1580                                                u32 leaf_id, struct bpos *new_min_pos)
1581 {
1582         struct btree_iter iter;
1583         struct bpos pos = interior_k.k->p;
1584         struct bkey_s_c k;
1585         struct bkey_i *new;
1586         int ret;
1587
1588         pos.snapshot = leaf_id;
1589
1590         bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
1591         k = bch2_btree_iter_peek_slot(&iter);
1592         ret = bkey_err(k);
1593         if (ret)
1594                 goto out;
1595
1596         /* key already overwritten in this snapshot? */
1597         if (k.k->p.snapshot != interior_k.k->p.snapshot)
1598                 goto out;
1599
1600         if (bpos_eq(*new_min_pos, POS_MIN)) {
1601                 *new_min_pos = k.k->p;
1602                 new_min_pos->snapshot = leaf_id;
1603         }
1604
1605         new = bch2_bkey_make_mut_noupdate(trans, interior_k);
1606         ret = PTR_ERR_OR_ZERO(new);
1607         if (ret)
1608                 goto out;
1609
1610         new->k.p.snapshot = leaf_id;
1611         ret = bch2_trans_update(trans, &iter, new, 0);
1612 out:
1613         bch2_trans_iter_exit(trans, &iter);
1614         return ret;
1615 }
1616
1617 int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
1618                                           enum btree_id btree,
1619                                           struct bkey_s_c k,
1620                                           struct bpos *new_min_pos)
1621 {
1622         struct bch_fs *c = trans->c;
1623         struct bkey_buf sk;
1624         u32 restart_count = trans->restart_count;
1625         int ret = 0;
1626
1627         bch2_bkey_buf_init(&sk);
1628         bch2_bkey_buf_reassemble(&sk, c, k);
1629         k = bkey_i_to_s_c(sk.k);
1630
1631         *new_min_pos = POS_MIN;
1632
1633         for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
1634              id < k.k->p.snapshot;
1635              id++) {
1636                 if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
1637                     !bch2_snapshot_is_leaf(c, id))
1638                         continue;
1639 again:
1640                 ret =   btree_trans_too_many_iters(trans) ?:
1641                         bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
1642                         bch2_trans_commit(trans, NULL, NULL, 0);
1643                 if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1644                         bch2_trans_begin(trans);
1645                         goto again;
1646                 }
1647
1648                 if (ret)
1649                         break;
1650         }
1651
1652         bch2_bkey_buf_exit(&sk, c);
1653
1654         return ret ?: trans_was_restarted(trans, restart_count);
1655 }
1656
1657 int bch2_snapshots_read(struct bch_fs *c)
1658 {
1659         struct btree_iter iter;
1660         struct bkey_s_c k;
1661         int ret = 0;
1662
1663         ret = bch2_trans_run(c,
1664                 for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
1665                            POS_MIN, 0, k,
1666                         bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1667                         bch2_snapshot_set_equiv(trans, k)) ?:
1668                 for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
1669                            POS_MIN, 0, k,
1670                            (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
1671         if (ret)
1672                 bch_err_fn(c, ret);
1673         return ret;
1674 }
1675
1676 void bch2_fs_snapshots_exit(struct bch_fs *c)
1677 {
1678         kfree(rcu_dereference_protected(c->snapshots, true));
1679 }