]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/subvolume.c
Update bcachefs sources to 17a344f265 bcachefs: Improve fsck for subvols/snapshots
[bcachefs-tools-debian] / libbcachefs / subvolume.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_update.h"
6 #include "error.h"
7 #include "fs.h"
8 #include "subvolume.h"
9
10 /* Snapshot tree: */
11
12 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
13                            struct bkey_s_c k)
14 {
15         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
16
17         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
18                BCH_SNAPSHOT_SUBVOL(s.v),
19                BCH_SNAPSHOT_DELETED(s.v),
20                le32_to_cpu(s.v->parent),
21                le32_to_cpu(s.v->children[0]),
22                le32_to_cpu(s.v->children[1]),
23                le32_to_cpu(s.v->subvol));
24 }
25
26 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
27                           int rw, struct printbuf *err)
28 {
29         struct bkey_s_c_snapshot s;
30         u32 i, id;
31
32         if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
33             bkey_cmp(k.k->p, POS(0, 1)) < 0) {
34                 prt_printf(err, "bad pos");
35                 return -EINVAL;
36         }
37
38         if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
39                 prt_printf(err, "bad val size (%zu != %zu)",
40                        bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
41                 return -EINVAL;
42         }
43
44         s = bkey_s_c_to_snapshot(k);
45
46         id = le32_to_cpu(s.v->parent);
47         if (id && id <= k.k->p.offset) {
48                 prt_printf(err, "bad parent node (%u <= %llu)",
49                        id, k.k->p.offset);
50                 return -EINVAL;
51         }
52
53         if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
54                 prt_printf(err, "children not normalized");
55                 return -EINVAL;
56         }
57
58         if (s.v->children[0] &&
59             s.v->children[0] == s.v->children[1]) {
60                 prt_printf(err, "duplicate child nodes");
61                 return -EINVAL;
62         }
63
64         for (i = 0; i < 2; i++) {
65                 id = le32_to_cpu(s.v->children[i]);
66
67                 if (id >= k.k->p.offset) {
68                         prt_printf(err, "bad child node (%u >= %llu)",
69                                id, k.k->p.offset);
70                         return -EINVAL;
71                 }
72         }
73
74         return 0;
75 }
76
77 int bch2_mark_snapshot(struct btree_trans *trans,
78                        struct bkey_s_c old, struct bkey_s_c new,
79                        unsigned flags)
80 {
81         struct bch_fs *c = trans->c;
82         struct snapshot_t *t;
83
84         t = genradix_ptr_alloc(&c->snapshots,
85                                U32_MAX - new.k->p.offset,
86                                GFP_KERNEL);
87         if (!t)
88                 return -ENOMEM;
89
90         if (new.k->type == KEY_TYPE_snapshot) {
91                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
92
93                 t->parent       = le32_to_cpu(s.v->parent);
94                 t->children[0]  = le32_to_cpu(s.v->children[0]);
95                 t->children[1]  = le32_to_cpu(s.v->children[1]);
96                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
97         } else {
98                 t->parent       = 0;
99                 t->children[0]  = 0;
100                 t->children[1]  = 0;
101                 t->subvol       = 0;
102         }
103
104         return 0;
105 }
106
107 static int snapshot_lookup(struct btree_trans *trans, u32 id,
108                            struct bch_snapshot *s)
109 {
110         struct btree_iter iter;
111         struct bkey_s_c k;
112         int ret;
113
114         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
115                              BTREE_ITER_WITH_UPDATES);
116         k = bch2_btree_iter_peek_slot(&iter);
117         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
118
119         if (!ret)
120                 *s = *bkey_s_c_to_snapshot(k).v;
121
122         bch2_trans_iter_exit(trans, &iter);
123         return ret;
124 }
125
126 static int snapshot_live(struct btree_trans *trans, u32 id)
127 {
128         struct bch_snapshot v;
129         int ret;
130
131         if (!id)
132                 return 0;
133
134         ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
135         if (ret == -ENOENT)
136                 bch_err(trans->c, "snapshot node %u not found", id);
137         if (ret)
138                 return ret;
139
140         return !BCH_SNAPSHOT_DELETED(&v);
141 }
142
143 static int bch2_snapshot_set_equiv(struct btree_trans *trans,
144                                    struct bkey_s_c_snapshot snap)
145 {
146         struct bch_fs *c = trans->c;
147         unsigned i, nr_live = 0, live_idx = 0;
148         u32 id = snap.k->p.offset, child[2] = {
149                 [0] = le32_to_cpu(snap.v->children[0]),
150                 [1] = le32_to_cpu(snap.v->children[1])
151         };
152
153         for (i = 0; i < 2; i++) {
154                 int ret = snapshot_live(trans, child[i]);
155                 if (ret < 0)
156                         return ret;
157
158                 if (ret)
159                         live_idx = i;
160                 nr_live += ret;
161         }
162
163         snapshot_t(c, id)->equiv = nr_live == 1
164                 ? snapshot_t(c, child[live_idx])->equiv
165                 : id;
166         return 0;
167 }
168
169 static int bch2_snapshots_set_equiv(struct btree_trans *trans)
170 {
171         struct btree_iter iter;
172         struct bkey_s_c k;
173         int ret;
174
175         for_each_btree_key(trans, iter, BTREE_ID_snapshots,
176                            POS_MIN, 0, k, ret) {
177                 if (k.k->type != KEY_TYPE_snapshot)
178                         continue;
179
180                 ret = bch2_snapshot_set_equiv(trans, bkey_s_c_to_snapshot(k));
181                 if (ret)
182                         break;
183         }
184         bch2_trans_iter_exit(trans, &iter);
185
186         if (ret)
187                 bch_err(trans->c, "error in bch2_snapshots_set_equiv: %i", ret);
188
189         return ret;
190 }
191
192 /* fsck: */
193 static int check_snapshot(struct btree_trans *trans,
194                           struct btree_iter *iter)
195 {
196         struct bch_fs *c = trans->c;
197         struct bkey_s_c_snapshot s;
198         struct bch_subvolume subvol;
199         struct bch_snapshot v;
200         struct bkey_s_c k;
201         struct printbuf buf = PRINTBUF;
202         bool should_have_subvol;
203         u32 i, id;
204         int ret = 0;
205
206         k = bch2_btree_iter_peek(iter);
207         if (!k.k)
208                 return 0;
209
210         ret = bkey_err(k);
211         if (ret)
212                 return ret;
213
214         if (k.k->type != KEY_TYPE_snapshot)
215                 return 0;
216
217         s = bkey_s_c_to_snapshot(k);
218         id = le32_to_cpu(s.v->parent);
219         if (id) {
220                 ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
221                 if (ret == -ENOENT)
222                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
223                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
224                 if (ret)
225                         goto err;
226
227                 if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
228                     le32_to_cpu(v.children[1]) != s.k->p.offset) {
229                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
230                                 id, s.k->p.offset);
231                         ret = -EINVAL;
232                         goto err;
233                 }
234         }
235
236         for (i = 0; i < 2 && s.v->children[i]; i++) {
237                 id = le32_to_cpu(s.v->children[i]);
238
239                 ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
240                 if (ret == -ENOENT)
241                         bch_err(c, "snapshot node %llu has nonexistent child %u",
242                                 s.k->p.offset, id);
243                 if (ret)
244                         goto err;
245
246                 if (le32_to_cpu(v.parent) != s.k->p.offset) {
247                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
248                                 id, le32_to_cpu(v.parent), s.k->p.offset);
249                         ret = -EINVAL;
250                         goto err;
251                 }
252         }
253
254         should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
255                 !BCH_SNAPSHOT_DELETED(s.v);
256
257         if (should_have_subvol) {
258                 id = le32_to_cpu(s.v->subvol);
259                 ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
260                 if (ret == -ENOENT)
261                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
262                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
263                 if (ret)
264                         goto err;
265
266                 if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
267                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
268                                 s.k->p.offset);
269                         ret = -EINVAL;
270                         goto err;
271                 }
272         } else {
273                 if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n  %s",
274                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
275                         struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
276
277                         ret = PTR_ERR_OR_ZERO(u);
278                         if (ret)
279                                 goto err;
280
281                         bkey_reassemble(&u->k_i, s.s_c);
282                         u->v.subvol = 0;
283                         ret = bch2_trans_update(trans, iter, &u->k_i, 0);
284                         if (ret)
285                                 goto err;
286                 }
287         }
288
289         if (BCH_SNAPSHOT_DELETED(s.v))
290                 set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
291 err:
292 fsck_err:
293         printbuf_exit(&buf);
294         return ret;
295 }
296
297 int bch2_fs_check_snapshots(struct bch_fs *c)
298 {
299         struct btree_trans trans;
300         struct btree_iter iter;
301         int ret;
302
303         bch2_trans_init(&trans, c, 0, 0);
304
305         bch2_trans_iter_init(&trans, &iter, BTREE_ID_snapshots,
306                              POS_MIN, BTREE_ITER_PREFETCH);
307
308         do {
309                 ret = commit_do(&trans, NULL, NULL,
310                                 BTREE_INSERT_LAZY_RW|
311                                 BTREE_INSERT_NOFAIL,
312                                 check_snapshot(&trans, &iter));
313                 if (ret)
314                         break;
315         } while (bch2_btree_iter_advance(&iter));
316         bch2_trans_iter_exit(&trans, &iter);
317
318         if (ret)
319                 bch_err(c, "error %i checking snapshots", ret);
320
321         bch2_trans_exit(&trans);
322         return ret;
323 }
324
325 static int check_subvol(struct btree_trans *trans,
326                         struct btree_iter *iter)
327 {
328         struct bkey_s_c k;
329         struct bkey_s_c_subvolume subvol;
330         struct bch_snapshot snapshot;
331         unsigned snapid;
332         int ret;
333
334         k = bch2_btree_iter_peek(iter);
335         if (!k.k)
336                 return 0;
337
338         ret = bkey_err(k);
339         if (ret)
340                 return ret;
341
342         if (k.k->type != KEY_TYPE_subvolume)
343                 return 0;
344
345         subvol = bkey_s_c_to_subvolume(k);
346         snapid = le32_to_cpu(subvol.v->snapshot);
347         ret = snapshot_lookup(trans, snapid, &snapshot);
348
349         if (ret == -ENOENT)
350                 bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
351                         k.k->p.offset, snapid);
352         if (ret)
353                 return ret;
354
355         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
356                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
357                 if (ret && ret != -EINTR)
358                         bch_err(trans->c, "error deleting subvolume %llu: %i",
359                                 iter->pos.offset, ret);
360                 if (ret)
361                         return ret;
362         }
363
364         return 0;
365 }
366
367 int bch2_fs_check_subvols(struct bch_fs *c)
368 {
369         struct btree_trans trans;
370         struct btree_iter iter;
371         int ret;
372
373         bch2_trans_init(&trans, c, 0, 0);
374
375         bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
376                              POS_MIN, BTREE_ITER_PREFETCH);
377
378         do {
379                 ret = commit_do(&trans, NULL, NULL,
380                                       BTREE_INSERT_LAZY_RW|
381                                       BTREE_INSERT_NOFAIL,
382                                       check_subvol(&trans, &iter));
383                 if (ret)
384                         break;
385         } while (bch2_btree_iter_advance(&iter));
386         bch2_trans_iter_exit(&trans, &iter);
387
388         bch2_trans_exit(&trans);
389
390         return ret;
391 }
392
393 void bch2_fs_snapshots_exit(struct bch_fs *c)
394 {
395         genradix_free(&c->snapshots);
396 }
397
398 int bch2_fs_snapshots_start(struct bch_fs *c)
399 {
400         struct btree_trans trans;
401         struct btree_iter iter;
402         struct bkey_s_c k;
403         int ret = 0;
404
405         bch2_trans_init(&trans, c, 0, 0);
406
407         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
408                            POS_MIN, 0, k, ret) {
409                if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
410                        break;
411
412                 if (k.k->type != KEY_TYPE_snapshot)
413                         continue;
414
415                 ret =   bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
416                         bch2_snapshot_set_equiv(&trans, bkey_s_c_to_snapshot(k));
417                 if (ret)
418                         break;
419         }
420         bch2_trans_iter_exit(&trans, &iter);
421
422         bch2_trans_exit(&trans);
423
424         if (ret)
425                 bch_err(c, "error starting snapshots: %i", ret);
426         return ret;
427 }
428
429 /*
430  * Mark a snapshot as deleted, for future cleanup:
431  */
432 static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
433 {
434         struct btree_iter iter;
435         struct bkey_s_c k;
436         struct bkey_i_snapshot *s;
437         int ret = 0;
438
439         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
440                              BTREE_ITER_INTENT);
441         k = bch2_btree_iter_peek_slot(&iter);
442         ret = bkey_err(k);
443         if (ret)
444                 goto err;
445
446         if (k.k->type != KEY_TYPE_snapshot) {
447                 bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
448                 ret = -ENOENT;
449                 goto err;
450         }
451
452         /* already deleted? */
453         if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
454                 goto err;
455
456         s = bch2_trans_kmalloc(trans, sizeof(*s));
457         ret = PTR_ERR_OR_ZERO(s);
458         if (ret)
459                 goto err;
460
461         bkey_reassemble(&s->k_i, k);
462         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
463         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
464         s->v.subvol = 0;
465
466         ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
467         if (ret)
468                 goto err;
469 err:
470         bch2_trans_iter_exit(trans, &iter);
471         return ret;
472 }
473
474 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
475 {
476         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
477         struct bkey_s_c k;
478         struct bkey_s_c_snapshot s;
479         struct bkey_i_snapshot *parent;
480         u32 parent_id;
481         unsigned i;
482         int ret = 0;
483
484         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
485                              BTREE_ITER_INTENT);
486         k = bch2_btree_iter_peek_slot(&iter);
487         ret = bkey_err(k);
488         if (ret)
489                 goto err;
490
491         if (k.k->type != KEY_TYPE_snapshot) {
492                 bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
493                 ret = -ENOENT;
494                 goto err;
495         }
496
497         s = bkey_s_c_to_snapshot(k);
498
499         BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
500         parent_id = le32_to_cpu(s.v->parent);
501
502         if (parent_id) {
503                 bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
504                                      POS(0, parent_id),
505                                      BTREE_ITER_INTENT);
506                 k = bch2_btree_iter_peek_slot(&p_iter);
507                 ret = bkey_err(k);
508                 if (ret)
509                         goto err;
510
511                 if (k.k->type != KEY_TYPE_snapshot) {
512                         bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
513                         ret = -ENOENT;
514                         goto err;
515                 }
516
517                 parent = bch2_trans_kmalloc(trans, sizeof(*parent));
518                 ret = PTR_ERR_OR_ZERO(parent);
519                 if (ret)
520                         goto err;
521
522                 bkey_reassemble(&parent->k_i, k);
523
524                 for (i = 0; i < 2; i++)
525                         if (le32_to_cpu(parent->v.children[i]) == id)
526                                 break;
527
528                 if (i == 2)
529                         bch_err(trans->c, "snapshot %u missing child pointer to %u",
530                                 parent_id, id);
531                 else
532                         parent->v.children[i] = 0;
533
534                 if (le32_to_cpu(parent->v.children[0]) <
535                     le32_to_cpu(parent->v.children[1]))
536                         swap(parent->v.children[0],
537                              parent->v.children[1]);
538
539                 ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
540                 if (ret)
541                         goto err;
542         }
543
544         ret = bch2_btree_delete_at(trans, &iter, 0);
545 err:
546         bch2_trans_iter_exit(trans, &p_iter);
547         bch2_trans_iter_exit(trans, &iter);
548         return ret;
549 }
550
551 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
552                               u32 *new_snapids,
553                               u32 *snapshot_subvols,
554                               unsigned nr_snapids)
555 {
556         struct btree_iter iter;
557         struct bkey_i_snapshot *n;
558         struct bkey_s_c k;
559         unsigned i;
560         int ret = 0;
561
562         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
563                              POS_MIN, BTREE_ITER_INTENT);
564         k = bch2_btree_iter_peek(&iter);
565         ret = bkey_err(k);
566         if (ret)
567                 goto err;
568
569         for (i = 0; i < nr_snapids; i++) {
570                 k = bch2_btree_iter_prev_slot(&iter);
571                 ret = bkey_err(k);
572                 if (ret)
573                         goto err;
574
575                 if (!k.k || !k.k->p.offset) {
576                         ret = -ENOSPC;
577                         goto err;
578                 }
579
580                 n = bch2_trans_kmalloc(trans, sizeof(*n));
581                 ret = PTR_ERR_OR_ZERO(n);
582                 if (ret)
583                         goto err;
584
585                 bkey_snapshot_init(&n->k_i);
586                 n->k.p          = iter.pos;
587                 n->v.flags      = 0;
588                 n->v.parent     = cpu_to_le32(parent);
589                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
590                 n->v.pad        = 0;
591                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
592
593                 ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
594                         bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
595                 if (ret)
596                         goto err;
597
598                 new_snapids[i]  = iter.pos.offset;
599         }
600
601         if (parent) {
602                 bch2_btree_iter_set_pos(&iter, POS(0, parent));
603                 k = bch2_btree_iter_peek(&iter);
604                 ret = bkey_err(k);
605                 if (ret)
606                         goto err;
607
608                 if (k.k->type != KEY_TYPE_snapshot) {
609                         bch_err(trans->c, "snapshot %u not found", parent);
610                         ret = -ENOENT;
611                         goto err;
612                 }
613
614                 n = bch2_trans_kmalloc(trans, sizeof(*n));
615                 ret = PTR_ERR_OR_ZERO(n);
616                 if (ret)
617                         goto err;
618
619                 bkey_reassemble(&n->k_i, k);
620
621                 if (n->v.children[0] || n->v.children[1]) {
622                         bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
623                         ret = -EINVAL;
624                         goto err;
625                 }
626
627                 n->v.children[0] = cpu_to_le32(new_snapids[0]);
628                 n->v.children[1] = cpu_to_le32(new_snapids[1]);
629                 n->v.subvol = 0;
630                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
631                 ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
632                 if (ret)
633                         goto err;
634         }
635 err:
636         bch2_trans_iter_exit(trans, &iter);
637         return ret;
638 }
639
640 static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
641                                            snapshot_id_list *deleted,
642                                            enum btree_id btree_id)
643 {
644         struct bch_fs *c = trans->c;
645         struct btree_iter iter;
646         struct bkey_s_c k;
647         snapshot_id_list equiv_seen = { 0 };
648         struct bpos last_pos = POS_MIN;
649         int ret = 0;
650
651         /*
652          * XXX: We should also delete whiteouts that no longer overwrite
653          * anything
654          */
655
656         bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
657                              BTREE_ITER_INTENT|
658                              BTREE_ITER_PREFETCH|
659                              BTREE_ITER_NOT_EXTENTS|
660                              BTREE_ITER_ALL_SNAPSHOTS);
661
662         while ((bch2_trans_begin(trans),
663                 (k = bch2_btree_iter_peek(&iter)).k) &&
664                !(ret = bkey_err(k))) {
665                 u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
666
667                 if (bkey_cmp(k.k->p, last_pos))
668                         equiv_seen.nr = 0;
669                 last_pos = k.k->p;
670
671                 if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
672                     snapshot_list_has_id(&equiv_seen, equiv)) {
673                         ret = commit_do(trans, NULL, NULL,
674                                               BTREE_INSERT_NOFAIL,
675                                 bch2_btree_iter_traverse(&iter) ?:
676                                 bch2_btree_delete_at(trans, &iter,
677                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
678                         if (ret)
679                                 break;
680                 } else {
681                         ret = snapshot_list_add(c, &equiv_seen, equiv);
682                         if (ret)
683                                 break;
684                 }
685
686                 bch2_btree_iter_advance(&iter);
687         }
688         bch2_trans_iter_exit(trans, &iter);
689
690         darray_exit(&equiv_seen);
691
692         return ret;
693 }
694
695 int bch2_delete_dead_snapshots(struct bch_fs *c)
696 {
697         struct btree_trans trans;
698         struct btree_iter iter;
699         struct bkey_s_c k;
700         struct bkey_s_c_snapshot snap;
701         snapshot_id_list deleted = { 0 };
702         u32 i, id, children[2];
703         int ret = 0;
704
705         if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
706                 return 0;
707
708         if (!test_bit(BCH_FS_STARTED, &c->flags)) {
709                 ret = bch2_fs_read_write_early(c);
710                 if (ret) {
711                         bch_err(c, "error deleleting dead snapshots: error going rw: %i", ret);
712                         return ret;
713                 }
714         }
715
716         bch2_trans_init(&trans, c, 0, 0);
717
718         /*
719          * For every snapshot node: If we have no live children and it's not
720          * pointed to by a subvolume, delete it:
721          */
722         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
723                            POS_MIN, 0, k, ret) {
724                 if (k.k->type != KEY_TYPE_snapshot)
725                         continue;
726
727                 snap = bkey_s_c_to_snapshot(k);
728                 if (BCH_SNAPSHOT_DELETED(snap.v) ||
729                     BCH_SNAPSHOT_SUBVOL(snap.v))
730                         continue;
731
732                 children[0] = le32_to_cpu(snap.v->children[0]);
733                 children[1] = le32_to_cpu(snap.v->children[1]);
734
735                 ret   = snapshot_live(&trans, children[0]) ?:
736                         snapshot_live(&trans, children[1]);
737                 if (ret < 0)
738                         break;
739                 if (ret)
740                         continue;
741
742                 ret = commit_do(&trans, NULL, NULL, 0,
743                         bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
744                 if (ret) {
745                         bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
746                         break;
747                 }
748         }
749         bch2_trans_iter_exit(&trans, &iter);
750
751         if (ret) {
752                 bch_err(c, "error walking snapshots: %i", ret);
753                 goto err;
754         }
755
756         ret = bch2_snapshots_set_equiv(&trans);
757         if (ret)
758                 goto err;
759
760         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
761                            POS_MIN, 0, k, ret) {
762                 if (k.k->type != KEY_TYPE_snapshot)
763                         continue;
764
765                 snap = bkey_s_c_to_snapshot(k);
766                 if (BCH_SNAPSHOT_DELETED(snap.v)) {
767                         ret = snapshot_list_add(c, &deleted, k.k->p.offset);
768                         if (ret)
769                                 break;
770                 }
771         }
772         bch2_trans_iter_exit(&trans, &iter);
773
774         if (ret) {
775                 bch_err(c, "error walking snapshots: %i", ret);
776                 goto err;
777         }
778
779         for (id = 0; id < BTREE_ID_NR; id++) {
780                 if (!btree_type_has_snapshots(id))
781                         continue;
782
783                 ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
784                 if (ret) {
785                         bch_err(c, "error deleting snapshot keys: %i", ret);
786                         goto err;
787                 }
788         }
789
790         for (i = 0; i < deleted.nr; i++) {
791                 ret = commit_do(&trans, NULL, NULL, 0,
792                         bch2_snapshot_node_delete(&trans, deleted.data[i]));
793                 if (ret) {
794                         bch_err(c, "error deleting snapshot %u: %i",
795                                 deleted.data[i], ret);
796                         goto err;
797                 }
798         }
799
800         clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
801 err:
802         darray_exit(&deleted);
803         bch2_trans_exit(&trans);
804         return ret;
805 }
806
807 static void bch2_delete_dead_snapshots_work(struct work_struct *work)
808 {
809         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
810
811         bch2_delete_dead_snapshots(c);
812         percpu_ref_put(&c->writes);
813 }
814
815 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
816 {
817         if (!percpu_ref_tryget_live(&c->writes))
818                 return;
819
820         if (!queue_work(system_long_wq, &c->snapshot_delete_work))
821                 percpu_ref_put(&c->writes);
822 }
823
824 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
825                                            struct btree_trans_commit_hook *h)
826 {
827         struct bch_fs *c = trans->c;
828
829         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
830
831         if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
832                 return 0;
833
834         bch2_delete_dead_snapshots_async(c);
835         return 0;
836 }
837
838 /* Subvolumes: */
839
840 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
841                            int rw, struct printbuf *err)
842 {
843         if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
844             bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
845                 prt_printf(err, "invalid pos");
846                 return -EINVAL;
847         }
848
849         if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
850                 prt_printf(err, "incorrect value size (%zu != %zu)",
851                        bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
852                 return -EINVAL;
853         }
854
855         return 0;
856 }
857
858 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
859                             struct bkey_s_c k)
860 {
861         struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
862
863         prt_printf(out, "root %llu snapshot id %u",
864                le64_to_cpu(s.v->inode),
865                le32_to_cpu(s.v->snapshot));
866 }
867
868 int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
869                        bool inconsistent_if_not_found,
870                        int iter_flags,
871                        struct bch_subvolume *s)
872 {
873         struct btree_iter iter;
874         struct bkey_s_c k;
875         int ret;
876
877         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
878                              iter_flags);
879         k = bch2_btree_iter_peek_slot(&iter);
880         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
881
882         if (ret == -ENOENT && inconsistent_if_not_found)
883                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
884         if (!ret)
885                 *s = *bkey_s_c_to_subvolume(k).v;
886
887         bch2_trans_iter_exit(trans, &iter);
888         return ret;
889 }
890
891 int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
892                              struct bch_subvolume *subvol)
893 {
894         struct bch_snapshot snap;
895
896         return  snapshot_lookup(trans, snapshot, &snap) ?:
897                 bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
898 }
899
900 int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
901                                 u32 *snapid)
902 {
903         struct bch_subvolume s;
904         int ret;
905
906         ret = bch2_subvolume_get(trans, subvol, true,
907                                  BTREE_ITER_CACHED|
908                                  BTREE_ITER_WITH_UPDATES,
909                                  &s);
910
911         *snapid = le32_to_cpu(s.snapshot);
912         return ret;
913 }
914
915 /*
916  * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
917  * deletion/cleanup:
918  */
919 int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
920 {
921         struct btree_iter iter;
922         struct bkey_s_c k;
923         struct bkey_s_c_subvolume subvol;
924         struct btree_trans_commit_hook *h;
925         u32 snapid;
926         int ret = 0;
927
928         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
929                              POS(0, subvolid),
930                              BTREE_ITER_CACHED|
931                              BTREE_ITER_INTENT);
932         k = bch2_btree_iter_peek_slot(&iter);
933         ret = bkey_err(k);
934         if (ret)
935                 goto err;
936
937         if (k.k->type != KEY_TYPE_subvolume) {
938                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
939                 ret = -EIO;
940                 goto err;
941         }
942
943         subvol = bkey_s_c_to_subvolume(k);
944         snapid = le32_to_cpu(subvol.v->snapshot);
945
946         ret = bch2_btree_delete_at(trans, &iter, 0);
947         if (ret)
948                 goto err;
949
950         ret = bch2_snapshot_node_set_deleted(trans, snapid);
951
952         h = bch2_trans_kmalloc(trans, sizeof(*h));
953         ret = PTR_ERR_OR_ZERO(h);
954         if (ret)
955                 goto err;
956
957         h->fn = bch2_delete_dead_snapshots_hook;
958         bch2_trans_commit_hook(trans, h);
959 err:
960         bch2_trans_iter_exit(trans, &iter);
961         return ret;
962 }
963
964 void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
965 {
966         struct bch_fs *c = container_of(work, struct bch_fs,
967                                 snapshot_wait_for_pagecache_and_delete_work);
968         snapshot_id_list s;
969         u32 *id;
970         int ret = 0;
971
972         while (!ret) {
973                 mutex_lock(&c->snapshots_unlinked_lock);
974                 s = c->snapshots_unlinked;
975                 darray_init(&c->snapshots_unlinked);
976                 mutex_unlock(&c->snapshots_unlinked_lock);
977
978                 if (!s.nr)
979                         break;
980
981                 bch2_evict_subvolume_inodes(c, &s);
982
983                 for (id = s.data; id < s.data + s.nr; id++) {
984                         ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
985                                       bch2_subvolume_delete(&trans, *id));
986                         if (ret) {
987                                 bch_err(c, "error %i deleting subvolume %u", ret, *id);
988                                 break;
989                         }
990                 }
991
992                 darray_exit(&s);
993         }
994
995         percpu_ref_put(&c->writes);
996 }
997
998 struct subvolume_unlink_hook {
999         struct btree_trans_commit_hook  h;
1000         u32                             subvol;
1001 };
1002
1003 int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
1004                                                       struct btree_trans_commit_hook *_h)
1005 {
1006         struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
1007         struct bch_fs *c = trans->c;
1008         int ret = 0;
1009
1010         mutex_lock(&c->snapshots_unlinked_lock);
1011         if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
1012                 ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
1013         mutex_unlock(&c->snapshots_unlinked_lock);
1014
1015         if (ret)
1016                 return ret;
1017
1018         if (unlikely(!percpu_ref_tryget_live(&c->writes)))
1019                 return -EROFS;
1020
1021         if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
1022                 percpu_ref_put(&c->writes);
1023         return 0;
1024 }
1025
1026 int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
1027 {
1028         struct btree_iter iter;
1029         struct bkey_s_c k;
1030         struct bkey_i_subvolume *n;
1031         struct subvolume_unlink_hook *h;
1032         int ret = 0;
1033
1034         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
1035                              POS(0, subvolid),
1036                              BTREE_ITER_CACHED|
1037                              BTREE_ITER_INTENT);
1038         k = bch2_btree_iter_peek_slot(&iter);
1039         ret = bkey_err(k);
1040         if (ret)
1041                 goto err;
1042
1043         if (k.k->type != KEY_TYPE_subvolume) {
1044                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
1045                 ret = -EIO;
1046                 goto err;
1047         }
1048
1049         n = bch2_trans_kmalloc(trans, sizeof(*n));
1050         ret = PTR_ERR_OR_ZERO(n);
1051         if (ret)
1052                 goto err;
1053
1054         bkey_reassemble(&n->k_i, k);
1055         SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
1056
1057         ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
1058         if (ret)
1059                 goto err;
1060
1061         h = bch2_trans_kmalloc(trans, sizeof(*h));
1062         ret = PTR_ERR_OR_ZERO(h);
1063         if (ret)
1064                 goto err;
1065
1066         h->h.fn         = bch2_subvolume_wait_for_pagecache_and_delete_hook;
1067         h->subvol       = subvolid;
1068         bch2_trans_commit_hook(trans, &h->h);
1069 err:
1070         bch2_trans_iter_exit(trans, &iter);
1071         return ret;
1072 }
1073
1074 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
1075                           u32 src_subvolid,
1076                           u32 *new_subvolid,
1077                           u32 *new_snapshotid,
1078                           bool ro)
1079 {
1080         struct bch_fs *c = trans->c;
1081         struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
1082         struct bkey_i_subvolume *new_subvol = NULL;
1083         struct bkey_i_subvolume *src_subvol = NULL;
1084         struct bkey_s_c k;
1085         u32 parent = 0, new_nodes[2], snapshot_subvols[2];
1086         int ret = 0;
1087
1088         for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
1089                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
1090                 if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
1091                         break;
1092
1093                 /*
1094                  * bch2_subvolume_delete() doesn't flush the btree key cache -
1095                  * ideally it would but that's tricky
1096                  */
1097                 if (bkey_deleted(k.k) &&
1098                     !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
1099                         goto found_slot;
1100         }
1101
1102         if (!ret)
1103                 ret = -ENOSPC;
1104         goto err;
1105 found_slot:
1106         snapshot_subvols[0] = dst_iter.pos.offset;
1107         snapshot_subvols[1] = src_subvolid;
1108
1109         if (src_subvolid) {
1110                 /* Creating a snapshot: */
1111                 src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
1112                 ret = PTR_ERR_OR_ZERO(src_subvol);
1113                 if (ret)
1114                         goto err;
1115
1116                 bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
1117                                      POS(0, src_subvolid),
1118                                      BTREE_ITER_CACHED|
1119                                      BTREE_ITER_INTENT);
1120                 k = bch2_btree_iter_peek_slot(&src_iter);
1121                 ret = bkey_err(k);
1122                 if (ret)
1123                         goto err;
1124
1125                 if (k.k->type != KEY_TYPE_subvolume) {
1126                         bch_err(c, "subvolume %u not found", src_subvolid);
1127                         ret = -ENOENT;
1128                         goto err;
1129                 }
1130
1131                 bkey_reassemble(&src_subvol->k_i, k);
1132                 parent = le32_to_cpu(src_subvol->v.snapshot);
1133         }
1134
1135         ret = bch2_snapshot_node_create(trans, parent, new_nodes,
1136                                         snapshot_subvols,
1137                                         src_subvolid ? 2 : 1);
1138         if (ret)
1139                 goto err;
1140
1141         if (src_subvolid) {
1142                 src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
1143                 ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
1144                 if (ret)
1145                         goto err;
1146         }
1147
1148         new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
1149         ret = PTR_ERR_OR_ZERO(new_subvol);
1150         if (ret)
1151                 goto err;
1152
1153         bkey_subvolume_init(&new_subvol->k_i);
1154         new_subvol->v.flags     = 0;
1155         new_subvol->v.snapshot  = cpu_to_le32(new_nodes[0]);
1156         new_subvol->v.inode     = cpu_to_le64(inode);
1157         SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
1158         SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
1159         new_subvol->k.p         = dst_iter.pos;
1160         ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
1161         if (ret)
1162                 goto err;
1163
1164         *new_subvolid   = new_subvol->k.p.offset;
1165         *new_snapshotid = new_nodes[0];
1166 err:
1167         bch2_trans_iter_exit(trans, &src_iter);
1168         bch2_trans_iter_exit(trans, &dst_iter);
1169         return ret;
1170 }
1171
1172 int bch2_fs_subvolumes_init(struct bch_fs *c)
1173 {
1174         INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
1175         INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
1176                   bch2_subvolume_wait_for_pagecache_and_delete);
1177         mutex_init(&c->snapshots_unlinked_lock);
1178         return 0;
1179 }