]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/subvolume.c
bcc67c0f5dfc95992c05827ade2b759fd20e913a
[bcachefs-tools-debian] / libbcachefs / subvolume.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_update.h"
6 #include "errcode.h"
7 #include "error.h"
8 #include "fs.h"
9 #include "subvolume.h"
10
11 /* Snapshot tree: */
12
13 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
14                            struct bkey_s_c k)
15 {
16         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
17
18         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
19                BCH_SNAPSHOT_SUBVOL(s.v),
20                BCH_SNAPSHOT_DELETED(s.v),
21                le32_to_cpu(s.v->parent),
22                le32_to_cpu(s.v->children[0]),
23                le32_to_cpu(s.v->children[1]),
24                le32_to_cpu(s.v->subvol));
25 }
26
27 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
28                           unsigned flags, struct printbuf *err)
29 {
30         struct bkey_s_c_snapshot s;
31         u32 i, id;
32
33         if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
34             bkey_lt(k.k->p, POS(0, 1))) {
35                 prt_printf(err, "bad pos");
36                 return -BCH_ERR_invalid_bkey;
37         }
38
39         if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
40                 prt_printf(err, "bad val size (%zu != %zu)",
41                        bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
42                 return -BCH_ERR_invalid_bkey;
43         }
44
45         s = bkey_s_c_to_snapshot(k);
46
47         id = le32_to_cpu(s.v->parent);
48         if (id && id <= k.k->p.offset) {
49                 prt_printf(err, "bad parent node (%u <= %llu)",
50                        id, k.k->p.offset);
51                 return -BCH_ERR_invalid_bkey;
52         }
53
54         if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
55                 prt_printf(err, "children not normalized");
56                 return -BCH_ERR_invalid_bkey;
57         }
58
59         if (s.v->children[0] &&
60             s.v->children[0] == s.v->children[1]) {
61                 prt_printf(err, "duplicate child nodes");
62                 return -BCH_ERR_invalid_bkey;
63         }
64
65         for (i = 0; i < 2; i++) {
66                 id = le32_to_cpu(s.v->children[i]);
67
68                 if (id >= k.k->p.offset) {
69                         prt_printf(err, "bad child node (%u >= %llu)",
70                                id, k.k->p.offset);
71                         return -BCH_ERR_invalid_bkey;
72                 }
73         }
74
75         return 0;
76 }
77
78 int bch2_mark_snapshot(struct btree_trans *trans,
79                        enum btree_id btree, unsigned level,
80                        struct bkey_s_c old, struct bkey_s_c new,
81                        unsigned flags)
82 {
83         struct bch_fs *c = trans->c;
84         struct snapshot_t *t;
85
86         t = genradix_ptr_alloc(&c->snapshots,
87                                U32_MAX - new.k->p.offset,
88                                GFP_KERNEL);
89         if (!t)
90                 return -ENOMEM;
91
92         if (new.k->type == KEY_TYPE_snapshot) {
93                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
94
95                 t->parent       = le32_to_cpu(s.v->parent);
96                 t->children[0]  = le32_to_cpu(s.v->children[0]);
97                 t->children[1]  = le32_to_cpu(s.v->children[1]);
98                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
99         } else {
100                 t->parent       = 0;
101                 t->children[0]  = 0;
102                 t->children[1]  = 0;
103                 t->subvol       = 0;
104         }
105
106         return 0;
107 }
108
109 static int snapshot_lookup(struct btree_trans *trans, u32 id,
110                            struct bch_snapshot *s)
111 {
112         struct btree_iter iter;
113         struct bkey_s_c k;
114         int ret;
115
116         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
117                              BTREE_ITER_WITH_UPDATES);
118         k = bch2_btree_iter_peek_slot(&iter);
119         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
120
121         if (!ret)
122                 *s = *bkey_s_c_to_snapshot(k).v;
123
124         bch2_trans_iter_exit(trans, &iter);
125         return ret;
126 }
127
128 static int snapshot_live(struct btree_trans *trans, u32 id)
129 {
130         struct bch_snapshot v;
131         int ret;
132
133         if (!id)
134                 return 0;
135
136         ret = snapshot_lookup(trans, id, &v);
137         if (ret == -ENOENT)
138                 bch_err(trans->c, "snapshot node %u not found", id);
139         if (ret)
140                 return ret;
141
142         return !BCH_SNAPSHOT_DELETED(&v);
143 }
144
145 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
146 {
147         struct bch_fs *c = trans->c;
148         unsigned i, nr_live = 0, live_idx = 0;
149         struct bkey_s_c_snapshot snap;
150         u32 id = k.k->p.offset, child[2];
151
152         if (k.k->type != KEY_TYPE_snapshot)
153                 return 0;
154
155         snap = bkey_s_c_to_snapshot(k);
156
157         child[0] = le32_to_cpu(snap.v->children[0]);
158         child[1] = le32_to_cpu(snap.v->children[1]);
159
160         for (i = 0; i < 2; i++) {
161                 int ret = snapshot_live(trans, child[i]);
162
163                 if (ret < 0)
164                         return ret;
165
166                 if (ret)
167                         live_idx = i;
168                 nr_live += ret;
169         }
170
171         snapshot_t(c, id)->equiv = nr_live == 1
172                 ? snapshot_t(c, child[live_idx])->equiv
173                 : id;
174         return 0;
175 }
176
177 /* fsck: */
178 static int check_snapshot(struct btree_trans *trans,
179                           struct btree_iter *iter,
180                           struct bkey_s_c k)
181 {
182         struct bch_fs *c = trans->c;
183         struct bkey_s_c_snapshot s;
184         struct bch_subvolume subvol;
185         struct bch_snapshot v;
186         struct printbuf buf = PRINTBUF;
187         bool should_have_subvol;
188         u32 i, id;
189         int ret = 0;
190
191         if (k.k->type != KEY_TYPE_snapshot)
192                 return 0;
193
194         s = bkey_s_c_to_snapshot(k);
195         id = le32_to_cpu(s.v->parent);
196         if (id) {
197                 ret = snapshot_lookup(trans, id, &v);
198                 if (ret == -ENOENT)
199                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
200                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
201                 if (ret)
202                         goto err;
203
204                 if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
205                     le32_to_cpu(v.children[1]) != s.k->p.offset) {
206                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
207                                 id, s.k->p.offset);
208                         ret = -EINVAL;
209                         goto err;
210                 }
211         }
212
213         for (i = 0; i < 2 && s.v->children[i]; i++) {
214                 id = le32_to_cpu(s.v->children[i]);
215
216                 ret = snapshot_lookup(trans, id, &v);
217                 if (ret == -ENOENT)
218                         bch_err(c, "snapshot node %llu has nonexistent child %u",
219                                 s.k->p.offset, id);
220                 if (ret)
221                         goto err;
222
223                 if (le32_to_cpu(v.parent) != s.k->p.offset) {
224                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
225                                 id, le32_to_cpu(v.parent), s.k->p.offset);
226                         ret = -EINVAL;
227                         goto err;
228                 }
229         }
230
231         should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
232                 !BCH_SNAPSHOT_DELETED(s.v);
233
234         if (should_have_subvol) {
235                 id = le32_to_cpu(s.v->subvol);
236                 ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
237                 if (ret == -ENOENT)
238                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
239                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
240                 if (ret)
241                         goto err;
242
243                 if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
244                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
245                                 s.k->p.offset);
246                         ret = -EINVAL;
247                         goto err;
248                 }
249         } else {
250                 if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n  %s",
251                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
252                         struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
253
254                         ret = PTR_ERR_OR_ZERO(u);
255                         if (ret)
256                                 goto err;
257
258                         bkey_reassemble(&u->k_i, s.s_c);
259                         u->v.subvol = 0;
260                         ret = bch2_trans_update(trans, iter, &u->k_i, 0);
261                         if (ret)
262                                 goto err;
263                 }
264         }
265
266         if (BCH_SNAPSHOT_DELETED(s.v))
267                 set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
268 err:
269 fsck_err:
270         printbuf_exit(&buf);
271         return ret;
272 }
273
274 int bch2_fs_check_snapshots(struct bch_fs *c)
275 {
276         struct btree_trans trans;
277         struct btree_iter iter;
278         struct bkey_s_c k;
279         int ret;
280
281         bch2_trans_init(&trans, c, 0, 0);
282
283         ret = for_each_btree_key_commit(&trans, iter,
284                         BTREE_ID_snapshots, POS_MIN,
285                         BTREE_ITER_PREFETCH, k,
286                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
287                 check_snapshot(&trans, &iter, k));
288
289         if (ret)
290                 bch_err(c, "error %i checking snapshots", ret);
291
292         bch2_trans_exit(&trans);
293         return ret;
294 }
295
296 static int check_subvol(struct btree_trans *trans,
297                         struct btree_iter *iter,
298                         struct bkey_s_c k)
299 {
300         struct bkey_s_c_subvolume subvol;
301         struct bch_snapshot snapshot;
302         unsigned snapid;
303         int ret;
304
305         if (k.k->type != KEY_TYPE_subvolume)
306                 return 0;
307
308         subvol = bkey_s_c_to_subvolume(k);
309         snapid = le32_to_cpu(subvol.v->snapshot);
310         ret = snapshot_lookup(trans, snapid, &snapshot);
311
312         if (ret == -ENOENT)
313                 bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
314                         k.k->p.offset, snapid);
315         if (ret)
316                 return ret;
317
318         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
319                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
320                 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
321                         bch_err(trans->c, "error deleting subvolume %llu: %s",
322                                 iter->pos.offset, bch2_err_str(ret));
323                 if (ret)
324                         return ret;
325         }
326
327         return 0;
328 }
329
330 int bch2_fs_check_subvols(struct bch_fs *c)
331 {
332         struct btree_trans trans;
333         struct btree_iter iter;
334         struct bkey_s_c k;
335         int ret;
336
337         bch2_trans_init(&trans, c, 0, 0);
338
339         ret = for_each_btree_key_commit(&trans, iter,
340                         BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
341                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
342                 check_subvol(&trans, &iter, k));
343
344         bch2_trans_exit(&trans);
345
346         return ret;
347 }
348
349 void bch2_fs_snapshots_exit(struct bch_fs *c)
350 {
351         genradix_free(&c->snapshots);
352 }
353
354 int bch2_fs_snapshots_start(struct bch_fs *c)
355 {
356         struct btree_trans trans;
357         struct btree_iter iter;
358         struct bkey_s_c k;
359         int ret = 0;
360
361         bch2_trans_init(&trans, c, 0, 0);
362
363         for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
364                            POS_MIN, 0, k,
365                 bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
366                 bch2_snapshot_set_equiv(&trans, k));
367
368         bch2_trans_exit(&trans);
369
370         if (ret)
371                 bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
372         return ret;
373 }
374
375 /*
376  * Mark a snapshot as deleted, for future cleanup:
377  */
378 static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
379 {
380         struct btree_iter iter;
381         struct bkey_i_snapshot *s;
382         int ret = 0;
383
384         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
385                              BTREE_ITER_INTENT);
386         s = bch2_bkey_get_mut_typed(trans, &iter, snapshot);
387         ret = PTR_ERR_OR_ZERO(s);
388         if (unlikely(ret)) {
389                 bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", id);
390                 goto err;
391         }
392
393         /* already deleted? */
394         if (BCH_SNAPSHOT_DELETED(&s->v))
395                 goto err;
396
397         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
398         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
399         s->v.subvol = 0;
400
401         ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
402         if (ret)
403                 goto err;
404 err:
405         bch2_trans_iter_exit(trans, &iter);
406         return ret;
407 }
408
409 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
410 {
411         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
412         struct bkey_s_c k;
413         struct bkey_s_c_snapshot s;
414         u32 parent_id;
415         unsigned i;
416         int ret = 0;
417
418         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
419                              BTREE_ITER_INTENT);
420         k = bch2_btree_iter_peek_slot(&iter);
421         ret = bkey_err(k);
422         if (ret)
423                 goto err;
424
425         if (k.k->type != KEY_TYPE_snapshot) {
426                 bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
427                 ret = -ENOENT;
428                 goto err;
429         }
430
431         s = bkey_s_c_to_snapshot(k);
432
433         BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
434         parent_id = le32_to_cpu(s.v->parent);
435
436         if (parent_id) {
437                 struct bkey_i_snapshot *parent;
438
439                 bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
440                                      POS(0, parent_id),
441                                      BTREE_ITER_INTENT);
442                 parent = bch2_bkey_get_mut_typed(trans, &p_iter, snapshot);
443                 ret = PTR_ERR_OR_ZERO(parent);
444                 if (unlikely(ret)) {
445                         bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing snapshot %u", parent_id);
446                         goto err;
447                 }
448
449                 for (i = 0; i < 2; i++)
450                         if (le32_to_cpu(parent->v.children[i]) == id)
451                                 break;
452
453                 if (i == 2)
454                         bch_err(trans->c, "snapshot %u missing child pointer to %u",
455                                 parent_id, id);
456                 else
457                         parent->v.children[i] = 0;
458
459                 if (le32_to_cpu(parent->v.children[0]) <
460                     le32_to_cpu(parent->v.children[1]))
461                         swap(parent->v.children[0],
462                              parent->v.children[1]);
463
464                 ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
465                 if (ret)
466                         goto err;
467         }
468
469         ret = bch2_btree_delete_at(trans, &iter, 0);
470 err:
471         bch2_trans_iter_exit(trans, &p_iter);
472         bch2_trans_iter_exit(trans, &iter);
473         return ret;
474 }
475
476 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
477                               u32 *new_snapids,
478                               u32 *snapshot_subvols,
479                               unsigned nr_snapids)
480 {
481         struct btree_iter iter;
482         struct bkey_i_snapshot *n;
483         struct bkey_s_c k;
484         unsigned i;
485         int ret = 0;
486
487         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
488                              POS_MIN, BTREE_ITER_INTENT);
489         k = bch2_btree_iter_peek(&iter);
490         ret = bkey_err(k);
491         if (ret)
492                 goto err;
493
494         for (i = 0; i < nr_snapids; i++) {
495                 k = bch2_btree_iter_prev_slot(&iter);
496                 ret = bkey_err(k);
497                 if (ret)
498                         goto err;
499
500                 if (!k.k || !k.k->p.offset) {
501                         ret = -BCH_ERR_ENOSPC_snapshot_create;
502                         goto err;
503                 }
504
505                 n = bch2_bkey_alloc(trans, &iter, snapshot);
506                 ret = PTR_ERR_OR_ZERO(n);
507                 if (ret)
508                         goto err;
509
510                 n->v.flags      = 0;
511                 n->v.parent     = cpu_to_le32(parent);
512                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
513                 n->v.pad        = 0;
514                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
515
516                 ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
517                         bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
518                                            bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
519                 if (ret)
520                         goto err;
521
522                 new_snapids[i]  = iter.pos.offset;
523         }
524
525         if (parent) {
526                 bch2_btree_iter_set_pos(&iter, POS(0, parent));
527                 n = bch2_bkey_get_mut_typed(trans, &iter, snapshot);
528                 ret = PTR_ERR_OR_ZERO(n);
529                 if (unlikely(ret)) {
530                         if (ret == -ENOENT)
531                                 bch_err(trans->c, "snapshot %u not found", parent);
532                         goto err;
533                 }
534
535                 if (n->v.children[0] || n->v.children[1]) {
536                         bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
537                         ret = -EINVAL;
538                         goto err;
539                 }
540
541                 n->v.children[0] = cpu_to_le32(new_snapids[0]);
542                 n->v.children[1] = cpu_to_le32(new_snapids[1]);
543                 n->v.subvol = 0;
544                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
545                 ret   = bch2_trans_update(trans, &iter, &n->k_i, 0);
546                 if (ret)
547                         goto err;
548         }
549 err:
550         bch2_trans_iter_exit(trans, &iter);
551         return ret;
552 }
553
554 static int snapshot_delete_key(struct btree_trans *trans,
555                                struct btree_iter *iter,
556                                struct bkey_s_c k,
557                                snapshot_id_list *deleted,
558                                snapshot_id_list *equiv_seen,
559                                struct bpos *last_pos)
560 {
561         struct bch_fs *c = trans->c;
562         u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
563
564         if (!bkey_eq(k.k->p, *last_pos))
565                 equiv_seen->nr = 0;
566         *last_pos = k.k->p;
567
568         if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
569             snapshot_list_has_id(equiv_seen, equiv)) {
570                 return bch2_btree_delete_at(trans, iter,
571                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
572         } else {
573                 return snapshot_list_add(c, equiv_seen, equiv);
574         }
575 }
576
577 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
578                                           struct bkey_s_c k)
579 {
580         struct bkey_s_c_snapshot snap;
581         u32 children[2];
582         int ret;
583
584         if (k.k->type != KEY_TYPE_snapshot)
585                 return 0;
586
587         snap = bkey_s_c_to_snapshot(k);
588         if (BCH_SNAPSHOT_DELETED(snap.v) ||
589             BCH_SNAPSHOT_SUBVOL(snap.v))
590                 return 0;
591
592         children[0] = le32_to_cpu(snap.v->children[0]);
593         children[1] = le32_to_cpu(snap.v->children[1]);
594
595         ret   = snapshot_live(trans, children[0]) ?:
596                 snapshot_live(trans, children[1]);
597         if (ret < 0)
598                 return ret;
599
600         if (!ret)
601                 return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
602         return 0;
603 }
604
605 int bch2_delete_dead_snapshots(struct bch_fs *c)
606 {
607         struct btree_trans trans;
608         struct btree_iter iter;
609         struct bkey_s_c k;
610         struct bkey_s_c_snapshot snap;
611         snapshot_id_list deleted = { 0 };
612         u32 i, id;
613         int ret = 0;
614
615         if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
616                 return 0;
617
618         if (!test_bit(BCH_FS_STARTED, &c->flags)) {
619                 ret = bch2_fs_read_write_early(c);
620                 if (ret) {
621                         bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
622                         return ret;
623                 }
624         }
625
626         bch2_trans_init(&trans, c, 0, 0);
627
628         /*
629          * For every snapshot node: If we have no live children and it's not
630          * pointed to by a subvolume, delete it:
631          */
632         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
633                         POS_MIN, 0, k,
634                         NULL, NULL, 0,
635                 bch2_delete_redundant_snapshot(&trans, &iter, k));
636         if (ret) {
637                 bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
638                 goto err;
639         }
640
641         for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
642                            POS_MIN, 0, k,
643                 bch2_snapshot_set_equiv(&trans, k));
644         if (ret) {
645                 bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
646                 goto err;
647         }
648
649         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
650                            POS_MIN, 0, k, ret) {
651                 if (k.k->type != KEY_TYPE_snapshot)
652                         continue;
653
654                 snap = bkey_s_c_to_snapshot(k);
655                 if (BCH_SNAPSHOT_DELETED(snap.v)) {
656                         ret = snapshot_list_add(c, &deleted, k.k->p.offset);
657                         if (ret)
658                                 break;
659                 }
660         }
661         bch2_trans_iter_exit(&trans, &iter);
662
663         if (ret) {
664                 bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
665                 goto err;
666         }
667
668         for (id = 0; id < BTREE_ID_NR; id++) {
669                 struct bpos last_pos = POS_MIN;
670                 snapshot_id_list equiv_seen = { 0 };
671
672                 if (!btree_type_has_snapshots(id))
673                         continue;
674
675                 ret = for_each_btree_key_commit(&trans, iter,
676                                 id, POS_MIN,
677                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
678                                 NULL, NULL, BTREE_INSERT_NOFAIL,
679                         snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
680
681                 darray_exit(&equiv_seen);
682
683                 if (ret) {
684                         bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
685                         goto err;
686                 }
687         }
688
689         for (i = 0; i < deleted.nr; i++) {
690                 ret = commit_do(&trans, NULL, NULL, 0,
691                         bch2_snapshot_node_delete(&trans, deleted.data[i]));
692                 if (ret) {
693                         bch_err(c, "error deleting snapshot %u: %s",
694                                 deleted.data[i], bch2_err_str(ret));
695                         goto err;
696                 }
697         }
698
699         clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
700 err:
701         darray_exit(&deleted);
702         bch2_trans_exit(&trans);
703         return ret;
704 }
705
706 static void bch2_delete_dead_snapshots_work(struct work_struct *work)
707 {
708         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
709
710         bch2_delete_dead_snapshots(c);
711         bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
712 }
713
714 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
715 {
716         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
717             !queue_work(system_long_wq, &c->snapshot_delete_work))
718                 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
719 }
720
721 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
722                                            struct btree_trans_commit_hook *h)
723 {
724         struct bch_fs *c = trans->c;
725
726         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
727
728         if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
729                 return 0;
730
731         bch2_delete_dead_snapshots_async(c);
732         return 0;
733 }
734
735 /* Subvolumes: */
736
737 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
738                            unsigned flags, struct printbuf *err)
739 {
740         if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
741             bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
742                 prt_printf(err, "invalid pos");
743                 return -BCH_ERR_invalid_bkey;
744         }
745
746         if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
747                 prt_printf(err, "incorrect value size (%zu != %zu)",
748                        bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
749                 return -BCH_ERR_invalid_bkey;
750         }
751
752         return 0;
753 }
754
755 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
756                             struct bkey_s_c k)
757 {
758         struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
759
760         prt_printf(out, "root %llu snapshot id %u",
761                le64_to_cpu(s.v->inode),
762                le32_to_cpu(s.v->snapshot));
763 }
764
765 static __always_inline int
766 bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
767                            bool inconsistent_if_not_found,
768                            int iter_flags,
769                            struct bch_subvolume *s)
770 {
771         struct btree_iter iter;
772         struct bkey_s_c k;
773         int ret;
774
775         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
776                              iter_flags);
777         k = bch2_btree_iter_peek_slot(&iter);
778         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
779
780         if (ret == -ENOENT && inconsistent_if_not_found)
781                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
782         if (!ret)
783                 *s = *bkey_s_c_to_subvolume(k).v;
784
785         bch2_trans_iter_exit(trans, &iter);
786         return ret;
787 }
788
789 int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
790                        bool inconsistent_if_not_found,
791                        int iter_flags,
792                        struct bch_subvolume *s)
793 {
794         return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s);
795 }
796
797 int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
798                              struct bch_subvolume *subvol)
799 {
800         struct bch_snapshot snap;
801
802         return  snapshot_lookup(trans, snapshot, &snap) ?:
803                 bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
804 }
805
806 int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
807                                 u32 *snapid)
808 {
809         struct bch_subvolume s;
810         int ret;
811
812         ret = bch2_subvolume_get_inlined(trans, subvol, true,
813                                          BTREE_ITER_CACHED|
814                                          BTREE_ITER_WITH_UPDATES,
815                                          &s);
816         if (!ret)
817                 *snapid = le32_to_cpu(s.snapshot);
818         return ret;
819 }
820
821 /*
822  * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
823  * deletion/cleanup:
824  */
825 int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
826 {
827         struct btree_iter iter;
828         struct bkey_s_c k;
829         struct bkey_s_c_subvolume subvol;
830         struct btree_trans_commit_hook *h;
831         u32 snapid;
832         int ret = 0;
833
834         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
835                              POS(0, subvolid),
836                              BTREE_ITER_CACHED|
837                              BTREE_ITER_INTENT);
838         k = bch2_btree_iter_peek_slot(&iter);
839         ret = bkey_err(k);
840         if (ret)
841                 goto err;
842
843         if (k.k->type != KEY_TYPE_subvolume) {
844                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
845                 ret = -EIO;
846                 goto err;
847         }
848
849         subvol = bkey_s_c_to_subvolume(k);
850         snapid = le32_to_cpu(subvol.v->snapshot);
851
852         ret = bch2_btree_delete_at(trans, &iter, 0);
853         if (ret)
854                 goto err;
855
856         ret = bch2_snapshot_node_set_deleted(trans, snapid);
857         if (ret)
858                 goto err;
859
860         h = bch2_trans_kmalloc(trans, sizeof(*h));
861         ret = PTR_ERR_OR_ZERO(h);
862         if (ret)
863                 goto err;
864
865         h->fn = bch2_delete_dead_snapshots_hook;
866         bch2_trans_commit_hook(trans, h);
867 err:
868         bch2_trans_iter_exit(trans, &iter);
869         return ret;
870 }
871
872 void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
873 {
874         struct bch_fs *c = container_of(work, struct bch_fs,
875                                 snapshot_wait_for_pagecache_and_delete_work);
876         snapshot_id_list s;
877         u32 *id;
878         int ret = 0;
879
880         while (!ret) {
881                 mutex_lock(&c->snapshots_unlinked_lock);
882                 s = c->snapshots_unlinked;
883                 darray_init(&c->snapshots_unlinked);
884                 mutex_unlock(&c->snapshots_unlinked_lock);
885
886                 if (!s.nr)
887                         break;
888
889                 bch2_evict_subvolume_inodes(c, &s);
890
891                 for (id = s.data; id < s.data + s.nr; id++) {
892                         ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
893                                       bch2_subvolume_delete(&trans, *id));
894                         if (ret) {
895                                 bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
896                                 break;
897                         }
898                 }
899
900                 darray_exit(&s);
901         }
902
903         bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
904 }
905
906 struct subvolume_unlink_hook {
907         struct btree_trans_commit_hook  h;
908         u32                             subvol;
909 };
910
911 int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
912                                                       struct btree_trans_commit_hook *_h)
913 {
914         struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
915         struct bch_fs *c = trans->c;
916         int ret = 0;
917
918         mutex_lock(&c->snapshots_unlinked_lock);
919         if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
920                 ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
921         mutex_unlock(&c->snapshots_unlinked_lock);
922
923         if (ret)
924                 return ret;
925
926         if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
927                 return -EROFS;
928
929         if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
930                 bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
931         return 0;
932 }
933
934 int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
935 {
936         struct btree_iter iter;
937         struct bkey_i_subvolume *n;
938         struct subvolume_unlink_hook *h;
939         int ret = 0;
940
941         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
942                              POS(0, subvolid),
943                              BTREE_ITER_CACHED|
944                              BTREE_ITER_INTENT);
945         n = bch2_bkey_get_mut_typed(trans, &iter, subvolume);
946         ret = PTR_ERR_OR_ZERO(n);
947         if (unlikely(ret)) {
948                 bch2_fs_inconsistent_on(ret == -ENOENT, trans->c, "missing subvolume %u", subvolid);
949                 goto err;
950         }
951
952         SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
953
954         ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
955         if (ret)
956                 goto err;
957
958         h = bch2_trans_kmalloc(trans, sizeof(*h));
959         ret = PTR_ERR_OR_ZERO(h);
960         if (ret)
961                 goto err;
962
963         h->h.fn         = bch2_subvolume_wait_for_pagecache_and_delete_hook;
964         h->subvol       = subvolid;
965         bch2_trans_commit_hook(trans, &h->h);
966 err:
967         bch2_trans_iter_exit(trans, &iter);
968         return ret;
969 }
970
971 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
972                           u32 src_subvolid,
973                           u32 *new_subvolid,
974                           u32 *new_snapshotid,
975                           bool ro)
976 {
977         struct bch_fs *c = trans->c;
978         struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
979         struct bkey_i_subvolume *new_subvol = NULL;
980         struct bkey_i_subvolume *src_subvol = NULL;
981         struct bkey_s_c k;
982         u32 parent = 0, new_nodes[2], snapshot_subvols[2];
983         int ret = 0;
984
985         for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
986                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
987                 if (bkey_gt(k.k->p, SUBVOL_POS_MAX))
988                         break;
989
990                 /*
991                  * bch2_subvolume_delete() doesn't flush the btree key cache -
992                  * ideally it would but that's tricky
993                  */
994                 if (bkey_deleted(k.k) &&
995                     !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
996                         goto found_slot;
997         }
998
999         if (!ret)
1000                 ret = -BCH_ERR_ENOSPC_subvolume_create;
1001         goto err;
1002 found_slot:
1003         snapshot_subvols[0] = dst_iter.pos.offset;
1004         snapshot_subvols[1] = src_subvolid;
1005
1006         if (src_subvolid) {
1007                 /* Creating a snapshot: */
1008
1009                 bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
1010                                      POS(0, src_subvolid),
1011                                      BTREE_ITER_CACHED|
1012                                      BTREE_ITER_INTENT);
1013                 src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, subvolume);
1014                 ret = PTR_ERR_OR_ZERO(src_subvol);
1015                 if (unlikely(ret)) {
1016                         bch2_fs_inconsistent_on(ret == -ENOENT, trans->c,
1017                                                 "subvolume %u not found", src_subvolid);
1018                         goto err;
1019                 }
1020
1021                 parent = le32_to_cpu(src_subvol->v.snapshot);
1022         }
1023
1024         ret = bch2_snapshot_node_create(trans, parent, new_nodes,
1025                                         snapshot_subvols,
1026                                         src_subvolid ? 2 : 1);
1027         if (ret)
1028                 goto err;
1029
1030         if (src_subvolid) {
1031                 src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
1032                 ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
1033                 if (ret)
1034                         goto err;
1035         }
1036
1037         new_subvol = bch2_bkey_alloc(trans, &dst_iter, subvolume);
1038         ret = PTR_ERR_OR_ZERO(new_subvol);
1039         if (ret)
1040                 goto err;
1041
1042         new_subvol->v.flags     = 0;
1043         new_subvol->v.snapshot  = cpu_to_le32(new_nodes[0]);
1044         new_subvol->v.inode     = cpu_to_le64(inode);
1045         SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
1046         SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
1047         ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
1048         if (ret)
1049                 goto err;
1050
1051         *new_subvolid   = new_subvol->k.p.offset;
1052         *new_snapshotid = new_nodes[0];
1053 err:
1054         bch2_trans_iter_exit(trans, &src_iter);
1055         bch2_trans_iter_exit(trans, &dst_iter);
1056         return ret;
1057 }
1058
1059 int bch2_fs_subvolumes_init(struct bch_fs *c)
1060 {
1061         INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
1062         INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
1063                   bch2_subvolume_wait_for_pagecache_and_delete);
1064         mutex_init(&c->snapshots_unlinked_lock);
1065         return 0;
1066 }