]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/subvolume.c
Update bcachefs sources to 90a9c61e2b bcachefs: Switch bch2_btree_delete_range()...
[bcachefs-tools-debian] / libbcachefs / subvolume.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_update.h"
6 #include "errcode.h"
7 #include "error.h"
8 #include "fs.h"
9 #include "subvolume.h"
10
11 /* Snapshot tree: */
12
13 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
14                            struct bkey_s_c k)
15 {
16         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
17
18         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u",
19                BCH_SNAPSHOT_SUBVOL(s.v),
20                BCH_SNAPSHOT_DELETED(s.v),
21                le32_to_cpu(s.v->parent),
22                le32_to_cpu(s.v->children[0]),
23                le32_to_cpu(s.v->children[1]),
24                le32_to_cpu(s.v->subvol));
25 }
26
27 int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
28                           int rw, struct printbuf *err)
29 {
30         struct bkey_s_c_snapshot s;
31         u32 i, id;
32
33         if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
34             bkey_cmp(k.k->p, POS(0, 1)) < 0) {
35                 prt_printf(err, "bad pos");
36                 return -EINVAL;
37         }
38
39         if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) {
40                 prt_printf(err, "bad val size (%zu != %zu)",
41                        bkey_val_bytes(k.k), sizeof(struct bch_snapshot));
42                 return -EINVAL;
43         }
44
45         s = bkey_s_c_to_snapshot(k);
46
47         id = le32_to_cpu(s.v->parent);
48         if (id && id <= k.k->p.offset) {
49                 prt_printf(err, "bad parent node (%u <= %llu)",
50                        id, k.k->p.offset);
51                 return -EINVAL;
52         }
53
54         if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
55                 prt_printf(err, "children not normalized");
56                 return -EINVAL;
57         }
58
59         if (s.v->children[0] &&
60             s.v->children[0] == s.v->children[1]) {
61                 prt_printf(err, "duplicate child nodes");
62                 return -EINVAL;
63         }
64
65         for (i = 0; i < 2; i++) {
66                 id = le32_to_cpu(s.v->children[i]);
67
68                 if (id >= k.k->p.offset) {
69                         prt_printf(err, "bad child node (%u >= %llu)",
70                                id, k.k->p.offset);
71                         return -EINVAL;
72                 }
73         }
74
75         return 0;
76 }
77
78 int bch2_mark_snapshot(struct btree_trans *trans,
79                        struct bkey_s_c old, struct bkey_s_c new,
80                        unsigned flags)
81 {
82         struct bch_fs *c = trans->c;
83         struct snapshot_t *t;
84
85         t = genradix_ptr_alloc(&c->snapshots,
86                                U32_MAX - new.k->p.offset,
87                                GFP_KERNEL);
88         if (!t)
89                 return -ENOMEM;
90
91         if (new.k->type == KEY_TYPE_snapshot) {
92                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
93
94                 t->parent       = le32_to_cpu(s.v->parent);
95                 t->children[0]  = le32_to_cpu(s.v->children[0]);
96                 t->children[1]  = le32_to_cpu(s.v->children[1]);
97                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
98         } else {
99                 t->parent       = 0;
100                 t->children[0]  = 0;
101                 t->children[1]  = 0;
102                 t->subvol       = 0;
103         }
104
105         return 0;
106 }
107
108 static int snapshot_lookup(struct btree_trans *trans, u32 id,
109                            struct bch_snapshot *s)
110 {
111         struct btree_iter iter;
112         struct bkey_s_c k;
113         int ret;
114
115         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
116                              BTREE_ITER_WITH_UPDATES);
117         k = bch2_btree_iter_peek_slot(&iter);
118         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
119
120         if (!ret)
121                 *s = *bkey_s_c_to_snapshot(k).v;
122
123         bch2_trans_iter_exit(trans, &iter);
124         return ret;
125 }
126
127 static int snapshot_live(struct btree_trans *trans, u32 id)
128 {
129         struct bch_snapshot v;
130         int ret;
131
132         if (!id)
133                 return 0;
134
135         ret = snapshot_lookup(trans, id, &v);
136         if (ret == -ENOENT)
137                 bch_err(trans->c, "snapshot node %u not found", id);
138         if (ret)
139                 return ret;
140
141         return !BCH_SNAPSHOT_DELETED(&v);
142 }
143
144 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
145 {
146         struct bch_fs *c = trans->c;
147         unsigned i, nr_live = 0, live_idx = 0;
148         struct bkey_s_c_snapshot snap;
149         u32 id = k.k->p.offset, child[2];
150
151         if (k.k->type != KEY_TYPE_snapshot)
152                 return 0;
153
154         snap = bkey_s_c_to_snapshot(k);
155
156         child[0] = le32_to_cpu(snap.v->children[0]);
157         child[1] = le32_to_cpu(snap.v->children[1]);
158
159         for (i = 0; i < 2; i++) {
160                 int ret = snapshot_live(trans, child[i]);
161                 if (ret < 0)
162                         return ret;
163
164                 if (ret)
165                         live_idx = i;
166                 nr_live += ret;
167         }
168
169         snapshot_t(c, id)->equiv = nr_live == 1
170                 ? snapshot_t(c, child[live_idx])->equiv
171                 : id;
172         return 0;
173 }
174
175 /* fsck: */
176 static int check_snapshot(struct btree_trans *trans,
177                           struct btree_iter *iter,
178                           struct bkey_s_c k)
179 {
180         struct bch_fs *c = trans->c;
181         struct bkey_s_c_snapshot s;
182         struct bch_subvolume subvol;
183         struct bch_snapshot v;
184         struct printbuf buf = PRINTBUF;
185         bool should_have_subvol;
186         u32 i, id;
187         int ret = 0;
188
189         if (k.k->type != KEY_TYPE_snapshot)
190                 return 0;
191
192         s = bkey_s_c_to_snapshot(k);
193         id = le32_to_cpu(s.v->parent);
194         if (id) {
195                 ret = snapshot_lookup(trans, id, &v);
196                 if (ret == -ENOENT)
197                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
198                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
199                 if (ret)
200                         goto err;
201
202                 if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
203                     le32_to_cpu(v.children[1]) != s.k->p.offset) {
204                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
205                                 id, s.k->p.offset);
206                         ret = -EINVAL;
207                         goto err;
208                 }
209         }
210
211         for (i = 0; i < 2 && s.v->children[i]; i++) {
212                 id = le32_to_cpu(s.v->children[i]);
213
214                 ret = snapshot_lookup(trans, id, &v);
215                 if (ret == -ENOENT)
216                         bch_err(c, "snapshot node %llu has nonexistent child %u",
217                                 s.k->p.offset, id);
218                 if (ret)
219                         goto err;
220
221                 if (le32_to_cpu(v.parent) != s.k->p.offset) {
222                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
223                                 id, le32_to_cpu(v.parent), s.k->p.offset);
224                         ret = -EINVAL;
225                         goto err;
226                 }
227         }
228
229         should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) &&
230                 !BCH_SNAPSHOT_DELETED(s.v);
231
232         if (should_have_subvol) {
233                 id = le32_to_cpu(s.v->subvol);
234                 ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
235                 if (ret == -ENOENT)
236                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
237                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf));
238                 if (ret)
239                         goto err;
240
241                 if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
242                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
243                                 s.k->p.offset);
244                         ret = -EINVAL;
245                         goto err;
246                 }
247         } else {
248                 if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n  %s",
249                                 (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
250                         struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u));
251
252                         ret = PTR_ERR_OR_ZERO(u);
253                         if (ret)
254                                 goto err;
255
256                         bkey_reassemble(&u->k_i, s.s_c);
257                         u->v.subvol = 0;
258                         ret = bch2_trans_update(trans, iter, &u->k_i, 0);
259                         if (ret)
260                                 goto err;
261                 }
262         }
263
264         if (BCH_SNAPSHOT_DELETED(s.v))
265                 set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
266 err:
267 fsck_err:
268         printbuf_exit(&buf);
269         return ret;
270 }
271
272 int bch2_fs_check_snapshots(struct bch_fs *c)
273 {
274         struct btree_trans trans;
275         struct btree_iter iter;
276         struct bkey_s_c k;
277         int ret;
278
279         bch2_trans_init(&trans, c, 0, 0);
280
281         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
282                         POS(BCACHEFS_ROOT_INO, 0),
283                         BTREE_ITER_PREFETCH, k,
284                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
285                 check_snapshot(&trans, &iter, k));
286
287         if (ret)
288                 bch_err(c, "error %i checking snapshots", ret);
289
290         bch2_trans_exit(&trans);
291         return ret;
292 }
293
294 static int check_subvol(struct btree_trans *trans,
295                         struct btree_iter *iter,
296                         struct bkey_s_c k)
297 {
298         struct bkey_s_c_subvolume subvol;
299         struct bch_snapshot snapshot;
300         unsigned snapid;
301         int ret;
302
303         if (k.k->type != KEY_TYPE_subvolume)
304                 return 0;
305
306         subvol = bkey_s_c_to_subvolume(k);
307         snapid = le32_to_cpu(subvol.v->snapshot);
308         ret = snapshot_lookup(trans, snapid, &snapshot);
309
310         if (ret == -ENOENT)
311                 bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u",
312                         k.k->p.offset, snapid);
313         if (ret)
314                 return ret;
315
316         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
317                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
318                 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
319                         bch_err(trans->c, "error deleting subvolume %llu: %s",
320                                 iter->pos.offset, bch2_err_str(ret));
321                 if (ret)
322                         return ret;
323         }
324
325         return 0;
326 }
327
328 int bch2_fs_check_subvols(struct bch_fs *c)
329 {
330         struct btree_trans trans;
331         struct btree_iter iter;
332         struct bkey_s_c k;
333         int ret;
334
335         bch2_trans_init(&trans, c, 0, 0);
336
337         ret = for_each_btree_key_commit(&trans, iter,
338                         BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
339                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
340                 check_subvol(&trans, &iter, k));
341
342         bch2_trans_exit(&trans);
343
344         return ret;
345 }
346
347 void bch2_fs_snapshots_exit(struct bch_fs *c)
348 {
349         genradix_free(&c->snapshots);
350 }
351
352 int bch2_fs_snapshots_start(struct bch_fs *c)
353 {
354         struct btree_trans trans;
355         struct btree_iter iter;
356         struct bkey_s_c k;
357         int ret = 0;
358
359         bch2_trans_init(&trans, c, 0, 0);
360
361         for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
362                            POS_MIN, 0, k,
363                 bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?:
364                 bch2_snapshot_set_equiv(&trans, k));
365
366         bch2_trans_exit(&trans);
367
368         if (ret)
369                 bch_err(c, "error starting snapshots: %s", bch2_err_str(ret));
370         return ret;
371 }
372
373 /*
374  * Mark a snapshot as deleted, for future cleanup:
375  */
376 static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
377 {
378         struct btree_iter iter;
379         struct bkey_s_c k;
380         struct bkey_i_snapshot *s;
381         int ret = 0;
382
383         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
384                              BTREE_ITER_INTENT);
385         k = bch2_btree_iter_peek_slot(&iter);
386         ret = bkey_err(k);
387         if (ret)
388                 goto err;
389
390         if (k.k->type != KEY_TYPE_snapshot) {
391                 bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
392                 ret = -ENOENT;
393                 goto err;
394         }
395
396         /* already deleted? */
397         if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
398                 goto err;
399
400         s = bch2_trans_kmalloc(trans, sizeof(*s));
401         ret = PTR_ERR_OR_ZERO(s);
402         if (ret)
403                 goto err;
404
405         bkey_reassemble(&s->k_i, k);
406         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
407         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
408         s->v.subvol = 0;
409
410         ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
411         if (ret)
412                 goto err;
413 err:
414         bch2_trans_iter_exit(trans, &iter);
415         return ret;
416 }
417
418 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
419 {
420         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
421         struct bkey_s_c k;
422         struct bkey_s_c_snapshot s;
423         struct bkey_i_snapshot *parent;
424         u32 parent_id;
425         unsigned i;
426         int ret = 0;
427
428         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
429                              BTREE_ITER_INTENT);
430         k = bch2_btree_iter_peek_slot(&iter);
431         ret = bkey_err(k);
432         if (ret)
433                 goto err;
434
435         if (k.k->type != KEY_TYPE_snapshot) {
436                 bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
437                 ret = -ENOENT;
438                 goto err;
439         }
440
441         s = bkey_s_c_to_snapshot(k);
442
443         BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
444         parent_id = le32_to_cpu(s.v->parent);
445
446         if (parent_id) {
447                 bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
448                                      POS(0, parent_id),
449                                      BTREE_ITER_INTENT);
450                 k = bch2_btree_iter_peek_slot(&p_iter);
451                 ret = bkey_err(k);
452                 if (ret)
453                         goto err;
454
455                 if (k.k->type != KEY_TYPE_snapshot) {
456                         bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
457                         ret = -ENOENT;
458                         goto err;
459                 }
460
461                 parent = bch2_trans_kmalloc(trans, sizeof(*parent));
462                 ret = PTR_ERR_OR_ZERO(parent);
463                 if (ret)
464                         goto err;
465
466                 bkey_reassemble(&parent->k_i, k);
467
468                 for (i = 0; i < 2; i++)
469                         if (le32_to_cpu(parent->v.children[i]) == id)
470                                 break;
471
472                 if (i == 2)
473                         bch_err(trans->c, "snapshot %u missing child pointer to %u",
474                                 parent_id, id);
475                 else
476                         parent->v.children[i] = 0;
477
478                 if (le32_to_cpu(parent->v.children[0]) <
479                     le32_to_cpu(parent->v.children[1]))
480                         swap(parent->v.children[0],
481                              parent->v.children[1]);
482
483                 ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
484                 if (ret)
485                         goto err;
486         }
487
488         ret = bch2_btree_delete_at(trans, &iter, 0);
489 err:
490         bch2_trans_iter_exit(trans, &p_iter);
491         bch2_trans_iter_exit(trans, &iter);
492         return ret;
493 }
494
495 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
496                               u32 *new_snapids,
497                               u32 *snapshot_subvols,
498                               unsigned nr_snapids)
499 {
500         struct btree_iter iter;
501         struct bkey_i_snapshot *n;
502         struct bkey_s_c k;
503         unsigned i;
504         int ret = 0;
505
506         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
507                              POS_MIN, BTREE_ITER_INTENT);
508         k = bch2_btree_iter_peek(&iter);
509         ret = bkey_err(k);
510         if (ret)
511                 goto err;
512
513         for (i = 0; i < nr_snapids; i++) {
514                 k = bch2_btree_iter_prev_slot(&iter);
515                 ret = bkey_err(k);
516                 if (ret)
517                         goto err;
518
519                 if (!k.k || !k.k->p.offset) {
520                         ret = -ENOSPC;
521                         goto err;
522                 }
523
524                 n = bch2_trans_kmalloc(trans, sizeof(*n));
525                 ret = PTR_ERR_OR_ZERO(n);
526                 if (ret)
527                         goto err;
528
529                 bkey_snapshot_init(&n->k_i);
530                 n->k.p          = iter.pos;
531                 n->v.flags      = 0;
532                 n->v.parent     = cpu_to_le32(parent);
533                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
534                 n->v.pad        = 0;
535                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
536
537                 ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
538                         bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
539                 if (ret)
540                         goto err;
541
542                 new_snapids[i]  = iter.pos.offset;
543         }
544
545         if (parent) {
546                 bch2_btree_iter_set_pos(&iter, POS(0, parent));
547                 k = bch2_btree_iter_peek(&iter);
548                 ret = bkey_err(k);
549                 if (ret)
550                         goto err;
551
552                 if (k.k->type != KEY_TYPE_snapshot) {
553                         bch_err(trans->c, "snapshot %u not found", parent);
554                         ret = -ENOENT;
555                         goto err;
556                 }
557
558                 n = bch2_trans_kmalloc(trans, sizeof(*n));
559                 ret = PTR_ERR_OR_ZERO(n);
560                 if (ret)
561                         goto err;
562
563                 bkey_reassemble(&n->k_i, k);
564
565                 if (n->v.children[0] || n->v.children[1]) {
566                         bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
567                         ret = -EINVAL;
568                         goto err;
569                 }
570
571                 n->v.children[0] = cpu_to_le32(new_snapids[0]);
572                 n->v.children[1] = cpu_to_le32(new_snapids[1]);
573                 n->v.subvol = 0;
574                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
575                 ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
576                 if (ret)
577                         goto err;
578         }
579 err:
580         bch2_trans_iter_exit(trans, &iter);
581         return ret;
582 }
583
584 static int snapshot_delete_key(struct btree_trans *trans,
585                                struct btree_iter *iter,
586                                struct bkey_s_c k,
587                                snapshot_id_list *deleted,
588                                snapshot_id_list *equiv_seen,
589                                struct bpos *last_pos)
590 {
591         struct bch_fs *c = trans->c;
592         u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
593
594         if (bkey_cmp(k.k->p, *last_pos))
595                 equiv_seen->nr = 0;
596         *last_pos = k.k->p;
597
598         if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
599             snapshot_list_has_id(equiv_seen, equiv)) {
600                 return bch2_btree_delete_at(trans, iter,
601                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
602         } else {
603                 return snapshot_list_add(c, equiv_seen, equiv);
604         }
605 }
606
607 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
608                                           struct bkey_s_c k)
609 {
610         struct bkey_s_c_snapshot snap;
611         u32 children[2];
612         int ret;
613
614         if (k.k->type != KEY_TYPE_snapshot)
615                 return 0;
616
617         snap = bkey_s_c_to_snapshot(k);
618         if (BCH_SNAPSHOT_DELETED(snap.v) ||
619             BCH_SNAPSHOT_SUBVOL(snap.v))
620                 return 0;
621
622         children[0] = le32_to_cpu(snap.v->children[0]);
623         children[1] = le32_to_cpu(snap.v->children[1]);
624
625         ret   = snapshot_live(trans, children[0]) ?:
626                 snapshot_live(trans, children[1]);
627         if (ret < 0)
628                 return ret;
629
630         if (!ret)
631                 return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
632         return 0;
633 }
634
635 int bch2_delete_dead_snapshots(struct bch_fs *c)
636 {
637         struct btree_trans trans;
638         struct btree_iter iter;
639         struct bkey_s_c k;
640         struct bkey_s_c_snapshot snap;
641         snapshot_id_list deleted = { 0 };
642         u32 i, id;
643         int ret = 0;
644
645         if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
646                 return 0;
647
648         if (!test_bit(BCH_FS_STARTED, &c->flags)) {
649                 ret = bch2_fs_read_write_early(c);
650                 if (ret) {
651                         bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
652                         return ret;
653                 }
654         }
655
656         bch2_trans_init(&trans, c, 0, 0);
657
658         /*
659          * For every snapshot node: If we have no live children and it's not
660          * pointed to by a subvolume, delete it:
661          */
662         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
663                         POS_MIN, 0, k,
664                         NULL, NULL, 0,
665                 bch2_delete_redundant_snapshot(&trans, &iter, k));
666         if (ret) {
667                 bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
668                 goto err;
669         }
670
671         for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
672                            POS_MIN, 0, k,
673                 bch2_snapshot_set_equiv(&trans, k));
674         if (ret) {
675                 bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
676                 goto err;
677         }
678
679         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
680                            POS_MIN, 0, k, ret) {
681                 if (k.k->type != KEY_TYPE_snapshot)
682                         continue;
683
684                 snap = bkey_s_c_to_snapshot(k);
685                 if (BCH_SNAPSHOT_DELETED(snap.v)) {
686                         ret = snapshot_list_add(c, &deleted, k.k->p.offset);
687                         if (ret)
688                                 break;
689                 }
690         }
691         bch2_trans_iter_exit(&trans, &iter);
692
693         if (ret) {
694                 bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
695                 goto err;
696         }
697
698         for (id = 0; id < BTREE_ID_NR; id++) {
699                 struct bpos last_pos = POS_MIN;
700                 snapshot_id_list equiv_seen = { 0 };
701
702                 if (!btree_type_has_snapshots(id))
703                         continue;
704
705                 ret = for_each_btree_key_commit(&trans, iter,
706                                 id, POS_MIN,
707                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
708                                 NULL, NULL, BTREE_INSERT_NOFAIL,
709                         snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
710
711                 darray_exit(&equiv_seen);
712
713                 if (ret) {
714                         bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
715                         goto err;
716                 }
717         }
718
719         for (i = 0; i < deleted.nr; i++) {
720                 ret = commit_do(&trans, NULL, NULL, 0,
721                         bch2_snapshot_node_delete(&trans, deleted.data[i]));
722                 if (ret) {
723                         bch_err(c, "error deleting snapshot %u: %s",
724                                 deleted.data[i], bch2_err_str(ret));
725                         goto err;
726                 }
727         }
728
729         clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
730 err:
731         darray_exit(&deleted);
732         bch2_trans_exit(&trans);
733         return ret;
734 }
735
736 static void bch2_delete_dead_snapshots_work(struct work_struct *work)
737 {
738         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
739
740         bch2_delete_dead_snapshots(c);
741         percpu_ref_put(&c->writes);
742 }
743
744 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
745 {
746         if (!percpu_ref_tryget_live(&c->writes))
747                 return;
748
749         if (!queue_work(system_long_wq, &c->snapshot_delete_work))
750                 percpu_ref_put(&c->writes);
751 }
752
753 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
754                                            struct btree_trans_commit_hook *h)
755 {
756         struct bch_fs *c = trans->c;
757
758         set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
759
760         if (!test_bit(BCH_FS_FSCK_DONE, &c->flags))
761                 return 0;
762
763         bch2_delete_dead_snapshots_async(c);
764         return 0;
765 }
766
767 /* Subvolumes: */
768
769 int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
770                            int rw, struct printbuf *err)
771 {
772         if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 ||
773             bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) {
774                 prt_printf(err, "invalid pos");
775                 return -EINVAL;
776         }
777
778         if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) {
779                 prt_printf(err, "incorrect value size (%zu != %zu)",
780                        bkey_val_bytes(k.k), sizeof(struct bch_subvolume));
781                 return -EINVAL;
782         }
783
784         return 0;
785 }
786
787 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
788                             struct bkey_s_c k)
789 {
790         struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
791
792         prt_printf(out, "root %llu snapshot id %u",
793                le64_to_cpu(s.v->inode),
794                le32_to_cpu(s.v->snapshot));
795 }
796
797 int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
798                        bool inconsistent_if_not_found,
799                        int iter_flags,
800                        struct bch_subvolume *s)
801 {
802         struct btree_iter iter;
803         struct bkey_s_c k;
804         int ret;
805
806         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
807                              iter_flags);
808         k = bch2_btree_iter_peek_slot(&iter);
809         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
810
811         if (ret == -ENOENT && inconsistent_if_not_found)
812                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
813         if (!ret)
814                 *s = *bkey_s_c_to_subvolume(k).v;
815
816         bch2_trans_iter_exit(trans, &iter);
817         return ret;
818 }
819
820 int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
821                              struct bch_subvolume *subvol)
822 {
823         struct bch_snapshot snap;
824
825         return  snapshot_lookup(trans, snapshot, &snap) ?:
826                 bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
827 }
828
829 int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
830                                 u32 *snapid)
831 {
832         struct bch_subvolume s;
833         int ret;
834
835         ret = bch2_subvolume_get(trans, subvol, true,
836                                  BTREE_ITER_CACHED|
837                                  BTREE_ITER_WITH_UPDATES,
838                                  &s);
839
840         *snapid = le32_to_cpu(s.snapshot);
841         return ret;
842 }
843
844 /*
845  * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
846  * deletion/cleanup:
847  */
848 int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
849 {
850         struct btree_iter iter;
851         struct bkey_s_c k;
852         struct bkey_s_c_subvolume subvol;
853         struct btree_trans_commit_hook *h;
854         u32 snapid;
855         int ret = 0;
856
857         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
858                              POS(0, subvolid),
859                              BTREE_ITER_CACHED|
860                              BTREE_ITER_INTENT);
861         k = bch2_btree_iter_peek_slot(&iter);
862         ret = bkey_err(k);
863         if (ret)
864                 goto err;
865
866         if (k.k->type != KEY_TYPE_subvolume) {
867                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
868                 ret = -EIO;
869                 goto err;
870         }
871
872         subvol = bkey_s_c_to_subvolume(k);
873         snapid = le32_to_cpu(subvol.v->snapshot);
874
875         ret = bch2_btree_delete_at(trans, &iter, 0);
876         if (ret)
877                 goto err;
878
879         ret = bch2_snapshot_node_set_deleted(trans, snapid);
880         if (ret)
881                 goto err;
882
883         h = bch2_trans_kmalloc(trans, sizeof(*h));
884         ret = PTR_ERR_OR_ZERO(h);
885         if (ret)
886                 goto err;
887
888         h->fn = bch2_delete_dead_snapshots_hook;
889         bch2_trans_commit_hook(trans, h);
890 err:
891         bch2_trans_iter_exit(trans, &iter);
892         return ret;
893 }
894
895 void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
896 {
897         struct bch_fs *c = container_of(work, struct bch_fs,
898                                 snapshot_wait_for_pagecache_and_delete_work);
899         snapshot_id_list s;
900         u32 *id;
901         int ret = 0;
902
903         while (!ret) {
904                 mutex_lock(&c->snapshots_unlinked_lock);
905                 s = c->snapshots_unlinked;
906                 darray_init(&c->snapshots_unlinked);
907                 mutex_unlock(&c->snapshots_unlinked_lock);
908
909                 if (!s.nr)
910                         break;
911
912                 bch2_evict_subvolume_inodes(c, &s);
913
914                 for (id = s.data; id < s.data + s.nr; id++) {
915                         ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
916                                       bch2_subvolume_delete(&trans, *id));
917                         if (ret) {
918                                 bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
919                                 break;
920                         }
921                 }
922
923                 darray_exit(&s);
924         }
925
926         percpu_ref_put(&c->writes);
927 }
928
929 struct subvolume_unlink_hook {
930         struct btree_trans_commit_hook  h;
931         u32                             subvol;
932 };
933
934 int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
935                                                       struct btree_trans_commit_hook *_h)
936 {
937         struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
938         struct bch_fs *c = trans->c;
939         int ret = 0;
940
941         mutex_lock(&c->snapshots_unlinked_lock);
942         if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
943                 ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol);
944         mutex_unlock(&c->snapshots_unlinked_lock);
945
946         if (ret)
947                 return ret;
948
949         if (unlikely(!percpu_ref_tryget_live(&c->writes)))
950                 return -EROFS;
951
952         if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
953                 percpu_ref_put(&c->writes);
954         return 0;
955 }
956
957 int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
958 {
959         struct btree_iter iter;
960         struct bkey_s_c k;
961         struct bkey_i_subvolume *n;
962         struct subvolume_unlink_hook *h;
963         int ret = 0;
964
965         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
966                              POS(0, subvolid),
967                              BTREE_ITER_CACHED|
968                              BTREE_ITER_INTENT);
969         k = bch2_btree_iter_peek_slot(&iter);
970         ret = bkey_err(k);
971         if (ret)
972                 goto err;
973
974         if (k.k->type != KEY_TYPE_subvolume) {
975                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
976                 ret = -EIO;
977                 goto err;
978         }
979
980         n = bch2_trans_kmalloc(trans, sizeof(*n));
981         ret = PTR_ERR_OR_ZERO(n);
982         if (ret)
983                 goto err;
984
985         bkey_reassemble(&n->k_i, k);
986         SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
987
988         ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
989         if (ret)
990                 goto err;
991
992         h = bch2_trans_kmalloc(trans, sizeof(*h));
993         ret = PTR_ERR_OR_ZERO(h);
994         if (ret)
995                 goto err;
996
997         h->h.fn         = bch2_subvolume_wait_for_pagecache_and_delete_hook;
998         h->subvol       = subvolid;
999         bch2_trans_commit_hook(trans, &h->h);
1000 err:
1001         bch2_trans_iter_exit(trans, &iter);
1002         return ret;
1003 }
1004
1005 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
1006                           u32 src_subvolid,
1007                           u32 *new_subvolid,
1008                           u32 *new_snapshotid,
1009                           bool ro)
1010 {
1011         struct bch_fs *c = trans->c;
1012         struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
1013         struct bkey_i_subvolume *new_subvol = NULL;
1014         struct bkey_i_subvolume *src_subvol = NULL;
1015         struct bkey_s_c k;
1016         u32 parent = 0, new_nodes[2], snapshot_subvols[2];
1017         int ret = 0;
1018
1019         for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
1020                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
1021                 if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
1022                         break;
1023
1024                 /*
1025                  * bch2_subvolume_delete() doesn't flush the btree key cache -
1026                  * ideally it would but that's tricky
1027                  */
1028                 if (bkey_deleted(k.k) &&
1029                     !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
1030                         goto found_slot;
1031         }
1032
1033         if (!ret)
1034                 ret = -ENOSPC;
1035         goto err;
1036 found_slot:
1037         snapshot_subvols[0] = dst_iter.pos.offset;
1038         snapshot_subvols[1] = src_subvolid;
1039
1040         if (src_subvolid) {
1041                 /* Creating a snapshot: */
1042                 src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
1043                 ret = PTR_ERR_OR_ZERO(src_subvol);
1044                 if (ret)
1045                         goto err;
1046
1047                 bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
1048                                      POS(0, src_subvolid),
1049                                      BTREE_ITER_CACHED|
1050                                      BTREE_ITER_INTENT);
1051                 k = bch2_btree_iter_peek_slot(&src_iter);
1052                 ret = bkey_err(k);
1053                 if (ret)
1054                         goto err;
1055
1056                 if (k.k->type != KEY_TYPE_subvolume) {
1057                         bch_err(c, "subvolume %u not found", src_subvolid);
1058                         ret = -ENOENT;
1059                         goto err;
1060                 }
1061
1062                 bkey_reassemble(&src_subvol->k_i, k);
1063                 parent = le32_to_cpu(src_subvol->v.snapshot);
1064         }
1065
1066         ret = bch2_snapshot_node_create(trans, parent, new_nodes,
1067                                         snapshot_subvols,
1068                                         src_subvolid ? 2 : 1);
1069         if (ret)
1070                 goto err;
1071
1072         if (src_subvolid) {
1073                 src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
1074                 ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
1075                 if (ret)
1076                         goto err;
1077         }
1078
1079         new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
1080         ret = PTR_ERR_OR_ZERO(new_subvol);
1081         if (ret)
1082                 goto err;
1083
1084         bkey_subvolume_init(&new_subvol->k_i);
1085         new_subvol->v.flags     = 0;
1086         new_subvol->v.snapshot  = cpu_to_le32(new_nodes[0]);
1087         new_subvol->v.inode     = cpu_to_le64(inode);
1088         SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
1089         SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
1090         new_subvol->k.p         = dst_iter.pos;
1091         ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
1092         if (ret)
1093                 goto err;
1094
1095         *new_subvolid   = new_subvol->k.p.offset;
1096         *new_snapshotid = new_nodes[0];
1097 err:
1098         bch2_trans_iter_exit(trans, &src_iter);
1099         bch2_trans_iter_exit(trans, &dst_iter);
1100         return ret;
1101 }
1102
1103 int bch2_fs_subvolumes_init(struct bch_fs *c)
1104 {
1105         INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
1106         INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
1107                   bch2_subvolume_wait_for_pagecache_and_delete);
1108         mutex_init(&c->snapshots_unlinked_lock);
1109         return 0;
1110 }