]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/subvolume.c
Update bcachefs sources to 720f644e63 bcachefs: Improve reflink repair code
[bcachefs-tools-debian] / libbcachefs / subvolume.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "btree_key_cache.h"
5 #include "btree_update.h"
6 #include "error.h"
7 #include "subvolume.h"
8
9 /* Snapshot tree: */
10
11 static void bch2_delete_dead_snapshots_work(struct work_struct *);
12 static void bch2_delete_dead_snapshots(struct bch_fs *);
13
14 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
15                            struct bkey_s_c k)
16 {
17         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
18
19         pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
20                BCH_SNAPSHOT_SUBVOL(s.v),
21                BCH_SNAPSHOT_DELETED(s.v),
22                le32_to_cpu(s.v->parent),
23                le32_to_cpu(s.v->children[0]),
24                le32_to_cpu(s.v->children[1]),
25                le32_to_cpu(s.v->subvol));
26 }
27
28 const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
29 {
30         struct bkey_s_c_snapshot s;
31         u32 i, id;
32
33         if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
34             bkey_cmp(k.k->p, POS(0, 1)) < 0)
35                 return "bad pos";
36
37         if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
38                 return "bad val size";
39
40         s = bkey_s_c_to_snapshot(k);
41
42         id = le32_to_cpu(s.v->parent);
43         if (id && id <= k.k->p.offset)
44                 return "bad parent node";
45
46         if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
47                 return "children not normalized";
48
49         if (s.v->children[0] &&
50             s.v->children[0] == s.v->children[1])
51                 return "duplicate child nodes";
52
53         for (i = 0; i < 2; i++) {
54                 id = le32_to_cpu(s.v->children[i]);
55
56                 if (id >= k.k->p.offset)
57                         return "bad child node";
58         }
59
60         return NULL;
61 }
62
63 int bch2_mark_snapshot(struct bch_fs *c,
64                        struct bkey_s_c old, struct bkey_s_c new,
65                        u64 journal_seq, unsigned flags)
66 {
67         struct snapshot_t *t;
68
69         t = genradix_ptr_alloc(&c->snapshots,
70                                U32_MAX - new.k->p.offset,
71                                GFP_KERNEL);
72         if (!t)
73                 return -ENOMEM;
74
75         if (new.k->type == KEY_TYPE_snapshot) {
76                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
77
78                 t->parent       = le32_to_cpu(s.v->parent);
79                 t->children[0]  = le32_to_cpu(s.v->children[0]);
80                 t->children[1]  = le32_to_cpu(s.v->children[1]);
81                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
82         } else {
83                 t->parent       = 0;
84                 t->children[0]  = 0;
85                 t->children[1]  = 0;
86                 t->subvol       = 0;
87         }
88
89         return 0;
90 }
91
92 static int snapshot_lookup(struct btree_trans *trans, u32 id,
93                            struct bch_snapshot *s)
94 {
95         struct btree_iter iter;
96         struct bkey_s_c k;
97         int ret;
98
99         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
100                              BTREE_ITER_WITH_UPDATES);
101         k = bch2_btree_iter_peek_slot(&iter);
102         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
103
104         if (!ret)
105                 *s = *bkey_s_c_to_snapshot(k).v;
106
107         bch2_trans_iter_exit(trans, &iter);
108         return ret;
109 }
110
111 static int snapshot_live(struct btree_trans *trans, u32 id)
112 {
113         struct bch_snapshot v;
114         int ret;
115
116         if (!id)
117                 return 0;
118
119         ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
120         if (ret == -ENOENT)
121                 bch_err(trans->c, "snapshot node %u not found", id);
122         if (ret)
123                 return ret;
124
125         return !BCH_SNAPSHOT_DELETED(&v);
126 }
127
128 static int bch2_snapshots_set_equiv(struct btree_trans *trans)
129 {
130         struct bch_fs *c = trans->c;
131         struct btree_iter iter;
132         struct bkey_s_c k;
133         struct bkey_s_c_snapshot snap;
134         unsigned i;
135         int ret;
136
137         for_each_btree_key(trans, iter, BTREE_ID_snapshots,
138                            POS_MIN, 0, k, ret) {
139                 u32 id = k.k->p.offset, child[2];
140                 unsigned nr_live = 0, live_idx;
141
142                 if (k.k->type != KEY_TYPE_snapshot)
143                         continue;
144
145                 snap = bkey_s_c_to_snapshot(k);
146                 child[0] = le32_to_cpu(snap.v->children[0]);
147                 child[1] = le32_to_cpu(snap.v->children[1]);
148
149                 for (i = 0; i < 2; i++) {
150                         ret = snapshot_live(trans, child[i]);
151                         if (ret < 0)
152                                 break;
153
154                         if (ret)
155                                 live_idx = i;
156                         nr_live += ret;
157                 }
158
159                 snapshot_t(c, id)->equiv = nr_live == 1
160                         ? snapshot_t(c, child[live_idx])->equiv
161                         : id;
162         }
163         bch2_trans_iter_exit(trans, &iter);
164
165         if (ret)
166                 bch_err(c, "error walking snapshots: %i", ret);
167
168         return ret;
169 }
170
171 /* fsck: */
172 static int bch2_snapshot_check(struct btree_trans *trans,
173                                struct bkey_s_c_snapshot s)
174 {
175         struct bch_subvolume subvol;
176         struct bch_snapshot v;
177         u32 i, id;
178         int ret;
179
180         id = le32_to_cpu(s.v->subvol);
181         ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
182         if (ret == -ENOENT)
183                 bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
184                         s.k->p.offset, id);
185         if (ret)
186                 return ret;
187
188         if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
189                 bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
190                         s.k->p.offset);
191                 return -EINVAL;
192         }
193
194         id = le32_to_cpu(s.v->parent);
195         if (id) {
196                 ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
197                 if (ret == -ENOENT)
198                         bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
199                                 s.k->p.offset, id);
200                 if (ret)
201                         return ret;
202
203                 if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
204                     le32_to_cpu(v.children[1]) != s.k->p.offset) {
205                         bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
206                                 id, s.k->p.offset);
207                         return -EINVAL;
208                 }
209         }
210
211         for (i = 0; i < 2 && s.v->children[i]; i++) {
212                 id = le32_to_cpu(s.v->children[i]);
213
214                 ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
215                 if (ret == -ENOENT)
216                         bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
217                                 s.k->p.offset, id);
218                 if (ret)
219                         return ret;
220
221                 if (le32_to_cpu(v.parent) != s.k->p.offset) {
222                         bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
223                                 id, le32_to_cpu(v.parent), s.k->p.offset);
224                         return -EINVAL;
225                 }
226         }
227
228         return 0;
229 }
230
231 int bch2_fs_snapshots_check(struct bch_fs *c)
232 {
233         struct btree_trans trans;
234         struct btree_iter iter;
235         struct bkey_s_c k;
236         struct bch_snapshot s;
237         unsigned id;
238         int ret;
239
240         bch2_trans_init(&trans, c, 0, 0);
241
242         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
243                            POS_MIN, 0, k, ret) {
244                 if (k.k->type != KEY_TYPE_snapshot)
245                         continue;
246
247                 ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
248                 if (ret)
249                         break;
250         }
251         bch2_trans_iter_exit(&trans, &iter);
252
253         if (ret) {
254                 bch_err(c, "error %i checking snapshots", ret);
255                 goto err;
256         }
257
258         for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
259                            POS_MIN, 0, k, ret) {
260                 if (k.k->type != KEY_TYPE_subvolume)
261                         continue;
262 again_2:
263                 id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
264                 ret = snapshot_lookup(&trans, id, &s);
265
266                 if (ret == -EINTR) {
267                         k = bch2_btree_iter_peek(&iter);
268                         goto again_2;
269                 } else if (ret == -ENOENT)
270                         bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
271                                 k.k->p.offset, id);
272                 else if (ret)
273                         break;
274         }
275         bch2_trans_iter_exit(&trans, &iter);
276 err:
277         bch2_trans_exit(&trans);
278         return ret;
279 }
280
281 void bch2_fs_snapshots_exit(struct bch_fs *c)
282 {
283         genradix_free(&c->snapshots);
284 }
285
286 int bch2_fs_snapshots_start(struct bch_fs *c)
287 {
288         struct btree_trans trans;
289         struct btree_iter iter;
290         struct bkey_s_c k;
291         bool have_deleted = false;
292         int ret = 0;
293
294         bch2_trans_init(&trans, c, 0, 0);
295
296         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
297                            POS_MIN, 0, k, ret) {
298                if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
299                        break;
300
301                 if (k.k->type != KEY_TYPE_snapshot) {
302                         bch_err(c, "found wrong key type %u in snapshot node table",
303                                 k.k->type);
304                         continue;
305                 }
306
307                 if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
308                         have_deleted = true;
309
310                 ret = bch2_mark_snapshot(c, bkey_s_c_null, k, 0, 0);
311                 if (ret)
312                         break;
313         }
314         bch2_trans_iter_exit(&trans, &iter);
315
316         if (ret)
317                 goto err;
318
319         ret = bch2_snapshots_set_equiv(&trans);
320         if (ret)
321                 goto err;
322 err:
323         bch2_trans_exit(&trans);
324
325         if (!ret && have_deleted) {
326                 bch_info(c, "restarting deletion of dead snapshots");
327                 if (c->opts.fsck) {
328                         bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
329                 } else {
330                         bch2_delete_dead_snapshots(c);
331                 }
332         }
333
334         return ret;
335 }
336
337 /*
338  * Mark a snapshot as deleted, for future cleanup:
339  */
340 static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
341 {
342         struct btree_iter iter;
343         struct bkey_s_c k;
344         struct bkey_i_snapshot *s;
345         int ret = 0;
346
347         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
348                              BTREE_ITER_INTENT);
349         k = bch2_btree_iter_peek_slot(&iter);
350         ret = bkey_err(k);
351         if (ret)
352                 goto err;
353
354         if (k.k->type != KEY_TYPE_snapshot) {
355                 bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
356                 ret = -ENOENT;
357                 goto err;
358         }
359
360         /* already deleted? */
361         if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
362                 goto err;
363
364         s = bch2_trans_kmalloc(trans, sizeof(*s));
365         ret = PTR_ERR_OR_ZERO(s);
366         if (ret)
367                 goto err;
368
369         bkey_reassemble(&s->k_i, k);
370
371         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
372         ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
373         if (ret)
374                 goto err;
375 err:
376         bch2_trans_iter_exit(trans, &iter);
377         return ret;
378 }
379
380 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
381 {
382         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
383         struct bkey_s_c k;
384         struct bkey_s_c_snapshot s;
385         struct bkey_i_snapshot *parent;
386         u32 parent_id;
387         unsigned i;
388         int ret = 0;
389
390         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
391                              BTREE_ITER_INTENT);
392         k = bch2_btree_iter_peek_slot(&iter);
393         ret = bkey_err(k);
394         if (ret)
395                 goto err;
396
397         if (k.k->type != KEY_TYPE_snapshot) {
398                 bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
399                 ret = -ENOENT;
400                 goto err;
401         }
402
403         s = bkey_s_c_to_snapshot(k);
404
405         BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
406         parent_id = le32_to_cpu(s.v->parent);
407
408         if (parent_id) {
409                 bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
410                                      POS(0, parent_id),
411                                      BTREE_ITER_INTENT);
412                 k = bch2_btree_iter_peek_slot(&p_iter);
413                 ret = bkey_err(k);
414                 if (ret)
415                         goto err;
416
417                 if (k.k->type != KEY_TYPE_snapshot) {
418                         bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
419                         ret = -ENOENT;
420                         goto err;
421                 }
422
423                 parent = bch2_trans_kmalloc(trans, sizeof(*parent));
424                 ret = PTR_ERR_OR_ZERO(parent);
425                 if (ret)
426                         goto err;
427
428                 bkey_reassemble(&parent->k_i, k);
429
430                 for (i = 0; i < 2; i++)
431                         if (le32_to_cpu(parent->v.children[i]) == id)
432                                 break;
433
434                 if (i == 2)
435                         bch_err(trans->c, "snapshot %u missing child pointer to %u",
436                                 parent_id, id);
437                 else
438                         parent->v.children[i] = 0;
439
440                 if (le32_to_cpu(parent->v.children[0]) <
441                     le32_to_cpu(parent->v.children[1]))
442                         swap(parent->v.children[0],
443                              parent->v.children[1]);
444
445                 ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
446                 if (ret)
447                         goto err;
448         }
449
450         ret = bch2_btree_delete_at(trans, &iter, 0);
451 err:
452         bch2_trans_iter_exit(trans, &p_iter);
453         bch2_trans_iter_exit(trans, &iter);
454         return ret;
455 }
456
457 static int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
458                                      u32 *new_snapids,
459                                      u32 *snapshot_subvols,
460                                      unsigned nr_snapids)
461 {
462         struct btree_iter iter;
463         struct bkey_i_snapshot *n;
464         struct bkey_s_c k;
465         unsigned i;
466         int ret = 0;
467
468         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
469                              POS_MIN, BTREE_ITER_INTENT);
470         k = bch2_btree_iter_peek(&iter);
471         ret = bkey_err(k);
472         if (ret)
473                 goto err;
474
475         for (i = 0; i < nr_snapids; i++) {
476                 k = bch2_btree_iter_prev_slot(&iter);
477                 ret = bkey_err(k);
478                 if (ret)
479                         goto err;
480
481                 if (!k.k || !k.k->p.offset) {
482                         ret = -ENOSPC;
483                         goto err;
484                 }
485
486                 n = bch2_trans_kmalloc(trans, sizeof(*n));
487                 ret = PTR_ERR_OR_ZERO(n);
488                 if (ret)
489                         return ret;
490
491                 bkey_snapshot_init(&n->k_i);
492                 n->k.p          = iter.pos;
493                 n->v.flags      = 0;
494                 n->v.parent     = cpu_to_le32(parent);
495                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
496                 n->v.pad        = 0;
497                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
498
499                 bch2_trans_update(trans, &iter, &n->k_i, 0);
500
501                 ret = bch2_mark_snapshot(trans->c, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0, 0);
502                 if (ret)
503                         break;
504
505                 new_snapids[i]  = iter.pos.offset;
506         }
507
508         if (parent) {
509                 bch2_btree_iter_set_pos(&iter, POS(0, parent));
510                 k = bch2_btree_iter_peek(&iter);
511                 ret = bkey_err(k);
512                 if (ret)
513                         goto err;
514
515                 if (k.k->type != KEY_TYPE_snapshot) {
516                         bch_err(trans->c, "snapshot %u not found", parent);
517                         ret = -ENOENT;
518                         goto err;
519                 }
520
521                 n = bch2_trans_kmalloc(trans, sizeof(*n));
522                 ret = PTR_ERR_OR_ZERO(n);
523                 if (ret)
524                         return ret;
525
526                 bkey_reassemble(&n->k_i, k);
527
528                 if (n->v.children[0] || n->v.children[1]) {
529                         bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
530                         ret = -EINVAL;
531                         goto err;
532                 }
533
534                 n->v.children[0] = cpu_to_le32(new_snapids[0]);
535                 n->v.children[1] = cpu_to_le32(new_snapids[1]);
536                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
537                 bch2_trans_update(trans, &iter, &n->k_i, 0);
538         }
539 err:
540         bch2_trans_iter_exit(trans, &iter);
541         return ret;
542 }
543
544 /* List of snapshot IDs that are being deleted: */
545 struct snapshot_id_list {
546         u32             nr;
547         u32             size;
548         u32             *d;
549 };
550
551 static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
552 {
553         unsigned i;
554
555         for (i = 0; i < s->nr; i++)
556                 if (id == s->d[i])
557                         return true;
558         return false;
559 }
560
561 static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
562 {
563         BUG_ON(snapshot_list_has_id(s, id));
564
565         if (s->nr == s->size) {
566                 size_t new_size = max(8U, s->size * 2);
567                 void *n = krealloc(s->d,
568                                    new_size * sizeof(s->d[0]),
569                                    GFP_KERNEL);
570                 if (!n) {
571                         pr_err("error allocating snapshot ID list");
572                         return -ENOMEM;
573                 }
574
575                 s->d    = n;
576                 s->size = new_size;
577         };
578
579         s->d[s->nr++] = id;
580         return 0;
581 }
582
583 static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
584                                            struct snapshot_id_list *deleted,
585                                            enum btree_id btree_id)
586 {
587         struct bch_fs *c = trans->c;
588         struct btree_iter iter;
589         struct bkey_s_c k;
590         struct snapshot_id_list equiv_seen = { 0 };
591         struct bpos last_pos = POS_MIN;
592         int ret = 0;
593
594         /*
595          * XXX: We should also delete whiteouts that no longer overwrite
596          * anything
597          */
598
599         bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
600                              BTREE_ITER_INTENT|
601                              BTREE_ITER_PREFETCH|
602                              BTREE_ITER_NOT_EXTENTS|
603                              BTREE_ITER_ALL_SNAPSHOTS);
604
605         while ((bch2_trans_begin(trans),
606                 (k = bch2_btree_iter_peek(&iter)).k) &&
607                !(ret = bkey_err(k))) {
608                 u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
609
610                 if (bkey_cmp(k.k->p, last_pos))
611                         equiv_seen.nr = 0;
612                 last_pos = k.k->p;
613
614                 if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
615                     snapshot_list_has_id(&equiv_seen, equiv)) {
616                         if (btree_id == BTREE_ID_inodes &&
617                             bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
618                                 continue;
619
620                         ret = __bch2_trans_do(trans, NULL, NULL,
621                                               BTREE_INSERT_NOFAIL,
622                                 bch2_btree_iter_traverse(&iter) ?:
623                                 bch2_btree_delete_at(trans, &iter,
624                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
625                         if (ret)
626                                 break;
627                 } else {
628                         ret = snapshot_id_add(&equiv_seen, equiv);
629                         if (ret)
630                                 break;
631                 }
632
633                 bch2_btree_iter_advance(&iter);
634         }
635         bch2_trans_iter_exit(trans, &iter);
636
637         kfree(equiv_seen.d);
638
639         return ret;
640 }
641
642 static void bch2_delete_dead_snapshots_work(struct work_struct *work)
643 {
644         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
645         struct btree_trans trans;
646         struct btree_iter iter;
647         struct bkey_s_c k;
648         struct bkey_s_c_snapshot snap;
649         struct snapshot_id_list deleted = { 0 };
650         u32 i, id, children[2];
651         int ret = 0;
652
653         bch2_trans_init(&trans, c, 0, 0);
654
655         /*
656          * For every snapshot node: If we have no live children and it's not
657          * pointed to by a subvolume, delete it:
658          */
659         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
660                            POS_MIN, 0, k, ret) {
661                 if (k.k->type != KEY_TYPE_snapshot)
662                         continue;
663
664                 snap = bkey_s_c_to_snapshot(k);
665                 if (BCH_SNAPSHOT_DELETED(snap.v) ||
666                     BCH_SNAPSHOT_SUBVOL(snap.v))
667                         continue;
668
669                 children[0] = le32_to_cpu(snap.v->children[0]);
670                 children[1] = le32_to_cpu(snap.v->children[1]);
671
672                 ret   = snapshot_live(&trans, children[0]) ?:
673                         snapshot_live(&trans, children[1]);
674                 if (ret < 0)
675                         break;
676                 if (ret)
677                         continue;
678
679                 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
680                         bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
681                 if (ret) {
682                         bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
683                         break;
684                 }
685         }
686         bch2_trans_iter_exit(&trans, &iter);
687
688         if (ret) {
689                 bch_err(c, "error walking snapshots: %i", ret);
690                 goto err;
691         }
692
693         ret = bch2_snapshots_set_equiv(&trans);
694         if (ret)
695                 goto err;
696
697         for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
698                            POS_MIN, 0, k, ret) {
699                 if (k.k->type != KEY_TYPE_snapshot)
700                         continue;
701
702                 snap = bkey_s_c_to_snapshot(k);
703                 if (BCH_SNAPSHOT_DELETED(snap.v)) {
704                         ret = snapshot_id_add(&deleted, k.k->p.offset);
705                         if (ret)
706                                 break;
707                 }
708         }
709         bch2_trans_iter_exit(&trans, &iter);
710
711         if (ret) {
712                 bch_err(c, "error walking snapshots: %i", ret);
713                 goto err;
714         }
715
716         for (id = 0; id < BTREE_ID_NR; id++) {
717                 if (!btree_type_has_snapshots(id))
718                         continue;
719
720                 ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
721                 if (ret) {
722                         bch_err(c, "error deleting snapshot keys: %i", ret);
723                         goto err;
724                 }
725         }
726
727         for (i = 0; i < deleted.nr; i++) {
728                 ret = __bch2_trans_do(&trans, NULL, NULL, 0,
729                         bch2_snapshot_node_delete(&trans, deleted.d[i]));
730                 if (ret) {
731                         bch_err(c, "error deleting snapshot %u: %i",
732                                 deleted.d[i], ret);
733                         goto err;
734                 }
735         }
736 err:
737         kfree(deleted.d);
738         bch2_trans_exit(&trans);
739         percpu_ref_put(&c->writes);
740 }
741
742 static void bch2_delete_dead_snapshots(struct bch_fs *c)
743 {
744         if (unlikely(!percpu_ref_tryget(&c->writes)))
745                 return;
746
747         if (!queue_work(system_long_wq, &c->snapshot_delete_work))
748                 percpu_ref_put(&c->writes);
749 }
750
751 static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
752                                            struct btree_trans_commit_hook *h)
753 {
754         bch2_delete_dead_snapshots(trans->c);
755         return 0;
756 }
757
758 /* Subvolumes: */
759
760 const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
761 {
762         if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
763                 return "invalid pos";
764
765         if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
766                 return "invalid pos";
767
768         if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
769                 return "bad val size";
770
771         return NULL;
772 }
773
774 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
775                             struct bkey_s_c k)
776 {
777         struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
778
779         pr_buf(out, "root %llu snapshot id %u",
780                le64_to_cpu(s.v->inode),
781                le32_to_cpu(s.v->snapshot));
782 }
783
784 int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
785                        bool inconsistent_if_not_found,
786                        int iter_flags,
787                        struct bch_subvolume *s)
788 {
789         struct btree_iter iter;
790         struct bkey_s_c k;
791         int ret;
792
793         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
794                              iter_flags);
795         k = bch2_btree_iter_peek_slot(&iter);
796         ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
797
798         if (ret == -ENOENT && inconsistent_if_not_found)
799                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
800         if (!ret)
801                 *s = *bkey_s_c_to_subvolume(k).v;
802
803         bch2_trans_iter_exit(trans, &iter);
804         return ret;
805 }
806
807 int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
808                                 u32 *snapid)
809 {
810         struct bch_subvolume s;
811         int ret;
812
813         ret = bch2_subvolume_get(trans, subvol, true,
814                                  BTREE_ITER_CACHED|
815                                  BTREE_ITER_WITH_UPDATES,
816                                  &s);
817
818         *snapid = le32_to_cpu(s.snapshot);
819         return ret;
820 }
821
822 /* XXX: mark snapshot id for deletion, walk btree and delete: */
823 int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid,
824                           int deleting_snapshot)
825 {
826         struct btree_iter iter;
827         struct bkey_s_c k;
828         struct bkey_s_c_subvolume subvol;
829         struct btree_trans_commit_hook *h;
830         struct bkey_i *delete;
831         u32 snapid;
832         int ret = 0;
833
834         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
835                              POS(0, subvolid),
836                              BTREE_ITER_CACHED|
837                              BTREE_ITER_INTENT);
838         k = bch2_btree_iter_peek_slot(&iter);
839         ret = bkey_err(k);
840         if (ret)
841                 goto err;
842
843         if (k.k->type != KEY_TYPE_subvolume) {
844                 bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
845                 ret = -EIO;
846                 goto err;
847         }
848
849         subvol = bkey_s_c_to_subvolume(k);
850         snapid = le32_to_cpu(subvol.v->snapshot);
851
852         if (deleting_snapshot >= 0 &&
853             deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) {
854                 ret = -ENOENT;
855                 goto err;
856         }
857
858         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
859         ret = PTR_ERR_OR_ZERO(delete);
860         if (ret)
861                 goto err;
862
863         bkey_init(&delete->k);
864         delete->k.p = iter.pos;
865         ret = bch2_trans_update(trans, &iter, delete, 0);
866         if (ret)
867                 goto err;
868
869         ret = bch2_snapshot_node_set_deleted(trans, snapid);
870
871         h = bch2_trans_kmalloc(trans, sizeof(*h));
872         ret = PTR_ERR_OR_ZERO(h);
873         if (ret)
874                 goto err;
875
876         h->fn = bch2_delete_dead_snapshots_hook;
877         bch2_trans_commit_hook(trans, h);
878 err:
879         bch2_trans_iter_exit(trans, &iter);
880         return ret;
881 }
882
883 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
884                           u32 src_subvolid,
885                           u32 *new_subvolid,
886                           u32 *new_snapshotid,
887                           bool ro)
888 {
889         struct bch_fs *c = trans->c;
890         struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
891         struct bkey_i_subvolume *new_subvol = NULL;
892         struct bkey_i_subvolume *src_subvol = NULL;
893         struct bkey_s_c k;
894         u32 parent = 0, new_nodes[2], snapshot_subvols[2];
895         int ret = 0;
896
897         for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
898                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
899                 if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
900                         break;
901
902                 /*
903                  * bch2_subvolume_delete() doesn't flush the btree key cache -
904                  * ideally it would but that's tricky
905                  */
906                 if (bkey_deleted(k.k) &&
907                     !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
908                         goto found_slot;
909         }
910
911         if (!ret)
912                 ret = -ENOSPC;
913         goto err;
914 found_slot:
915         snapshot_subvols[0] = dst_iter.pos.offset;
916         snapshot_subvols[1] = src_subvolid;
917
918         if (src_subvolid) {
919                 /* Creating a snapshot: */
920                 src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
921                 ret = PTR_ERR_OR_ZERO(src_subvol);
922                 if (ret)
923                         goto err;
924
925                 bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
926                                      POS(0, src_subvolid),
927                                      BTREE_ITER_CACHED|
928                                      BTREE_ITER_INTENT);
929                 k = bch2_btree_iter_peek_slot(&src_iter);
930                 ret = bkey_err(k);
931                 if (ret)
932                         goto err;
933
934                 if (k.k->type != KEY_TYPE_subvolume) {
935                         bch_err(c, "subvolume %u not found", src_subvolid);
936                         ret = -ENOENT;
937                         goto err;
938                 }
939
940                 bkey_reassemble(&src_subvol->k_i, k);
941                 parent = le32_to_cpu(src_subvol->v.snapshot);
942         }
943
944         ret = bch2_snapshot_node_create(trans, parent, new_nodes,
945                                         snapshot_subvols,
946                                         src_subvolid ? 2 : 1);
947         if (ret)
948                 goto err;
949
950         if (src_subvolid) {
951                 src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
952                 bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
953         }
954
955         new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
956         ret = PTR_ERR_OR_ZERO(new_subvol);
957         if (ret)
958                 goto err;
959
960         bkey_subvolume_init(&new_subvol->k_i);
961         new_subvol->v.flags     = 0;
962         new_subvol->v.snapshot  = cpu_to_le32(new_nodes[0]);
963         new_subvol->v.inode     = cpu_to_le64(inode);
964         SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
965         SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
966         new_subvol->k.p         = dst_iter.pos;
967         bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
968
969         *new_subvolid   = new_subvol->k.p.offset;
970         *new_snapshotid = new_nodes[0];
971 err:
972         bch2_trans_iter_exit(trans, &src_iter);
973         bch2_trans_iter_exit(trans, &dst_iter);
974         return ret;
975 }
976
977 int bch2_fs_subvolumes_init(struct bch_fs *c)
978 {
979         INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
980         return 0;
981 }