]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
095453d2d7825a7c411d1e13ddaed7c96a7b0eef
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "darray.h"
9 #include "dirent.h"
10 #include "error.h"
11 #include "fs-common.h"
12 #include "fsck.h"
13 #include "inode.h"
14 #include "keylist.h"
15 #include "recovery.h"
16 #include "snapshot.h"
17 #include "super.h"
18 #include "xattr.h"
19
20 #include <linux/bsearch.h>
21 #include <linux/dcache.h> /* struct qstr */
22
23 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
24
25 /*
26  * XXX: this is handling transaction restarts without returning
27  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
28  */
29 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
30                                     u32 snapshot)
31 {
32         struct btree_iter iter;
33         struct bkey_s_c k;
34         u64 sectors = 0;
35         int ret;
36
37         for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
38                                 SPOS(inum, 0, snapshot),
39                                 POS(inum, U64_MAX),
40                                 0, k, ret)
41                 if (bkey_extent_is_allocation(k.k))
42                         sectors += k.k->size;
43
44         bch2_trans_iter_exit(trans, &iter);
45
46         return ret ?: sectors;
47 }
48
49 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
50                                     u32 snapshot)
51 {
52         struct btree_iter iter;
53         struct bkey_s_c k;
54         struct bkey_s_c_dirent d;
55         u64 subdirs = 0;
56         int ret;
57
58         for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
59                                 SPOS(inum, 0, snapshot),
60                                 POS(inum, U64_MAX),
61                                 0, k, ret) {
62                 if (k.k->type != KEY_TYPE_dirent)
63                         continue;
64
65                 d = bkey_s_c_to_dirent(k);
66                 if (d.v->d_type == DT_DIR)
67                         subdirs++;
68         }
69         bch2_trans_iter_exit(trans, &iter);
70
71         return ret ?: subdirs;
72 }
73
74 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
75                                     u32 *subvol)
76 {
77         struct bch_snapshot s;
78         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots,
79                                           POS(0, snapshot), 0,
80                                           snapshot, &s);
81         if (!ret)
82                 *subvol = le32_to_cpu(s.subvol);
83         else if (bch2_err_matches(ret, ENOENT))
84                 bch_err(trans->c, "snapshot %u not found", snapshot);
85         return ret;
86
87 }
88
89 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
90                            u32 *snapshot, u64 *inum)
91 {
92         struct bch_subvolume s;
93         int ret;
94
95         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
96
97         *snapshot = le32_to_cpu(s.snapshot);
98         *inum = le64_to_cpu(s.inode);
99         return ret;
100 }
101
102 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
103                          u32 *snapshot, u64 *inum)
104 {
105         return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
106 }
107
108 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
109                               struct bch_inode_unpacked *inode)
110 {
111         struct btree_iter iter;
112         struct bkey_s_c k;
113         int ret;
114
115         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
116                              POS(0, inode_nr),
117                              BTREE_ITER_ALL_SNAPSHOTS);
118         k = bch2_btree_iter_peek(&iter);
119         ret = bkey_err(k);
120         if (ret)
121                 goto err;
122
123         if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
124                 ret = -BCH_ERR_ENOENT_inode;
125                 goto err;
126         }
127
128         ret = bch2_inode_unpack(k, inode);
129 err:
130         bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
131         bch2_trans_iter_exit(trans, &iter);
132         return ret;
133 }
134
135 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
136                           struct bch_inode_unpacked *inode,
137                           u32 *snapshot)
138 {
139         struct btree_iter iter;
140         struct bkey_s_c k;
141         int ret;
142
143         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
144                                SPOS(0, inode_nr, *snapshot), 0);
145         ret = bkey_err(k);
146         if (ret)
147                 goto err;
148
149         ret = bkey_is_inode(k.k)
150                 ? bch2_inode_unpack(k, inode)
151                 : -BCH_ERR_ENOENT_inode;
152         if (!ret)
153                 *snapshot = iter.pos.snapshot;
154 err:
155         bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
156         bch2_trans_iter_exit(trans, &iter);
157         return ret;
158 }
159
160 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
161                         struct bch_inode_unpacked *inode,
162                         u32 *snapshot)
163 {
164         return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
165 }
166
167 static int __lookup_dirent(struct btree_trans *trans,
168                            struct bch_hash_info hash_info,
169                            subvol_inum dir, struct qstr *name,
170                            u64 *target, unsigned *type)
171 {
172         struct btree_iter iter;
173         struct bkey_s_c_dirent d;
174         int ret;
175
176         ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
177                                &hash_info, dir, name, 0);
178         if (ret)
179                 return ret;
180
181         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
182         *target = le64_to_cpu(d.v->d_inum);
183         *type = d.v->d_type;
184         bch2_trans_iter_exit(trans, &iter);
185         return 0;
186 }
187
188 static int __write_inode(struct btree_trans *trans,
189                          struct bch_inode_unpacked *inode,
190                          u32 snapshot)
191 {
192         struct bkey_inode_buf *inode_p =
193                 bch2_trans_kmalloc(trans, sizeof(*inode_p));
194
195         if (IS_ERR(inode_p))
196                 return PTR_ERR(inode_p);
197
198         bch2_inode_pack(inode_p, inode);
199         inode_p->inode.k.p.snapshot = snapshot;
200
201         return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
202                                 &inode_p->inode.k_i,
203                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
204 }
205
206 static int fsck_write_inode(struct btree_trans *trans,
207                             struct bch_inode_unpacked *inode,
208                             u32 snapshot)
209 {
210         int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
211                             __write_inode(trans, inode, snapshot));
212         if (ret)
213                 bch_err_fn(trans->c, ret);
214         return ret;
215 }
216
217 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
218 {
219         struct bch_fs *c = trans->c;
220         struct btree_iter iter;
221         struct bch_inode_unpacked dir_inode;
222         struct bch_hash_info dir_hash_info;
223         int ret;
224
225         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
226         if (ret)
227                 goto err;
228
229         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
230
231         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
232
233         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
234                                   &dir_hash_info, &iter,
235                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
236         bch2_trans_iter_exit(trans, &iter);
237 err:
238         bch_err_fn(c, ret);
239         return ret;
240 }
241
242 /* Get lost+found, create if it doesn't exist: */
243 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
244                             struct bch_inode_unpacked *lostfound)
245 {
246         struct bch_fs *c = trans->c;
247         struct bch_inode_unpacked root;
248         struct bch_hash_info root_hash_info;
249         struct qstr lostfound_str = QSTR("lost+found");
250         subvol_inum root_inum = { .subvol = subvol };
251         u64 inum = 0;
252         unsigned d_type = 0;
253         u32 snapshot;
254         int ret;
255
256         ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
257         if (ret)
258                 return ret;
259
260         ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
261         if (ret)
262                 return ret;
263
264         root_hash_info = bch2_hash_info_init(c, &root);
265
266         ret = __lookup_dirent(trans, root_hash_info, root_inum,
267                             &lostfound_str, &inum, &d_type);
268         if (bch2_err_matches(ret, ENOENT)) {
269                 bch_notice(c, "creating lost+found");
270                 goto create_lostfound;
271         }
272
273         bch_err_fn(c, ret);
274         if (ret)
275                 return ret;
276
277         if (d_type != DT_DIR) {
278                 bch_err(c, "error looking up lost+found: not a directory");
279                 return -BCH_ERR_ENOENT_not_directory;
280         }
281
282         /*
283          * The bch2_check_dirents pass has already run, dangling dirents
284          * shouldn't exist here:
285          */
286         return __lookup_inode(trans, inum, lostfound, &snapshot);
287
288 create_lostfound:
289         bch2_inode_init_early(c, lostfound);
290
291         ret = bch2_create_trans(trans, root_inum, &root,
292                                 lostfound, &lostfound_str,
293                                 0, 0, S_IFDIR|0700, 0, NULL, NULL,
294                                 (subvol_inum) { }, 0);
295         bch_err_msg(c, ret, "creating lost+found");
296         return ret;
297 }
298
299 static int __reattach_inode(struct btree_trans *trans,
300                           struct bch_inode_unpacked *inode,
301                           u32 inode_snapshot)
302 {
303         struct bch_hash_info dir_hash;
304         struct bch_inode_unpacked lostfound;
305         char name_buf[20];
306         struct qstr name;
307         u64 dir_offset = 0;
308         u32 subvol;
309         int ret;
310
311         ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
312         if (ret)
313                 return ret;
314
315         ret = lookup_lostfound(trans, subvol, &lostfound);
316         if (ret)
317                 return ret;
318
319         if (S_ISDIR(inode->bi_mode)) {
320                 lostfound.bi_nlink++;
321
322                 ret = __write_inode(trans, &lostfound, U32_MAX);
323                 if (ret)
324                         return ret;
325         }
326
327         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
328
329         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
330         name = (struct qstr) QSTR(name_buf);
331
332         ret = bch2_dirent_create(trans,
333                                  (subvol_inum) {
334                                         .subvol = subvol,
335                                         .inum = lostfound.bi_inum,
336                                  },
337                                  &dir_hash,
338                                  inode_d_type(inode),
339                                  &name, inode->bi_inum, &dir_offset,
340                                  BCH_HASH_SET_MUST_CREATE);
341         if (ret)
342                 return ret;
343
344         inode->bi_dir           = lostfound.bi_inum;
345         inode->bi_dir_offset    = dir_offset;
346
347         return __write_inode(trans, inode, inode_snapshot);
348 }
349
350 static int reattach_inode(struct btree_trans *trans,
351                           struct bch_inode_unpacked *inode,
352                           u32 inode_snapshot)
353 {
354         int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
355                         __reattach_inode(trans, inode, inode_snapshot));
356         bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
357         return ret;
358 }
359
360 static int remove_backpointer(struct btree_trans *trans,
361                               struct bch_inode_unpacked *inode)
362 {
363         struct btree_iter iter;
364         struct bkey_s_c_dirent d;
365         int ret;
366
367         d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
368                                      POS(inode->bi_dir, inode->bi_dir_offset), 0,
369                                      dirent);
370         ret =   bkey_err(d) ?:
371                 __remove_dirent(trans, d.k->p);
372         bch2_trans_iter_exit(trans, &iter);
373         return ret;
374 }
375
376 struct snapshots_seen_entry {
377         u32                             id;
378         u32                             equiv;
379 };
380
381 struct snapshots_seen {
382         struct bpos                     pos;
383         DARRAY(struct snapshots_seen_entry) ids;
384 };
385
386 static inline void snapshots_seen_exit(struct snapshots_seen *s)
387 {
388         darray_exit(&s->ids);
389 }
390
391 static inline void snapshots_seen_init(struct snapshots_seen *s)
392 {
393         memset(s, 0, sizeof(*s));
394 }
395
396 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
397 {
398         struct snapshots_seen_entry *i, n = {
399                 .id     = id,
400                 .equiv  = bch2_snapshot_equiv(c, id),
401         };
402         int ret = 0;
403
404         darray_for_each(s->ids, i) {
405                 if (i->id == id)
406                         return 0;
407                 if (i->id > id)
408                         break;
409         }
410
411         ret = darray_insert_item(&s->ids, i - s->ids.data, n);
412         if (ret)
413                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
414                         s->ids.size);
415         return ret;
416 }
417
418 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
419                                  enum btree_id btree_id, struct bpos pos)
420 {
421         struct snapshots_seen_entry *i, n = {
422                 .id     = pos.snapshot,
423                 .equiv  = bch2_snapshot_equiv(c, pos.snapshot),
424         };
425         int ret = 0;
426
427         if (!bkey_eq(s->pos, pos))
428                 s->ids.nr = 0;
429
430         s->pos = pos;
431         s->pos.snapshot = n.equiv;
432
433         darray_for_each(s->ids, i) {
434                 if (i->id == n.id)
435                         return 0;
436
437                 /*
438                  * We currently don't rigorously track for snapshot cleanup
439                  * needing to be run, so it shouldn't be a fsck error yet:
440                  */
441                 if (i->equiv == n.equiv) {
442                         bch_err(c, "snapshot deletion did not finish:\n"
443                                 "  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
444                                 bch2_btree_id_str(btree_id),
445                                 pos.inode, pos.offset,
446                                 i->id, n.id, n.equiv);
447                         set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
448                         return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
449                 }
450         }
451
452         ret = darray_push(&s->ids, n);
453         if (ret)
454                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
455                         s->ids.size);
456         return ret;
457 }
458
459 /**
460  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
461  * and @ancestor hasn't been overwritten in @seen
462  *
463  * @c:          filesystem handle
464  * @seen:       list of snapshot ids already seen at current position
465  * @id:         descendent snapshot id
466  * @ancestor:   ancestor snapshot id
467  *
468  * Returns:     whether key in @ancestor snapshot is visible in @id snapshot
469  */
470 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
471                                     u32 id, u32 ancestor)
472 {
473         ssize_t i;
474
475         EBUG_ON(id > ancestor);
476         EBUG_ON(!bch2_snapshot_is_equiv(c, id));
477         EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
478
479         /* @ancestor should be the snapshot most recently added to @seen */
480         EBUG_ON(ancestor != seen->pos.snapshot);
481         EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
482
483         if (id == ancestor)
484                 return true;
485
486         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
487                 return false;
488
489         /*
490          * We know that @id is a descendant of @ancestor, we're checking if
491          * we've seen a key that overwrote @ancestor - i.e. also a descendent of
492          * @ascestor and with @id as a descendent.
493          *
494          * But we already know that we're scanning IDs between @id and @ancestor
495          * numerically, since snapshot ID lists are kept sorted, so if we find
496          * an id that's an ancestor of @id we're done:
497          */
498
499         for (i = seen->ids.nr - 2;
500              i >= 0 && seen->ids.data[i].equiv >= id;
501              --i)
502                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
503                         return false;
504
505         return true;
506 }
507
508 /**
509  * ref_visible - given a key with snapshot id @src that points to a key with
510  * snapshot id @dst, test whether there is some snapshot in which @dst is
511  * visible.
512  *
513  * @c:          filesystem handle
514  * @s:          list of snapshot IDs already seen at @src
515  * @src:        snapshot ID of src key
516  * @dst:        snapshot ID of dst key
517  * Returns:     true if there is some snapshot in which @dst is visible
518  *
519  * Assumes we're visiting @src keys in natural key order
520  */
521 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
522                         u32 src, u32 dst)
523 {
524         return dst <= src
525                 ? key_visible_in_snapshot(c, s, dst, src)
526                 : bch2_snapshot_is_ancestor(c, src, dst);
527 }
528
529 static int ref_visible2(struct bch_fs *c,
530                         u32 src, struct snapshots_seen *src_seen,
531                         u32 dst, struct snapshots_seen *dst_seen)
532 {
533         src = bch2_snapshot_equiv(c, src);
534         dst = bch2_snapshot_equiv(c, dst);
535
536         if (dst > src) {
537                 swap(dst, src);
538                 swap(dst_seen, src_seen);
539         }
540         return key_visible_in_snapshot(c, src_seen, dst, src);
541 }
542
543 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                               \
544         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
545              (_i)->snapshot <= (_snapshot); _i++)                                       \
546                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
547
548 struct inode_walker_entry {
549         struct bch_inode_unpacked inode;
550         u32                     snapshot;
551         bool                    seen_this_pos;
552         u64                     count;
553 };
554
555 struct inode_walker {
556         bool                            first_this_inode;
557         bool                            recalculate_sums;
558         struct bpos                     last_pos;
559
560         DARRAY(struct inode_walker_entry) inodes;
561 };
562
563 static void inode_walker_exit(struct inode_walker *w)
564 {
565         darray_exit(&w->inodes);
566 }
567
568 static struct inode_walker inode_walker_init(void)
569 {
570         return (struct inode_walker) { 0, };
571 }
572
573 static int add_inode(struct bch_fs *c, struct inode_walker *w,
574                      struct bkey_s_c inode)
575 {
576         struct bch_inode_unpacked u;
577
578         BUG_ON(bch2_inode_unpack(inode, &u));
579
580         return darray_push(&w->inodes, ((struct inode_walker_entry) {
581                 .inode          = u,
582                 .snapshot       = bch2_snapshot_equiv(c, inode.k->p.snapshot),
583         }));
584 }
585
586 static int get_inodes_all_snapshots(struct btree_trans *trans,
587                                     struct inode_walker *w, u64 inum)
588 {
589         struct bch_fs *c = trans->c;
590         struct btree_iter iter;
591         struct bkey_s_c k;
592         int ret;
593
594         w->recalculate_sums = false;
595         w->inodes.nr = 0;
596
597         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
598                                      BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
599                 if (k.k->p.offset != inum)
600                         break;
601
602                 if (bkey_is_inode(k.k))
603                         add_inode(c, w, k);
604         }
605         bch2_trans_iter_exit(trans, &iter);
606
607         if (ret)
608                 return ret;
609
610         w->first_this_inode = true;
611         return 0;
612 }
613
614 static struct inode_walker_entry *
615 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
616                           u32 snapshot, bool is_whiteout)
617 {
618         struct inode_walker_entry *i;
619
620         snapshot = bch2_snapshot_equiv(c, snapshot);
621
622         darray_for_each(w->inodes, i)
623                 if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
624                         goto found;
625
626         return NULL;
627 found:
628         BUG_ON(snapshot > i->snapshot);
629
630         if (snapshot != i->snapshot && !is_whiteout) {
631                 struct inode_walker_entry new = *i;
632                 size_t pos;
633                 int ret;
634
635                 new.snapshot = snapshot;
636                 new.count = 0;
637
638                 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
639                          w->last_pos.inode, snapshot, i->snapshot);
640
641                 while (i > w->inodes.data && i[-1].snapshot > snapshot)
642                         --i;
643
644                 pos = i - w->inodes.data;
645                 ret = darray_insert_item(&w->inodes, pos, new);
646                 if (ret)
647                         return ERR_PTR(ret);
648
649                 i = w->inodes.data + pos;
650         }
651
652         return i;
653 }
654
655 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
656                                              struct inode_walker *w, struct bpos pos,
657                                              bool is_whiteout)
658 {
659         if (w->last_pos.inode != pos.inode) {
660                 int ret = get_inodes_all_snapshots(trans, w, pos.inode);
661                 if (ret)
662                         return ERR_PTR(ret);
663         } else if (bkey_cmp(w->last_pos, pos)) {
664                 struct inode_walker_entry *i;
665
666                 darray_for_each(w->inodes, i)
667                         i->seen_this_pos = false;
668
669         }
670
671         w->last_pos = pos;
672
673         return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
674 }
675
676 static int __get_visible_inodes(struct btree_trans *trans,
677                                 struct inode_walker *w,
678                                 struct snapshots_seen *s,
679                                 u64 inum)
680 {
681         struct bch_fs *c = trans->c;
682         struct btree_iter iter;
683         struct bkey_s_c k;
684         int ret;
685
686         w->inodes.nr = 0;
687
688         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
689                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
690                 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
691
692                 if (k.k->p.offset != inum)
693                         break;
694
695                 if (!ref_visible(c, s, s->pos.snapshot, equiv))
696                         continue;
697
698                 if (bkey_is_inode(k.k))
699                         add_inode(c, w, k);
700
701                 if (equiv >= s->pos.snapshot)
702                         break;
703         }
704         bch2_trans_iter_exit(trans, &iter);
705
706         return ret;
707 }
708
709 static int check_key_has_snapshot(struct btree_trans *trans,
710                                   struct btree_iter *iter,
711                                   struct bkey_s_c k)
712 {
713         struct bch_fs *c = trans->c;
714         struct printbuf buf = PRINTBUF;
715         int ret = 0;
716
717         if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
718                                 bkey_in_missing_snapshot,
719                                 "key in missing snapshot: %s",
720                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
721                 ret = bch2_btree_delete_at(trans, iter,
722                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
723 fsck_err:
724         printbuf_exit(&buf);
725         return ret;
726 }
727
728 static int hash_redo_key(struct btree_trans *trans,
729                          const struct bch_hash_desc desc,
730                          struct bch_hash_info *hash_info,
731                          struct btree_iter *k_iter, struct bkey_s_c k)
732 {
733         struct bkey_i *delete;
734         struct bkey_i *tmp;
735
736         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
737         if (IS_ERR(delete))
738                 return PTR_ERR(delete);
739
740         tmp = bch2_bkey_make_mut_noupdate(trans, k);
741         if (IS_ERR(tmp))
742                 return PTR_ERR(tmp);
743
744         bkey_init(&delete->k);
745         delete->k.p = k_iter->pos;
746         return  bch2_btree_iter_traverse(k_iter) ?:
747                 bch2_trans_update(trans, k_iter, delete, 0) ?:
748                 bch2_hash_set_snapshot(trans, desc, hash_info,
749                                        (subvol_inum) { 0, k.k->p.inode },
750                                        k.k->p.snapshot, tmp,
751                                        BCH_HASH_SET_MUST_CREATE,
752                                        BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
753                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
754 }
755
756 static int hash_check_key(struct btree_trans *trans,
757                           const struct bch_hash_desc desc,
758                           struct bch_hash_info *hash_info,
759                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
760 {
761         struct bch_fs *c = trans->c;
762         struct btree_iter iter = { NULL };
763         struct printbuf buf = PRINTBUF;
764         struct bkey_s_c k;
765         u64 hash;
766         int ret = 0;
767
768         if (hash_k.k->type != desc.key_type)
769                 return 0;
770
771         hash = desc.hash_bkey(hash_info, hash_k);
772
773         if (likely(hash == hash_k.k->p.offset))
774                 return 0;
775
776         if (hash_k.k->p.offset < hash)
777                 goto bad_hash;
778
779         for_each_btree_key_norestart(trans, iter, desc.btree_id,
780                                      SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
781                                      BTREE_ITER_SLOTS, k, ret) {
782                 if (bkey_eq(k.k->p, hash_k.k->p))
783                         break;
784
785                 if (fsck_err_on(k.k->type == desc.key_type &&
786                                 !desc.cmp_bkey(k, hash_k), c,
787                                 hash_table_key_duplicate,
788                                 "duplicate hash table keys:\n%s",
789                                 (printbuf_reset(&buf),
790                                  bch2_bkey_val_to_text(&buf, c, hash_k),
791                                  buf.buf))) {
792                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
793                         break;
794                 }
795
796                 if (bkey_deleted(k.k)) {
797                         bch2_trans_iter_exit(trans, &iter);
798                         goto bad_hash;
799                 }
800         }
801 out:
802         bch2_trans_iter_exit(trans, &iter);
803         printbuf_exit(&buf);
804         return ret;
805 bad_hash:
806         if (fsck_err(c, hash_table_key_wrong_offset,
807                      "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
808                      bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
809                      (printbuf_reset(&buf),
810                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
811                 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
812                 bch_err_fn(c, ret);
813                 if (ret)
814                         return ret;
815                 ret = -BCH_ERR_transaction_restart_nested;
816         }
817 fsck_err:
818         goto out;
819 }
820
821 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
822 {
823         struct btree_iter iter;
824         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
825         int ret = bkey_err(k);
826         if (ret)
827                 return ret;
828
829         bch2_trans_iter_exit(trans, &iter);
830         return k.k->type == KEY_TYPE_set;
831 }
832
833 static int check_inode(struct btree_trans *trans,
834                        struct btree_iter *iter,
835                        struct bkey_s_c k,
836                        struct bch_inode_unpacked *prev,
837                        struct snapshots_seen *s,
838                        bool full)
839 {
840         struct bch_fs *c = trans->c;
841         struct bch_inode_unpacked u;
842         bool do_update = false;
843         int ret;
844
845         ret = check_key_has_snapshot(trans, iter, k);
846         if (ret < 0)
847                 goto err;
848         if (ret)
849                 return 0;
850
851         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
852         if (ret)
853                 goto err;
854
855         if (!bkey_is_inode(k.k))
856                 return 0;
857
858         BUG_ON(bch2_inode_unpack(k, &u));
859
860         if (!full &&
861             !(u.bi_flags & (BCH_INODE_i_size_dirty|
862                             BCH_INODE_i_sectors_dirty|
863                             BCH_INODE_unlinked)))
864                 return 0;
865
866         if (prev->bi_inum != u.bi_inum)
867                 *prev = u;
868
869         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
870                         inode_d_type(prev)      != inode_d_type(&u),
871                         c, inode_snapshot_mismatch,
872                         "inodes in different snapshots don't match")) {
873                 bch_err(c, "repair not implemented yet");
874                 return -BCH_ERR_fsck_repair_unimplemented;
875         }
876
877         if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
878             bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
879                 struct bpos new_min_pos;
880
881                 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
882                 if (ret)
883                         goto err;
884
885                 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
886
887                 ret = __write_inode(trans, &u, iter->pos.snapshot);
888                 bch_err_msg(c, ret, "in fsck updating inode");
889                 if (ret)
890                         return ret;
891
892                 if (!bpos_eq(new_min_pos, POS_MIN))
893                         bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
894                 return 0;
895         }
896
897         if (u.bi_flags & BCH_INODE_unlinked &&
898             c->sb.version >= bcachefs_metadata_version_deleted_inodes) {
899                 ret = check_inode_deleted_list(trans, k.k->p);
900                 if (ret < 0)
901                         return ret;
902
903                 fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
904                             "inode %llu:%u unlinked, but not on deleted list",
905                             u.bi_inum, k.k->p.snapshot);
906                 ret = 0;
907         }
908
909         if (u.bi_flags & BCH_INODE_unlinked &&
910             (!c->sb.clean ||
911              fsck_err(c, inode_unlinked_but_clean,
912                       "filesystem marked clean, but inode %llu unlinked",
913                       u.bi_inum))) {
914                 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
915                 bch_err_msg(c, ret, "in fsck deleting inode");
916                 return ret;
917         }
918
919         if (u.bi_flags & BCH_INODE_i_size_dirty &&
920             (!c->sb.clean ||
921              fsck_err(c, inode_i_size_dirty_but_clean,
922                       "filesystem marked clean, but inode %llu has i_size dirty",
923                       u.bi_inum))) {
924                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
925
926                 /*
927                  * XXX: need to truncate partial blocks too here - or ideally
928                  * just switch units to bytes and that issue goes away
929                  */
930                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
931                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
932                                      iter->pos.snapshot),
933                                 POS(u.bi_inum, U64_MAX),
934                                 0, NULL);
935                 bch_err_msg(c, ret, "in fsck truncating inode");
936                 if (ret)
937                         return ret;
938
939                 /*
940                  * We truncated without our normal sector accounting hook, just
941                  * make sure we recalculate it:
942                  */
943                 u.bi_flags |= BCH_INODE_i_sectors_dirty;
944
945                 u.bi_flags &= ~BCH_INODE_i_size_dirty;
946                 do_update = true;
947         }
948
949         if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
950             (!c->sb.clean ||
951              fsck_err(c, inode_i_sectors_dirty_but_clean,
952                       "filesystem marked clean, but inode %llu has i_sectors dirty",
953                       u.bi_inum))) {
954                 s64 sectors;
955
956                 bch_verbose(c, "recounting sectors for inode %llu",
957                             u.bi_inum);
958
959                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
960                 if (sectors < 0) {
961                         bch_err_msg(c, sectors, "in fsck recounting inode sectors");
962                         return sectors;
963                 }
964
965                 u.bi_sectors = sectors;
966                 u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
967                 do_update = true;
968         }
969
970         if (u.bi_flags & BCH_INODE_backptr_untrusted) {
971                 u.bi_dir = 0;
972                 u.bi_dir_offset = 0;
973                 u.bi_flags &= ~BCH_INODE_backptr_untrusted;
974                 do_update = true;
975         }
976
977         if (do_update) {
978                 ret = __write_inode(trans, &u, iter->pos.snapshot);
979                 bch_err_msg(c, ret, "in fsck updating inode");
980                 if (ret)
981                         return ret;
982         }
983 err:
984 fsck_err:
985         bch_err_fn(c, ret);
986         return ret;
987 }
988
989 int bch2_check_inodes(struct bch_fs *c)
990 {
991         bool full = c->opts.fsck;
992         struct btree_trans *trans = bch2_trans_get(c);
993         struct btree_iter iter;
994         struct bch_inode_unpacked prev = { 0 };
995         struct snapshots_seen s;
996         struct bkey_s_c k;
997         int ret;
998
999         snapshots_seen_init(&s);
1000
1001         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1002                         POS_MIN,
1003                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1004                         NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1005                 check_inode(trans, &iter, k, &prev, &s, full));
1006
1007         snapshots_seen_exit(&s);
1008         bch2_trans_put(trans);
1009         bch_err_fn(c, ret);
1010         return ret;
1011 }
1012
1013 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
1014                                                 struct btree_iter *iter,
1015                                                 struct bpos pos)
1016 {
1017         return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
1018 }
1019
1020 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
1021                                    struct bkey_s_c_dirent d)
1022 {
1023         return  inode->bi_dir           == d.k->p.inode &&
1024                 inode->bi_dir_offset    == d.k->p.offset;
1025 }
1026
1027 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
1028                                    struct bch_inode_unpacked *inode)
1029 {
1030         return d.v->d_type == DT_SUBVOL
1031                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
1032                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
1033 }
1034
1035 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1036 {
1037         struct bch_fs *c = trans->c;
1038         struct inode_walker_entry *i;
1039         u32 restart_count = trans->restart_count;
1040         int ret = 0;
1041         s64 count2;
1042
1043         darray_for_each(w->inodes, i) {
1044                 if (i->inode.bi_sectors == i->count)
1045                         continue;
1046
1047                 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1048
1049                 if (w->recalculate_sums)
1050                         i->count = count2;
1051
1052                 if (i->count != count2) {
1053                         bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1054                                 w->last_pos.inode, i->snapshot, i->count, count2);
1055                         return -BCH_ERR_internal_fsck_err;
1056                 }
1057
1058                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1059                                 c, inode_i_sectors_wrong,
1060                                 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1061                                 w->last_pos.inode, i->snapshot,
1062                                 i->inode.bi_sectors, i->count)) {
1063                         i->inode.bi_sectors = i->count;
1064                         ret = fsck_write_inode(trans, &i->inode, i->snapshot);
1065                         if (ret)
1066                                 break;
1067                 }
1068         }
1069 fsck_err:
1070         bch_err_fn(c, ret);
1071         return ret ?: trans_was_restarted(trans, restart_count);
1072 }
1073
1074 struct extent_end {
1075         u32                     snapshot;
1076         u64                     offset;
1077         struct snapshots_seen   seen;
1078 };
1079
1080 struct extent_ends {
1081         struct bpos                     last_pos;
1082         DARRAY(struct extent_end)       e;
1083 };
1084
1085 static void extent_ends_reset(struct extent_ends *extent_ends)
1086 {
1087         struct extent_end *i;
1088
1089         darray_for_each(extent_ends->e, i)
1090                 snapshots_seen_exit(&i->seen);
1091
1092         extent_ends->e.nr = 0;
1093 }
1094
1095 static void extent_ends_exit(struct extent_ends *extent_ends)
1096 {
1097         extent_ends_reset(extent_ends);
1098         darray_exit(&extent_ends->e);
1099 }
1100
1101 static void extent_ends_init(struct extent_ends *extent_ends)
1102 {
1103         memset(extent_ends, 0, sizeof(*extent_ends));
1104 }
1105
1106 static int extent_ends_at(struct bch_fs *c,
1107                           struct extent_ends *extent_ends,
1108                           struct snapshots_seen *seen,
1109                           struct bkey_s_c k)
1110 {
1111         struct extent_end *i, n = (struct extent_end) {
1112                 .offset         = k.k->p.offset,
1113                 .snapshot       = k.k->p.snapshot,
1114                 .seen           = *seen,
1115         };
1116
1117         n.seen.ids.data = kmemdup(seen->ids.data,
1118                               sizeof(seen->ids.data[0]) * seen->ids.size,
1119                               GFP_KERNEL);
1120         if (!n.seen.ids.data)
1121                 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1122
1123         darray_for_each(extent_ends->e, i) {
1124                 if (i->snapshot == k.k->p.snapshot) {
1125                         snapshots_seen_exit(&i->seen);
1126                         *i = n;
1127                         return 0;
1128                 }
1129
1130                 if (i->snapshot >= k.k->p.snapshot)
1131                         break;
1132         }
1133
1134         return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1135 }
1136
1137 static int overlapping_extents_found(struct btree_trans *trans,
1138                                      enum btree_id btree,
1139                                      struct bpos pos1, struct snapshots_seen *pos1_seen,
1140                                      struct bkey pos2,
1141                                      bool *fixed,
1142                                      struct extent_end *extent_end)
1143 {
1144         struct bch_fs *c = trans->c;
1145         struct printbuf buf = PRINTBUF;
1146         struct btree_iter iter1, iter2 = { NULL };
1147         struct bkey_s_c k1, k2;
1148         int ret;
1149
1150         BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1151
1152         bch2_trans_iter_init(trans, &iter1, btree, pos1,
1153                              BTREE_ITER_ALL_SNAPSHOTS|
1154                              BTREE_ITER_NOT_EXTENTS);
1155         k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1156         ret = bkey_err(k1);
1157         if (ret)
1158                 goto err;
1159
1160         prt_str(&buf, "\n  ");
1161         bch2_bkey_val_to_text(&buf, c, k1);
1162
1163         if (!bpos_eq(pos1, k1.k->p)) {
1164                 prt_str(&buf, "\n  wanted\n  ");
1165                 bch2_bpos_to_text(&buf, pos1);
1166                 prt_str(&buf, "\n  ");
1167                 bch2_bkey_to_text(&buf, &pos2);
1168
1169                 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1170                         __func__, buf.buf);
1171                 ret = -BCH_ERR_internal_fsck_err;
1172                 goto err;
1173         }
1174
1175         bch2_trans_copy_iter(&iter2, &iter1);
1176
1177         while (1) {
1178                 bch2_btree_iter_advance(&iter2);
1179
1180                 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1181                 ret = bkey_err(k2);
1182                 if (ret)
1183                         goto err;
1184
1185                 if (bpos_ge(k2.k->p, pos2.p))
1186                         break;
1187         }
1188
1189         prt_str(&buf, "\n  ");
1190         bch2_bkey_val_to_text(&buf, c, k2);
1191
1192         if (bpos_gt(k2.k->p, pos2.p) ||
1193             pos2.size != k2.k->size) {
1194                 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1195                         __func__, buf.buf);
1196                 ret = -BCH_ERR_internal_fsck_err;
1197                 goto err;
1198         }
1199
1200         prt_printf(&buf, "\n  overwriting %s extent",
1201                    pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1202
1203         if (fsck_err(c, extent_overlapping,
1204                      "overlapping extents%s", buf.buf)) {
1205                 struct btree_iter *old_iter = &iter1;
1206                 struct disk_reservation res = { 0 };
1207
1208                 if (pos1.snapshot < pos2.p.snapshot) {
1209                         old_iter = &iter2;
1210                         swap(k1, k2);
1211                 }
1212
1213                 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1214
1215                 ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1216                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
1217                                 k1, k2) ?:
1218                         bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1219                 bch2_disk_reservation_put(c, &res);
1220
1221                 if (ret)
1222                         goto err;
1223
1224                 *fixed = true;
1225
1226                 if (pos1.snapshot == pos2.p.snapshot) {
1227                         /*
1228                          * We overwrote the first extent, and did the overwrite
1229                          * in the same snapshot:
1230                          */
1231                         extent_end->offset = bkey_start_offset(&pos2);
1232                 } else if (pos1.snapshot > pos2.p.snapshot) {
1233                         /*
1234                          * We overwrote the first extent in pos2's snapshot:
1235                          */
1236                         ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1237                 } else {
1238                         /*
1239                          * We overwrote the second extent - restart
1240                          * check_extent() from the top:
1241                          */
1242                         ret = -BCH_ERR_transaction_restart_nested;
1243                 }
1244         }
1245 fsck_err:
1246 err:
1247         bch2_trans_iter_exit(trans, &iter2);
1248         bch2_trans_iter_exit(trans, &iter1);
1249         printbuf_exit(&buf);
1250         return ret;
1251 }
1252
1253 static int check_overlapping_extents(struct btree_trans *trans,
1254                               struct snapshots_seen *seen,
1255                               struct extent_ends *extent_ends,
1256                               struct bkey_s_c k,
1257                               u32 equiv,
1258                               struct btree_iter *iter,
1259                               bool *fixed)
1260 {
1261         struct bch_fs *c = trans->c;
1262         struct extent_end *i;
1263         int ret = 0;
1264
1265         /* transaction restart, running again */
1266         if (bpos_eq(extent_ends->last_pos, k.k->p))
1267                 return 0;
1268
1269         if (extent_ends->last_pos.inode != k.k->p.inode)
1270                 extent_ends_reset(extent_ends);
1271
1272         darray_for_each(extent_ends->e, i) {
1273                 if (i->offset <= bkey_start_offset(k.k))
1274                         continue;
1275
1276                 if (!ref_visible2(c,
1277                                   k.k->p.snapshot, seen,
1278                                   i->snapshot, &i->seen))
1279                         continue;
1280
1281                 ret = overlapping_extents_found(trans, iter->btree_id,
1282                                                 SPOS(iter->pos.inode,
1283                                                      i->offset,
1284                                                      i->snapshot),
1285                                                 &i->seen,
1286                                                 *k.k, fixed, i);
1287                 if (ret)
1288                         goto err;
1289         }
1290
1291         ret = extent_ends_at(c, extent_ends, seen, k);
1292         if (ret)
1293                 goto err;
1294
1295         extent_ends->last_pos = k.k->p;
1296 err:
1297         return ret;
1298 }
1299
1300 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1301                                 struct bkey_s_c k)
1302 {
1303         struct bch_fs *c = trans->c;
1304         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1305         struct bch_extent_crc_unpacked crc;
1306         const union bch_extent_entry *i;
1307         unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1308
1309         bkey_for_each_crc(k.k, ptrs, crc, i)
1310                 if (crc_is_encoded(crc) &&
1311                     crc.uncompressed_size > encoded_extent_max_sectors) {
1312                         struct printbuf buf = PRINTBUF;
1313
1314                         bch2_bkey_val_to_text(&buf, c, k);
1315                         bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1316                         printbuf_exit(&buf);
1317                 }
1318
1319         return 0;
1320 }
1321
1322 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1323                         struct bkey_s_c k,
1324                         struct inode_walker *inode,
1325                         struct snapshots_seen *s,
1326                         struct extent_ends *extent_ends)
1327 {
1328         struct bch_fs *c = trans->c;
1329         struct inode_walker_entry *i;
1330         struct printbuf buf = PRINTBUF;
1331         struct bpos equiv = k.k->p;
1332         int ret = 0;
1333
1334         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1335
1336         ret = check_key_has_snapshot(trans, iter, k);
1337         if (ret) {
1338                 ret = ret < 0 ? ret : 0;
1339                 goto out;
1340         }
1341
1342         if (inode->last_pos.inode != k.k->p.inode) {
1343                 ret = check_i_sectors(trans, inode);
1344                 if (ret)
1345                         goto err;
1346         }
1347
1348         i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
1349         ret = PTR_ERR_OR_ZERO(i);
1350         if (ret)
1351                 goto err;
1352
1353         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1354         if (ret)
1355                 goto err;
1356
1357         if (k.k->type != KEY_TYPE_whiteout) {
1358                 if (fsck_err_on(!i, c, extent_in_missing_inode,
1359                                 "extent in missing inode:\n  %s",
1360                                 (printbuf_reset(&buf),
1361                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1362                         goto delete;
1363
1364                 if (fsck_err_on(i &&
1365                                 !S_ISREG(i->inode.bi_mode) &&
1366                                 !S_ISLNK(i->inode.bi_mode),
1367                                 c, extent_in_non_reg_inode,
1368                                 "extent in non regular inode mode %o:\n  %s",
1369                                 i->inode.bi_mode,
1370                                 (printbuf_reset(&buf),
1371                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1372                         goto delete;
1373
1374                 ret = check_overlapping_extents(trans, s, extent_ends, k,
1375                                                 equiv.snapshot, iter,
1376                                                 &inode->recalculate_sums);
1377                 if (ret)
1378                         goto err;
1379         }
1380
1381         /*
1382          * Check inodes in reverse order, from oldest snapshots to newest,
1383          * starting from the inode that matches this extent's snapshot. If we
1384          * didn't have one, iterate over all inodes:
1385          */
1386         if (!i)
1387                 i = inode->inodes.data + inode->inodes.nr - 1;
1388
1389         for (;
1390              inode->inodes.data && i >= inode->inodes.data;
1391              --i) {
1392                 if (i->snapshot > equiv.snapshot ||
1393                     !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
1394                         continue;
1395
1396                 if (k.k->type != KEY_TYPE_whiteout) {
1397                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1398                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1399                                         !bkey_extent_is_reservation(k),
1400                                         c, extent_past_end_of_inode,
1401                                         "extent type past end of inode %llu:%u, i_size %llu\n  %s",
1402                                         i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1403                                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1404                                 struct btree_iter iter2;
1405
1406                                 bch2_trans_copy_iter(&iter2, iter);
1407                                 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1408                                 ret =   bch2_btree_iter_traverse(&iter2) ?:
1409                                         bch2_btree_delete_at(trans, &iter2,
1410                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1411                                 bch2_trans_iter_exit(trans, &iter2);
1412                                 if (ret)
1413                                         goto err;
1414
1415                                 iter->k.type = KEY_TYPE_whiteout;
1416                         }
1417
1418                         if (bkey_extent_is_allocation(k.k))
1419                                 i->count += k.k->size;
1420                 }
1421
1422                 i->seen_this_pos = true;
1423         }
1424 out:
1425 err:
1426 fsck_err:
1427         printbuf_exit(&buf);
1428         bch_err_fn(c, ret);
1429         return ret;
1430 delete:
1431         ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1432         goto out;
1433 }
1434
1435 /*
1436  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1437  * that i_size an i_sectors are consistent
1438  */
1439 int bch2_check_extents(struct bch_fs *c)
1440 {
1441         struct inode_walker w = inode_walker_init();
1442         struct snapshots_seen s;
1443         struct btree_trans *trans = bch2_trans_get(c);
1444         struct btree_iter iter;
1445         struct bkey_s_c k;
1446         struct extent_ends extent_ends;
1447         struct disk_reservation res = { 0 };
1448         int ret = 0;
1449
1450         snapshots_seen_init(&s);
1451         extent_ends_init(&extent_ends);
1452
1453         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1454                         POS(BCACHEFS_ROOT_INO, 0),
1455                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1456                         &res, NULL,
1457                         BCH_TRANS_COMMIT_no_enospc, ({
1458                 bch2_disk_reservation_put(c, &res);
1459                 check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1460                 check_extent_overbig(trans, &iter, k);
1461         })) ?:
1462         check_i_sectors(trans, &w);
1463
1464         bch2_disk_reservation_put(c, &res);
1465         extent_ends_exit(&extent_ends);
1466         inode_walker_exit(&w);
1467         snapshots_seen_exit(&s);
1468         bch2_trans_put(trans);
1469
1470         bch_err_fn(c, ret);
1471         return ret;
1472 }
1473
1474 int bch2_check_indirect_extents(struct bch_fs *c)
1475 {
1476         struct btree_trans *trans = bch2_trans_get(c);
1477         struct btree_iter iter;
1478         struct bkey_s_c k;
1479         struct disk_reservation res = { 0 };
1480         int ret = 0;
1481
1482         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1483                         POS_MIN,
1484                         BTREE_ITER_PREFETCH, k,
1485                         &res, NULL,
1486                         BCH_TRANS_COMMIT_no_enospc, ({
1487                 bch2_disk_reservation_put(c, &res);
1488                 check_extent_overbig(trans, &iter, k);
1489         }));
1490
1491         bch2_disk_reservation_put(c, &res);
1492         bch2_trans_put(trans);
1493
1494         bch_err_fn(c, ret);
1495         return ret;
1496 }
1497
1498 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1499 {
1500         struct bch_fs *c = trans->c;
1501         struct inode_walker_entry *i;
1502         u32 restart_count = trans->restart_count;
1503         int ret = 0;
1504         s64 count2;
1505
1506         darray_for_each(w->inodes, i) {
1507                 if (i->inode.bi_nlink == i->count)
1508                         continue;
1509
1510                 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1511                 if (count2 < 0)
1512                         return count2;
1513
1514                 if (i->count != count2) {
1515                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1516                                 i->count, count2);
1517                         i->count = count2;
1518                         if (i->inode.bi_nlink == i->count)
1519                                 continue;
1520                 }
1521
1522                 if (fsck_err_on(i->inode.bi_nlink != i->count,
1523                                 c, inode_dir_wrong_nlink,
1524                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1525                                 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1526                         i->inode.bi_nlink = i->count;
1527                         ret = fsck_write_inode(trans, &i->inode, i->snapshot);
1528                         if (ret)
1529                                 break;
1530                 }
1531         }
1532 fsck_err:
1533         bch_err_fn(c, ret);
1534         return ret ?: trans_was_restarted(trans, restart_count);
1535 }
1536
1537 static int check_dirent_target(struct btree_trans *trans,
1538                                struct btree_iter *iter,
1539                                struct bkey_s_c_dirent d,
1540                                struct bch_inode_unpacked *target,
1541                                u32 target_snapshot)
1542 {
1543         struct bch_fs *c = trans->c;
1544         struct bkey_i_dirent *n;
1545         struct printbuf buf = PRINTBUF;
1546         struct btree_iter bp_iter = { NULL };
1547         int ret = 0;
1548
1549         if (!target->bi_dir &&
1550             !target->bi_dir_offset) {
1551                 target->bi_dir          = d.k->p.inode;
1552                 target->bi_dir_offset   = d.k->p.offset;
1553
1554                 ret = __write_inode(trans, target, target_snapshot);
1555                 if (ret)
1556                         goto err;
1557         }
1558
1559         if (!inode_points_to_dirent(target, d)) {
1560                 struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1561                                       SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1562                 ret = bkey_err(bp_dirent);
1563                 if (ret && !bch2_err_matches(ret, ENOENT))
1564                         goto err;
1565
1566                 bool backpointer_exists = !ret;
1567                 ret = 0;
1568
1569                 bch2_bkey_val_to_text(&buf, c, d.s_c);
1570                 prt_newline(&buf);
1571                 if (backpointer_exists)
1572                         bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1573
1574                 if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
1575                                 c, inode_dir_multiple_links,
1576                                 "directory %llu:%u with multiple links\n%s",
1577                                 target->bi_inum, target_snapshot, buf.buf)) {
1578                         ret = __remove_dirent(trans, d.k->p);
1579                         goto out;
1580                 }
1581
1582                 /*
1583                  * hardlinked file with nlink 0:
1584                  * We're just adjusting nlink here so check_nlinks() will pick
1585                  * it up, it ignores inodes with nlink 0
1586                  */
1587                 if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1588                                 c, inode_multiple_links_but_nlink_0,
1589                                 "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1590                                 target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1591                         target->bi_nlink++;
1592                         target->bi_flags &= ~BCH_INODE_unlinked;
1593
1594                         ret = __write_inode(trans, target, target_snapshot);
1595                         if (ret)
1596                                 goto err;
1597                 }
1598
1599                 if (fsck_err_on(!backpointer_exists,
1600                                 c, inode_wrong_backpointer,
1601                                 "inode %llu:%u has wrong backpointer:\n"
1602                                 "got       %llu:%llu\n"
1603                                 "should be %llu:%llu",
1604                                 target->bi_inum, target_snapshot,
1605                                 target->bi_dir,
1606                                 target->bi_dir_offset,
1607                                 d.k->p.inode,
1608                                 d.k->p.offset)) {
1609                         target->bi_dir          = d.k->p.inode;
1610                         target->bi_dir_offset   = d.k->p.offset;
1611
1612                         ret = __write_inode(trans, target, target_snapshot);
1613                         if (ret)
1614                                 goto err;
1615                 }
1616         }
1617
1618         if (fsck_err_on(d.v->d_type != inode_d_type(target),
1619                         c, dirent_d_type_wrong,
1620                         "incorrect d_type: got %s, should be %s:\n%s",
1621                         bch2_d_type_str(d.v->d_type),
1622                         bch2_d_type_str(inode_d_type(target)),
1623                         (printbuf_reset(&buf),
1624                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1625                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1626                 ret = PTR_ERR_OR_ZERO(n);
1627                 if (ret)
1628                         goto err;
1629
1630                 bkey_reassemble(&n->k_i, d.s_c);
1631                 n->v.d_type = inode_d_type(target);
1632
1633                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1634                 if (ret)
1635                         goto err;
1636
1637                 d = dirent_i_to_s_c(n);
1638         }
1639
1640         if (d.v->d_type == DT_SUBVOL &&
1641             target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
1642             (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
1643              fsck_err(c, dirent_d_parent_subvol_wrong,
1644                       "dirent has wrong d_parent_subvol field: got %u, should be %u",
1645                       le32_to_cpu(d.v->d_parent_subvol),
1646                       target->bi_parent_subvol))) {
1647                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1648                 ret = PTR_ERR_OR_ZERO(n);
1649                 if (ret)
1650                         goto err;
1651
1652                 bkey_reassemble(&n->k_i, d.s_c);
1653                 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1654
1655                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1656                 if (ret)
1657                         goto err;
1658
1659                 d = dirent_i_to_s_c(n);
1660         }
1661 out:
1662 err:
1663 fsck_err:
1664         bch2_trans_iter_exit(trans, &bp_iter);
1665         printbuf_exit(&buf);
1666         bch_err_fn(c, ret);
1667         return ret;
1668 }
1669
1670 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1671                         struct bkey_s_c k,
1672                         struct bch_hash_info *hash_info,
1673                         struct inode_walker *dir,
1674                         struct inode_walker *target,
1675                         struct snapshots_seen *s)
1676 {
1677         struct bch_fs *c = trans->c;
1678         struct bkey_s_c_dirent d;
1679         struct inode_walker_entry *i;
1680         struct printbuf buf = PRINTBUF;
1681         struct bpos equiv;
1682         int ret = 0;
1683
1684         ret = check_key_has_snapshot(trans, iter, k);
1685         if (ret) {
1686                 ret = ret < 0 ? ret : 0;
1687                 goto out;
1688         }
1689
1690         equiv = k.k->p;
1691         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1692
1693         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1694         if (ret)
1695                 goto err;
1696
1697         if (k.k->type == KEY_TYPE_whiteout)
1698                 goto out;
1699
1700         if (dir->last_pos.inode != k.k->p.inode) {
1701                 ret = check_subdir_count(trans, dir);
1702                 if (ret)
1703                         goto err;
1704         }
1705
1706         BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
1707
1708         i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
1709         ret = PTR_ERR_OR_ZERO(i);
1710         if (ret < 0)
1711                 goto err;
1712
1713         if (dir->first_this_inode && dir->inodes.nr)
1714                 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
1715         dir->first_this_inode = false;
1716
1717         if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
1718                         "dirent in nonexisting directory:\n%s",
1719                         (printbuf_reset(&buf),
1720                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1721                 ret = bch2_btree_delete_at(trans, iter,
1722                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1723                 goto out;
1724         }
1725
1726         if (!i)
1727                 goto out;
1728
1729         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
1730                         c, dirent_in_non_dir_inode,
1731                         "dirent in non directory inode type %s:\n%s",
1732                         bch2_d_type_str(inode_d_type(&i->inode)),
1733                         (printbuf_reset(&buf),
1734                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1735                 ret = bch2_btree_delete_at(trans, iter, 0);
1736                 goto out;
1737         }
1738
1739         ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
1740         if (ret < 0)
1741                 goto err;
1742         if (ret) {
1743                 /* dirent has been deleted */
1744                 ret = 0;
1745                 goto out;
1746         }
1747
1748         if (k.k->type != KEY_TYPE_dirent)
1749                 goto out;
1750
1751         d = bkey_s_c_to_dirent(k);
1752
1753         if (d.v->d_type == DT_SUBVOL) {
1754                 struct bch_inode_unpacked subvol_root;
1755                 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1756                 u32 target_snapshot;
1757                 u64 target_inum;
1758
1759                 ret = __subvol_lookup(trans, target_subvol,
1760                                       &target_snapshot, &target_inum);
1761                 if (ret && !bch2_err_matches(ret, ENOENT))
1762                         goto err;
1763
1764                 if (fsck_err_on(ret, c, dirent_to_missing_subvol,
1765                                 "dirent points to missing subvolume %u",
1766                                 le32_to_cpu(d.v->d_child_subvol))) {
1767                         ret = __remove_dirent(trans, d.k->p);
1768                         goto err;
1769                 }
1770
1771                 ret = __lookup_inode(trans, target_inum,
1772                                    &subvol_root, &target_snapshot);
1773                 if (ret && !bch2_err_matches(ret, ENOENT))
1774                         goto err;
1775
1776                 if (fsck_err_on(ret, c, subvol_to_missing_root,
1777                                 "subvolume %u points to missing subvolume root %llu",
1778                                 target_subvol,
1779                                 target_inum)) {
1780                         bch_err(c, "repair not implemented yet");
1781                         ret = -EINVAL;
1782                         goto err;
1783                 }
1784
1785                 if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
1786                                 c, subvol_root_wrong_bi_subvol,
1787                                 "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1788                                 target_inum,
1789                                 subvol_root.bi_subvol, target_subvol)) {
1790                         subvol_root.bi_subvol = target_subvol;
1791                         ret = __write_inode(trans, &subvol_root, target_snapshot);
1792                         if (ret)
1793                                 goto err;
1794                 }
1795
1796                 ret = check_dirent_target(trans, iter, d, &subvol_root,
1797                                           target_snapshot);
1798                 if (ret)
1799                         goto err;
1800         } else {
1801                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1802                 if (ret)
1803                         goto err;
1804
1805                 if (fsck_err_on(!target->inodes.nr,
1806                                 c, dirent_to_missing_inode,
1807                                 "dirent points to missing inode: (equiv %u)\n%s",
1808                                 equiv.snapshot,
1809                                 (printbuf_reset(&buf),
1810                                  bch2_bkey_val_to_text(&buf, c, k),
1811                                  buf.buf))) {
1812                         ret = __remove_dirent(trans, d.k->p);
1813                         if (ret)
1814                                 goto err;
1815                 }
1816
1817                 darray_for_each(target->inodes, i) {
1818                         ret = check_dirent_target(trans, iter, d,
1819                                                   &i->inode, i->snapshot);
1820                         if (ret)
1821                                 goto err;
1822                 }
1823         }
1824
1825         if (d.v->d_type == DT_DIR)
1826                 for_each_visible_inode(c, s, dir, equiv.snapshot, i)
1827                         i->count++;
1828
1829 out:
1830 err:
1831 fsck_err:
1832         printbuf_exit(&buf);
1833         bch_err_fn(c, ret);
1834         return ret;
1835 }
1836
1837 /*
1838  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1839  * validate d_type
1840  */
1841 int bch2_check_dirents(struct bch_fs *c)
1842 {
1843         struct inode_walker dir = inode_walker_init();
1844         struct inode_walker target = inode_walker_init();
1845         struct snapshots_seen s;
1846         struct bch_hash_info hash_info;
1847         struct btree_trans *trans = bch2_trans_get(c);
1848         struct btree_iter iter;
1849         struct bkey_s_c k;
1850         int ret = 0;
1851
1852         snapshots_seen_init(&s);
1853
1854         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
1855                         POS(BCACHEFS_ROOT_INO, 0),
1856                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1857                         k,
1858                         NULL, NULL,
1859                         BCH_TRANS_COMMIT_no_enospc,
1860                 check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
1861
1862         bch2_trans_put(trans);
1863         snapshots_seen_exit(&s);
1864         inode_walker_exit(&dir);
1865         inode_walker_exit(&target);
1866         bch_err_fn(c, ret);
1867         return ret;
1868 }
1869
1870 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
1871                        struct bkey_s_c k,
1872                        struct bch_hash_info *hash_info,
1873                        struct inode_walker *inode)
1874 {
1875         struct bch_fs *c = trans->c;
1876         struct inode_walker_entry *i;
1877         int ret;
1878
1879         ret = check_key_has_snapshot(trans, iter, k);
1880         if (ret)
1881                 return ret;
1882
1883         i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
1884         ret = PTR_ERR_OR_ZERO(i);
1885         if (ret)
1886                 return ret;
1887
1888         if (inode->first_this_inode && inode->inodes.nr)
1889                 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
1890         inode->first_this_inode = false;
1891
1892         if (fsck_err_on(!i, c, xattr_in_missing_inode,
1893                         "xattr for missing inode %llu",
1894                         k.k->p.inode))
1895                 return bch2_btree_delete_at(trans, iter, 0);
1896
1897         if (!i)
1898                 return 0;
1899
1900         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
1901 fsck_err:
1902         bch_err_fn(c, ret);
1903         return ret;
1904 }
1905
1906 /*
1907  * Walk xattrs: verify that they all have a corresponding inode
1908  */
1909 int bch2_check_xattrs(struct bch_fs *c)
1910 {
1911         struct inode_walker inode = inode_walker_init();
1912         struct bch_hash_info hash_info;
1913         struct btree_iter iter;
1914         struct bkey_s_c k;
1915         int ret = 0;
1916
1917         ret = bch2_trans_run(c,
1918                 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
1919                         POS(BCACHEFS_ROOT_INO, 0),
1920                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1921                         k,
1922                         NULL, NULL,
1923                         BCH_TRANS_COMMIT_no_enospc,
1924                 check_xattr(trans, &iter, k, &hash_info, &inode)));
1925         bch_err_fn(c, ret);
1926         return ret;
1927 }
1928
1929 static int check_root_trans(struct btree_trans *trans)
1930 {
1931         struct bch_fs *c = trans->c;
1932         struct bch_inode_unpacked root_inode;
1933         u32 snapshot;
1934         u64 inum;
1935         int ret;
1936
1937         ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1938         if (ret && !bch2_err_matches(ret, ENOENT))
1939                 return ret;
1940
1941         if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
1942                                 "root subvol missing")) {
1943                 struct bkey_i_subvolume root_subvol;
1944
1945                 snapshot        = U32_MAX;
1946                 inum            = BCACHEFS_ROOT_INO;
1947
1948                 bkey_subvolume_init(&root_subvol.k_i);
1949                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1950                 root_subvol.v.flags     = 0;
1951                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1952                 root_subvol.v.inode     = cpu_to_le64(inum);
1953                 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
1954                 bch_err_msg(c, ret, "writing root subvol");
1955                 if (ret)
1956                         goto err;
1957         }
1958
1959         ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1960         if (ret && !bch2_err_matches(ret, ENOENT))
1961                 return ret;
1962
1963         if (mustfix_fsck_err_on(ret, c, root_dir_missing,
1964                                 "root directory missing") ||
1965             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
1966                                 c, root_inode_not_dir,
1967                                 "root inode not a directory")) {
1968                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1969                                 0, NULL);
1970                 root_inode.bi_inum = inum;
1971
1972                 ret = __write_inode(trans, &root_inode, snapshot);
1973                 bch_err_msg(c, ret, "writing root inode");
1974         }
1975 err:
1976 fsck_err:
1977         return ret;
1978 }
1979
1980 /* Get root directory, create if it doesn't exist: */
1981 int bch2_check_root(struct bch_fs *c)
1982 {
1983         int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1984                 check_root_trans(trans));
1985         bch_err_fn(c, ret);
1986         return ret;
1987 }
1988
1989 struct pathbuf_entry {
1990         u64     inum;
1991         u32     snapshot;
1992 };
1993
1994 typedef DARRAY(struct pathbuf_entry) pathbuf;
1995
1996 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
1997 {
1998         struct pathbuf_entry *i;
1999
2000         darray_for_each(*p, i)
2001                 if (i->inum     == inum &&
2002                     i->snapshot == snapshot)
2003                         return true;
2004
2005         return false;
2006 }
2007
2008 static int path_down(struct bch_fs *c, pathbuf *p,
2009                      u64 inum, u32 snapshot)
2010 {
2011         int ret = darray_push(p, ((struct pathbuf_entry) {
2012                 .inum           = inum,
2013                 .snapshot       = snapshot,
2014         }));
2015
2016         if (ret)
2017                 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
2018                         p->size);
2019         return ret;
2020 }
2021
2022 /*
2023  * Check that a given inode is reachable from the root:
2024  *
2025  * XXX: we should also be verifying that inodes are in the right subvolumes
2026  */
2027 static int check_path(struct btree_trans *trans,
2028                       pathbuf *p,
2029                       struct bch_inode_unpacked *inode,
2030                       u32 snapshot)
2031 {
2032         struct bch_fs *c = trans->c;
2033         int ret = 0;
2034
2035         snapshot = bch2_snapshot_equiv(c, snapshot);
2036         p->nr = 0;
2037
2038         while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
2039                  inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
2040                 struct btree_iter dirent_iter;
2041                 struct bkey_s_c_dirent d;
2042                 u32 parent_snapshot = snapshot;
2043
2044                 if (inode->bi_subvol) {
2045                         u64 inum;
2046
2047                         ret = subvol_lookup(trans, inode->bi_parent_subvol,
2048                                             &parent_snapshot, &inum);
2049                         if (ret)
2050                                 break;
2051                 }
2052
2053                 ret = lockrestart_do(trans,
2054                         PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
2055                                           SPOS(inode->bi_dir, inode->bi_dir_offset,
2056                                                parent_snapshot))).k));
2057                 if (ret && !bch2_err_matches(ret, ENOENT))
2058                         break;
2059
2060                 if (!ret && !dirent_points_to_inode(d, inode)) {
2061                         bch2_trans_iter_exit(trans, &dirent_iter);
2062                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2063                 }
2064
2065                 if (bch2_err_matches(ret, ENOENT)) {
2066                         if (fsck_err(c,  inode_unreachable,
2067                                      "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
2068                                      inode->bi_inum, snapshot,
2069                                      bch2_d_type_str(inode_d_type(inode)),
2070                                      inode->bi_nlink,
2071                                      inode->bi_dir,
2072                                      inode->bi_dir_offset))
2073                                 ret = reattach_inode(trans, inode, snapshot);
2074                         break;
2075                 }
2076
2077                 bch2_trans_iter_exit(trans, &dirent_iter);
2078
2079                 if (!S_ISDIR(inode->bi_mode))
2080                         break;
2081
2082                 ret = path_down(c, p, inode->bi_inum, snapshot);
2083                 if (ret) {
2084                         bch_err(c, "memory allocation failure");
2085                         return ret;
2086                 }
2087
2088                 snapshot = parent_snapshot;
2089
2090                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
2091                 if (ret) {
2092                         /* Should have been caught in dirents pass */
2093                         bch_err(c, "error looking up parent directory: %i", ret);
2094                         break;
2095                 }
2096
2097                 if (path_is_dup(p, inode->bi_inum, snapshot)) {
2098                         struct pathbuf_entry *i;
2099
2100                         /* XXX print path */
2101                         bch_err(c, "directory structure loop");
2102
2103                         darray_for_each(*p, i)
2104                                 pr_err("%llu:%u", i->inum, i->snapshot);
2105                         pr_err("%llu:%u", inode->bi_inum, snapshot);
2106
2107                         if (!fsck_err(c, dir_loop,
2108                                       "directory structure loop"))
2109                                 return 0;
2110
2111                         ret = commit_do(trans, NULL, NULL,
2112                                               BCH_TRANS_COMMIT_no_enospc,
2113                                         remove_backpointer(trans, inode));
2114                         if (ret) {
2115                                 bch_err(c, "error removing dirent: %i", ret);
2116                                 break;
2117                         }
2118
2119                         ret = reattach_inode(trans, inode, snapshot);
2120                 }
2121         }
2122 fsck_err:
2123         bch_err_fn(c, ret);
2124         return ret;
2125 }
2126
2127 /*
2128  * Check for unreachable inodes, as well as loops in the directory structure:
2129  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2130  * unreachable:
2131  */
2132 int bch2_check_directory_structure(struct bch_fs *c)
2133 {
2134         struct btree_trans *trans = bch2_trans_get(c);
2135         struct btree_iter iter;
2136         struct bkey_s_c k;
2137         struct bch_inode_unpacked u;
2138         pathbuf path = { 0, };
2139         int ret;
2140
2141         for_each_btree_key_old(trans, iter, BTREE_ID_inodes, POS_MIN,
2142                                BTREE_ITER_INTENT|
2143                                BTREE_ITER_PREFETCH|
2144                                BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2145                 if (!bkey_is_inode(k.k))
2146                         continue;
2147
2148                 BUG_ON(bch2_inode_unpack(k, &u));
2149
2150                 if (u.bi_flags & BCH_INODE_unlinked)
2151                         continue;
2152
2153                 ret = check_path(trans, &path, &u, iter.pos.snapshot);
2154                 if (ret)
2155                         break;
2156         }
2157         bch2_trans_iter_exit(trans, &iter);
2158         bch2_trans_put(trans);
2159         darray_exit(&path);
2160
2161         bch_err_fn(c, ret);
2162         return ret;
2163 }
2164
2165 struct nlink_table {
2166         size_t          nr;
2167         size_t          size;
2168
2169         struct nlink {
2170                 u64     inum;
2171                 u32     snapshot;
2172                 u32     count;
2173         }               *d;
2174 };
2175
2176 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2177                      u64 inum, u32 snapshot)
2178 {
2179         if (t->nr == t->size) {
2180                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2181                 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2182
2183                 if (!d) {
2184                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2185                                 new_size);
2186                         return -BCH_ERR_ENOMEM_fsck_add_nlink;
2187                 }
2188
2189                 if (t->d)
2190                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2191                 kvfree(t->d);
2192
2193                 t->d = d;
2194                 t->size = new_size;
2195         }
2196
2197
2198         t->d[t->nr++] = (struct nlink) {
2199                 .inum           = inum,
2200                 .snapshot       = snapshot,
2201         };
2202
2203         return 0;
2204 }
2205
2206 static int nlink_cmp(const void *_l, const void *_r)
2207 {
2208         const struct nlink *l = _l;
2209         const struct nlink *r = _r;
2210
2211         return cmp_int(l->inum, r->inum);
2212 }
2213
2214 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2215                      struct nlink_table *links,
2216                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2217 {
2218         struct nlink *link, key = {
2219                 .inum = inum, .snapshot = U32_MAX,
2220         };
2221
2222         if (inum < range_start || inum >= range_end)
2223                 return;
2224
2225         link = __inline_bsearch(&key, links->d, links->nr,
2226                                 sizeof(links->d[0]), nlink_cmp);
2227         if (!link)
2228                 return;
2229
2230         while (link > links->d && link[0].inum == link[-1].inum)
2231                 --link;
2232
2233         for (; link < links->d + links->nr && link->inum == inum; link++)
2234                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2235                         link->count++;
2236                         if (link->snapshot >= snapshot)
2237                                 break;
2238                 }
2239 }
2240
2241 noinline_for_stack
2242 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2243                                        struct nlink_table *t,
2244                                        u64 start, u64 *end)
2245 {
2246         struct btree_iter iter;
2247         struct bkey_s_c k;
2248         struct bch_inode_unpacked u;
2249
2250         int ret = bch2_trans_run(c,
2251                 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2252                                    POS(0, start),
2253                                    BTREE_ITER_INTENT|
2254                                    BTREE_ITER_PREFETCH|
2255                                    BTREE_ITER_ALL_SNAPSHOTS, k, ({
2256                         if (!bkey_is_inode(k.k))
2257                                 continue;
2258
2259                         /* Should never fail, checked by bch2_inode_invalid: */
2260                         BUG_ON(bch2_inode_unpack(k, &u));
2261
2262                         /*
2263                          * Backpointer and directory structure checks are sufficient for
2264                          * directories, since they can't have hardlinks:
2265                          */
2266                         if (S_ISDIR(u.bi_mode))
2267                                 continue;
2268
2269                         if (!u.bi_nlink)
2270                                 continue;
2271
2272                         ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2273                         if (ret) {
2274                                 *end = k.k->p.offset;
2275                                 ret = 0;
2276                                 break;
2277                         }
2278                         0;
2279                 })));
2280
2281         bch_err_fn(c, ret);
2282         return ret;
2283 }
2284
2285 noinline_for_stack
2286 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2287                                      u64 range_start, u64 range_end)
2288 {
2289         struct snapshots_seen s;
2290         struct btree_iter iter;
2291         struct bkey_s_c k;
2292         struct bkey_s_c_dirent d;
2293
2294         snapshots_seen_init(&s);
2295
2296         int ret = bch2_trans_run(c,
2297                 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2298                                    BTREE_ITER_INTENT|
2299                                    BTREE_ITER_PREFETCH|
2300                                    BTREE_ITER_ALL_SNAPSHOTS, k, ({
2301                         ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2302                         if (ret)
2303                                 break;
2304
2305                         switch (k.k->type) {
2306                         case KEY_TYPE_dirent:
2307                                 d = bkey_s_c_to_dirent(k);
2308
2309                                 if (d.v->d_type != DT_DIR &&
2310                                     d.v->d_type != DT_SUBVOL)
2311                                         inc_link(c, &s, links, range_start, range_end,
2312                                                  le64_to_cpu(d.v->d_inum),
2313                                                  bch2_snapshot_equiv(c, d.k->p.snapshot));
2314                                 break;
2315                         }
2316                         0;
2317                 })));
2318
2319         snapshots_seen_exit(&s);
2320
2321         bch_err_fn(c, ret);
2322         return ret;
2323 }
2324
2325 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2326                                      struct bkey_s_c k,
2327                                      struct nlink_table *links,
2328                                      size_t *idx, u64 range_end)
2329 {
2330         struct bch_fs *c = trans->c;
2331         struct bch_inode_unpacked u;
2332         struct nlink *link = &links->d[*idx];
2333         int ret = 0;
2334
2335         if (k.k->p.offset >= range_end)
2336                 return 1;
2337
2338         if (!bkey_is_inode(k.k))
2339                 return 0;
2340
2341         BUG_ON(bch2_inode_unpack(k, &u));
2342
2343         if (S_ISDIR(u.bi_mode))
2344                 return 0;
2345
2346         if (!u.bi_nlink)
2347                 return 0;
2348
2349         while ((cmp_int(link->inum, k.k->p.offset) ?:
2350                 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2351                 BUG_ON(*idx == links->nr);
2352                 link = &links->d[++*idx];
2353         }
2354
2355         if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2356                         c, inode_wrong_nlink,
2357                         "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2358                         u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2359                         bch2_inode_nlink_get(&u), link->count)) {
2360                 bch2_inode_nlink_set(&u, link->count);
2361                 ret = __write_inode(trans, &u, k.k->p.snapshot);
2362         }
2363 fsck_err:
2364         return ret;
2365 }
2366
2367 noinline_for_stack
2368 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2369                                struct nlink_table *links,
2370                                u64 range_start, u64 range_end)
2371 {
2372         struct btree_iter iter;
2373         struct bkey_s_c k;
2374         size_t idx = 0;
2375         int ret = 0;
2376
2377         ret = bch2_trans_run(c,
2378                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2379                                 POS(0, range_start),
2380                                 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
2381                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2382                         check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2383         if (ret < 0) {
2384                 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2385                 return ret;
2386         }
2387
2388         return 0;
2389 }
2390
2391 int bch2_check_nlinks(struct bch_fs *c)
2392 {
2393         struct nlink_table links = { 0 };
2394         u64 this_iter_range_start, next_iter_range_start = 0;
2395         int ret = 0;
2396
2397         do {
2398                 this_iter_range_start = next_iter_range_start;
2399                 next_iter_range_start = U64_MAX;
2400
2401                 ret = check_nlinks_find_hardlinks(c, &links,
2402                                                   this_iter_range_start,
2403                                                   &next_iter_range_start);
2404
2405                 ret = check_nlinks_walk_dirents(c, &links,
2406                                           this_iter_range_start,
2407                                           next_iter_range_start);
2408                 if (ret)
2409                         break;
2410
2411                 ret = check_nlinks_update_hardlinks(c, &links,
2412                                          this_iter_range_start,
2413                                          next_iter_range_start);
2414                 if (ret)
2415                         break;
2416
2417                 links.nr = 0;
2418         } while (next_iter_range_start != U64_MAX);
2419
2420         kvfree(links.d);
2421         bch_err_fn(c, ret);
2422         return ret;
2423 }
2424
2425 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2426                              struct bkey_s_c k)
2427 {
2428         struct bkey_s_c_reflink_p p;
2429         struct bkey_i_reflink_p *u;
2430         int ret;
2431
2432         if (k.k->type != KEY_TYPE_reflink_p)
2433                 return 0;
2434
2435         p = bkey_s_c_to_reflink_p(k);
2436
2437         if (!p.v->front_pad && !p.v->back_pad)
2438                 return 0;
2439
2440         u = bch2_trans_kmalloc(trans, sizeof(*u));
2441         ret = PTR_ERR_OR_ZERO(u);
2442         if (ret)
2443                 return ret;
2444
2445         bkey_reassemble(&u->k_i, k);
2446         u->v.front_pad  = 0;
2447         u->v.back_pad   = 0;
2448
2449         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2450 }
2451
2452 int bch2_fix_reflink_p(struct bch_fs *c)
2453 {
2454         struct btree_iter iter;
2455         struct bkey_s_c k;
2456         int ret;
2457
2458         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2459                 return 0;
2460
2461         ret = bch2_trans_run(c,
2462                 for_each_btree_key_commit(trans, iter,
2463                                 BTREE_ID_extents, POS_MIN,
2464                                 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
2465                                 BTREE_ITER_ALL_SNAPSHOTS, k,
2466                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2467                         fix_reflink_p_key(trans, &iter, k)));
2468         bch_err_fn(c, ret);
2469         return ret;
2470 }