]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
e4a8a14c46bc922983e91edcdc9ece6fe717d3eb
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "dirent.h"
9 #include "error.h"
10 #include "fs-common.h"
11 #include "fsck.h"
12 #include "inode.h"
13 #include "keylist.h"
14 #include "recovery.h"
15 #include "snapshot.h"
16 #include "super.h"
17 #include "xattr.h"
18
19 #include <linux/bsearch.h>
20 #include <linux/darray.h>
21 #include <linux/dcache.h> /* struct qstr */
22
23 /*
24  * XXX: this is handling transaction restarts without returning
25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26  */
27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28                                     u32 snapshot)
29 {
30         u64 sectors = 0;
31
32         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33                                 SPOS(inum, 0, snapshot),
34                                 POS(inum, U64_MAX),
35                                 0, k, ({
36                 if (bkey_extent_is_allocation(k.k))
37                         sectors += k.k->size;
38                 0;
39         }));
40
41         return ret ?: sectors;
42 }
43
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45                                     u32 snapshot)
46 {
47         u64 subdirs = 0;
48
49         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50                                     SPOS(inum, 0, snapshot),
51                                     POS(inum, U64_MAX),
52                                     0, k, ({
53                 if (k.k->type == KEY_TYPE_dirent &&
54                     bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55                         subdirs++;
56                 0;
57         }));
58
59         return ret ?: subdirs;
60 }
61
62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63                          u32 *snapshot, u64 *inum)
64 {
65         struct bch_subvolume s;
66         int ret;
67
68         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
69
70         *snapshot = le32_to_cpu(s.snapshot);
71         *inum = le64_to_cpu(s.inode);
72         return ret;
73 }
74
75 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
76                               struct bch_inode_unpacked *inode)
77 {
78         struct btree_iter iter;
79         struct bkey_s_c k;
80         int ret;
81
82         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
83                              POS(0, inode_nr),
84                              BTREE_ITER_ALL_SNAPSHOTS);
85         k = bch2_btree_iter_peek(&iter);
86         ret = bkey_err(k);
87         if (ret)
88                 goto err;
89
90         if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
91                 ret = -BCH_ERR_ENOENT_inode;
92                 goto err;
93         }
94
95         ret = bch2_inode_unpack(k, inode);
96 err:
97         bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
98         bch2_trans_iter_exit(trans, &iter);
99         return ret;
100 }
101
102 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
103                         struct bch_inode_unpacked *inode,
104                         u32 *snapshot)
105 {
106         struct btree_iter iter;
107         struct bkey_s_c k;
108         int ret;
109
110         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
111                                SPOS(0, inode_nr, *snapshot), 0);
112         ret = bkey_err(k);
113         if (ret)
114                 goto err;
115
116         ret = bkey_is_inode(k.k)
117                 ? bch2_inode_unpack(k, inode)
118                 : -BCH_ERR_ENOENT_inode;
119         if (!ret)
120                 *snapshot = iter.pos.snapshot;
121 err:
122         bch2_trans_iter_exit(trans, &iter);
123         return ret;
124 }
125
126 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
127                            struct bch_hash_info hash_info,
128                            subvol_inum dir, struct qstr *name,
129                            u64 *target, unsigned *type, u32 snapshot)
130 {
131         struct btree_iter iter;
132         struct bkey_s_c_dirent d;
133         int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
134                                &hash_info, dir, name, 0, snapshot);
135         if (ret)
136                 return ret;
137
138         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
139         *target = le64_to_cpu(d.v->d_inum);
140         *type = d.v->d_type;
141         bch2_trans_iter_exit(trans, &iter);
142         return 0;
143 }
144
145 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
146 {
147         struct bch_fs *c = trans->c;
148         struct btree_iter iter;
149         struct bch_inode_unpacked dir_inode;
150         struct bch_hash_info dir_hash_info;
151         int ret;
152
153         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
154         if (ret)
155                 goto err;
156
157         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
158
159         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
160
161         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
162                                   &dir_hash_info, &iter,
163                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
164         bch2_trans_iter_exit(trans, &iter);
165 err:
166         bch_err_fn(c, ret);
167         return ret;
168 }
169
170 /* Get lost+found, create if it doesn't exist: */
171 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
172                             struct bch_inode_unpacked *lostfound)
173 {
174         struct bch_fs *c = trans->c;
175         struct qstr lostfound_str = QSTR("lost+found");
176         u64 inum = 0;
177         unsigned d_type = 0;
178         int ret;
179
180         struct bch_snapshot_tree st;
181         ret = bch2_snapshot_tree_lookup(trans,
182                         bch2_snapshot_tree(c, snapshot), &st);
183         if (ret)
184                 return ret;
185
186         subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
187         u32 subvol_snapshot;
188
189         ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
190                             &subvol_snapshot, &root_inum.inum);
191         bch_err_msg(c, ret, "looking up root subvol");
192         if (ret)
193                 return ret;
194
195         struct bch_inode_unpacked root_inode;
196         struct bch_hash_info root_hash_info;
197         u32 root_inode_snapshot = snapshot;
198         ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
199         bch_err_msg(c, ret, "looking up root inode");
200         if (ret)
201                 return ret;
202
203         root_hash_info = bch2_hash_info_init(c, &root_inode);
204
205         ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
206                               &lostfound_str, &inum, &d_type, snapshot);
207         if (bch2_err_matches(ret, ENOENT))
208                 goto create_lostfound;
209
210         bch_err_fn(c, ret);
211         if (ret)
212                 return ret;
213
214         if (d_type != DT_DIR) {
215                 bch_err(c, "error looking up lost+found: not a directory");
216                 return -BCH_ERR_ENOENT_not_directory;
217         }
218
219         /*
220          * The bch2_check_dirents pass has already run, dangling dirents
221          * shouldn't exist here:
222          */
223         ret = lookup_inode(trans, inum, lostfound, &snapshot);
224         bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
225                     inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
226         return ret;
227
228 create_lostfound:
229         /*
230          * XXX: we could have a nicer log message here  if we had a nice way to
231          * walk backpointers to print a path
232          */
233         bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
234
235         u64 now = bch2_current_time(c);
236         struct btree_iter lostfound_iter = { NULL };
237         u64 cpu = raw_smp_processor_id();
238
239         bch2_inode_init_early(c, lostfound);
240         bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
241         lostfound->bi_dir = root_inode.bi_inum;
242
243         root_inode.bi_nlink++;
244
245         ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
246         if (ret)
247                 goto err;
248
249         bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
250         ret = bch2_btree_iter_traverse(&lostfound_iter);
251         if (ret)
252                 goto err;
253
254         ret =   bch2_dirent_create_snapshot(trans,
255                                 root_inode.bi_inum, snapshot, &root_hash_info,
256                                 mode_to_type(lostfound->bi_mode),
257                                 &lostfound_str,
258                                 lostfound->bi_inum,
259                                 &lostfound->bi_dir_offset,
260                                 BCH_HASH_SET_MUST_CREATE) ?:
261                 bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
262                                        BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
263 err:
264         bch_err_msg(c, ret, "creating lost+found");
265         bch2_trans_iter_exit(trans, &lostfound_iter);
266         return ret;
267 }
268
269 static int reattach_inode(struct btree_trans *trans,
270                           struct bch_inode_unpacked *inode,
271                           u32 inode_snapshot)
272 {
273         struct bch_hash_info dir_hash;
274         struct bch_inode_unpacked lostfound;
275         char name_buf[20];
276         struct qstr name;
277         u64 dir_offset = 0;
278         int ret;
279
280         ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
281         if (ret)
282                 return ret;
283
284         if (S_ISDIR(inode->bi_mode)) {
285                 lostfound.bi_nlink++;
286
287                 ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
288                 if (ret)
289                         return ret;
290         }
291
292         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
293
294         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
295         name = (struct qstr) QSTR(name_buf);
296
297         ret = bch2_dirent_create_snapshot(trans,
298                                 lostfound.bi_inum, inode_snapshot,
299                                 &dir_hash,
300                                 inode_d_type(inode),
301                                 &name, inode->bi_inum, &dir_offset,
302                                 BCH_HASH_SET_MUST_CREATE);
303         if (ret)
304                 return ret;
305
306         inode->bi_dir           = lostfound.bi_inum;
307         inode->bi_dir_offset    = dir_offset;
308
309         return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
310 }
311
312 static int remove_backpointer(struct btree_trans *trans,
313                               struct bch_inode_unpacked *inode)
314 {
315         struct btree_iter iter;
316         struct bkey_s_c_dirent d;
317         int ret;
318
319         d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
320                                      POS(inode->bi_dir, inode->bi_dir_offset), 0,
321                                      dirent);
322         ret =   bkey_err(d) ?:
323                 __remove_dirent(trans, d.k->p);
324         bch2_trans_iter_exit(trans, &iter);
325         return ret;
326 }
327
328 struct snapshots_seen_entry {
329         u32                             id;
330         u32                             equiv;
331 };
332
333 struct snapshots_seen {
334         struct bpos                     pos;
335         DARRAY(struct snapshots_seen_entry) ids;
336 };
337
338 static inline void snapshots_seen_exit(struct snapshots_seen *s)
339 {
340         darray_exit(&s->ids);
341 }
342
343 static inline void snapshots_seen_init(struct snapshots_seen *s)
344 {
345         memset(s, 0, sizeof(*s));
346 }
347
348 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
349 {
350         struct snapshots_seen_entry *i, n = {
351                 .id     = id,
352                 .equiv  = bch2_snapshot_equiv(c, id),
353         };
354         int ret = 0;
355
356         __darray_for_each(s->ids, i) {
357                 if (i->id == id)
358                         return 0;
359                 if (i->id > id)
360                         break;
361         }
362
363         ret = darray_insert_item(&s->ids, i - s->ids.data, n);
364         if (ret)
365                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
366                         s->ids.size);
367         return ret;
368 }
369
370 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
371                                  enum btree_id btree_id, struct bpos pos)
372 {
373         struct snapshots_seen_entry n = {
374                 .id     = pos.snapshot,
375                 .equiv  = bch2_snapshot_equiv(c, pos.snapshot),
376         };
377         int ret = 0;
378
379         if (!bkey_eq(s->pos, pos))
380                 s->ids.nr = 0;
381
382         s->pos = pos;
383         s->pos.snapshot = n.equiv;
384
385         darray_for_each(s->ids, i) {
386                 if (i->id == n.id)
387                         return 0;
388
389                 /*
390                  * We currently don't rigorously track for snapshot cleanup
391                  * needing to be run, so it shouldn't be a fsck error yet:
392                  */
393                 if (i->equiv == n.equiv) {
394                         bch_err(c, "snapshot deletion did not finish:\n"
395                                 "  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
396                                 bch2_btree_id_str(btree_id),
397                                 pos.inode, pos.offset,
398                                 i->id, n.id, n.equiv);
399                         set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
400                         return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
401                 }
402         }
403
404         ret = darray_push(&s->ids, n);
405         if (ret)
406                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
407                         s->ids.size);
408         return ret;
409 }
410
411 /**
412  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
413  * and @ancestor hasn't been overwritten in @seen
414  *
415  * @c:          filesystem handle
416  * @seen:       list of snapshot ids already seen at current position
417  * @id:         descendent snapshot id
418  * @ancestor:   ancestor snapshot id
419  *
420  * Returns:     whether key in @ancestor snapshot is visible in @id snapshot
421  */
422 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
423                                     u32 id, u32 ancestor)
424 {
425         ssize_t i;
426
427         EBUG_ON(id > ancestor);
428         EBUG_ON(!bch2_snapshot_is_equiv(c, id));
429         EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
430
431         /* @ancestor should be the snapshot most recently added to @seen */
432         EBUG_ON(ancestor != seen->pos.snapshot);
433         EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
434
435         if (id == ancestor)
436                 return true;
437
438         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
439                 return false;
440
441         /*
442          * We know that @id is a descendant of @ancestor, we're checking if
443          * we've seen a key that overwrote @ancestor - i.e. also a descendent of
444          * @ascestor and with @id as a descendent.
445          *
446          * But we already know that we're scanning IDs between @id and @ancestor
447          * numerically, since snapshot ID lists are kept sorted, so if we find
448          * an id that's an ancestor of @id we're done:
449          */
450
451         for (i = seen->ids.nr - 2;
452              i >= 0 && seen->ids.data[i].equiv >= id;
453              --i)
454                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
455                         return false;
456
457         return true;
458 }
459
460 /**
461  * ref_visible - given a key with snapshot id @src that points to a key with
462  * snapshot id @dst, test whether there is some snapshot in which @dst is
463  * visible.
464  *
465  * @c:          filesystem handle
466  * @s:          list of snapshot IDs already seen at @src
467  * @src:        snapshot ID of src key
468  * @dst:        snapshot ID of dst key
469  * Returns:     true if there is some snapshot in which @dst is visible
470  *
471  * Assumes we're visiting @src keys in natural key order
472  */
473 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
474                         u32 src, u32 dst)
475 {
476         return dst <= src
477                 ? key_visible_in_snapshot(c, s, dst, src)
478                 : bch2_snapshot_is_ancestor(c, src, dst);
479 }
480
481 static int ref_visible2(struct bch_fs *c,
482                         u32 src, struct snapshots_seen *src_seen,
483                         u32 dst, struct snapshots_seen *dst_seen)
484 {
485         src = bch2_snapshot_equiv(c, src);
486         dst = bch2_snapshot_equiv(c, dst);
487
488         if (dst > src) {
489                 swap(dst, src);
490                 swap(dst_seen, src_seen);
491         }
492         return key_visible_in_snapshot(c, src_seen, dst, src);
493 }
494
495 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                               \
496         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
497              (_i)->snapshot <= (_snapshot); _i++)                                       \
498                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
499
500 struct inode_walker_entry {
501         struct bch_inode_unpacked inode;
502         u32                     snapshot;
503         bool                    seen_this_pos;
504         u64                     count;
505 };
506
507 struct inode_walker {
508         bool                            first_this_inode;
509         bool                            recalculate_sums;
510         struct bpos                     last_pos;
511
512         DARRAY(struct inode_walker_entry) inodes;
513 };
514
515 static void inode_walker_exit(struct inode_walker *w)
516 {
517         darray_exit(&w->inodes);
518 }
519
520 static struct inode_walker inode_walker_init(void)
521 {
522         return (struct inode_walker) { 0, };
523 }
524
525 static int add_inode(struct bch_fs *c, struct inode_walker *w,
526                      struct bkey_s_c inode)
527 {
528         struct bch_inode_unpacked u;
529
530         BUG_ON(bch2_inode_unpack(inode, &u));
531
532         return darray_push(&w->inodes, ((struct inode_walker_entry) {
533                 .inode          = u,
534                 .snapshot       = bch2_snapshot_equiv(c, inode.k->p.snapshot),
535         }));
536 }
537
538 static int get_inodes_all_snapshots(struct btree_trans *trans,
539                                     struct inode_walker *w, u64 inum)
540 {
541         struct bch_fs *c = trans->c;
542         struct btree_iter iter;
543         struct bkey_s_c k;
544         int ret;
545
546         w->recalculate_sums = false;
547         w->inodes.nr = 0;
548
549         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
550                                      BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
551                 if (k.k->p.offset != inum)
552                         break;
553
554                 if (bkey_is_inode(k.k))
555                         add_inode(c, w, k);
556         }
557         bch2_trans_iter_exit(trans, &iter);
558
559         if (ret)
560                 return ret;
561
562         w->first_this_inode = true;
563         return 0;
564 }
565
566 static struct inode_walker_entry *
567 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
568                           u32 snapshot, bool is_whiteout)
569 {
570         struct inode_walker_entry *i;
571
572         snapshot = bch2_snapshot_equiv(c, snapshot);
573
574         __darray_for_each(w->inodes, i)
575                 if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
576                         goto found;
577
578         return NULL;
579 found:
580         BUG_ON(snapshot > i->snapshot);
581
582         if (snapshot != i->snapshot && !is_whiteout) {
583                 struct inode_walker_entry new = *i;
584                 size_t pos;
585                 int ret;
586
587                 new.snapshot = snapshot;
588                 new.count = 0;
589
590                 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
591                          w->last_pos.inode, snapshot, i->snapshot);
592
593                 while (i > w->inodes.data && i[-1].snapshot > snapshot)
594                         --i;
595
596                 pos = i - w->inodes.data;
597                 ret = darray_insert_item(&w->inodes, pos, new);
598                 if (ret)
599                         return ERR_PTR(ret);
600
601                 i = w->inodes.data + pos;
602         }
603
604         return i;
605 }
606
607 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
608                                              struct inode_walker *w, struct bpos pos,
609                                              bool is_whiteout)
610 {
611         if (w->last_pos.inode != pos.inode) {
612                 int ret = get_inodes_all_snapshots(trans, w, pos.inode);
613                 if (ret)
614                         return ERR_PTR(ret);
615         } else if (bkey_cmp(w->last_pos, pos)) {
616                 darray_for_each(w->inodes, i)
617                         i->seen_this_pos = false;
618         }
619
620         w->last_pos = pos;
621
622         return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
623 }
624
625 static int __get_visible_inodes(struct btree_trans *trans,
626                                 struct inode_walker *w,
627                                 struct snapshots_seen *s,
628                                 u64 inum)
629 {
630         struct bch_fs *c = trans->c;
631         struct btree_iter iter;
632         struct bkey_s_c k;
633         int ret;
634
635         w->inodes.nr = 0;
636
637         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
638                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
639                 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
640
641                 if (k.k->p.offset != inum)
642                         break;
643
644                 if (!ref_visible(c, s, s->pos.snapshot, equiv))
645                         continue;
646
647                 if (bkey_is_inode(k.k))
648                         add_inode(c, w, k);
649
650                 if (equiv >= s->pos.snapshot)
651                         break;
652         }
653         bch2_trans_iter_exit(trans, &iter);
654
655         return ret;
656 }
657
658 static int check_key_has_snapshot(struct btree_trans *trans,
659                                   struct btree_iter *iter,
660                                   struct bkey_s_c k)
661 {
662         struct bch_fs *c = trans->c;
663         struct printbuf buf = PRINTBUF;
664         int ret = 0;
665
666         if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
667                                 bkey_in_missing_snapshot,
668                                 "key in missing snapshot: %s",
669                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
670                 ret = bch2_btree_delete_at(trans, iter,
671                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
672 fsck_err:
673         printbuf_exit(&buf);
674         return ret;
675 }
676
677 static int hash_redo_key(struct btree_trans *trans,
678                          const struct bch_hash_desc desc,
679                          struct bch_hash_info *hash_info,
680                          struct btree_iter *k_iter, struct bkey_s_c k)
681 {
682         struct bkey_i *delete;
683         struct bkey_i *tmp;
684
685         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
686         if (IS_ERR(delete))
687                 return PTR_ERR(delete);
688
689         tmp = bch2_bkey_make_mut_noupdate(trans, k);
690         if (IS_ERR(tmp))
691                 return PTR_ERR(tmp);
692
693         bkey_init(&delete->k);
694         delete->k.p = k_iter->pos;
695         return  bch2_btree_iter_traverse(k_iter) ?:
696                 bch2_trans_update(trans, k_iter, delete, 0) ?:
697                 bch2_hash_set_in_snapshot(trans, desc, hash_info,
698                                        (subvol_inum) { 0, k.k->p.inode },
699                                        k.k->p.snapshot, tmp,
700                                        BCH_HASH_SET_MUST_CREATE,
701                                        BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
702                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
703 }
704
705 static int hash_check_key(struct btree_trans *trans,
706                           const struct bch_hash_desc desc,
707                           struct bch_hash_info *hash_info,
708                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
709 {
710         struct bch_fs *c = trans->c;
711         struct btree_iter iter = { NULL };
712         struct printbuf buf = PRINTBUF;
713         struct bkey_s_c k;
714         u64 hash;
715         int ret = 0;
716
717         if (hash_k.k->type != desc.key_type)
718                 return 0;
719
720         hash = desc.hash_bkey(hash_info, hash_k);
721
722         if (likely(hash == hash_k.k->p.offset))
723                 return 0;
724
725         if (hash_k.k->p.offset < hash)
726                 goto bad_hash;
727
728         for_each_btree_key_norestart(trans, iter, desc.btree_id,
729                                      SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
730                                      BTREE_ITER_SLOTS, k, ret) {
731                 if (bkey_eq(k.k->p, hash_k.k->p))
732                         break;
733
734                 if (fsck_err_on(k.k->type == desc.key_type &&
735                                 !desc.cmp_bkey(k, hash_k), c,
736                                 hash_table_key_duplicate,
737                                 "duplicate hash table keys:\n%s",
738                                 (printbuf_reset(&buf),
739                                  bch2_bkey_val_to_text(&buf, c, hash_k),
740                                  buf.buf))) {
741                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
742                         break;
743                 }
744
745                 if (bkey_deleted(k.k)) {
746                         bch2_trans_iter_exit(trans, &iter);
747                         goto bad_hash;
748                 }
749         }
750 out:
751         bch2_trans_iter_exit(trans, &iter);
752         printbuf_exit(&buf);
753         return ret;
754 bad_hash:
755         if (fsck_err(c, hash_table_key_wrong_offset,
756                      "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
757                      bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
758                      (printbuf_reset(&buf),
759                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
760                 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
761                 bch_err_fn(c, ret);
762                 if (ret)
763                         return ret;
764                 ret = -BCH_ERR_transaction_restart_nested;
765         }
766 fsck_err:
767         goto out;
768 }
769
770 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
771 {
772         struct btree_iter iter;
773         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
774         int ret = bkey_err(k);
775         if (ret)
776                 return ret;
777
778         bch2_trans_iter_exit(trans, &iter);
779         return k.k->type == KEY_TYPE_set;
780 }
781
782 static int check_inode(struct btree_trans *trans,
783                        struct btree_iter *iter,
784                        struct bkey_s_c k,
785                        struct bch_inode_unpacked *prev,
786                        struct snapshots_seen *s,
787                        bool full)
788 {
789         struct bch_fs *c = trans->c;
790         struct bch_inode_unpacked u;
791         bool do_update = false;
792         int ret;
793
794         ret = check_key_has_snapshot(trans, iter, k);
795         if (ret < 0)
796                 goto err;
797         if (ret)
798                 return 0;
799
800         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
801         if (ret)
802                 goto err;
803
804         if (!bkey_is_inode(k.k))
805                 return 0;
806
807         BUG_ON(bch2_inode_unpack(k, &u));
808
809         if (!full &&
810             !(u.bi_flags & (BCH_INODE_i_size_dirty|
811                             BCH_INODE_i_sectors_dirty|
812                             BCH_INODE_unlinked)))
813                 return 0;
814
815         if (prev->bi_inum != u.bi_inum)
816                 *prev = u;
817
818         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
819                         inode_d_type(prev)      != inode_d_type(&u),
820                         c, inode_snapshot_mismatch,
821                         "inodes in different snapshots don't match")) {
822                 bch_err(c, "repair not implemented yet");
823                 return -BCH_ERR_fsck_repair_unimplemented;
824         }
825
826         if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
827             bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
828                 struct bpos new_min_pos;
829
830                 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
831                 if (ret)
832                         goto err;
833
834                 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
835
836                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
837
838                 bch_err_msg(c, ret, "in fsck updating inode");
839                 if (ret)
840                         return ret;
841
842                 if (!bpos_eq(new_min_pos, POS_MIN))
843                         bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
844                 return 0;
845         }
846
847         if (u.bi_flags & BCH_INODE_unlinked) {
848                 ret = check_inode_deleted_list(trans, k.k->p);
849                 if (ret < 0)
850                         return ret;
851
852                 fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
853                             "inode %llu:%u unlinked, but not on deleted list",
854                             u.bi_inum, k.k->p.snapshot);
855                 ret = 0;
856         }
857
858         if (u.bi_flags & BCH_INODE_unlinked &&
859             (!c->sb.clean ||
860              fsck_err(c, inode_unlinked_but_clean,
861                       "filesystem marked clean, but inode %llu unlinked",
862                       u.bi_inum))) {
863                 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
864                 bch_err_msg(c, ret, "in fsck deleting inode");
865                 return ret;
866         }
867
868         if (u.bi_flags & BCH_INODE_i_size_dirty &&
869             (!c->sb.clean ||
870              fsck_err(c, inode_i_size_dirty_but_clean,
871                       "filesystem marked clean, but inode %llu has i_size dirty",
872                       u.bi_inum))) {
873                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
874
875                 /*
876                  * XXX: need to truncate partial blocks too here - or ideally
877                  * just switch units to bytes and that issue goes away
878                  */
879                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
880                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
881                                      iter->pos.snapshot),
882                                 POS(u.bi_inum, U64_MAX),
883                                 0, NULL);
884                 bch_err_msg(c, ret, "in fsck truncating inode");
885                 if (ret)
886                         return ret;
887
888                 /*
889                  * We truncated without our normal sector accounting hook, just
890                  * make sure we recalculate it:
891                  */
892                 u.bi_flags |= BCH_INODE_i_sectors_dirty;
893
894                 u.bi_flags &= ~BCH_INODE_i_size_dirty;
895                 do_update = true;
896         }
897
898         if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
899             (!c->sb.clean ||
900              fsck_err(c, inode_i_sectors_dirty_but_clean,
901                       "filesystem marked clean, but inode %llu has i_sectors dirty",
902                       u.bi_inum))) {
903                 s64 sectors;
904
905                 bch_verbose(c, "recounting sectors for inode %llu",
906                             u.bi_inum);
907
908                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
909                 if (sectors < 0) {
910                         bch_err_msg(c, sectors, "in fsck recounting inode sectors");
911                         return sectors;
912                 }
913
914                 u.bi_sectors = sectors;
915                 u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
916                 do_update = true;
917         }
918
919         if (u.bi_flags & BCH_INODE_backptr_untrusted) {
920                 u.bi_dir = 0;
921                 u.bi_dir_offset = 0;
922                 u.bi_flags &= ~BCH_INODE_backptr_untrusted;
923                 do_update = true;
924         }
925
926         if (u.bi_subvol) {
927                 struct bch_subvolume s;
928
929                 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
930                 if (ret && !bch2_err_matches(ret, ENOENT))
931                         goto err;
932
933                 if (fsck_err_on(ret,
934                                 c, inode_bi_subvol_missing,
935                                 "inode %llu:%u bi_subvol points to missing subvolume %u",
936                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
937                     fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
938                                 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
939                                                            k.k->p.snapshot),
940                                 c, inode_bi_subvol_wrong,
941                                 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
942                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol,
943                                 le64_to_cpu(s.inode),
944                                 le32_to_cpu(s.snapshot))) {
945                         u.bi_subvol = 0;
946                         u.bi_parent_subvol = 0;
947                         do_update = true;
948                 }
949         }
950
951         if (do_update) {
952                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
953                 bch_err_msg(c, ret, "in fsck updating inode");
954                 if (ret)
955                         return ret;
956         }
957 err:
958 fsck_err:
959         bch_err_fn(c, ret);
960         return ret;
961 }
962
963 int bch2_check_inodes(struct bch_fs *c)
964 {
965         bool full = c->opts.fsck;
966         struct bch_inode_unpacked prev = { 0 };
967         struct snapshots_seen s;
968
969         snapshots_seen_init(&s);
970
971         int ret = bch2_trans_run(c,
972                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
973                                 POS_MIN,
974                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
975                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
976                         check_inode(trans, &iter, k, &prev, &s, full)));
977
978         snapshots_seen_exit(&s);
979         bch_err_fn(c, ret);
980         return ret;
981 }
982
983 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
984                                                 struct btree_iter *iter,
985                                                 struct bpos pos)
986 {
987         return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
988 }
989
990 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
991                                    struct bkey_s_c_dirent d)
992 {
993         return  inode->bi_dir           == d.k->p.inode &&
994                 inode->bi_dir_offset    == d.k->p.offset;
995 }
996
997 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
998                                    struct bch_inode_unpacked *inode)
999 {
1000         return d.v->d_type == DT_SUBVOL
1001                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
1002                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
1003 }
1004
1005 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1006 {
1007         struct bch_fs *c = trans->c;
1008         u32 restart_count = trans->restart_count;
1009         int ret = 0;
1010         s64 count2;
1011
1012         darray_for_each(w->inodes, i) {
1013                 if (i->inode.bi_sectors == i->count)
1014                         continue;
1015
1016                 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1017
1018                 if (w->recalculate_sums)
1019                         i->count = count2;
1020
1021                 if (i->count != count2) {
1022                         bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1023                                 w->last_pos.inode, i->snapshot, i->count, count2);
1024                         return -BCH_ERR_internal_fsck_err;
1025                 }
1026
1027                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1028                                 c, inode_i_sectors_wrong,
1029                                 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1030                                 w->last_pos.inode, i->snapshot,
1031                                 i->inode.bi_sectors, i->count)) {
1032                         i->inode.bi_sectors = i->count;
1033                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1034                         if (ret)
1035                                 break;
1036                 }
1037         }
1038 fsck_err:
1039         bch_err_fn(c, ret);
1040         return ret ?: trans_was_restarted(trans, restart_count);
1041 }
1042
1043 struct extent_end {
1044         u32                     snapshot;
1045         u64                     offset;
1046         struct snapshots_seen   seen;
1047 };
1048
1049 struct extent_ends {
1050         struct bpos                     last_pos;
1051         DARRAY(struct extent_end)       e;
1052 };
1053
1054 static void extent_ends_reset(struct extent_ends *extent_ends)
1055 {
1056         darray_for_each(extent_ends->e, i)
1057                 snapshots_seen_exit(&i->seen);
1058         extent_ends->e.nr = 0;
1059 }
1060
1061 static void extent_ends_exit(struct extent_ends *extent_ends)
1062 {
1063         extent_ends_reset(extent_ends);
1064         darray_exit(&extent_ends->e);
1065 }
1066
1067 static void extent_ends_init(struct extent_ends *extent_ends)
1068 {
1069         memset(extent_ends, 0, sizeof(*extent_ends));
1070 }
1071
1072 static int extent_ends_at(struct bch_fs *c,
1073                           struct extent_ends *extent_ends,
1074                           struct snapshots_seen *seen,
1075                           struct bkey_s_c k)
1076 {
1077         struct extent_end *i, n = (struct extent_end) {
1078                 .offset         = k.k->p.offset,
1079                 .snapshot       = k.k->p.snapshot,
1080                 .seen           = *seen,
1081         };
1082
1083         n.seen.ids.data = kmemdup(seen->ids.data,
1084                               sizeof(seen->ids.data[0]) * seen->ids.size,
1085                               GFP_KERNEL);
1086         if (!n.seen.ids.data)
1087                 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1088
1089         __darray_for_each(extent_ends->e, i) {
1090                 if (i->snapshot == k.k->p.snapshot) {
1091                         snapshots_seen_exit(&i->seen);
1092                         *i = n;
1093                         return 0;
1094                 }
1095
1096                 if (i->snapshot >= k.k->p.snapshot)
1097                         break;
1098         }
1099
1100         return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1101 }
1102
1103 static int overlapping_extents_found(struct btree_trans *trans,
1104                                      enum btree_id btree,
1105                                      struct bpos pos1, struct snapshots_seen *pos1_seen,
1106                                      struct bkey pos2,
1107                                      bool *fixed,
1108                                      struct extent_end *extent_end)
1109 {
1110         struct bch_fs *c = trans->c;
1111         struct printbuf buf = PRINTBUF;
1112         struct btree_iter iter1, iter2 = { NULL };
1113         struct bkey_s_c k1, k2;
1114         int ret;
1115
1116         BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1117
1118         bch2_trans_iter_init(trans, &iter1, btree, pos1,
1119                              BTREE_ITER_ALL_SNAPSHOTS|
1120                              BTREE_ITER_NOT_EXTENTS);
1121         k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1122         ret = bkey_err(k1);
1123         if (ret)
1124                 goto err;
1125
1126         prt_str(&buf, "\n  ");
1127         bch2_bkey_val_to_text(&buf, c, k1);
1128
1129         if (!bpos_eq(pos1, k1.k->p)) {
1130                 prt_str(&buf, "\n  wanted\n  ");
1131                 bch2_bpos_to_text(&buf, pos1);
1132                 prt_str(&buf, "\n  ");
1133                 bch2_bkey_to_text(&buf, &pos2);
1134
1135                 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1136                         __func__, buf.buf);
1137                 ret = -BCH_ERR_internal_fsck_err;
1138                 goto err;
1139         }
1140
1141         bch2_trans_copy_iter(&iter2, &iter1);
1142
1143         while (1) {
1144                 bch2_btree_iter_advance(&iter2);
1145
1146                 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1147                 ret = bkey_err(k2);
1148                 if (ret)
1149                         goto err;
1150
1151                 if (bpos_ge(k2.k->p, pos2.p))
1152                         break;
1153         }
1154
1155         prt_str(&buf, "\n  ");
1156         bch2_bkey_val_to_text(&buf, c, k2);
1157
1158         if (bpos_gt(k2.k->p, pos2.p) ||
1159             pos2.size != k2.k->size) {
1160                 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1161                         __func__, buf.buf);
1162                 ret = -BCH_ERR_internal_fsck_err;
1163                 goto err;
1164         }
1165
1166         prt_printf(&buf, "\n  overwriting %s extent",
1167                    pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1168
1169         if (fsck_err(c, extent_overlapping,
1170                      "overlapping extents%s", buf.buf)) {
1171                 struct btree_iter *old_iter = &iter1;
1172                 struct disk_reservation res = { 0 };
1173
1174                 if (pos1.snapshot < pos2.p.snapshot) {
1175                         old_iter = &iter2;
1176                         swap(k1, k2);
1177                 }
1178
1179                 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1180
1181                 ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1182                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
1183                                 k1, k2) ?:
1184                         bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1185                 bch2_disk_reservation_put(c, &res);
1186
1187                 if (ret)
1188                         goto err;
1189
1190                 *fixed = true;
1191
1192                 if (pos1.snapshot == pos2.p.snapshot) {
1193                         /*
1194                          * We overwrote the first extent, and did the overwrite
1195                          * in the same snapshot:
1196                          */
1197                         extent_end->offset = bkey_start_offset(&pos2);
1198                 } else if (pos1.snapshot > pos2.p.snapshot) {
1199                         /*
1200                          * We overwrote the first extent in pos2's snapshot:
1201                          */
1202                         ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1203                 } else {
1204                         /*
1205                          * We overwrote the second extent - restart
1206                          * check_extent() from the top:
1207                          */
1208                         ret = -BCH_ERR_transaction_restart_nested;
1209                 }
1210         }
1211 fsck_err:
1212 err:
1213         bch2_trans_iter_exit(trans, &iter2);
1214         bch2_trans_iter_exit(trans, &iter1);
1215         printbuf_exit(&buf);
1216         return ret;
1217 }
1218
1219 static int check_overlapping_extents(struct btree_trans *trans,
1220                               struct snapshots_seen *seen,
1221                               struct extent_ends *extent_ends,
1222                               struct bkey_s_c k,
1223                               u32 equiv,
1224                               struct btree_iter *iter,
1225                               bool *fixed)
1226 {
1227         struct bch_fs *c = trans->c;
1228         int ret = 0;
1229
1230         /* transaction restart, running again */
1231         if (bpos_eq(extent_ends->last_pos, k.k->p))
1232                 return 0;
1233
1234         if (extent_ends->last_pos.inode != k.k->p.inode)
1235                 extent_ends_reset(extent_ends);
1236
1237         darray_for_each(extent_ends->e, i) {
1238                 if (i->offset <= bkey_start_offset(k.k))
1239                         continue;
1240
1241                 if (!ref_visible2(c,
1242                                   k.k->p.snapshot, seen,
1243                                   i->snapshot, &i->seen))
1244                         continue;
1245
1246                 ret = overlapping_extents_found(trans, iter->btree_id,
1247                                                 SPOS(iter->pos.inode,
1248                                                      i->offset,
1249                                                      i->snapshot),
1250                                                 &i->seen,
1251                                                 *k.k, fixed, i);
1252                 if (ret)
1253                         goto err;
1254         }
1255
1256         ret = extent_ends_at(c, extent_ends, seen, k);
1257         if (ret)
1258                 goto err;
1259
1260         extent_ends->last_pos = k.k->p;
1261 err:
1262         return ret;
1263 }
1264
1265 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1266                                 struct bkey_s_c k)
1267 {
1268         struct bch_fs *c = trans->c;
1269         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1270         struct bch_extent_crc_unpacked crc;
1271         const union bch_extent_entry *i;
1272         unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1273
1274         bkey_for_each_crc(k.k, ptrs, crc, i)
1275                 if (crc_is_encoded(crc) &&
1276                     crc.uncompressed_size > encoded_extent_max_sectors) {
1277                         struct printbuf buf = PRINTBUF;
1278
1279                         bch2_bkey_val_to_text(&buf, c, k);
1280                         bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1281                         printbuf_exit(&buf);
1282                 }
1283
1284         return 0;
1285 }
1286
1287 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1288                         struct bkey_s_c k,
1289                         struct inode_walker *inode,
1290                         struct snapshots_seen *s,
1291                         struct extent_ends *extent_ends)
1292 {
1293         struct bch_fs *c = trans->c;
1294         struct inode_walker_entry *i;
1295         struct printbuf buf = PRINTBUF;
1296         struct bpos equiv = k.k->p;
1297         int ret = 0;
1298
1299         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1300
1301         ret = check_key_has_snapshot(trans, iter, k);
1302         if (ret) {
1303                 ret = ret < 0 ? ret : 0;
1304                 goto out;
1305         }
1306
1307         if (inode->last_pos.inode != k.k->p.inode) {
1308                 ret = check_i_sectors(trans, inode);
1309                 if (ret)
1310                         goto err;
1311         }
1312
1313         i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
1314         ret = PTR_ERR_OR_ZERO(i);
1315         if (ret)
1316                 goto err;
1317
1318         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1319         if (ret)
1320                 goto err;
1321
1322         if (k.k->type != KEY_TYPE_whiteout) {
1323                 if (fsck_err_on(!i, c, extent_in_missing_inode,
1324                                 "extent in missing inode:\n  %s",
1325                                 (printbuf_reset(&buf),
1326                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1327                         goto delete;
1328
1329                 if (fsck_err_on(i &&
1330                                 !S_ISREG(i->inode.bi_mode) &&
1331                                 !S_ISLNK(i->inode.bi_mode),
1332                                 c, extent_in_non_reg_inode,
1333                                 "extent in non regular inode mode %o:\n  %s",
1334                                 i->inode.bi_mode,
1335                                 (printbuf_reset(&buf),
1336                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1337                         goto delete;
1338
1339                 ret = check_overlapping_extents(trans, s, extent_ends, k,
1340                                                 equiv.snapshot, iter,
1341                                                 &inode->recalculate_sums);
1342                 if (ret)
1343                         goto err;
1344         }
1345
1346         /*
1347          * Check inodes in reverse order, from oldest snapshots to newest,
1348          * starting from the inode that matches this extent's snapshot. If we
1349          * didn't have one, iterate over all inodes:
1350          */
1351         if (!i)
1352                 i = inode->inodes.data + inode->inodes.nr - 1;
1353
1354         for (;
1355              inode->inodes.data && i >= inode->inodes.data;
1356              --i) {
1357                 if (i->snapshot > equiv.snapshot ||
1358                     !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
1359                         continue;
1360
1361                 if (k.k->type != KEY_TYPE_whiteout) {
1362                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1363                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1364                                         !bkey_extent_is_reservation(k),
1365                                         c, extent_past_end_of_inode,
1366                                         "extent type past end of inode %llu:%u, i_size %llu\n  %s",
1367                                         i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1368                                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1369                                 struct btree_iter iter2;
1370
1371                                 bch2_trans_copy_iter(&iter2, iter);
1372                                 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1373                                 ret =   bch2_btree_iter_traverse(&iter2) ?:
1374                                         bch2_btree_delete_at(trans, &iter2,
1375                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1376                                 bch2_trans_iter_exit(trans, &iter2);
1377                                 if (ret)
1378                                         goto err;
1379
1380                                 iter->k.type = KEY_TYPE_whiteout;
1381                         }
1382
1383                         if (bkey_extent_is_allocation(k.k))
1384                                 i->count += k.k->size;
1385                 }
1386
1387                 i->seen_this_pos = true;
1388         }
1389 out:
1390 err:
1391 fsck_err:
1392         printbuf_exit(&buf);
1393         bch_err_fn(c, ret);
1394         return ret;
1395 delete:
1396         ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1397         goto out;
1398 }
1399
1400 /*
1401  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1402  * that i_size an i_sectors are consistent
1403  */
1404 int bch2_check_extents(struct bch_fs *c)
1405 {
1406         struct inode_walker w = inode_walker_init();
1407         struct snapshots_seen s;
1408         struct extent_ends extent_ends;
1409         struct disk_reservation res = { 0 };
1410
1411         snapshots_seen_init(&s);
1412         extent_ends_init(&extent_ends);
1413
1414         int ret = bch2_trans_run(c,
1415                 for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1416                                 POS(BCACHEFS_ROOT_INO, 0),
1417                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1418                                 &res, NULL,
1419                                 BCH_TRANS_COMMIT_no_enospc, ({
1420                         bch2_disk_reservation_put(c, &res);
1421                         check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1422                         check_extent_overbig(trans, &iter, k);
1423                 })) ?:
1424                 check_i_sectors(trans, &w));
1425
1426         bch2_disk_reservation_put(c, &res);
1427         extent_ends_exit(&extent_ends);
1428         inode_walker_exit(&w);
1429         snapshots_seen_exit(&s);
1430
1431         bch_err_fn(c, ret);
1432         return ret;
1433 }
1434
1435 int bch2_check_indirect_extents(struct bch_fs *c)
1436 {
1437         struct disk_reservation res = { 0 };
1438
1439         int ret = bch2_trans_run(c,
1440                 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1441                                 POS_MIN,
1442                                 BTREE_ITER_PREFETCH, k,
1443                                 &res, NULL,
1444                                 BCH_TRANS_COMMIT_no_enospc, ({
1445                         bch2_disk_reservation_put(c, &res);
1446                         check_extent_overbig(trans, &iter, k);
1447                 })));
1448
1449         bch2_disk_reservation_put(c, &res);
1450         bch_err_fn(c, ret);
1451         return ret;
1452 }
1453
1454 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1455 {
1456         struct bch_fs *c = trans->c;
1457         u32 restart_count = trans->restart_count;
1458         int ret = 0;
1459         s64 count2;
1460
1461         darray_for_each(w->inodes, i) {
1462                 if (i->inode.bi_nlink == i->count)
1463                         continue;
1464
1465                 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1466                 if (count2 < 0)
1467                         return count2;
1468
1469                 if (i->count != count2) {
1470                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1471                                 i->count, count2);
1472                         i->count = count2;
1473                         if (i->inode.bi_nlink == i->count)
1474                                 continue;
1475                 }
1476
1477                 if (fsck_err_on(i->inode.bi_nlink != i->count,
1478                                 c, inode_dir_wrong_nlink,
1479                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1480                                 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1481                         i->inode.bi_nlink = i->count;
1482                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1483                         if (ret)
1484                                 break;
1485                 }
1486         }
1487 fsck_err:
1488         bch_err_fn(c, ret);
1489         return ret ?: trans_was_restarted(trans, restart_count);
1490 }
1491
1492 static int check_inode_backpointer(struct btree_trans *trans,
1493                                    struct btree_iter *iter,
1494                                    struct bkey_s_c_dirent d,
1495                                    struct bch_inode_unpacked *target,
1496                                    u32 target_snapshot)
1497 {
1498         struct bch_fs *c = trans->c;
1499         struct btree_iter bp_iter = { NULL };
1500         struct printbuf buf = PRINTBUF;
1501         int ret = 0;
1502
1503         if (!target->bi_dir &&
1504             !target->bi_dir_offset) {
1505                 target->bi_dir          = d.k->p.inode;
1506                 target->bi_dir_offset   = d.k->p.offset;
1507
1508                 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1509                 if (ret)
1510                         goto err;
1511         }
1512
1513         if (!inode_points_to_dirent(target, d)) {
1514                 struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1515                                       SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1516                 ret = bkey_err(bp_dirent);
1517                 if (ret && !bch2_err_matches(ret, ENOENT))
1518                         goto err;
1519
1520                 bool backpointer_exists = !ret;
1521                 ret = 0;
1522
1523                 bch2_bkey_val_to_text(&buf, c, d.s_c);
1524                 prt_newline(&buf);
1525                 if (backpointer_exists)
1526                         bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1527
1528                 if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
1529                                 c, inode_dir_multiple_links,
1530                                 "directory %llu:%u with multiple links\n%s",
1531                                 target->bi_inum, target_snapshot, buf.buf)) {
1532                         ret = __remove_dirent(trans, d.k->p);
1533                         goto out;
1534                 }
1535
1536                 /*
1537                  * hardlinked file with nlink 0:
1538                  * We're just adjusting nlink here so check_nlinks() will pick
1539                  * it up, it ignores inodes with nlink 0
1540                  */
1541                 if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1542                                 c, inode_multiple_links_but_nlink_0,
1543                                 "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1544                                 target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1545                         target->bi_nlink++;
1546                         target->bi_flags &= ~BCH_INODE_unlinked;
1547
1548                         ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1549                         if (ret)
1550                                 goto err;
1551                 }
1552
1553                 if (fsck_err_on(!backpointer_exists,
1554                                 c, inode_wrong_backpointer,
1555                                 "inode %llu:%u has wrong backpointer:\n"
1556                                 "got       %llu:%llu\n"
1557                                 "should be %llu:%llu",
1558                                 target->bi_inum, target_snapshot,
1559                                 target->bi_dir,
1560                                 target->bi_dir_offset,
1561                                 d.k->p.inode,
1562                                 d.k->p.offset)) {
1563                         target->bi_dir          = d.k->p.inode;
1564                         target->bi_dir_offset   = d.k->p.offset;
1565
1566                         ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1567                         if (ret)
1568                                 goto err;
1569                 }
1570         }
1571 out:
1572 err:
1573 fsck_err:
1574         bch2_trans_iter_exit(trans, &bp_iter);
1575         printbuf_exit(&buf);
1576         bch_err_fn(c, ret);
1577         return ret;
1578 }
1579
1580 static int check_dirent_target(struct btree_trans *trans,
1581                                struct btree_iter *iter,
1582                                struct bkey_s_c_dirent d,
1583                                struct bch_inode_unpacked *target,
1584                                u32 target_snapshot)
1585 {
1586         struct bch_fs *c = trans->c;
1587         struct bkey_i_dirent *n;
1588         struct printbuf buf = PRINTBUF;
1589         int ret = 0;
1590
1591         ret = check_inode_backpointer(trans, iter, d, target, target_snapshot);
1592         if (ret)
1593                 goto err;
1594
1595         if (fsck_err_on(d.v->d_type != inode_d_type(target),
1596                         c, dirent_d_type_wrong,
1597                         "incorrect d_type: got %s, should be %s:\n%s",
1598                         bch2_d_type_str(d.v->d_type),
1599                         bch2_d_type_str(inode_d_type(target)),
1600                         (printbuf_reset(&buf),
1601                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1602                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1603                 ret = PTR_ERR_OR_ZERO(n);
1604                 if (ret)
1605                         goto err;
1606
1607                 bkey_reassemble(&n->k_i, d.s_c);
1608                 n->v.d_type = inode_d_type(target);
1609
1610                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1611                 if (ret)
1612                         goto err;
1613
1614                 d = dirent_i_to_s_c(n);
1615         }
1616
1617         if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
1618                         target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
1619                         c, dirent_d_parent_subvol_wrong,
1620                         "dirent has wrong d_parent_subvol field: got %u, should be %u",
1621                         le32_to_cpu(d.v->d_parent_subvol),
1622                         target->bi_parent_subvol)) {
1623                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1624                 ret = PTR_ERR_OR_ZERO(n);
1625                 if (ret)
1626                         goto err;
1627
1628                 bkey_reassemble(&n->k_i, d.s_c);
1629                 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1630
1631                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1632                 if (ret)
1633                         goto err;
1634
1635                 d = dirent_i_to_s_c(n);
1636         }
1637 err:
1638 fsck_err:
1639         printbuf_exit(&buf);
1640         bch_err_fn(c, ret);
1641         return ret;
1642 }
1643
1644 static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter,
1645                                struct bkey_s_c_dirent d)
1646 {
1647         struct bch_fs *c = trans->c;
1648         struct bch_inode_unpacked subvol_root;
1649         u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1650         u32 target_snapshot;
1651         u64 target_inum;
1652         int ret = 0;
1653
1654         ret = subvol_lookup(trans, target_subvol,
1655                               &target_snapshot, &target_inum);
1656         if (ret && !bch2_err_matches(ret, ENOENT))
1657                 return ret;
1658
1659         if (fsck_err_on(ret, c, dirent_to_missing_subvol,
1660                         "dirent points to missing subvolume %u",
1661                         le32_to_cpu(d.v->d_child_subvol)))
1662                 return __remove_dirent(trans, d.k->p);
1663
1664         ret = lookup_inode(trans, target_inum,
1665                            &subvol_root, &target_snapshot);
1666         if (ret && !bch2_err_matches(ret, ENOENT))
1667                 return ret;
1668
1669         if (fsck_err_on(ret, c, subvol_to_missing_root,
1670                         "subvolume %u points to missing subvolume root %llu",
1671                         target_subvol,
1672                         target_inum)) {
1673                 bch_err(c, "repair not implemented yet");
1674                 return -EINVAL;
1675         }
1676
1677         if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
1678                         c, subvol_root_wrong_bi_subvol,
1679                         "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1680                         target_inum,
1681                         subvol_root.bi_subvol, target_subvol)) {
1682                 subvol_root.bi_subvol = target_subvol;
1683                 ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
1684                 if (ret)
1685                         return ret;
1686         }
1687
1688         ret = check_dirent_target(trans, iter, d, &subvol_root,
1689                                   target_snapshot);
1690         if (ret)
1691                 return ret;
1692 fsck_err:
1693         return ret;
1694 }
1695
1696 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1697                         struct bkey_s_c k,
1698                         struct bch_hash_info *hash_info,
1699                         struct inode_walker *dir,
1700                         struct inode_walker *target,
1701                         struct snapshots_seen *s)
1702 {
1703         struct bch_fs *c = trans->c;
1704         struct bkey_s_c_dirent d;
1705         struct inode_walker_entry *i;
1706         struct printbuf buf = PRINTBUF;
1707         struct bpos equiv;
1708         int ret = 0;
1709
1710         ret = check_key_has_snapshot(trans, iter, k);
1711         if (ret) {
1712                 ret = ret < 0 ? ret : 0;
1713                 goto out;
1714         }
1715
1716         equiv = k.k->p;
1717         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1718
1719         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1720         if (ret)
1721                 goto err;
1722
1723         if (k.k->type == KEY_TYPE_whiteout)
1724                 goto out;
1725
1726         if (dir->last_pos.inode != k.k->p.inode) {
1727                 ret = check_subdir_count(trans, dir);
1728                 if (ret)
1729                         goto err;
1730         }
1731
1732         BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
1733
1734         i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
1735         ret = PTR_ERR_OR_ZERO(i);
1736         if (ret < 0)
1737                 goto err;
1738
1739         if (dir->first_this_inode && dir->inodes.nr)
1740                 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
1741         dir->first_this_inode = false;
1742
1743         if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
1744                         "dirent in nonexisting directory:\n%s",
1745                         (printbuf_reset(&buf),
1746                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1747                 ret = bch2_btree_delete_at(trans, iter,
1748                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1749                 goto out;
1750         }
1751
1752         if (!i)
1753                 goto out;
1754
1755         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
1756                         c, dirent_in_non_dir_inode,
1757                         "dirent in non directory inode type %s:\n%s",
1758                         bch2_d_type_str(inode_d_type(&i->inode)),
1759                         (printbuf_reset(&buf),
1760                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1761                 ret = bch2_btree_delete_at(trans, iter, 0);
1762                 goto out;
1763         }
1764
1765         ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
1766         if (ret < 0)
1767                 goto err;
1768         if (ret) {
1769                 /* dirent has been deleted */
1770                 ret = 0;
1771                 goto out;
1772         }
1773
1774         if (k.k->type != KEY_TYPE_dirent)
1775                 goto out;
1776
1777         d = bkey_s_c_to_dirent(k);
1778
1779         if (d.v->d_type == DT_SUBVOL) {
1780                 ret = check_subvol_dirent(trans, iter, d);
1781                 if (ret)
1782                         goto err;
1783         } else {
1784                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1785                 if (ret)
1786                         goto err;
1787
1788                 if (fsck_err_on(!target->inodes.nr,
1789                                 c, dirent_to_missing_inode,
1790                                 "dirent points to missing inode: (equiv %u)\n%s",
1791                                 equiv.snapshot,
1792                                 (printbuf_reset(&buf),
1793                                  bch2_bkey_val_to_text(&buf, c, k),
1794                                  buf.buf))) {
1795                         ret = __remove_dirent(trans, d.k->p);
1796                         if (ret)
1797                                 goto err;
1798                 }
1799
1800                 darray_for_each(target->inodes, i) {
1801                         ret = check_dirent_target(trans, iter, d,
1802                                                   &i->inode, i->snapshot);
1803                         if (ret)
1804                                 goto err;
1805                 }
1806
1807                 if (d.v->d_type == DT_DIR)
1808                         for_each_visible_inode(c, s, dir, equiv.snapshot, i)
1809                                 i->count++;
1810         }
1811 out:
1812 err:
1813 fsck_err:
1814         printbuf_exit(&buf);
1815         bch_err_fn(c, ret);
1816         return ret;
1817 }
1818
1819 /*
1820  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1821  * validate d_type
1822  */
1823 int bch2_check_dirents(struct bch_fs *c)
1824 {
1825         struct inode_walker dir = inode_walker_init();
1826         struct inode_walker target = inode_walker_init();
1827         struct snapshots_seen s;
1828         struct bch_hash_info hash_info;
1829
1830         snapshots_seen_init(&s);
1831
1832         int ret = bch2_trans_run(c,
1833                 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
1834                                 POS(BCACHEFS_ROOT_INO, 0),
1835                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1836                                 k,
1837                                 NULL, NULL,
1838                                 BCH_TRANS_COMMIT_no_enospc,
1839                         check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
1840
1841         snapshots_seen_exit(&s);
1842         inode_walker_exit(&dir);
1843         inode_walker_exit(&target);
1844         bch_err_fn(c, ret);
1845         return ret;
1846 }
1847
1848 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
1849                        struct bkey_s_c k,
1850                        struct bch_hash_info *hash_info,
1851                        struct inode_walker *inode)
1852 {
1853         struct bch_fs *c = trans->c;
1854         struct inode_walker_entry *i;
1855         int ret;
1856
1857         ret = check_key_has_snapshot(trans, iter, k);
1858         if (ret)
1859                 return ret;
1860
1861         i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
1862         ret = PTR_ERR_OR_ZERO(i);
1863         if (ret)
1864                 return ret;
1865
1866         if (inode->first_this_inode && inode->inodes.nr)
1867                 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
1868         inode->first_this_inode = false;
1869
1870         if (fsck_err_on(!i, c, xattr_in_missing_inode,
1871                         "xattr for missing inode %llu",
1872                         k.k->p.inode))
1873                 return bch2_btree_delete_at(trans, iter, 0);
1874
1875         if (!i)
1876                 return 0;
1877
1878         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
1879 fsck_err:
1880         bch_err_fn(c, ret);
1881         return ret;
1882 }
1883
1884 /*
1885  * Walk xattrs: verify that they all have a corresponding inode
1886  */
1887 int bch2_check_xattrs(struct bch_fs *c)
1888 {
1889         struct inode_walker inode = inode_walker_init();
1890         struct bch_hash_info hash_info;
1891         int ret = 0;
1892
1893         ret = bch2_trans_run(c,
1894                 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
1895                         POS(BCACHEFS_ROOT_INO, 0),
1896                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1897                         k,
1898                         NULL, NULL,
1899                         BCH_TRANS_COMMIT_no_enospc,
1900                 check_xattr(trans, &iter, k, &hash_info, &inode)));
1901         bch_err_fn(c, ret);
1902         return ret;
1903 }
1904
1905 static int check_root_trans(struct btree_trans *trans)
1906 {
1907         struct bch_fs *c = trans->c;
1908         struct bch_inode_unpacked root_inode;
1909         u32 snapshot;
1910         u64 inum;
1911         int ret;
1912
1913         ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1914         if (ret && !bch2_err_matches(ret, ENOENT))
1915                 return ret;
1916
1917         if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
1918                                 "root subvol missing")) {
1919                 struct bkey_i_subvolume root_subvol;
1920
1921                 snapshot        = U32_MAX;
1922                 inum            = BCACHEFS_ROOT_INO;
1923
1924                 bkey_subvolume_init(&root_subvol.k_i);
1925                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1926                 root_subvol.v.flags     = 0;
1927                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1928                 root_subvol.v.inode     = cpu_to_le64(inum);
1929                 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
1930                 bch_err_msg(c, ret, "writing root subvol");
1931                 if (ret)
1932                         goto err;
1933         }
1934
1935         ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1936         if (ret && !bch2_err_matches(ret, ENOENT))
1937                 return ret;
1938
1939         if (mustfix_fsck_err_on(ret, c, root_dir_missing,
1940                                 "root directory missing") ||
1941             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
1942                                 c, root_inode_not_dir,
1943                                 "root inode not a directory")) {
1944                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1945                                 0, NULL);
1946                 root_inode.bi_inum = inum;
1947
1948                 ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
1949                 bch_err_msg(c, ret, "writing root inode");
1950         }
1951 err:
1952 fsck_err:
1953         return ret;
1954 }
1955
1956 /* Get root directory, create if it doesn't exist: */
1957 int bch2_check_root(struct bch_fs *c)
1958 {
1959         int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1960                 check_root_trans(trans));
1961         bch_err_fn(c, ret);
1962         return ret;
1963 }
1964
1965 struct pathbuf_entry {
1966         u64     inum;
1967         u32     snapshot;
1968 };
1969
1970 typedef DARRAY(struct pathbuf_entry) pathbuf;
1971
1972 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
1973 {
1974         darray_for_each(*p, i)
1975                 if (i->inum     == inum &&
1976                     i->snapshot == snapshot)
1977                         return true;
1978         return false;
1979 }
1980
1981 static int path_down(struct bch_fs *c, pathbuf *p,
1982                      u64 inum, u32 snapshot)
1983 {
1984         int ret = darray_push(p, ((struct pathbuf_entry) {
1985                 .inum           = inum,
1986                 .snapshot       = snapshot,
1987         }));
1988
1989         if (ret)
1990                 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
1991                         p->size);
1992         return ret;
1993 }
1994
1995 /*
1996  * Check that a given inode is reachable from the root:
1997  *
1998  * XXX: we should also be verifying that inodes are in the right subvolumes
1999  */
2000 static int check_path(struct btree_trans *trans,
2001                       pathbuf *p,
2002                       struct bch_inode_unpacked *inode,
2003                       u32 snapshot)
2004 {
2005         struct bch_fs *c = trans->c;
2006         int ret = 0;
2007
2008         snapshot = bch2_snapshot_equiv(c, snapshot);
2009         p->nr = 0;
2010
2011         while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
2012                  inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
2013                 struct btree_iter dirent_iter;
2014                 struct bkey_s_c_dirent d;
2015                 u32 parent_snapshot = snapshot;
2016
2017                 if (inode->bi_subvol) {
2018                         u64 inum;
2019
2020                         ret = subvol_lookup(trans, inode->bi_parent_subvol,
2021                                             &parent_snapshot, &inum);
2022                         if (ret)
2023                                 break;
2024                 }
2025
2026                 d = dirent_get_by_pos(trans, &dirent_iter,
2027                                       SPOS(inode->bi_dir, inode->bi_dir_offset,
2028                                            parent_snapshot));
2029                 ret = bkey_err(d.s_c);
2030                 if (ret && !bch2_err_matches(ret, ENOENT))
2031                         break;
2032
2033                 if (!ret && !dirent_points_to_inode(d, inode)) {
2034                         bch2_trans_iter_exit(trans, &dirent_iter);
2035                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2036                 }
2037
2038                 if (bch2_err_matches(ret, ENOENT)) {
2039                         if (fsck_err(c,  inode_unreachable,
2040                                      "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
2041                                      inode->bi_inum, snapshot,
2042                                      bch2_d_type_str(inode_d_type(inode)),
2043                                      inode->bi_nlink,
2044                                      inode->bi_dir,
2045                                      inode->bi_dir_offset))
2046                                 ret = reattach_inode(trans, inode, snapshot);
2047                         break;
2048                 }
2049
2050                 bch2_trans_iter_exit(trans, &dirent_iter);
2051
2052                 if (!S_ISDIR(inode->bi_mode))
2053                         break;
2054
2055                 ret = path_down(c, p, inode->bi_inum, snapshot);
2056                 if (ret) {
2057                         bch_err(c, "memory allocation failure");
2058                         return ret;
2059                 }
2060
2061                 snapshot = parent_snapshot;
2062
2063                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
2064                 if (ret) {
2065                         /* Should have been caught in dirents pass */
2066                         if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2067                                 bch_err(c, "error looking up parent directory: %i", ret);
2068                         break;
2069                 }
2070
2071                 if (path_is_dup(p, inode->bi_inum, snapshot)) {
2072                         /* XXX print path */
2073                         bch_err(c, "directory structure loop");
2074
2075                         darray_for_each(*p, i)
2076                                 pr_err("%llu:%u", i->inum, i->snapshot);
2077                         pr_err("%llu:%u", inode->bi_inum, snapshot);
2078
2079                         if (!fsck_err(c, dir_loop, "directory structure loop"))
2080                                 return 0;
2081
2082                         ret = remove_backpointer(trans, inode);
2083                         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
2084                                 bch_err_msg(c, ret, "removing dirent");
2085                         if (ret)
2086                                 break;
2087
2088                         ret = reattach_inode(trans, inode, snapshot);
2089                         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
2090                                 bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
2091                         break;
2092                 }
2093         }
2094 fsck_err:
2095         bch_err_fn(c, ret);
2096         return ret;
2097 }
2098
2099 /*
2100  * Check for unreachable inodes, as well as loops in the directory structure:
2101  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2102  * unreachable:
2103  */
2104 int bch2_check_directory_structure(struct bch_fs *c)
2105 {
2106         struct bch_inode_unpacked u;
2107         pathbuf path = { 0, };
2108         int ret;
2109
2110         ret = bch2_trans_run(c,
2111                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2112                                           BTREE_ITER_INTENT|
2113                                           BTREE_ITER_PREFETCH|
2114                                           BTREE_ITER_ALL_SNAPSHOTS, k,
2115                                           NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2116                         if (!bkey_is_inode(k.k))
2117                                 continue;
2118
2119                         BUG_ON(bch2_inode_unpack(k, &u));
2120
2121                         if (u.bi_flags & BCH_INODE_unlinked)
2122                                 continue;
2123
2124                         check_path(trans, &path, &u, iter.pos.snapshot);
2125                 })));
2126         darray_exit(&path);
2127
2128         bch_err_fn(c, ret);
2129         return ret;
2130 }
2131
2132 struct nlink_table {
2133         size_t          nr;
2134         size_t          size;
2135
2136         struct nlink {
2137                 u64     inum;
2138                 u32     snapshot;
2139                 u32     count;
2140         }               *d;
2141 };
2142
2143 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2144                      u64 inum, u32 snapshot)
2145 {
2146         if (t->nr == t->size) {
2147                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2148                 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2149
2150                 if (!d) {
2151                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2152                                 new_size);
2153                         return -BCH_ERR_ENOMEM_fsck_add_nlink;
2154                 }
2155
2156                 if (t->d)
2157                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2158                 kvfree(t->d);
2159
2160                 t->d = d;
2161                 t->size = new_size;
2162         }
2163
2164
2165         t->d[t->nr++] = (struct nlink) {
2166                 .inum           = inum,
2167                 .snapshot       = snapshot,
2168         };
2169
2170         return 0;
2171 }
2172
2173 static int nlink_cmp(const void *_l, const void *_r)
2174 {
2175         const struct nlink *l = _l;
2176         const struct nlink *r = _r;
2177
2178         return cmp_int(l->inum, r->inum);
2179 }
2180
2181 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2182                      struct nlink_table *links,
2183                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2184 {
2185         struct nlink *link, key = {
2186                 .inum = inum, .snapshot = U32_MAX,
2187         };
2188
2189         if (inum < range_start || inum >= range_end)
2190                 return;
2191
2192         link = __inline_bsearch(&key, links->d, links->nr,
2193                                 sizeof(links->d[0]), nlink_cmp);
2194         if (!link)
2195                 return;
2196
2197         while (link > links->d && link[0].inum == link[-1].inum)
2198                 --link;
2199
2200         for (; link < links->d + links->nr && link->inum == inum; link++)
2201                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2202                         link->count++;
2203                         if (link->snapshot >= snapshot)
2204                                 break;
2205                 }
2206 }
2207
2208 noinline_for_stack
2209 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2210                                        struct nlink_table *t,
2211                                        u64 start, u64 *end)
2212 {
2213         int ret = bch2_trans_run(c,
2214                 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2215                                    POS(0, start),
2216                                    BTREE_ITER_INTENT|
2217                                    BTREE_ITER_PREFETCH|
2218                                    BTREE_ITER_ALL_SNAPSHOTS, k, ({
2219                         if (!bkey_is_inode(k.k))
2220                                 continue;
2221
2222                         /* Should never fail, checked by bch2_inode_invalid: */
2223                         struct bch_inode_unpacked u;
2224                         BUG_ON(bch2_inode_unpack(k, &u));
2225
2226                         /*
2227                          * Backpointer and directory structure checks are sufficient for
2228                          * directories, since they can't have hardlinks:
2229                          */
2230                         if (S_ISDIR(u.bi_mode))
2231                                 continue;
2232
2233                         if (!u.bi_nlink)
2234                                 continue;
2235
2236                         ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2237                         if (ret) {
2238                                 *end = k.k->p.offset;
2239                                 ret = 0;
2240                                 break;
2241                         }
2242                         0;
2243                 })));
2244
2245         bch_err_fn(c, ret);
2246         return ret;
2247 }
2248
2249 noinline_for_stack
2250 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2251                                      u64 range_start, u64 range_end)
2252 {
2253         struct snapshots_seen s;
2254
2255         snapshots_seen_init(&s);
2256
2257         int ret = bch2_trans_run(c,
2258                 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2259                                    BTREE_ITER_INTENT|
2260                                    BTREE_ITER_PREFETCH|
2261                                    BTREE_ITER_ALL_SNAPSHOTS, k, ({
2262                         ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2263                         if (ret)
2264                                 break;
2265
2266                         if (k.k->type == KEY_TYPE_dirent) {
2267                                 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2268
2269                                 if (d.v->d_type != DT_DIR &&
2270                                     d.v->d_type != DT_SUBVOL)
2271                                         inc_link(c, &s, links, range_start, range_end,
2272                                                  le64_to_cpu(d.v->d_inum),
2273                                                  bch2_snapshot_equiv(c, d.k->p.snapshot));
2274                         }
2275                         0;
2276                 })));
2277
2278         snapshots_seen_exit(&s);
2279
2280         bch_err_fn(c, ret);
2281         return ret;
2282 }
2283
2284 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2285                                      struct bkey_s_c k,
2286                                      struct nlink_table *links,
2287                                      size_t *idx, u64 range_end)
2288 {
2289         struct bch_fs *c = trans->c;
2290         struct bch_inode_unpacked u;
2291         struct nlink *link = &links->d[*idx];
2292         int ret = 0;
2293
2294         if (k.k->p.offset >= range_end)
2295                 return 1;
2296
2297         if (!bkey_is_inode(k.k))
2298                 return 0;
2299
2300         BUG_ON(bch2_inode_unpack(k, &u));
2301
2302         if (S_ISDIR(u.bi_mode))
2303                 return 0;
2304
2305         if (!u.bi_nlink)
2306                 return 0;
2307
2308         while ((cmp_int(link->inum, k.k->p.offset) ?:
2309                 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2310                 BUG_ON(*idx == links->nr);
2311                 link = &links->d[++*idx];
2312         }
2313
2314         if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2315                         c, inode_wrong_nlink,
2316                         "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2317                         u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2318                         bch2_inode_nlink_get(&u), link->count)) {
2319                 bch2_inode_nlink_set(&u, link->count);
2320                 ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2321         }
2322 fsck_err:
2323         return ret;
2324 }
2325
2326 noinline_for_stack
2327 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2328                                struct nlink_table *links,
2329                                u64 range_start, u64 range_end)
2330 {
2331         size_t idx = 0;
2332
2333         int ret = bch2_trans_run(c,
2334                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2335                                 POS(0, range_start),
2336                                 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
2337                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2338                         check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2339         if (ret < 0) {
2340                 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2341                 return ret;
2342         }
2343
2344         return 0;
2345 }
2346
2347 int bch2_check_nlinks(struct bch_fs *c)
2348 {
2349         struct nlink_table links = { 0 };
2350         u64 this_iter_range_start, next_iter_range_start = 0;
2351         int ret = 0;
2352
2353         do {
2354                 this_iter_range_start = next_iter_range_start;
2355                 next_iter_range_start = U64_MAX;
2356
2357                 ret = check_nlinks_find_hardlinks(c, &links,
2358                                                   this_iter_range_start,
2359                                                   &next_iter_range_start);
2360
2361                 ret = check_nlinks_walk_dirents(c, &links,
2362                                           this_iter_range_start,
2363                                           next_iter_range_start);
2364                 if (ret)
2365                         break;
2366
2367                 ret = check_nlinks_update_hardlinks(c, &links,
2368                                          this_iter_range_start,
2369                                          next_iter_range_start);
2370                 if (ret)
2371                         break;
2372
2373                 links.nr = 0;
2374         } while (next_iter_range_start != U64_MAX);
2375
2376         kvfree(links.d);
2377         bch_err_fn(c, ret);
2378         return ret;
2379 }
2380
2381 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2382                              struct bkey_s_c k)
2383 {
2384         struct bkey_s_c_reflink_p p;
2385         struct bkey_i_reflink_p *u;
2386
2387         if (k.k->type != KEY_TYPE_reflink_p)
2388                 return 0;
2389
2390         p = bkey_s_c_to_reflink_p(k);
2391
2392         if (!p.v->front_pad && !p.v->back_pad)
2393                 return 0;
2394
2395         u = bch2_trans_kmalloc(trans, sizeof(*u));
2396         int ret = PTR_ERR_OR_ZERO(u);
2397         if (ret)
2398                 return ret;
2399
2400         bkey_reassemble(&u->k_i, k);
2401         u->v.front_pad  = 0;
2402         u->v.back_pad   = 0;
2403
2404         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2405 }
2406
2407 int bch2_fix_reflink_p(struct bch_fs *c)
2408 {
2409         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2410                 return 0;
2411
2412         int ret = bch2_trans_run(c,
2413                 for_each_btree_key_commit(trans, iter,
2414                                 BTREE_ID_extents, POS_MIN,
2415                                 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
2416                                 BTREE_ITER_ALL_SNAPSHOTS, k,
2417                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2418                         fix_reflink_p_key(trans, &iter, k)));
2419         bch_err_fn(c, ret);
2420         return ret;
2421 }