]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
Update bcachefs sources to 717b356d1d bcachefs: Convert journal validation to bkey_in...
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "buckets.h"
7 #include "darray.h"
8 #include "dirent.h"
9 #include "error.h"
10 #include "fs-common.h"
11 #include "fsck.h"
12 #include "inode.h"
13 #include "keylist.h"
14 #include "recovery.h"
15 #include "subvolume.h"
16 #include "super.h"
17 #include "xattr.h"
18
19 #include <linux/bsearch.h>
20 #include <linux/dcache.h> /* struct qstr */
21
22 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
23
24 /*
25  * XXX: this is handling transaction restarts without returning
26  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
27  */
28 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
29                                     u32 snapshot)
30 {
31         struct btree_iter iter;
32         struct bkey_s_c k;
33         u64 sectors = 0;
34         int ret;
35
36         for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
37                                 SPOS(inum, 0, snapshot),
38                                 POS(inum, U64_MAX),
39                                 0, k, ret)
40                 if (bkey_extent_is_allocation(k.k))
41                         sectors += k.k->size;
42
43         bch2_trans_iter_exit(trans, &iter);
44
45         return ret ?: sectors;
46 }
47
48 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
49                                     u32 snapshot)
50 {
51         struct btree_iter iter;
52         struct bkey_s_c k;
53         struct bkey_s_c_dirent d;
54         u64 subdirs = 0;
55         int ret;
56
57         for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
58                                 SPOS(inum, 0, snapshot),
59                                 POS(inum, U64_MAX),
60                                 0, k, ret) {
61                 if (k.k->type != KEY_TYPE_dirent)
62                         continue;
63
64                 d = bkey_s_c_to_dirent(k);
65                 if (d.v->d_type == DT_DIR)
66                         subdirs++;
67         }
68         bch2_trans_iter_exit(trans, &iter);
69
70         return ret ?: subdirs;
71 }
72
73 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
74                                     u32 *subvol)
75 {
76         struct bch_snapshot s;
77         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots,
78                                           POS(0, snapshot), 0,
79                                           snapshot, &s);
80         if (!ret)
81                 *subvol = le32_to_cpu(s.subvol);
82         else if (bch2_err_matches(ret, ENOENT))
83                 bch_err(trans->c, "snapshot %u not fonud", snapshot);
84         return ret;
85
86 }
87
88 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
89                            u32 *snapshot, u64 *inum)
90 {
91         struct bch_subvolume s;
92         int ret;
93
94         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
95
96         *snapshot = le32_to_cpu(s.snapshot);
97         *inum = le64_to_cpu(s.inode);
98         return ret;
99 }
100
101 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
102                          u32 *snapshot, u64 *inum)
103 {
104         return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
105 }
106
107 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
108                               struct bch_inode_unpacked *inode)
109 {
110         struct btree_iter iter;
111         struct bkey_s_c k;
112         int ret;
113
114         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
115                              POS(0, inode_nr),
116                              BTREE_ITER_ALL_SNAPSHOTS);
117         k = bch2_btree_iter_peek(&iter);
118         ret = bkey_err(k);
119         if (ret)
120                 goto err;
121
122         if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
123                 ret = -BCH_ERR_ENOENT_inode;
124                 goto err;
125         }
126
127         ret = bch2_inode_unpack(k, inode);
128 err:
129         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
130                 bch_err(trans->c, "error fetching inode %llu: %s",
131                         inode_nr, bch2_err_str(ret));
132         bch2_trans_iter_exit(trans, &iter);
133         return ret;
134 }
135
136 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
137                           struct bch_inode_unpacked *inode,
138                           u32 *snapshot)
139 {
140         struct btree_iter iter;
141         struct bkey_s_c k;
142         int ret;
143
144         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
145                                SPOS(0, inode_nr, *snapshot), 0);
146         ret = bkey_err(k);
147         if (ret)
148                 goto err;
149
150         ret = bkey_is_inode(k.k)
151                 ? bch2_inode_unpack(k, inode)
152                 : -BCH_ERR_ENOENT_inode;
153         if (!ret)
154                 *snapshot = iter.pos.snapshot;
155 err:
156         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
157                 bch_err(trans->c, "error fetching inode %llu:%u: %s",
158                         inode_nr, *snapshot, bch2_err_str(ret));
159         bch2_trans_iter_exit(trans, &iter);
160         return ret;
161 }
162
163 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
164                         struct bch_inode_unpacked *inode,
165                         u32 *snapshot)
166 {
167         return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
168 }
169
170 static int __lookup_dirent(struct btree_trans *trans,
171                            struct bch_hash_info hash_info,
172                            subvol_inum dir, struct qstr *name,
173                            u64 *target, unsigned *type)
174 {
175         struct btree_iter iter;
176         struct bkey_s_c_dirent d;
177         int ret;
178
179         ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
180                                &hash_info, dir, name, 0);
181         if (ret)
182                 return ret;
183
184         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
185         *target = le64_to_cpu(d.v->d_inum);
186         *type = d.v->d_type;
187         bch2_trans_iter_exit(trans, &iter);
188         return 0;
189 }
190
191 static int __write_inode(struct btree_trans *trans,
192                          struct bch_inode_unpacked *inode,
193                          u32 snapshot)
194 {
195         struct bkey_inode_buf *inode_p =
196                 bch2_trans_kmalloc(trans, sizeof(*inode_p));
197
198         if (IS_ERR(inode_p))
199                 return PTR_ERR(inode_p);
200
201         bch2_inode_pack(inode_p, inode);
202         inode_p->inode.k.p.snapshot = snapshot;
203
204         return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
205                                 &inode_p->inode.k_i,
206                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
207 }
208
209 static int write_inode(struct btree_trans *trans,
210                        struct bch_inode_unpacked *inode,
211                        u32 snapshot)
212 {
213         int ret = commit_do(trans, NULL, NULL,
214                                   BTREE_INSERT_NOFAIL|
215                                   BTREE_INSERT_LAZY_RW,
216                                   __write_inode(trans, inode, snapshot));
217         if (ret)
218                 bch_err(trans->c, "error in fsck: error updating inode: %s",
219                         bch2_err_str(ret));
220         return ret;
221 }
222
223 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
224 {
225         struct bch_fs *c = trans->c;
226         struct btree_iter iter;
227         struct bch_inode_unpacked dir_inode;
228         struct bch_hash_info dir_hash_info;
229         int ret;
230
231         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
232         if (ret)
233                 goto err;
234
235         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
236
237         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
238
239         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
240                                   &dir_hash_info, &iter,
241                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
242         bch2_trans_iter_exit(trans, &iter);
243 err:
244         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
245                 bch_err_fn(c, ret);
246         return ret;
247 }
248
249 /* Get lost+found, create if it doesn't exist: */
250 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
251                             struct bch_inode_unpacked *lostfound)
252 {
253         struct bch_fs *c = trans->c;
254         struct bch_inode_unpacked root;
255         struct bch_hash_info root_hash_info;
256         struct qstr lostfound_str = QSTR("lost+found");
257         subvol_inum root_inum = { .subvol = subvol };
258         u64 inum = 0;
259         unsigned d_type = 0;
260         u32 snapshot;
261         int ret;
262
263         ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
264         if (ret)
265                 return ret;
266
267         ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
268         if (ret)
269                 return ret;
270
271         root_hash_info = bch2_hash_info_init(c, &root);
272
273         ret = __lookup_dirent(trans, root_hash_info, root_inum,
274                             &lostfound_str, &inum, &d_type);
275         if (bch2_err_matches(ret, ENOENT)) {
276                 bch_notice(c, "creating lost+found");
277                 goto create_lostfound;
278         }
279
280         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
281                 bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
282         if (ret)
283                 return ret;
284
285         if (d_type != DT_DIR) {
286                 bch_err(c, "error looking up lost+found: not a directory");
287                 return ret;
288         }
289
290         /*
291          * The bch2_check_dirents pass has already run, dangling dirents
292          * shouldn't exist here:
293          */
294         return __lookup_inode(trans, inum, lostfound, &snapshot);
295
296 create_lostfound:
297         bch2_inode_init_early(c, lostfound);
298
299         ret = bch2_create_trans(trans, root_inum, &root,
300                                 lostfound, &lostfound_str,
301                                 0, 0, S_IFDIR|0700, 0, NULL, NULL,
302                                 (subvol_inum) { }, 0);
303         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
304                 bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
305         return ret;
306 }
307
308 static int __reattach_inode(struct btree_trans *trans,
309                           struct bch_inode_unpacked *inode,
310                           u32 inode_snapshot)
311 {
312         struct bch_hash_info dir_hash;
313         struct bch_inode_unpacked lostfound;
314         char name_buf[20];
315         struct qstr name;
316         u64 dir_offset = 0;
317         u32 subvol;
318         int ret;
319
320         ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
321         if (ret)
322                 return ret;
323
324         ret = lookup_lostfound(trans, subvol, &lostfound);
325         if (ret)
326                 return ret;
327
328         if (S_ISDIR(inode->bi_mode)) {
329                 lostfound.bi_nlink++;
330
331                 ret = __write_inode(trans, &lostfound, U32_MAX);
332                 if (ret)
333                         return ret;
334         }
335
336         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
337
338         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
339         name = (struct qstr) QSTR(name_buf);
340
341         ret = bch2_dirent_create(trans,
342                                  (subvol_inum) {
343                                         .subvol = subvol,
344                                         .inum = lostfound.bi_inum,
345                                  },
346                                  &dir_hash,
347                                  inode_d_type(inode),
348                                  &name, inode->bi_inum, &dir_offset,
349                                  BCH_HASH_SET_MUST_CREATE);
350         if (ret)
351                 return ret;
352
353         inode->bi_dir           = lostfound.bi_inum;
354         inode->bi_dir_offset    = dir_offset;
355
356         return __write_inode(trans, inode, inode_snapshot);
357 }
358
359 static int reattach_inode(struct btree_trans *trans,
360                           struct bch_inode_unpacked *inode,
361                           u32 inode_snapshot)
362 {
363         int ret = commit_do(trans, NULL, NULL,
364                                   BTREE_INSERT_LAZY_RW|
365                                   BTREE_INSERT_NOFAIL,
366                         __reattach_inode(trans, inode, inode_snapshot));
367         if (ret) {
368                 bch_err(trans->c, "error reattaching inode %llu: %s",
369                         inode->bi_inum, bch2_err_str(ret));
370                 return ret;
371         }
372
373         return ret;
374 }
375
376 static int remove_backpointer(struct btree_trans *trans,
377                               struct bch_inode_unpacked *inode)
378 {
379         struct btree_iter iter;
380         struct bkey_s_c_dirent d;
381         int ret;
382
383         d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
384                                      POS(inode->bi_dir, inode->bi_dir_offset), 0,
385                                      dirent);
386         ret =   bkey_err(d) ?:
387                 __remove_dirent(trans, d.k->p);
388         bch2_trans_iter_exit(trans, &iter);
389         return ret;
390 }
391
392 struct snapshots_seen_entry {
393         u32                             id;
394         u32                             equiv;
395 };
396
397 struct snapshots_seen {
398         struct bpos                     pos;
399         DARRAY(struct snapshots_seen_entry) ids;
400 };
401
402 static inline void snapshots_seen_exit(struct snapshots_seen *s)
403 {
404         darray_exit(&s->ids);
405 }
406
407 static inline void snapshots_seen_init(struct snapshots_seen *s)
408 {
409         memset(s, 0, sizeof(*s));
410 }
411
412 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
413                                  enum btree_id btree_id, struct bpos pos)
414 {
415         struct snapshots_seen_entry *i, n = {
416                 .id     = pos.snapshot,
417                 .equiv  = bch2_snapshot_equiv(c, pos.snapshot),
418         };
419         int ret = 0;
420
421         if (!bkey_eq(s->pos, pos))
422                 s->ids.nr = 0;
423
424         s->pos = pos;
425         s->pos.snapshot = n.equiv;
426
427         darray_for_each(s->ids, i) {
428                 if (i->id == n.id)
429                         return 0;
430
431                 /*
432                  * We currently don't rigorously track for snapshot cleanup
433                  * needing to be run, so it shouldn't be a fsck error yet:
434                  */
435                 if (i->equiv == n.equiv) {
436                         bch_err(c, "snapshot deletion did not finish:\n"
437                                 "  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
438                                 bch2_btree_ids[btree_id],
439                                 pos.inode, pos.offset,
440                                 i->id, n.id, n.equiv);
441                         return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
442                 }
443         }
444
445         ret = darray_push(&s->ids, n);
446         if (ret)
447                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
448                         s->ids.size);
449         return ret;
450 }
451
452 /**
453  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
454  * and @ancestor hasn't been overwritten in @seen
455  *
456  * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
457  */
458 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
459                                     u32 id, u32 ancestor)
460 {
461         ssize_t i;
462
463         EBUG_ON(id > ancestor);
464         EBUG_ON(!bch2_snapshot_is_equiv(c, id));
465         EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
466
467         /* @ancestor should be the snapshot most recently added to @seen */
468         EBUG_ON(ancestor != seen->pos.snapshot);
469         EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
470
471         if (id == ancestor)
472                 return true;
473
474         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
475                 return false;
476
477         /*
478          * We know that @id is a descendant of @ancestor, we're checking if
479          * we've seen a key that overwrote @ancestor - i.e. also a descendent of
480          * @ascestor and with @id as a descendent.
481          *
482          * But we already know that we're scanning IDs between @id and @ancestor
483          * numerically, since snapshot ID lists are kept sorted, so if we find
484          * an id that's an ancestor of @id we're done:
485          */
486
487         for (i = seen->ids.nr - 2;
488              i >= 0 && seen->ids.data[i].equiv >= id;
489              --i)
490                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
491                         return false;
492
493         return true;
494 }
495
496 /**
497  * ref_visible - given a key with snapshot id @src that points to a key with
498  * snapshot id @dst, test whether there is some snapshot in which @dst is
499  * visible.
500  *
501  * This assumes we're visiting @src keys in natural key order.
502  *
503  * @s   - list of snapshot IDs already seen at @src
504  * @src - snapshot ID of src key
505  * @dst - snapshot ID of dst key
506  */
507 static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
508                        u32 src, u32 dst)
509 {
510         return dst <= src
511                 ? key_visible_in_snapshot(c, s, dst, src)
512                 : bch2_snapshot_is_ancestor(c, src, dst);
513 }
514
515 static int ref_visible2(struct bch_fs *c,
516                         u32 src, struct snapshots_seen *src_seen,
517                         u32 dst, struct snapshots_seen *dst_seen)
518 {
519         src = bch2_snapshot_equiv(c, src);
520         dst = bch2_snapshot_equiv(c, dst);
521
522         if (dst > src) {
523                 swap(dst, src);
524                 swap(dst_seen, src_seen);
525         }
526         return key_visible_in_snapshot(c, src_seen, dst, src);
527 }
528
529 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                               \
530         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
531              (_i)->snapshot <= (_snapshot); _i++)                                       \
532                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
533
534 struct inode_walker_entry {
535         struct bch_inode_unpacked inode;
536         u32                     snapshot;
537         bool                    seen_this_pos;
538         u64                     count;
539 };
540
541 struct inode_walker {
542         bool                            first_this_inode;
543         bool                            recalculate_sums;
544         struct bpos                     last_pos;
545
546         DARRAY(struct inode_walker_entry) inodes;
547 };
548
549 static void inode_walker_exit(struct inode_walker *w)
550 {
551         darray_exit(&w->inodes);
552 }
553
554 static struct inode_walker inode_walker_init(void)
555 {
556         return (struct inode_walker) { 0, };
557 }
558
559 static int add_inode(struct bch_fs *c, struct inode_walker *w,
560                      struct bkey_s_c inode)
561 {
562         struct bch_inode_unpacked u;
563
564         BUG_ON(bch2_inode_unpack(inode, &u));
565
566         return darray_push(&w->inodes, ((struct inode_walker_entry) {
567                 .inode          = u,
568                 .snapshot       = bch2_snapshot_equiv(c, inode.k->p.snapshot),
569         }));
570 }
571
572 static int get_inodes_all_snapshots(struct btree_trans *trans,
573                                     struct inode_walker *w, u64 inum)
574 {
575         struct bch_fs *c = trans->c;
576         struct btree_iter iter;
577         struct bkey_s_c k;
578         u32 restart_count = trans->restart_count;
579         int ret;
580
581         w->recalculate_sums = false;
582         w->inodes.nr = 0;
583
584         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
585                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
586                 if (k.k->p.offset != inum)
587                         break;
588
589                 if (bkey_is_inode(k.k))
590                         add_inode(c, w, k);
591         }
592         bch2_trans_iter_exit(trans, &iter);
593
594         if (ret)
595                 return ret;
596
597         w->first_this_inode = true;
598
599         if (trans_was_restarted(trans, restart_count))
600                 return -BCH_ERR_transaction_restart_nested;
601
602         return 0;
603 }
604
605 static struct inode_walker_entry *
606 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
607                           u32 snapshot, bool is_whiteout)
608 {
609         struct inode_walker_entry *i;
610
611         snapshot = bch2_snapshot_equiv(c, snapshot);
612
613         darray_for_each(w->inodes, i)
614                 if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
615                         goto found;
616
617         return NULL;
618 found:
619         BUG_ON(snapshot > i->snapshot);
620
621         if (snapshot != i->snapshot && !is_whiteout) {
622                 struct inode_walker_entry new = *i;
623                 size_t pos;
624                 int ret;
625
626                 new.snapshot = snapshot;
627                 new.count = 0;
628
629                 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
630                          w->last_pos.inode, snapshot, i->snapshot);
631
632                 while (i > w->inodes.data && i[-1].snapshot > snapshot)
633                         --i;
634
635                 pos = i - w->inodes.data;
636                 ret = darray_insert_item(&w->inodes, pos, new);
637                 if (ret)
638                         return ERR_PTR(ret);
639
640                 i = w->inodes.data + pos;
641         }
642
643         return i;
644 }
645
646 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
647                                              struct inode_walker *w, struct bpos pos,
648                                              bool is_whiteout)
649 {
650         if (w->last_pos.inode != pos.inode) {
651                 int ret = get_inodes_all_snapshots(trans, w, pos.inode);
652                 if (ret)
653                         return ERR_PTR(ret);
654         } else if (bkey_cmp(w->last_pos, pos)) {
655                 struct inode_walker_entry *i;
656
657                 darray_for_each(w->inodes, i)
658                         i->seen_this_pos = false;
659
660         }
661
662         w->last_pos = pos;
663
664         return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
665 }
666
667 static int __get_visible_inodes(struct btree_trans *trans,
668                                 struct inode_walker *w,
669                                 struct snapshots_seen *s,
670                                 u64 inum)
671 {
672         struct bch_fs *c = trans->c;
673         struct btree_iter iter;
674         struct bkey_s_c k;
675         int ret;
676
677         w->inodes.nr = 0;
678
679         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
680                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
681                 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
682
683                 if (k.k->p.offset != inum)
684                         break;
685
686                 if (!ref_visible(c, s, s->pos.snapshot, equiv))
687                         continue;
688
689                 if (bkey_is_inode(k.k))
690                         add_inode(c, w, k);
691
692                 if (equiv >= s->pos.snapshot)
693                         break;
694         }
695         bch2_trans_iter_exit(trans, &iter);
696
697         return ret;
698 }
699
700 static int check_key_has_snapshot(struct btree_trans *trans,
701                                   struct btree_iter *iter,
702                                   struct bkey_s_c k)
703 {
704         struct bch_fs *c = trans->c;
705         struct printbuf buf = PRINTBUF;
706         int ret = 0;
707
708         if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
709                         "key in missing snapshot: %s",
710                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
711                 ret = bch2_btree_delete_at(trans, iter,
712                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
713 fsck_err:
714         printbuf_exit(&buf);
715         return ret;
716 }
717
718 static int hash_redo_key(struct btree_trans *trans,
719                          const struct bch_hash_desc desc,
720                          struct bch_hash_info *hash_info,
721                          struct btree_iter *k_iter, struct bkey_s_c k)
722 {
723         struct bkey_i *delete;
724         struct bkey_i *tmp;
725
726         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
727         if (IS_ERR(delete))
728                 return PTR_ERR(delete);
729
730         tmp = bch2_bkey_make_mut_noupdate(trans, k);
731         if (IS_ERR(tmp))
732                 return PTR_ERR(tmp);
733
734         bkey_init(&delete->k);
735         delete->k.p = k_iter->pos;
736         return  bch2_btree_iter_traverse(k_iter) ?:
737                 bch2_trans_update(trans, k_iter, delete, 0) ?:
738                 bch2_hash_set_snapshot(trans, desc, hash_info,
739                                        (subvol_inum) { 0, k.k->p.inode },
740                                        k.k->p.snapshot, tmp,
741                                        BCH_HASH_SET_MUST_CREATE,
742                                        BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
743                 bch2_trans_commit(trans, NULL, NULL,
744                                   BTREE_INSERT_NOFAIL|
745                                   BTREE_INSERT_LAZY_RW);
746 }
747
748 static int hash_check_key(struct btree_trans *trans,
749                           const struct bch_hash_desc desc,
750                           struct bch_hash_info *hash_info,
751                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
752 {
753         struct bch_fs *c = trans->c;
754         struct btree_iter iter = { NULL };
755         struct printbuf buf = PRINTBUF;
756         struct bkey_s_c k;
757         u64 hash;
758         int ret = 0;
759
760         if (hash_k.k->type != desc.key_type)
761                 return 0;
762
763         hash = desc.hash_bkey(hash_info, hash_k);
764
765         if (likely(hash == hash_k.k->p.offset))
766                 return 0;
767
768         if (hash_k.k->p.offset < hash)
769                 goto bad_hash;
770
771         for_each_btree_key_norestart(trans, iter, desc.btree_id,
772                                      SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
773                                      BTREE_ITER_SLOTS, k, ret) {
774                 if (bkey_eq(k.k->p, hash_k.k->p))
775                         break;
776
777                 if (fsck_err_on(k.k->type == desc.key_type &&
778                                 !desc.cmp_bkey(k, hash_k), c,
779                                 "duplicate hash table keys:\n%s",
780                                 (printbuf_reset(&buf),
781                                  bch2_bkey_val_to_text(&buf, c, hash_k),
782                                  buf.buf))) {
783                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
784                         break;
785                 }
786
787                 if (bkey_deleted(k.k)) {
788                         bch2_trans_iter_exit(trans, &iter);
789                         goto bad_hash;
790                 }
791         }
792 out:
793         bch2_trans_iter_exit(trans, &iter);
794         printbuf_exit(&buf);
795         return ret;
796 bad_hash:
797         if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
798                      bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
799                      (printbuf_reset(&buf),
800                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
801                 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
802                 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
803                         bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
804                 if (ret)
805                         return ret;
806                 ret = -BCH_ERR_transaction_restart_nested;
807         }
808 fsck_err:
809         goto out;
810 }
811
812 static int check_inode(struct btree_trans *trans,
813                        struct btree_iter *iter,
814                        struct bkey_s_c k,
815                        struct bch_inode_unpacked *prev,
816                        struct snapshots_seen *s,
817                        bool full)
818 {
819         struct bch_fs *c = trans->c;
820         struct bch_inode_unpacked u;
821         bool do_update = false;
822         int ret;
823
824         ret = check_key_has_snapshot(trans, iter, k);
825         if (ret < 0)
826                 goto err;
827         if (ret)
828                 return 0;
829
830         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
831         if (ret)
832                 goto err;
833
834         /*
835          * if snapshot id isn't a leaf node, skip it - deletion in
836          * particular is not atomic, so on the internal snapshot nodes
837          * we can see inodes marked for deletion after a clean shutdown
838          */
839         if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot))
840                 return 0;
841
842         if (!bkey_is_inode(k.k))
843                 return 0;
844
845         BUG_ON(bch2_inode_unpack(k, &u));
846
847         if (!full &&
848             !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
849                             BCH_INODE_I_SECTORS_DIRTY|
850                             BCH_INODE_UNLINKED)))
851                 return 0;
852
853         if (prev->bi_inum != u.bi_inum)
854                 *prev = u;
855
856         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
857                         inode_d_type(prev)      != inode_d_type(&u), c,
858                         "inodes in different snapshots don't match")) {
859                 bch_err(c, "repair not implemented yet");
860                 return -EINVAL;
861         }
862
863         if (u.bi_flags & BCH_INODE_UNLINKED &&
864             (!c->sb.clean ||
865              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
866                       u.bi_inum))) {
867                 bch2_trans_unlock(trans);
868                 bch2_fs_lazy_rw(c);
869
870                 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
871                 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
872                         bch_err(c, "error in fsck: error while deleting inode: %s",
873                                 bch2_err_str(ret));
874                 return ret;
875         }
876
877         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
878             (!c->sb.clean ||
879              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
880                       u.bi_inum))) {
881                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
882
883                 bch2_trans_unlock(trans);
884                 bch2_fs_lazy_rw(c);
885
886                 /*
887                  * XXX: need to truncate partial blocks too here - or ideally
888                  * just switch units to bytes and that issue goes away
889                  */
890                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
891                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
892                                      iter->pos.snapshot),
893                                 POS(u.bi_inum, U64_MAX),
894                                 0, NULL);
895                 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
896                         bch_err(c, "error in fsck: error truncating inode: %s",
897                                 bch2_err_str(ret));
898                 if (ret)
899                         return ret;
900
901                 /*
902                  * We truncated without our normal sector accounting hook, just
903                  * make sure we recalculate it:
904                  */
905                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
906
907                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
908                 do_update = true;
909         }
910
911         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
912             (!c->sb.clean ||
913              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
914                       u.bi_inum))) {
915                 s64 sectors;
916
917                 bch_verbose(c, "recounting sectors for inode %llu",
918                             u.bi_inum);
919
920                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
921                 if (sectors < 0) {
922                         bch_err(c, "error in fsck: error recounting inode sectors: %s",
923                                 bch2_err_str(sectors));
924                         return sectors;
925                 }
926
927                 u.bi_sectors = sectors;
928                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
929                 do_update = true;
930         }
931
932         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
933                 u.bi_dir = 0;
934                 u.bi_dir_offset = 0;
935                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
936                 do_update = true;
937         }
938
939         if (do_update) {
940                 ret = __write_inode(trans, &u, iter->pos.snapshot);
941                 if (ret)
942                         bch_err(c, "error in fsck: error updating inode: %s",
943                                 bch2_err_str(ret));
944         }
945 err:
946 fsck_err:
947         if (ret)
948                 bch_err_fn(c, ret);
949         return ret;
950 }
951
952 noinline_for_stack
953 int bch2_check_inodes(struct bch_fs *c)
954 {
955         bool full = c->opts.fsck;
956         struct btree_trans trans;
957         struct btree_iter iter;
958         struct bch_inode_unpacked prev = { 0 };
959         struct snapshots_seen s;
960         struct bkey_s_c k;
961         int ret;
962
963         snapshots_seen_init(&s);
964         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
965
966         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
967                         POS_MIN,
968                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
969                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
970                 check_inode(&trans, &iter, k, &prev, &s, full));
971
972         bch2_trans_exit(&trans);
973         snapshots_seen_exit(&s);
974         if (ret)
975                 bch_err_fn(c, ret);
976         return ret;
977 }
978
979 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
980                                                 struct btree_iter *iter,
981                                                 struct bpos pos)
982 {
983         return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
984 }
985
986 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
987                                    struct bkey_s_c_dirent d)
988 {
989         return  inode->bi_dir           == d.k->p.inode &&
990                 inode->bi_dir_offset    == d.k->p.offset;
991 }
992
993 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
994                                    struct bch_inode_unpacked *inode)
995 {
996         return d.v->d_type == DT_SUBVOL
997                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
998                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
999 }
1000
1001 static int inode_backpointer_exists(struct btree_trans *trans,
1002                                     struct bch_inode_unpacked *inode,
1003                                     u32 snapshot)
1004 {
1005         struct btree_iter iter;
1006         struct bkey_s_c_dirent d;
1007         int ret;
1008
1009         d = dirent_get_by_pos(trans, &iter,
1010                         SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
1011         ret = bkey_err(d);
1012         if (ret)
1013                 return bch2_err_matches(ret, ENOENT) ? 0 : ret;
1014
1015         ret = dirent_points_to_inode(d, inode);
1016         bch2_trans_iter_exit(trans, &iter);
1017         return ret;
1018 }
1019
1020 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1021 {
1022         struct bch_fs *c = trans->c;
1023         struct inode_walker_entry *i;
1024         u32 restart_count = trans->restart_count;
1025         int ret = 0;
1026         s64 count2;
1027
1028         darray_for_each(w->inodes, i) {
1029                 if (i->inode.bi_sectors == i->count)
1030                         continue;
1031
1032                 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1033
1034                 if (w->recalculate_sums)
1035                         i->count = count2;
1036
1037                 if (i->count != count2) {
1038                         bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1039                                 w->last_pos.inode, i->snapshot, i->count, count2);
1040                         return -BCH_ERR_internal_fsck_err;
1041                 }
1042
1043                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
1044                             "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1045                             w->last_pos.inode, i->snapshot,
1046                             i->inode.bi_sectors, i->count)) {
1047                         i->inode.bi_sectors = i->count;
1048                         ret = write_inode(trans, &i->inode, i->snapshot);
1049                         if (ret)
1050                                 break;
1051                 }
1052         }
1053 fsck_err:
1054         if (ret)
1055                 bch_err_fn(c, ret);
1056         if (!ret && trans_was_restarted(trans, restart_count))
1057                 ret = -BCH_ERR_transaction_restart_nested;
1058         return ret;
1059 }
1060
1061 struct extent_end {
1062         u32                     snapshot;
1063         u64                     offset;
1064         struct snapshots_seen   seen;
1065 };
1066
1067 struct extent_ends {
1068         struct bpos                     last_pos;
1069         DARRAY(struct extent_end)       e;
1070 };
1071
1072 static void extent_ends_reset(struct extent_ends *extent_ends)
1073 {
1074         struct extent_end *i;
1075
1076         darray_for_each(extent_ends->e, i)
1077                 snapshots_seen_exit(&i->seen);
1078
1079         extent_ends->e.nr = 0;
1080 }
1081
1082 static void extent_ends_exit(struct extent_ends *extent_ends)
1083 {
1084         extent_ends_reset(extent_ends);
1085         darray_exit(&extent_ends->e);
1086 }
1087
1088 static void extent_ends_init(struct extent_ends *extent_ends)
1089 {
1090         memset(extent_ends, 0, sizeof(*extent_ends));
1091 }
1092
1093 static int extent_ends_at(struct bch_fs *c,
1094                           struct extent_ends *extent_ends,
1095                           struct snapshots_seen *seen,
1096                           struct bkey_s_c k)
1097 {
1098         struct extent_end *i, n = (struct extent_end) {
1099                 .offset         = k.k->p.offset,
1100                 .snapshot       = k.k->p.snapshot,
1101                 .seen           = *seen,
1102         };
1103
1104         n.seen.ids.data = kmemdup(seen->ids.data,
1105                               sizeof(seen->ids.data[0]) * seen->ids.size,
1106                               GFP_KERNEL);
1107         if (!n.seen.ids.data)
1108                 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1109
1110         darray_for_each(extent_ends->e, i) {
1111                 if (i->snapshot == k.k->p.snapshot) {
1112                         snapshots_seen_exit(&i->seen);
1113                         *i = n;
1114                         return 0;
1115                 }
1116
1117                 if (i->snapshot >= k.k->p.snapshot)
1118                         break;
1119         }
1120
1121         return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1122 }
1123
1124 static int overlapping_extents_found(struct btree_trans *trans,
1125                                      enum btree_id btree,
1126                                      struct bpos pos1, struct bkey pos2,
1127                                      bool *fixed,
1128                                      struct extent_end *extent_end)
1129 {
1130         struct bch_fs *c = trans->c;
1131         struct printbuf buf = PRINTBUF;
1132         struct btree_iter iter1, iter2 = { NULL };
1133         struct bkey_s_c k1, k2;
1134         int ret;
1135
1136         BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1137
1138         bch2_trans_iter_init(trans, &iter1, btree, pos1,
1139                              BTREE_ITER_ALL_SNAPSHOTS|
1140                              BTREE_ITER_NOT_EXTENTS);
1141         k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1142         ret = bkey_err(k1);
1143         if (ret)
1144                 goto err;
1145
1146         prt_str(&buf, "\n  ");
1147         bch2_bkey_val_to_text(&buf, c, k1);
1148
1149         if (!bpos_eq(pos1, k1.k->p)) {
1150                 prt_str(&buf, "\n  wanted\n  ");
1151                 bch2_bpos_to_text(&buf, pos1);
1152                 prt_str(&buf, "\n  ");
1153                 bch2_bkey_to_text(&buf, &pos2);
1154
1155                 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1156                         __func__, buf.buf);
1157                 ret = -BCH_ERR_internal_fsck_err;
1158                 goto err;
1159         }
1160
1161         bch2_trans_copy_iter(&iter2, &iter1);
1162
1163         while (1) {
1164                 bch2_btree_iter_advance(&iter2);
1165
1166                 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1167                 ret = bkey_err(k2);
1168                 if (ret)
1169                         goto err;
1170
1171                 if (bpos_ge(k2.k->p, pos2.p))
1172                         break;
1173         }
1174
1175         prt_str(&buf, "\n  ");
1176         bch2_bkey_val_to_text(&buf, c, k2);
1177
1178         if (bpos_gt(k2.k->p, pos2.p) ||
1179             pos2.size != k2.k->size) {
1180                 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1181                         __func__, buf.buf);
1182                 ret = -BCH_ERR_internal_fsck_err;
1183                 goto err;
1184         }
1185
1186         prt_printf(&buf, "\n  overwriting %s extent",
1187                    pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1188
1189         if (fsck_err(c, "overlapping extents%s", buf.buf)) {
1190                 struct btree_iter *old_iter = &iter1;
1191                 struct disk_reservation res = { 0 };
1192
1193                 if (pos1.snapshot < pos2.p.snapshot) {
1194                         old_iter = &iter2;
1195                         swap(k1, k2);
1196                 }
1197
1198                 trans->extra_journal_res += bch2_bkey_sectors_compressed(k2);
1199
1200                 ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1201                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
1202                                 k1, k2) ?:
1203                         bch2_trans_commit(trans, &res, NULL,
1204                                 BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
1205                 bch2_disk_reservation_put(c, &res);
1206
1207                 if (ret)
1208                         goto err;
1209
1210                 *fixed = true;
1211
1212                 if (pos1.snapshot == pos2.p.snapshot)
1213                         extent_end->offset = bkey_start_offset(&pos2);
1214                 else
1215                         ret = -BCH_ERR_transaction_restart_nested;
1216         }
1217 fsck_err:
1218 err:
1219         bch2_trans_iter_exit(trans, &iter2);
1220         bch2_trans_iter_exit(trans, &iter1);
1221         printbuf_exit(&buf);
1222         return ret;
1223 }
1224
1225 static int check_overlapping_extents(struct btree_trans *trans,
1226                               struct snapshots_seen *seen,
1227                               struct extent_ends *extent_ends,
1228                               struct bkey_s_c k,
1229                               u32 equiv,
1230                               struct btree_iter *iter,
1231                               bool *fixed)
1232 {
1233         struct bch_fs *c = trans->c;
1234         struct extent_end *i;
1235         int ret = 0;
1236
1237         /* transaction restart, running again */
1238         if (bpos_eq(extent_ends->last_pos, k.k->p))
1239                 return 0;
1240
1241         if (extent_ends->last_pos.inode != k.k->p.inode)
1242                 extent_ends_reset(extent_ends);
1243
1244         darray_for_each(extent_ends->e, i) {
1245                 if (i->offset <= bkey_start_offset(k.k))
1246                         continue;
1247
1248                 if (!ref_visible2(c,
1249                                   k.k->p.snapshot, seen,
1250                                   i->snapshot, &i->seen))
1251                         continue;
1252
1253                 ret = overlapping_extents_found(trans, iter->btree_id,
1254                                                 SPOS(iter->pos.inode,
1255                                                      i->offset,
1256                                                      i->snapshot),
1257                                                 *k.k, fixed, i);
1258                 if (ret)
1259                         goto err;
1260         }
1261
1262         ret = extent_ends_at(c, extent_ends, seen, k);
1263         if (ret)
1264                 goto err;
1265
1266         extent_ends->last_pos = k.k->p;
1267 err:
1268         return ret;
1269 }
1270
1271 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1272                         struct bkey_s_c k,
1273                         struct inode_walker *inode,
1274                         struct snapshots_seen *s,
1275                         struct extent_ends *extent_ends)
1276 {
1277         struct bch_fs *c = trans->c;
1278         struct inode_walker_entry *i;
1279         struct printbuf buf = PRINTBUF;
1280         struct bpos equiv = k.k->p;
1281         int ret = 0;
1282
1283         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1284
1285         ret = check_key_has_snapshot(trans, iter, k);
1286         if (ret) {
1287                 ret = ret < 0 ? ret : 0;
1288                 goto out;
1289         }
1290
1291         if (inode->last_pos.inode != k.k->p.inode) {
1292                 ret = check_i_sectors(trans, inode);
1293                 if (ret)
1294                         goto err;
1295         }
1296
1297         i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
1298         ret = PTR_ERR_OR_ZERO(i);
1299         if (ret)
1300                 goto err;
1301
1302         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1303         if (ret)
1304                 goto err;
1305
1306         if (k.k->type != KEY_TYPE_whiteout) {
1307                 if (fsck_err_on(!i, c,
1308                                 "extent in missing inode:\n  %s",
1309                                 (printbuf_reset(&buf),
1310                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1311                         goto delete;
1312
1313                 if (fsck_err_on(i &&
1314                                 !S_ISREG(i->inode.bi_mode) &&
1315                                 !S_ISLNK(i->inode.bi_mode), c,
1316                                 "extent in non regular inode mode %o:\n  %s",
1317                                 i->inode.bi_mode,
1318                                 (printbuf_reset(&buf),
1319                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1320                         goto delete;
1321
1322                 ret = check_overlapping_extents(trans, s, extent_ends, k,
1323                                                 equiv.snapshot, iter,
1324                                                 &inode->recalculate_sums);
1325                 if (ret)
1326                         goto err;
1327         }
1328
1329         /*
1330          * Check inodes in reverse order, from oldest snapshots to newest,
1331          * starting from the inode that matches this extent's snapshot. If we
1332          * didn't have one, iterate over all inodes:
1333          */
1334         if (!i)
1335                 i = inode->inodes.data + inode->inodes.nr - 1;
1336
1337         for (;
1338              inode->inodes.data && i >= inode->inodes.data;
1339              --i) {
1340                 if (i->snapshot > equiv.snapshot ||
1341                     !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
1342                         continue;
1343
1344                 if (k.k->type != KEY_TYPE_whiteout) {
1345                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
1346                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1347                                         !bkey_extent_is_reservation(k), c,
1348                                         "extent type past end of inode %llu:%u, i_size %llu\n  %s",
1349                                         i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1350                                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1351                                 struct btree_iter iter2;
1352
1353                                 bch2_trans_copy_iter(&iter2, iter);
1354                                 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1355                                 ret =   bch2_btree_iter_traverse(&iter2) ?:
1356                                         bch2_btree_delete_at(trans, &iter2,
1357                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1358                                 bch2_trans_iter_exit(trans, &iter2);
1359                                 if (ret)
1360                                         goto err;
1361
1362                                 iter->k.type = KEY_TYPE_whiteout;
1363                         }
1364
1365                         if (bkey_extent_is_allocation(k.k))
1366                                 i->count += k.k->size;
1367                 }
1368
1369                 i->seen_this_pos = true;
1370         }
1371 out:
1372 err:
1373 fsck_err:
1374         printbuf_exit(&buf);
1375
1376         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1377                 bch_err_fn(c, ret);
1378         return ret;
1379 delete:
1380         ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1381         goto out;
1382 }
1383
1384 /*
1385  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1386  * that i_size an i_sectors are consistent
1387  */
1388 int bch2_check_extents(struct bch_fs *c)
1389 {
1390         struct inode_walker w = inode_walker_init();
1391         struct snapshots_seen s;
1392         struct btree_trans trans;
1393         struct btree_iter iter;
1394         struct bkey_s_c k;
1395         struct extent_ends extent_ends;
1396         struct disk_reservation res = { 0 };
1397         int ret = 0;
1398
1399         snapshots_seen_init(&s);
1400         extent_ends_init(&extent_ends);
1401         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
1402
1403         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
1404                         POS(BCACHEFS_ROOT_INO, 0),
1405                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1406                         &res, NULL,
1407                         BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
1408                 bch2_disk_reservation_put(c, &res);
1409                 check_extent(&trans, &iter, k, &w, &s, &extent_ends);
1410         })) ?:
1411         check_i_sectors(&trans, &w);
1412
1413         bch2_disk_reservation_put(c, &res);
1414         extent_ends_exit(&extent_ends);
1415         inode_walker_exit(&w);
1416         bch2_trans_exit(&trans);
1417         snapshots_seen_exit(&s);
1418
1419         if (ret)
1420                 bch_err_fn(c, ret);
1421         return ret;
1422 }
1423
1424 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1425 {
1426         struct bch_fs *c = trans->c;
1427         struct inode_walker_entry *i;
1428         u32 restart_count = trans->restart_count;
1429         int ret = 0;
1430         s64 count2;
1431
1432         darray_for_each(w->inodes, i) {
1433                 if (i->inode.bi_nlink == i->count)
1434                         continue;
1435
1436                 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1437                 if (count2 < 0)
1438                         return count2;
1439
1440                 if (i->count != count2) {
1441                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1442                                 i->count, count2);
1443                         i->count = count2;
1444                         if (i->inode.bi_nlink == i->count)
1445                                 continue;
1446                 }
1447
1448                 if (fsck_err_on(i->inode.bi_nlink != i->count, c,
1449                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1450                                 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1451                         i->inode.bi_nlink = i->count;
1452                         ret = write_inode(trans, &i->inode, i->snapshot);
1453                         if (ret)
1454                                 break;
1455                 }
1456         }
1457 fsck_err:
1458         if (ret)
1459                 bch_err_fn(c, ret);
1460         if (!ret && trans_was_restarted(trans, restart_count))
1461                 ret = -BCH_ERR_transaction_restart_nested;
1462         return ret;
1463 }
1464
1465 static int check_dirent_target(struct btree_trans *trans,
1466                                struct btree_iter *iter,
1467                                struct bkey_s_c_dirent d,
1468                                struct bch_inode_unpacked *target,
1469                                u32 target_snapshot)
1470 {
1471         struct bch_fs *c = trans->c;
1472         struct bkey_i_dirent *n;
1473         bool backpointer_exists = true;
1474         struct printbuf buf = PRINTBUF;
1475         int ret = 0;
1476
1477         if (!target->bi_dir &&
1478             !target->bi_dir_offset) {
1479                 target->bi_dir          = d.k->p.inode;
1480                 target->bi_dir_offset   = d.k->p.offset;
1481
1482                 ret = __write_inode(trans, target, target_snapshot);
1483                 if (ret)
1484                         goto err;
1485         }
1486
1487         if (!inode_points_to_dirent(target, d)) {
1488                 ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
1489                 if (ret < 0)
1490                         goto err;
1491
1492                 backpointer_exists = ret;
1493                 ret = 0;
1494
1495                 if (fsck_err_on(S_ISDIR(target->bi_mode) &&
1496                                 backpointer_exists, c,
1497                                 "directory %llu with multiple links",
1498                                 target->bi_inum)) {
1499                         ret = __remove_dirent(trans, d.k->p);
1500                         goto out;
1501                 }
1502
1503                 if (fsck_err_on(backpointer_exists &&
1504                                 !target->bi_nlink, c,
1505                                 "inode %llu type %s has multiple links but i_nlink 0",
1506                                 target->bi_inum, bch2_d_types[d.v->d_type])) {
1507                         target->bi_nlink++;
1508                         target->bi_flags &= ~BCH_INODE_UNLINKED;
1509
1510                         ret = __write_inode(trans, target, target_snapshot);
1511                         if (ret)
1512                                 goto err;
1513                 }
1514
1515                 if (fsck_err_on(!backpointer_exists, c,
1516                                 "inode %llu:%u has wrong backpointer:\n"
1517                                 "got       %llu:%llu\n"
1518                                 "should be %llu:%llu",
1519                                 target->bi_inum, target_snapshot,
1520                                 target->bi_dir,
1521                                 target->bi_dir_offset,
1522                                 d.k->p.inode,
1523                                 d.k->p.offset)) {
1524                         target->bi_dir          = d.k->p.inode;
1525                         target->bi_dir_offset   = d.k->p.offset;
1526
1527                         ret = __write_inode(trans, target, target_snapshot);
1528                         if (ret)
1529                                 goto err;
1530                 }
1531         }
1532
1533         if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
1534                         "incorrect d_type: got %s, should be %s:\n%s",
1535                         bch2_d_type_str(d.v->d_type),
1536                         bch2_d_type_str(inode_d_type(target)),
1537                         (printbuf_reset(&buf),
1538                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1539                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1540                 ret = PTR_ERR_OR_ZERO(n);
1541                 if (ret)
1542                         goto err;
1543
1544                 bkey_reassemble(&n->k_i, d.s_c);
1545                 n->v.d_type = inode_d_type(target);
1546
1547                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1548                 if (ret)
1549                         goto err;
1550
1551                 d = dirent_i_to_s_c(n);
1552         }
1553
1554         if (d.v->d_type == DT_SUBVOL &&
1555             target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
1556             (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
1557              fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
1558                       le32_to_cpu(d.v->d_parent_subvol),
1559                       target->bi_parent_subvol))) {
1560                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1561                 ret = PTR_ERR_OR_ZERO(n);
1562                 if (ret)
1563                         goto err;
1564
1565                 bkey_reassemble(&n->k_i, d.s_c);
1566                 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1567
1568                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1569                 if (ret)
1570                         goto err;
1571
1572                 d = dirent_i_to_s_c(n);
1573         }
1574 out:
1575 err:
1576 fsck_err:
1577         printbuf_exit(&buf);
1578
1579         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1580                 bch_err_fn(c, ret);
1581         return ret;
1582 }
1583
1584 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1585                         struct bkey_s_c k,
1586                         struct bch_hash_info *hash_info,
1587                         struct inode_walker *dir,
1588                         struct inode_walker *target,
1589                         struct snapshots_seen *s)
1590 {
1591         struct bch_fs *c = trans->c;
1592         struct bkey_s_c_dirent d;
1593         struct inode_walker_entry *i;
1594         struct printbuf buf = PRINTBUF;
1595         struct bpos equiv;
1596         int ret = 0;
1597
1598         ret = check_key_has_snapshot(trans, iter, k);
1599         if (ret) {
1600                 ret = ret < 0 ? ret : 0;
1601                 goto out;
1602         }
1603
1604         equiv = k.k->p;
1605         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1606
1607         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1608         if (ret)
1609                 goto err;
1610
1611         if (k.k->type == KEY_TYPE_whiteout)
1612                 goto out;
1613
1614         if (dir->last_pos.inode != k.k->p.inode) {
1615                 ret = check_subdir_count(trans, dir);
1616                 if (ret)
1617                         goto err;
1618         }
1619
1620         BUG_ON(!iter->path->should_be_locked);
1621
1622         i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
1623         ret = PTR_ERR_OR_ZERO(i);
1624         if (ret < 0)
1625                 goto err;
1626
1627         if (dir->first_this_inode && dir->inodes.nr)
1628                 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
1629         dir->first_this_inode = false;
1630
1631         if (fsck_err_on(!i, c,
1632                         "dirent in nonexisting directory:\n%s",
1633                         (printbuf_reset(&buf),
1634                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1635                 ret = bch2_btree_delete_at(trans, iter,
1636                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1637                 goto out;
1638         }
1639
1640         if (!i)
1641                 goto out;
1642
1643         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
1644                         "dirent in non directory inode type %s:\n%s",
1645                         bch2_d_type_str(inode_d_type(&i->inode)),
1646                         (printbuf_reset(&buf),
1647                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1648                 ret = bch2_btree_delete_at(trans, iter, 0);
1649                 goto out;
1650         }
1651
1652         ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
1653         if (ret < 0)
1654                 goto err;
1655         if (ret) {
1656                 /* dirent has been deleted */
1657                 ret = 0;
1658                 goto out;
1659         }
1660
1661         if (k.k->type != KEY_TYPE_dirent)
1662                 goto out;
1663
1664         d = bkey_s_c_to_dirent(k);
1665
1666         if (d.v->d_type == DT_SUBVOL) {
1667                 struct bch_inode_unpacked subvol_root;
1668                 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1669                 u32 target_snapshot;
1670                 u64 target_inum;
1671
1672                 ret = __subvol_lookup(trans, target_subvol,
1673                                       &target_snapshot, &target_inum);
1674                 if (ret && !bch2_err_matches(ret, ENOENT))
1675                         goto err;
1676
1677                 if (fsck_err_on(ret, c,
1678                                 "dirent points to missing subvolume %u",
1679                                 le32_to_cpu(d.v->d_child_subvol))) {
1680                         ret = __remove_dirent(trans, d.k->p);
1681                         goto err;
1682                 }
1683
1684                 ret = __lookup_inode(trans, target_inum,
1685                                    &subvol_root, &target_snapshot);
1686                 if (ret && !bch2_err_matches(ret, ENOENT))
1687                         goto err;
1688
1689                 if (fsck_err_on(ret, c,
1690                                 "subvolume %u points to missing subvolume root %llu",
1691                                 target_subvol,
1692                                 target_inum)) {
1693                         bch_err(c, "repair not implemented yet");
1694                         ret = -EINVAL;
1695                         goto err;
1696                 }
1697
1698                 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
1699                                 "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1700                                 target_inum,
1701                                 subvol_root.bi_subvol, target_subvol)) {
1702                         subvol_root.bi_subvol = target_subvol;
1703                         ret = __write_inode(trans, &subvol_root, target_snapshot);
1704                         if (ret)
1705                                 goto err;
1706                 }
1707
1708                 ret = check_dirent_target(trans, iter, d, &subvol_root,
1709                                           target_snapshot);
1710                 if (ret)
1711                         goto err;
1712         } else {
1713                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1714                 if (ret)
1715                         goto err;
1716
1717                 if (fsck_err_on(!target->inodes.nr, c,
1718                                 "dirent points to missing inode: (equiv %u)\n%s",
1719                                 equiv.snapshot,
1720                                 (printbuf_reset(&buf),
1721                                  bch2_bkey_val_to_text(&buf, c, k),
1722                                  buf.buf))) {
1723                         ret = __remove_dirent(trans, d.k->p);
1724                         if (ret)
1725                                 goto err;
1726                 }
1727
1728                 darray_for_each(target->inodes, i) {
1729                         ret = check_dirent_target(trans, iter, d,
1730                                                   &i->inode, i->snapshot);
1731                         if (ret)
1732                                 goto err;
1733                 }
1734         }
1735
1736         if (d.v->d_type == DT_DIR)
1737                 for_each_visible_inode(c, s, dir, equiv.snapshot, i)
1738                         i->count++;
1739
1740 out:
1741 err:
1742 fsck_err:
1743         printbuf_exit(&buf);
1744
1745         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1746                 bch_err_fn(c, ret);
1747         return ret;
1748 }
1749
1750 /*
1751  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1752  * validate d_type
1753  */
1754 int bch2_check_dirents(struct bch_fs *c)
1755 {
1756         struct inode_walker dir = inode_walker_init();
1757         struct inode_walker target = inode_walker_init();
1758         struct snapshots_seen s;
1759         struct bch_hash_info hash_info;
1760         struct btree_trans trans;
1761         struct btree_iter iter;
1762         struct bkey_s_c k;
1763         int ret = 0;
1764
1765         snapshots_seen_init(&s);
1766         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1767
1768         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
1769                         POS(BCACHEFS_ROOT_INO, 0),
1770                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1771                         k,
1772                         NULL, NULL,
1773                         BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
1774                 check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
1775
1776         bch2_trans_exit(&trans);
1777         snapshots_seen_exit(&s);
1778         inode_walker_exit(&dir);
1779         inode_walker_exit(&target);
1780
1781         if (ret)
1782                 bch_err_fn(c, ret);
1783         return ret;
1784 }
1785
1786 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
1787                        struct bkey_s_c k,
1788                        struct bch_hash_info *hash_info,
1789                        struct inode_walker *inode)
1790 {
1791         struct bch_fs *c = trans->c;
1792         struct inode_walker_entry *i;
1793         int ret;
1794
1795         ret = check_key_has_snapshot(trans, iter, k);
1796         if (ret)
1797                 return ret;
1798
1799         i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
1800         ret = PTR_ERR_OR_ZERO(i);
1801         if (ret)
1802                 return ret;
1803
1804         if (inode->first_this_inode && inode->inodes.nr)
1805                 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
1806         inode->first_this_inode = false;
1807
1808         if (fsck_err_on(!i, c,
1809                         "xattr for missing inode %llu",
1810                         k.k->p.inode))
1811                 return bch2_btree_delete_at(trans, iter, 0);
1812
1813         if (!i)
1814                 return 0;
1815
1816         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
1817 fsck_err:
1818         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1819                 bch_err_fn(c, ret);
1820         return ret;
1821 }
1822
1823 /*
1824  * Walk xattrs: verify that they all have a corresponding inode
1825  */
1826 int bch2_check_xattrs(struct bch_fs *c)
1827 {
1828         struct inode_walker inode = inode_walker_init();
1829         struct bch_hash_info hash_info;
1830         struct btree_trans trans;
1831         struct btree_iter iter;
1832         struct bkey_s_c k;
1833         int ret = 0;
1834
1835         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1836
1837         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
1838                         POS(BCACHEFS_ROOT_INO, 0),
1839                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1840                         k,
1841                         NULL, NULL,
1842                         BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
1843                 check_xattr(&trans, &iter, k, &hash_info, &inode));
1844
1845         bch2_trans_exit(&trans);
1846
1847         if (ret)
1848                 bch_err_fn(c, ret);
1849         return ret;
1850 }
1851
1852 static int check_root_trans(struct btree_trans *trans)
1853 {
1854         struct bch_fs *c = trans->c;
1855         struct bch_inode_unpacked root_inode;
1856         u32 snapshot;
1857         u64 inum;
1858         int ret;
1859
1860         ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1861         if (ret && !bch2_err_matches(ret, ENOENT))
1862                 return ret;
1863
1864         if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
1865                 struct bkey_i_subvolume root_subvol;
1866
1867                 snapshot        = U32_MAX;
1868                 inum            = BCACHEFS_ROOT_INO;
1869
1870                 bkey_subvolume_init(&root_subvol.k_i);
1871                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1872                 root_subvol.v.flags     = 0;
1873                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1874                 root_subvol.v.inode     = cpu_to_le64(inum);
1875                 ret = commit_do(trans, NULL, NULL,
1876                                       BTREE_INSERT_NOFAIL|
1877                                       BTREE_INSERT_LAZY_RW,
1878                         __bch2_btree_insert(trans, BTREE_ID_subvolumes,
1879                                             &root_subvol.k_i, 0));
1880                 if (ret) {
1881                         bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
1882                         goto err;
1883                 }
1884
1885         }
1886
1887         ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1888         if (ret && !bch2_err_matches(ret, ENOENT))
1889                 return ret;
1890
1891         if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
1892             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
1893                                 "root inode not a directory")) {
1894                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1895                                 0, NULL);
1896                 root_inode.bi_inum = inum;
1897
1898                 ret = __write_inode(trans, &root_inode, snapshot);
1899                 if (ret)
1900                         bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
1901         }
1902 err:
1903 fsck_err:
1904         return ret;
1905 }
1906
1907 /* Get root directory, create if it doesn't exist: */
1908 int bch2_check_root(struct bch_fs *c)
1909 {
1910         int ret;
1911
1912         ret = bch2_trans_do(c, NULL, NULL,
1913                              BTREE_INSERT_NOFAIL|
1914                              BTREE_INSERT_LAZY_RW,
1915                 check_root_trans(&trans));
1916
1917         if (ret)
1918                 bch_err_fn(c, ret);
1919         return ret;
1920 }
1921
1922 struct pathbuf_entry {
1923         u64     inum;
1924         u32     snapshot;
1925 };
1926
1927 typedef DARRAY(struct pathbuf_entry) pathbuf;
1928
1929 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
1930 {
1931         struct pathbuf_entry *i;
1932
1933         darray_for_each(*p, i)
1934                 if (i->inum     == inum &&
1935                     i->snapshot == snapshot)
1936                         return true;
1937
1938         return false;
1939 }
1940
1941 static int path_down(struct bch_fs *c, pathbuf *p,
1942                      u64 inum, u32 snapshot)
1943 {
1944         int ret = darray_push(p, ((struct pathbuf_entry) {
1945                 .inum           = inum,
1946                 .snapshot       = snapshot,
1947         }));
1948
1949         if (ret)
1950                 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
1951                         p->size);
1952         return ret;
1953 }
1954
1955 /*
1956  * Check that a given inode is reachable from the root:
1957  *
1958  * XXX: we should also be verifying that inodes are in the right subvolumes
1959  */
1960 static int check_path(struct btree_trans *trans,
1961                       pathbuf *p,
1962                       struct bch_inode_unpacked *inode,
1963                       u32 snapshot)
1964 {
1965         struct bch_fs *c = trans->c;
1966         int ret = 0;
1967
1968         snapshot = bch2_snapshot_equiv(c, snapshot);
1969         p->nr = 0;
1970
1971         while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
1972                  inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
1973                 struct btree_iter dirent_iter;
1974                 struct bkey_s_c_dirent d;
1975                 u32 parent_snapshot = snapshot;
1976
1977                 if (inode->bi_subvol) {
1978                         u64 inum;
1979
1980                         ret = subvol_lookup(trans, inode->bi_parent_subvol,
1981                                             &parent_snapshot, &inum);
1982                         if (ret)
1983                                 break;
1984                 }
1985
1986                 ret = lockrestart_do(trans,
1987                         PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
1988                                           SPOS(inode->bi_dir, inode->bi_dir_offset,
1989                                                parent_snapshot))).k));
1990                 if (ret && !bch2_err_matches(ret, ENOENT))
1991                         break;
1992
1993                 if (!ret && !dirent_points_to_inode(d, inode)) {
1994                         bch2_trans_iter_exit(trans, &dirent_iter);
1995                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
1996                 }
1997
1998                 if (bch2_err_matches(ret, ENOENT)) {
1999                         if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
2000                                      inode->bi_inum, snapshot,
2001                                      bch2_d_type_str(inode_d_type(inode)),
2002                                      inode->bi_nlink,
2003                                      inode->bi_dir,
2004                                      inode->bi_dir_offset))
2005                                 ret = reattach_inode(trans, inode, snapshot);
2006                         break;
2007                 }
2008
2009                 bch2_trans_iter_exit(trans, &dirent_iter);
2010
2011                 if (!S_ISDIR(inode->bi_mode))
2012                         break;
2013
2014                 ret = path_down(c, p, inode->bi_inum, snapshot);
2015                 if (ret) {
2016                         bch_err(c, "memory allocation failure");
2017                         return ret;
2018                 }
2019
2020                 snapshot = parent_snapshot;
2021
2022                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
2023                 if (ret) {
2024                         /* Should have been caught in dirents pass */
2025                         bch_err(c, "error looking up parent directory: %i", ret);
2026                         break;
2027                 }
2028
2029                 if (path_is_dup(p, inode->bi_inum, snapshot)) {
2030                         struct pathbuf_entry *i;
2031
2032                         /* XXX print path */
2033                         bch_err(c, "directory structure loop");
2034
2035                         darray_for_each(*p, i)
2036                                 pr_err("%llu:%u", i->inum, i->snapshot);
2037                         pr_err("%llu:%u", inode->bi_inum, snapshot);
2038
2039                         if (!fsck_err(c, "directory structure loop"))
2040                                 return 0;
2041
2042                         ret = commit_do(trans, NULL, NULL,
2043                                               BTREE_INSERT_NOFAIL|
2044                                               BTREE_INSERT_LAZY_RW,
2045                                         remove_backpointer(trans, inode));
2046                         if (ret) {
2047                                 bch_err(c, "error removing dirent: %i", ret);
2048                                 break;
2049                         }
2050
2051                         ret = reattach_inode(trans, inode, snapshot);
2052                 }
2053         }
2054 fsck_err:
2055         if (ret)
2056                 bch_err_fn(c, ret);
2057         return ret;
2058 }
2059
2060 /*
2061  * Check for unreachable inodes, as well as loops in the directory structure:
2062  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2063  * unreachable:
2064  */
2065 int bch2_check_directory_structure(struct bch_fs *c)
2066 {
2067         struct btree_trans trans;
2068         struct btree_iter iter;
2069         struct bkey_s_c k;
2070         struct bch_inode_unpacked u;
2071         pathbuf path = { 0, };
2072         int ret;
2073
2074         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2075
2076         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
2077                            BTREE_ITER_INTENT|
2078                            BTREE_ITER_PREFETCH|
2079                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2080                 if (!bkey_is_inode(k.k))
2081                         continue;
2082
2083                 ret = bch2_inode_unpack(k, &u);
2084                 if (ret) {
2085                         /* Should have been caught earlier in fsck: */
2086                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
2087                         break;
2088                 }
2089
2090                 if (u.bi_flags & BCH_INODE_UNLINKED)
2091                         continue;
2092
2093                 ret = check_path(&trans, &path, &u, iter.pos.snapshot);
2094                 if (ret)
2095                         break;
2096         }
2097         bch2_trans_iter_exit(&trans, &iter);
2098         bch2_trans_exit(&trans);
2099         darray_exit(&path);
2100
2101         if (ret)
2102                 bch_err_fn(c, ret);
2103         return ret;
2104 }
2105
2106 /* check_nlink pass: */
2107
2108 struct nlink_table {
2109         size_t          nr;
2110         size_t          size;
2111
2112         struct nlink {
2113                 u64     inum;
2114                 u32     snapshot;
2115                 u32     count;
2116         }               *d;
2117 };
2118
2119 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2120                      u64 inum, u32 snapshot)
2121 {
2122         if (t->nr == t->size) {
2123                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2124                 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2125
2126                 if (!d) {
2127                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2128                                 new_size);
2129                         return -BCH_ERR_ENOMEM_fsck_add_nlink;
2130                 }
2131
2132                 if (t->d)
2133                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2134                 kvfree(t->d);
2135
2136                 t->d = d;
2137                 t->size = new_size;
2138         }
2139
2140
2141         t->d[t->nr++] = (struct nlink) {
2142                 .inum           = inum,
2143                 .snapshot       = snapshot,
2144         };
2145
2146         return 0;
2147 }
2148
2149 static int nlink_cmp(const void *_l, const void *_r)
2150 {
2151         const struct nlink *l = _l;
2152         const struct nlink *r = _r;
2153
2154         return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
2155 }
2156
2157 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2158                      struct nlink_table *links,
2159                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2160 {
2161         struct nlink *link, key = {
2162                 .inum = inum, .snapshot = U32_MAX,
2163         };
2164
2165         if (inum < range_start || inum >= range_end)
2166                 return;
2167
2168         link = __inline_bsearch(&key, links->d, links->nr,
2169                                 sizeof(links->d[0]), nlink_cmp);
2170         if (!link)
2171                 return;
2172
2173         while (link > links->d && link[0].inum == link[-1].inum)
2174                 --link;
2175
2176         for (; link < links->d + links->nr && link->inum == inum; link++)
2177                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2178                         link->count++;
2179                         if (link->snapshot >= snapshot)
2180                                 break;
2181                 }
2182 }
2183
2184 noinline_for_stack
2185 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2186                                        struct nlink_table *t,
2187                                        u64 start, u64 *end)
2188 {
2189         struct btree_trans trans;
2190         struct btree_iter iter;
2191         struct bkey_s_c k;
2192         struct bch_inode_unpacked u;
2193         int ret = 0;
2194
2195         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2196
2197         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2198                            POS(0, start),
2199                            BTREE_ITER_INTENT|
2200                            BTREE_ITER_PREFETCH|
2201                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2202                 if (!bkey_is_inode(k.k))
2203                         continue;
2204
2205                 /* Should never fail, checked by bch2_inode_invalid: */
2206                 BUG_ON(bch2_inode_unpack(k, &u));
2207
2208                 /*
2209                  * Backpointer and directory structure checks are sufficient for
2210                  * directories, since they can't have hardlinks:
2211                  */
2212                 if (S_ISDIR(u.bi_mode))
2213                         continue;
2214
2215                 if (!u.bi_nlink)
2216                         continue;
2217
2218                 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2219                 if (ret) {
2220                         *end = k.k->p.offset;
2221                         ret = 0;
2222                         break;
2223                 }
2224
2225         }
2226         bch2_trans_iter_exit(&trans, &iter);
2227         bch2_trans_exit(&trans);
2228
2229         if (ret)
2230                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2231
2232         return ret;
2233 }
2234
2235 noinline_for_stack
2236 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2237                                      u64 range_start, u64 range_end)
2238 {
2239         struct btree_trans trans;
2240         struct snapshots_seen s;
2241         struct btree_iter iter;
2242         struct bkey_s_c k;
2243         struct bkey_s_c_dirent d;
2244         int ret;
2245
2246         snapshots_seen_init(&s);
2247
2248         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2249
2250         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
2251                            BTREE_ITER_INTENT|
2252                            BTREE_ITER_PREFETCH|
2253                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2254                 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2255                 if (ret)
2256                         break;
2257
2258                 switch (k.k->type) {
2259                 case KEY_TYPE_dirent:
2260                         d = bkey_s_c_to_dirent(k);
2261
2262                         if (d.v->d_type != DT_DIR &&
2263                             d.v->d_type != DT_SUBVOL)
2264                                 inc_link(c, &s, links, range_start, range_end,
2265                                          le64_to_cpu(d.v->d_inum),
2266                                          bch2_snapshot_equiv(c, d.k->p.snapshot));
2267                         break;
2268                 }
2269         }
2270         bch2_trans_iter_exit(&trans, &iter);
2271
2272         if (ret)
2273                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
2274
2275         bch2_trans_exit(&trans);
2276         snapshots_seen_exit(&s);
2277         return ret;
2278 }
2279
2280 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2281                                      struct bkey_s_c k,
2282                                      struct nlink_table *links,
2283                                      size_t *idx, u64 range_end)
2284 {
2285         struct bch_fs *c = trans->c;
2286         struct bch_inode_unpacked u;
2287         struct nlink *link = &links->d[*idx];
2288         int ret = 0;
2289
2290         if (k.k->p.offset >= range_end)
2291                 return 1;
2292
2293         if (!bkey_is_inode(k.k))
2294                 return 0;
2295
2296         BUG_ON(bch2_inode_unpack(k, &u));
2297
2298         if (S_ISDIR(u.bi_mode))
2299                 return 0;
2300
2301         if (!u.bi_nlink)
2302                 return 0;
2303
2304         while ((cmp_int(link->inum, k.k->p.offset) ?:
2305                 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2306                 BUG_ON(*idx == links->nr);
2307                 link = &links->d[++*idx];
2308         }
2309
2310         if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
2311                         "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2312                         u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2313                         bch2_inode_nlink_get(&u), link->count)) {
2314                 bch2_inode_nlink_set(&u, link->count);
2315                 ret = __write_inode(trans, &u, k.k->p.snapshot);
2316         }
2317 fsck_err:
2318         return ret;
2319 }
2320
2321 noinline_for_stack
2322 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2323                                struct nlink_table *links,
2324                                u64 range_start, u64 range_end)
2325 {
2326         struct btree_trans trans;
2327         struct btree_iter iter;
2328         struct bkey_s_c k;
2329         size_t idx = 0;
2330         int ret = 0;
2331
2332         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2333
2334         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
2335                         POS(0, range_start),
2336                         BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
2337                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
2338                 check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
2339
2340         bch2_trans_exit(&trans);
2341
2342         if (ret < 0) {
2343                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2344                 return ret;
2345         }
2346
2347         return 0;
2348 }
2349
2350 int bch2_check_nlinks(struct bch_fs *c)
2351 {
2352         struct nlink_table links = { 0 };
2353         u64 this_iter_range_start, next_iter_range_start = 0;
2354         int ret = 0;
2355
2356         do {
2357                 this_iter_range_start = next_iter_range_start;
2358                 next_iter_range_start = U64_MAX;
2359
2360                 ret = check_nlinks_find_hardlinks(c, &links,
2361                                                   this_iter_range_start,
2362                                                   &next_iter_range_start);
2363
2364                 ret = check_nlinks_walk_dirents(c, &links,
2365                                           this_iter_range_start,
2366                                           next_iter_range_start);
2367                 if (ret)
2368                         break;
2369
2370                 ret = check_nlinks_update_hardlinks(c, &links,
2371                                          this_iter_range_start,
2372                                          next_iter_range_start);
2373                 if (ret)
2374                         break;
2375
2376                 links.nr = 0;
2377         } while (next_iter_range_start != U64_MAX);
2378
2379         kvfree(links.d);
2380
2381         if (ret)
2382                 bch_err_fn(c, ret);
2383         return ret;
2384 }
2385
2386 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2387                              struct bkey_s_c k)
2388 {
2389         struct bkey_s_c_reflink_p p;
2390         struct bkey_i_reflink_p *u;
2391         int ret;
2392
2393         if (k.k->type != KEY_TYPE_reflink_p)
2394                 return 0;
2395
2396         p = bkey_s_c_to_reflink_p(k);
2397
2398         if (!p.v->front_pad && !p.v->back_pad)
2399                 return 0;
2400
2401         u = bch2_trans_kmalloc(trans, sizeof(*u));
2402         ret = PTR_ERR_OR_ZERO(u);
2403         if (ret)
2404                 return ret;
2405
2406         bkey_reassemble(&u->k_i, k);
2407         u->v.front_pad  = 0;
2408         u->v.back_pad   = 0;
2409
2410         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2411 }
2412
2413 int bch2_fix_reflink_p(struct bch_fs *c)
2414 {
2415         struct btree_iter iter;
2416         struct bkey_s_c k;
2417         int ret;
2418
2419         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2420                 return 0;
2421
2422         ret = bch2_trans_run(c,
2423                 for_each_btree_key_commit(&trans, iter,
2424                                 BTREE_ID_extents, POS_MIN,
2425                                 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
2426                                 BTREE_ITER_ALL_SNAPSHOTS, k,
2427                                 NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
2428                         fix_reflink_p_key(&trans, &iter, k)));
2429
2430         if (ret)
2431                 bch_err_fn(c, ret);
2432         return ret;
2433 }