]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
Update bcachefs sources to 1b14994029 bcachefs: Fragmentation LRU
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "darray.h"
7 #include "dirent.h"
8 #include "error.h"
9 #include "fs-common.h"
10 #include "fsck.h"
11 #include "inode.h"
12 #include "keylist.h"
13 #include "subvolume.h"
14 #include "super.h"
15 #include "xattr.h"
16
17 #include <linux/bsearch.h>
18 #include <linux/dcache.h> /* struct qstr */
19
20 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
21
22 /*
23  * XXX: this is handling transaction restarts without returning
24  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
25  */
26 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
27                                     u32 snapshot)
28 {
29         struct btree_iter iter;
30         struct bkey_s_c k;
31         u64 sectors = 0;
32         int ret;
33
34         for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
35                                 SPOS(inum, 0, snapshot),
36                                 POS(inum, U64_MAX),
37                                 0, k, ret)
38                 if (bkey_extent_is_allocation(k.k))
39                         sectors += k.k->size;
40
41         bch2_trans_iter_exit(trans, &iter);
42
43         return ret ?: sectors;
44 }
45
46 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
47                                     u32 snapshot)
48 {
49         struct btree_iter iter;
50         struct bkey_s_c k;
51         struct bkey_s_c_dirent d;
52         u64 subdirs = 0;
53         int ret;
54
55         for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
56                                 SPOS(inum, 0, snapshot),
57                                 POS(inum, U64_MAX),
58                                 0, k, ret) {
59                 if (k.k->type != KEY_TYPE_dirent)
60                         continue;
61
62                 d = bkey_s_c_to_dirent(k);
63                 if (d.v->d_type == DT_DIR)
64                         subdirs++;
65         }
66         bch2_trans_iter_exit(trans, &iter);
67
68         return ret ?: subdirs;
69 }
70
71 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
72                                     u32 *subvol)
73 {
74         struct btree_iter iter;
75         struct bkey_s_c k;
76         int ret;
77
78         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
79                              POS(0, snapshot), 0);
80         k = bch2_btree_iter_peek_slot(&iter);
81         ret = bkey_err(k);
82         if (ret)
83                 goto err;
84
85         if (k.k->type != KEY_TYPE_snapshot) {
86                 bch_err(trans->c, "snapshot %u not fonud", snapshot);
87                 ret = -ENOENT;
88                 goto err;
89         }
90
91         *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
92 err:
93         bch2_trans_iter_exit(trans, &iter);
94         return ret;
95
96 }
97
98 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
99                            u32 *snapshot, u64 *inum)
100 {
101         struct bch_subvolume s;
102         int ret;
103
104         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
105
106         *snapshot = le32_to_cpu(s.snapshot);
107         *inum = le64_to_cpu(s.inode);
108         return ret;
109 }
110
111 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
112                          u32 *snapshot, u64 *inum)
113 {
114         return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
115 }
116
117 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
118                               struct bch_inode_unpacked *inode)
119 {
120         struct btree_iter iter;
121         struct bkey_s_c k;
122         int ret;
123
124         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
125                              POS(0, inode_nr),
126                              BTREE_ITER_ALL_SNAPSHOTS);
127         k = bch2_btree_iter_peek(&iter);
128         ret = bkey_err(k);
129         if (ret)
130                 goto err;
131
132         if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
133                 ret = -ENOENT;
134                 goto err;
135         }
136
137         ret = bch2_inode_unpack(k, inode);
138 err:
139         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
140                 bch_err(trans->c, "error fetching inode %llu: %s",
141                         inode_nr, bch2_err_str(ret));
142         bch2_trans_iter_exit(trans, &iter);
143         return ret;
144 }
145
146 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
147                           struct bch_inode_unpacked *inode,
148                           u32 *snapshot)
149 {
150         struct btree_iter iter;
151         struct bkey_s_c k;
152         int ret;
153
154         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
155                              SPOS(0, inode_nr, *snapshot), 0);
156         k = bch2_btree_iter_peek_slot(&iter);
157         ret = bkey_err(k);
158         if (ret)
159                 goto err;
160
161         ret = bkey_is_inode(k.k)
162                 ? bch2_inode_unpack(k, inode)
163                 : -ENOENT;
164         if (!ret)
165                 *snapshot = iter.pos.snapshot;
166 err:
167         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
168                 bch_err(trans->c, "error fetching inode %llu:%u: %s",
169                         inode_nr, *snapshot, bch2_err_str(ret));
170         bch2_trans_iter_exit(trans, &iter);
171         return ret;
172 }
173
174 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
175                         struct bch_inode_unpacked *inode,
176                         u32 *snapshot)
177 {
178         return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
179 }
180
181 static int __lookup_dirent(struct btree_trans *trans,
182                            struct bch_hash_info hash_info,
183                            subvol_inum dir, struct qstr *name,
184                            u64 *target, unsigned *type)
185 {
186         struct btree_iter iter;
187         struct bkey_s_c_dirent d;
188         int ret;
189
190         ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
191                                &hash_info, dir, name, 0);
192         if (ret)
193                 return ret;
194
195         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
196         *target = le64_to_cpu(d.v->d_inum);
197         *type = d.v->d_type;
198         bch2_trans_iter_exit(trans, &iter);
199         return 0;
200 }
201
202 static int __write_inode(struct btree_trans *trans,
203                          struct bch_inode_unpacked *inode,
204                          u32 snapshot)
205 {
206         struct btree_iter iter;
207         int ret;
208
209         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
210                             SPOS(0, inode->bi_inum, snapshot),
211                             BTREE_ITER_INTENT);
212
213         ret   = bch2_btree_iter_traverse(&iter) ?:
214                 bch2_inode_write(trans, &iter, inode);
215         bch2_trans_iter_exit(trans, &iter);
216         return ret;
217 }
218
219 static int write_inode(struct btree_trans *trans,
220                        struct bch_inode_unpacked *inode,
221                        u32 snapshot)
222 {
223         int ret = commit_do(trans, NULL, NULL,
224                                   BTREE_INSERT_NOFAIL|
225                                   BTREE_INSERT_LAZY_RW,
226                                   __write_inode(trans, inode, snapshot));
227         if (ret)
228                 bch_err(trans->c, "error in fsck: error updating inode: %s",
229                         bch2_err_str(ret));
230         return ret;
231 }
232
233 static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
234 {
235         struct bch_fs *c = trans->c;
236         struct btree_iter iter = { NULL };
237         struct bkey_i_inode_generation delete;
238         struct bch_inode_unpacked inode_u;
239         struct bkey_s_c k;
240         int ret;
241
242         do {
243                 ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
244                                                       SPOS(inum, 0, snapshot),
245                                                       SPOS(inum, U64_MAX, snapshot),
246                                                       0, NULL) ?:
247                         bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
248                                                       SPOS(inum, 0, snapshot),
249                                                       SPOS(inum, U64_MAX, snapshot),
250                                                       0, NULL) ?:
251                         bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
252                                                       SPOS(inum, 0, snapshot),
253                                                       SPOS(inum, U64_MAX, snapshot),
254                                                       0, NULL);
255         } while (ret == -BCH_ERR_transaction_restart_nested);
256         if (ret)
257                 goto err;
258 retry:
259         bch2_trans_begin(trans);
260
261         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
262                              SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
263         k = bch2_btree_iter_peek_slot(&iter);
264
265         ret = bkey_err(k);
266         if (ret)
267                 goto err;
268
269         if (!bkey_is_inode(k.k)) {
270                 bch2_fs_inconsistent(c,
271                                      "inode %llu:%u not found when deleting",
272                                      inum, snapshot);
273                 ret = -EIO;
274                 goto err;
275         }
276
277         bch2_inode_unpack(k, &inode_u);
278
279         /* Subvolume root? */
280         if (inode_u.bi_subvol)
281                 bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
282
283         bkey_inode_generation_init(&delete.k_i);
284         delete.k.p = iter.pos;
285         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
286
287         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
288                 bch2_trans_commit(trans, NULL, NULL,
289                                 BTREE_INSERT_NOFAIL);
290 err:
291         bch2_trans_iter_exit(trans, &iter);
292         if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
293                 goto retry;
294
295         return ret ?: -BCH_ERR_transaction_restart_nested;
296 }
297
298 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
299 {
300         struct bch_fs *c = trans->c;
301         struct btree_iter iter;
302         struct bch_inode_unpacked dir_inode;
303         struct bch_hash_info dir_hash_info;
304         int ret;
305
306         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
307         if (ret)
308                 goto err;
309
310         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
311
312         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
313
314         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
315                                   &dir_hash_info, &iter,
316                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
317         bch2_trans_iter_exit(trans, &iter);
318 err:
319         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
320                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
321         return ret;
322 }
323
324 /* Get lost+found, create if it doesn't exist: */
325 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
326                             struct bch_inode_unpacked *lostfound)
327 {
328         struct bch_fs *c = trans->c;
329         struct bch_inode_unpacked root;
330         struct bch_hash_info root_hash_info;
331         struct qstr lostfound_str = QSTR("lost+found");
332         subvol_inum root_inum = { .subvol = subvol };
333         u64 inum = 0;
334         unsigned d_type = 0;
335         u32 snapshot;
336         int ret;
337
338         ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
339         if (ret)
340                 return ret;
341
342         ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
343         if (ret)
344                 return ret;
345
346         root_hash_info = bch2_hash_info_init(c, &root);
347
348         ret = __lookup_dirent(trans, root_hash_info, root_inum,
349                             &lostfound_str, &inum, &d_type);
350         if (ret == -ENOENT) {
351                 bch_notice(c, "creating lost+found");
352                 goto create_lostfound;
353         }
354
355         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
356                 bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
357         if (ret)
358                 return ret;
359
360         if (d_type != DT_DIR) {
361                 bch_err(c, "error looking up lost+found: not a directory");
362                 return ret;
363         }
364
365         /*
366          * The check_dirents pass has already run, dangling dirents
367          * shouldn't exist here:
368          */
369         return __lookup_inode(trans, inum, lostfound, &snapshot);
370
371 create_lostfound:
372         bch2_inode_init_early(c, lostfound);
373
374         ret = bch2_create_trans(trans, root_inum, &root,
375                                 lostfound, &lostfound_str,
376                                 0, 0, S_IFDIR|0700, 0, NULL, NULL,
377                                 (subvol_inum) { }, 0);
378         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
379                 bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
380         return ret;
381 }
382
383 static int __reattach_inode(struct btree_trans *trans,
384                           struct bch_inode_unpacked *inode,
385                           u32 inode_snapshot)
386 {
387         struct bch_hash_info dir_hash;
388         struct bch_inode_unpacked lostfound;
389         char name_buf[20];
390         struct qstr name;
391         u64 dir_offset = 0;
392         u32 subvol;
393         int ret;
394
395         ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
396         if (ret)
397                 return ret;
398
399         ret = lookup_lostfound(trans, subvol, &lostfound);
400         if (ret)
401                 return ret;
402
403         if (S_ISDIR(inode->bi_mode)) {
404                 lostfound.bi_nlink++;
405
406                 ret = __write_inode(trans, &lostfound, U32_MAX);
407                 if (ret)
408                         return ret;
409         }
410
411         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
412
413         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
414         name = (struct qstr) QSTR(name_buf);
415
416         ret = bch2_dirent_create(trans,
417                                  (subvol_inum) {
418                                         .subvol = subvol,
419                                         .inum = lostfound.bi_inum,
420                                  },
421                                  &dir_hash,
422                                  inode_d_type(inode),
423                                  &name, inode->bi_inum, &dir_offset,
424                                  BCH_HASH_SET_MUST_CREATE);
425         if (ret)
426                 return ret;
427
428         inode->bi_dir           = lostfound.bi_inum;
429         inode->bi_dir_offset    = dir_offset;
430
431         return __write_inode(trans, inode, inode_snapshot);
432 }
433
434 static int reattach_inode(struct btree_trans *trans,
435                           struct bch_inode_unpacked *inode,
436                           u32 inode_snapshot)
437 {
438         int ret = commit_do(trans, NULL, NULL,
439                                   BTREE_INSERT_LAZY_RW|
440                                   BTREE_INSERT_NOFAIL,
441                         __reattach_inode(trans, inode, inode_snapshot));
442         if (ret) {
443                 bch_err(trans->c, "error reattaching inode %llu: %s",
444                         inode->bi_inum, bch2_err_str(ret));
445                 return ret;
446         }
447
448         return ret;
449 }
450
451 static int remove_backpointer(struct btree_trans *trans,
452                               struct bch_inode_unpacked *inode)
453 {
454         struct btree_iter iter;
455         struct bkey_s_c k;
456         int ret;
457
458         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
459                              POS(inode->bi_dir, inode->bi_dir_offset), 0);
460         k = bch2_btree_iter_peek_slot(&iter);
461         ret = bkey_err(k);
462         if (ret)
463                 goto out;
464         if (k.k->type != KEY_TYPE_dirent) {
465                 ret = -ENOENT;
466                 goto out;
467         }
468
469         ret = __remove_dirent(trans, k.k->p);
470 out:
471         bch2_trans_iter_exit(trans, &iter);
472         return ret;
473 }
474
475 struct snapshots_seen_entry {
476         u32                             id;
477         u32                             equiv;
478 };
479
480 struct snapshots_seen {
481         struct bpos                     pos;
482         DARRAY(struct snapshots_seen_entry) ids;
483 };
484
485 static inline void snapshots_seen_exit(struct snapshots_seen *s)
486 {
487         darray_exit(&s->ids);
488 }
489
490 static inline void snapshots_seen_init(struct snapshots_seen *s)
491 {
492         memset(s, 0, sizeof(*s));
493 }
494
495 static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
496 {
497         struct snapshots_seen_entry *i, n = { id, id };
498         int ret;
499
500         darray_for_each(s->ids, i) {
501                 if (n.equiv < i->equiv)
502                         break;
503
504                 if (i->equiv == n.equiv) {
505                         bch_err(c, "%s(): adding duplicate snapshot", __func__);
506                         return -EINVAL;
507                 }
508         }
509
510         ret = darray_insert_item(&s->ids, i - s->ids.data, n);
511         if (ret)
512                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
513                         s->ids.size);
514         return ret;
515 }
516
517 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
518                                  enum btree_id btree_id, struct bpos pos)
519 {
520         struct snapshots_seen_entry *i, n = {
521                 .id     = pos.snapshot,
522                 .equiv  = bch2_snapshot_equiv(c, pos.snapshot),
523         };
524         int ret = 0;
525
526         if (!bkey_eq(s->pos, pos))
527                 s->ids.nr = 0;
528
529         pos.snapshot = n.equiv;
530         s->pos = pos;
531
532         darray_for_each(s->ids, i)
533                 if (i->equiv == n.equiv) {
534                         if (fsck_err_on(i->id != n.id, c,
535                                         "snapshot deletion did not run correctly:\n"
536                                         "  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
537                                         bch2_btree_ids[btree_id],
538                                         pos.inode, pos.offset,
539                                         i->id, n.id, n.equiv))
540                                 return -BCH_ERR_need_snapshot_cleanup;
541
542                         return 0;
543                 }
544
545         ret = darray_push(&s->ids, n);
546         if (ret)
547                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
548                         s->ids.size);
549 fsck_err:
550         return ret;
551 }
552
553 /**
554  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
555  * and @ancestor hasn't been overwritten in @seen
556  *
557  * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
558  */
559 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
560                                     u32 id, u32 ancestor)
561 {
562         ssize_t i;
563         u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0;
564
565         BUG_ON(id > ancestor);
566         BUG_ON(!bch2_snapshot_is_equiv(c, id));
567         BUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
568
569         /* @ancestor should be the snapshot most recently added to @seen */
570         BUG_ON(ancestor != seen->pos.snapshot);
571         BUG_ON(ancestor != top);
572
573         if (id == ancestor)
574                 return true;
575
576         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
577                 return false;
578
579         for (i = seen->ids.nr - 2;
580              i >= 0 && seen->ids.data[i].equiv >= id;
581              --i)
582                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) &&
583                     bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor))
584                         return false;
585
586         return true;
587 }
588
589 /**
590  * ref_visible - given a key with snapshot id @src that points to a key with
591  * snapshot id @dst, test whether there is some snapshot in which @dst is
592  * visible.
593  *
594  * This assumes we're visiting @src keys in natural key order.
595  *
596  * @s   - list of snapshot IDs already seen at @src
597  * @src - snapshot ID of src key
598  * @dst - snapshot ID of dst key
599  */
600 static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
601                        u32 src, u32 dst)
602 {
603         return dst <= src
604                 ? key_visible_in_snapshot(c, s, dst, src)
605                 : bch2_snapshot_is_ancestor(c, src, dst);
606 }
607
608 static int ref_visible2(struct bch_fs *c,
609                        u32 src, struct snapshots_seen *src_seen,
610                        u32 dst, struct snapshots_seen *dst_seen)
611 {
612         if (dst > src) {
613                 swap(dst, src);
614                 swap(dst_seen, src_seen);
615         }
616         return key_visible_in_snapshot(c, src_seen, dst, src);
617 }
618
619 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                               \
620         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
621              (_i)->snapshot <= (_snapshot); _i++)                                       \
622                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
623
624 struct inode_walker_entry {
625         struct bch_inode_unpacked inode;
626         u32                     snapshot;
627         u64                     count;
628 };
629
630 struct inode_walker {
631         bool                            first_this_inode;
632         u64                             cur_inum;
633
634         DARRAY(struct inode_walker_entry) inodes;
635 };
636
637 static void inode_walker_exit(struct inode_walker *w)
638 {
639         darray_exit(&w->inodes);
640 }
641
642 static struct inode_walker inode_walker_init(void)
643 {
644         return (struct inode_walker) { 0, };
645 }
646
647 static int add_inode(struct bch_fs *c, struct inode_walker *w,
648                      struct bkey_s_c inode)
649 {
650         struct bch_inode_unpacked u;
651
652         BUG_ON(bch2_inode_unpack(inode, &u));
653
654         return darray_push(&w->inodes, ((struct inode_walker_entry) {
655                 .inode          = u,
656                 .snapshot       = bch2_snapshot_equiv(c, inode.k->p.snapshot),
657         }));
658 }
659
660 static int __walk_inode(struct btree_trans *trans,
661                         struct inode_walker *w, struct bpos pos)
662 {
663         struct bch_fs *c = trans->c;
664         struct btree_iter iter;
665         struct bkey_s_c k;
666         u32 restart_count = trans->restart_count;
667         unsigned i;
668         int ret;
669
670         pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot);
671
672         if (pos.inode == w->cur_inum) {
673                 w->first_this_inode = false;
674                 goto lookup_snapshot;
675         }
676
677         w->inodes.nr = 0;
678
679         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
680                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
681                 if (k.k->p.offset != pos.inode)
682                         break;
683
684                 if (bkey_is_inode(k.k))
685                         add_inode(c, w, k);
686         }
687         bch2_trans_iter_exit(trans, &iter);
688
689         if (ret)
690                 return ret;
691
692         w->cur_inum             = pos.inode;
693         w->first_this_inode     = true;
694
695         if (trans_was_restarted(trans, restart_count))
696                 return -BCH_ERR_transaction_restart_nested;
697
698 lookup_snapshot:
699         for (i = 0; i < w->inodes.nr; i++)
700                 if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
701                         goto found;
702         return INT_MAX;
703 found:
704         BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
705
706         if (pos.snapshot != w->inodes.data[i].snapshot) {
707                 struct inode_walker_entry e = w->inodes.data[i];
708
709                 e.snapshot = pos.snapshot;
710                 e.count = 0;
711
712                 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
713                          pos.inode, pos.snapshot, w->inodes.data[i].snapshot);
714
715                 while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
716                         --i;
717
718                 ret = darray_insert_item(&w->inodes, i, e);
719                 if (ret)
720                         return ret;
721         }
722
723         return i;
724 }
725
726 static int __get_visible_inodes(struct btree_trans *trans,
727                                 struct inode_walker *w,
728                                 struct snapshots_seen *s,
729                                 u64 inum)
730 {
731         struct bch_fs *c = trans->c;
732         struct btree_iter iter;
733         struct bkey_s_c k;
734         int ret;
735
736         w->inodes.nr = 0;
737
738         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
739                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
740                 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
741
742                 if (k.k->p.offset != inum)
743                         break;
744
745                 if (!ref_visible(c, s, s->pos.snapshot, equiv))
746                         continue;
747
748                 if (bkey_is_inode(k.k))
749                         add_inode(c, w, k);
750
751                 if (equiv >= s->pos.snapshot)
752                         break;
753         }
754         bch2_trans_iter_exit(trans, &iter);
755
756         return ret;
757 }
758
759 static int check_key_has_snapshot(struct btree_trans *trans,
760                                   struct btree_iter *iter,
761                                   struct bkey_s_c k)
762 {
763         struct bch_fs *c = trans->c;
764         struct printbuf buf = PRINTBUF;
765         int ret = 0;
766
767         if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
768                         "key in missing snapshot: %s",
769                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
770                 ret = bch2_btree_delete_at(trans, iter,
771                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
772 fsck_err:
773         printbuf_exit(&buf);
774         return ret;
775 }
776
777 static int hash_redo_key(struct btree_trans *trans,
778                          const struct bch_hash_desc desc,
779                          struct bch_hash_info *hash_info,
780                          struct btree_iter *k_iter, struct bkey_s_c k)
781 {
782         struct bkey_i *delete;
783         struct bkey_i *tmp;
784
785         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
786         if (IS_ERR(delete))
787                 return PTR_ERR(delete);
788
789         tmp = bch2_bkey_make_mut(trans, k);
790         if (IS_ERR(tmp))
791                 return PTR_ERR(tmp);
792
793         bkey_init(&delete->k);
794         delete->k.p = k_iter->pos;
795         return  bch2_btree_iter_traverse(k_iter) ?:
796                 bch2_trans_update(trans, k_iter, delete, 0) ?:
797                 bch2_hash_set_snapshot(trans, desc, hash_info,
798                                        (subvol_inum) { 0, k.k->p.inode },
799                                        k.k->p.snapshot, tmp,
800                                        BCH_HASH_SET_MUST_CREATE,
801                                        BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
802                 bch2_trans_commit(trans, NULL, NULL,
803                                   BTREE_INSERT_NOFAIL|
804                                   BTREE_INSERT_LAZY_RW);
805 }
806
807 static int hash_check_key(struct btree_trans *trans,
808                           const struct bch_hash_desc desc,
809                           struct bch_hash_info *hash_info,
810                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
811 {
812         struct bch_fs *c = trans->c;
813         struct btree_iter iter = { NULL };
814         struct printbuf buf = PRINTBUF;
815         struct bkey_s_c k;
816         u64 hash;
817         int ret = 0;
818
819         if (hash_k.k->type != desc.key_type)
820                 return 0;
821
822         hash = desc.hash_bkey(hash_info, hash_k);
823
824         if (likely(hash == hash_k.k->p.offset))
825                 return 0;
826
827         if (hash_k.k->p.offset < hash)
828                 goto bad_hash;
829
830         for_each_btree_key_norestart(trans, iter, desc.btree_id,
831                                      SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
832                                      BTREE_ITER_SLOTS, k, ret) {
833                 if (bkey_eq(k.k->p, hash_k.k->p))
834                         break;
835
836                 if (fsck_err_on(k.k->type == desc.key_type &&
837                                 !desc.cmp_bkey(k, hash_k), c,
838                                 "duplicate hash table keys:\n%s",
839                                 (printbuf_reset(&buf),
840                                  bch2_bkey_val_to_text(&buf, c, hash_k),
841                                  buf.buf))) {
842                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
843                         break;
844                 }
845
846                 if (bkey_deleted(k.k)) {
847                         bch2_trans_iter_exit(trans, &iter);
848                         goto bad_hash;
849                 }
850         }
851 out:
852         bch2_trans_iter_exit(trans, &iter);
853         printbuf_exit(&buf);
854         return ret;
855 bad_hash:
856         if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
857                      bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
858                      (printbuf_reset(&buf),
859                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
860                 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
861                 if (ret) {
862                         bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
863                         return ret;
864                 }
865                 ret = -BCH_ERR_transaction_restart_nested;
866         }
867 fsck_err:
868         goto out;
869 }
870
871 static int check_inode(struct btree_trans *trans,
872                        struct btree_iter *iter,
873                        struct bkey_s_c k,
874                        struct bch_inode_unpacked *prev,
875                        struct snapshots_seen *s,
876                        bool full)
877 {
878         struct bch_fs *c = trans->c;
879         struct bch_inode_unpacked u;
880         bool do_update = false;
881         int ret;
882
883         ret = check_key_has_snapshot(trans, iter, k);
884         if (ret < 0)
885                 goto err;
886         if (ret)
887                 return 0;
888
889         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
890         if (ret)
891                 goto err;
892
893         /*
894          * if snapshot id isn't a leaf node, skip it - deletion in
895          * particular is not atomic, so on the internal snapshot nodes
896          * we can see inodes marked for deletion after a clean shutdown
897          */
898         if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
899                 return 0;
900
901         if (!bkey_is_inode(k.k))
902                 return 0;
903
904         BUG_ON(bch2_inode_unpack(k, &u));
905
906         if (!full &&
907             !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
908                             BCH_INODE_I_SECTORS_DIRTY|
909                             BCH_INODE_UNLINKED)))
910                 return 0;
911
912         if (prev->bi_inum != u.bi_inum)
913                 *prev = u;
914
915         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
916                         inode_d_type(prev)      != inode_d_type(&u), c,
917                         "inodes in different snapshots don't match")) {
918                 bch_err(c, "repair not implemented yet");
919                 return -EINVAL;
920         }
921
922         if (u.bi_flags & BCH_INODE_UNLINKED &&
923             (!c->sb.clean ||
924              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
925                       u.bi_inum))) {
926                 bch2_trans_unlock(trans);
927                 bch2_fs_lazy_rw(c);
928
929                 ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
930                 if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
931                         bch_err(c, "error in fsck: error while deleting inode: %s",
932                                 bch2_err_str(ret));
933                 return ret;
934         }
935
936         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
937             (!c->sb.clean ||
938              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
939                       u.bi_inum))) {
940                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
941
942                 bch2_trans_unlock(trans);
943                 bch2_fs_lazy_rw(c);
944
945                 /*
946                  * XXX: need to truncate partial blocks too here - or ideally
947                  * just switch units to bytes and that issue goes away
948                  */
949                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
950                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
951                                      iter->pos.snapshot),
952                                 POS(u.bi_inum, U64_MAX),
953                                 0, NULL);
954                 if (ret) {
955                         bch_err(c, "error in fsck: error truncating inode: %s",
956                                 bch2_err_str(ret));
957                         return ret;
958                 }
959
960                 /*
961                  * We truncated without our normal sector accounting hook, just
962                  * make sure we recalculate it:
963                  */
964                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
965
966                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
967                 do_update = true;
968         }
969
970         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
971             (!c->sb.clean ||
972              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
973                       u.bi_inum))) {
974                 s64 sectors;
975
976                 bch_verbose(c, "recounting sectors for inode %llu",
977                             u.bi_inum);
978
979                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
980                 if (sectors < 0) {
981                         bch_err(c, "error in fsck: error recounting inode sectors: %s",
982                                 bch2_err_str(sectors));
983                         return sectors;
984                 }
985
986                 u.bi_sectors = sectors;
987                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
988                 do_update = true;
989         }
990
991         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
992                 u.bi_dir = 0;
993                 u.bi_dir_offset = 0;
994                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
995                 do_update = true;
996         }
997
998         if (do_update) {
999                 ret = __write_inode(trans, &u, iter->pos.snapshot);
1000                 if (ret)
1001                         bch_err(c, "error in fsck: error updating inode: %s",
1002                                 bch2_err_str(ret));
1003         }
1004 err:
1005 fsck_err:
1006         if (ret)
1007                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1008         return ret;
1009 }
1010
1011 noinline_for_stack
1012 static int check_inodes(struct bch_fs *c, bool full)
1013 {
1014         struct btree_trans trans;
1015         struct btree_iter iter;
1016         struct bch_inode_unpacked prev = { 0 };
1017         struct snapshots_seen s;
1018         struct bkey_s_c k;
1019         int ret;
1020
1021         snapshots_seen_init(&s);
1022         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1023
1024         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
1025                         POS_MIN,
1026                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1027                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
1028                 check_inode(&trans, &iter, k, &prev, &s, full));
1029
1030         bch2_trans_exit(&trans);
1031         snapshots_seen_exit(&s);
1032         if (ret)
1033                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1034         return ret;
1035 }
1036
1037 /*
1038  * Checking for overlapping extents needs to be reimplemented
1039  */
1040 #if 0
1041 static int fix_overlapping_extent(struct btree_trans *trans,
1042                                        struct bkey_s_c k, struct bpos cut_at)
1043 {
1044         struct btree_iter iter;
1045         struct bkey_i *u;
1046         int ret;
1047
1048         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1049         ret = PTR_ERR_OR_ZERO(u);
1050         if (ret)
1051                 return ret;
1052
1053         bkey_reassemble(u, k);
1054         bch2_cut_front(cut_at, u);
1055
1056
1057         /*
1058          * We don't want to go through the extent_handle_overwrites path:
1059          *
1060          * XXX: this is going to screw up disk accounting, extent triggers
1061          * assume things about extent overwrites - we should be running the
1062          * triggers manually here
1063          */
1064         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
1065                              BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
1066
1067         BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
1068         ret   = bch2_btree_iter_traverse(&iter) ?:
1069                 bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
1070                 bch2_trans_commit(trans, NULL, NULL,
1071                                   BTREE_INSERT_NOFAIL|
1072                                   BTREE_INSERT_LAZY_RW);
1073         bch2_trans_iter_exit(trans, &iter);
1074         return ret;
1075 }
1076 #endif
1077
1078 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
1079                                                 struct btree_iter *iter,
1080                                                 struct bpos pos)
1081 {
1082         struct bkey_s_c k;
1083         int ret;
1084
1085         bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
1086         k = bch2_btree_iter_peek_slot(iter);
1087         ret = bkey_err(k);
1088         if (!ret && k.k->type != KEY_TYPE_dirent)
1089                 ret = -ENOENT;
1090         if (ret) {
1091                 bch2_trans_iter_exit(trans, iter);
1092                 return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
1093         }
1094
1095         return bkey_s_c_to_dirent(k);
1096 }
1097
1098 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
1099                                    struct bkey_s_c_dirent d)
1100 {
1101         return  inode->bi_dir           == d.k->p.inode &&
1102                 inode->bi_dir_offset    == d.k->p.offset;
1103 }
1104
1105 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
1106                                    struct bch_inode_unpacked *inode)
1107 {
1108         return d.v->d_type == DT_SUBVOL
1109                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
1110                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
1111 }
1112
1113 static int inode_backpointer_exists(struct btree_trans *trans,
1114                                     struct bch_inode_unpacked *inode,
1115                                     u32 snapshot)
1116 {
1117         struct btree_iter iter;
1118         struct bkey_s_c_dirent d;
1119         int ret;
1120
1121         d = dirent_get_by_pos(trans, &iter,
1122                         SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
1123         ret = bkey_err(d.s_c);
1124         if (ret)
1125                 return ret == -ENOENT ? 0 : ret;
1126
1127         ret = dirent_points_to_inode(d, inode);
1128         bch2_trans_iter_exit(trans, &iter);
1129         return ret;
1130 }
1131
1132 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1133 {
1134         struct bch_fs *c = trans->c;
1135         struct inode_walker_entry *i;
1136         u32 restart_count = trans->restart_count;
1137         int ret = 0;
1138         s64 count2;
1139
1140         darray_for_each(w->inodes, i) {
1141                 if (i->inode.bi_sectors == i->count)
1142                         continue;
1143
1144                 count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot);
1145
1146                 if (i->count != count2) {
1147                         bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
1148                                 i->count, count2);
1149                         i->count = count2;
1150                         if (i->inode.bi_sectors == i->count)
1151                                 continue;
1152                 }
1153
1154                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
1155                             "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1156                             w->cur_inum, i->snapshot,
1157                             i->inode.bi_sectors, i->count)) {
1158                         i->inode.bi_sectors = i->count;
1159                         ret = write_inode(trans, &i->inode, i->snapshot);
1160                         if (ret)
1161                                 break;
1162                 }
1163         }
1164 fsck_err:
1165         if (ret)
1166                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1167         if (!ret && trans_was_restarted(trans, restart_count))
1168                 ret = -BCH_ERR_transaction_restart_nested;
1169         return ret;
1170 }
1171
1172 struct extent_end {
1173         u32                     snapshot;
1174         u64                     offset;
1175         struct snapshots_seen   seen;
1176 };
1177
1178 typedef DARRAY(struct extent_end) extent_ends;
1179
1180 static int check_overlapping_extents(struct btree_trans *trans,
1181                               struct snapshots_seen *seen,
1182                               extent_ends *extent_ends,
1183                               struct bkey_s_c k,
1184                               struct btree_iter *iter)
1185 {
1186         struct bch_fs *c = trans->c;
1187         struct extent_end *i;
1188         struct printbuf buf = PRINTBUF;
1189         int ret = 0;
1190
1191         darray_for_each(*extent_ends, i) {
1192                 /* duplicate, due to transaction restart: */
1193                 if (i->offset   == k.k->p.offset &&
1194                     i->snapshot == k.k->p.snapshot)
1195                         continue;
1196
1197                 if (!ref_visible2(c,
1198                                   k.k->p.snapshot, seen,
1199                                   i->snapshot, &i->seen))
1200                         continue;
1201
1202                 if (fsck_err_on(i->offset > bkey_start_offset(k.k), c,
1203                                 "overlapping extents: extent in snapshot %u ends at %llu overlaps with\n%s",
1204                                 i->snapshot,
1205                                 i->offset,
1206                                 (printbuf_reset(&buf),
1207                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1208                         struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1209                         if ((ret = PTR_ERR_OR_ZERO(update)))
1210                                 goto err;
1211                         bkey_reassemble(update, k);
1212                         ret = bch2_trans_update_extent(trans, iter, update, 0);
1213                         if (!ret)
1214                                 goto err;
1215                 }
1216         }
1217 err:
1218 fsck_err:
1219         printbuf_exit(&buf);
1220         return ret;
1221 }
1222
1223 static int extent_ends_at(extent_ends *extent_ends,
1224                           struct snapshots_seen *seen,
1225                           struct bkey_s_c k)
1226 {
1227         struct extent_end *i, n = (struct extent_end) {
1228                 .snapshot       = k.k->p.snapshot,
1229                 .offset         = k.k->p.offset,
1230                 .seen           = *seen,
1231         };
1232
1233         n.seen.ids.data = kmemdup(seen->ids.data,
1234                               sizeof(seen->ids.data[0]) * seen->ids.size,
1235                               GFP_KERNEL);
1236         if (!n.seen.ids.data)
1237                 return -ENOMEM;
1238
1239         darray_for_each(*extent_ends, i) {
1240                 if (i->snapshot == k.k->p.snapshot) {
1241                         snapshots_seen_exit(&i->seen);
1242                         *i = n;
1243                         return 0;
1244                 }
1245
1246                 if (i->snapshot >= k.k->p.snapshot)
1247                         break;
1248         }
1249
1250         return darray_insert_item(extent_ends, i - extent_ends->data, n);
1251 }
1252
1253 static void extent_ends_reset(extent_ends *extent_ends)
1254 {
1255         struct extent_end *i;
1256
1257         darray_for_each(*extent_ends, i)
1258                 snapshots_seen_exit(&i->seen);
1259
1260         extent_ends->nr = 0;
1261 }
1262
1263 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1264                         struct bkey_s_c k,
1265                         struct inode_walker *inode,
1266                         struct snapshots_seen *s,
1267                         extent_ends *extent_ends)
1268 {
1269         struct bch_fs *c = trans->c;
1270         struct inode_walker_entry *i;
1271         struct printbuf buf = PRINTBUF;
1272         struct bpos equiv;
1273         int ret = 0;
1274
1275         ret = check_key_has_snapshot(trans, iter, k);
1276         if (ret) {
1277                 ret = ret < 0 ? ret : 0;
1278                 goto out;
1279         }
1280
1281         equiv = k.k->p;
1282         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1283
1284         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1285         if (ret)
1286                 goto err;
1287
1288         if (k.k->type == KEY_TYPE_whiteout)
1289                 goto out;
1290
1291         if (inode->cur_inum != k.k->p.inode) {
1292                 ret = check_i_sectors(trans, inode);
1293                 if (ret)
1294                         goto err;
1295
1296                 extent_ends_reset(extent_ends);
1297         }
1298
1299         BUG_ON(!iter->path->should_be_locked);
1300
1301         ret = check_overlapping_extents(trans, s, extent_ends, k, iter);
1302         if (ret)
1303                 goto err;
1304
1305         ret = extent_ends_at(extent_ends, s, k);
1306         if (ret)
1307                 goto err;
1308
1309         ret = __walk_inode(trans, inode, equiv);
1310         if (ret < 0)
1311                 goto err;
1312
1313         if (fsck_err_on(ret == INT_MAX, c,
1314                         "extent in missing inode:\n  %s",
1315                         (printbuf_reset(&buf),
1316                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1317                 ret = bch2_btree_delete_at(trans, iter,
1318                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1319                 goto out;
1320         }
1321
1322         if (ret == INT_MAX) {
1323                 ret = 0;
1324                 goto out;
1325         }
1326
1327         i = inode->inodes.data + ret;
1328         ret = 0;
1329
1330         if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
1331                         !S_ISLNK(i->inode.bi_mode), c,
1332                         "extent in non regular inode mode %o:\n  %s",
1333                         i->inode.bi_mode,
1334                         (printbuf_reset(&buf),
1335                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1336                 ret = bch2_btree_delete_at(trans, iter,
1337                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1338                 goto out;
1339         }
1340
1341         /*
1342          * Check inodes in reverse order, from oldest snapshots to newest, so
1343          * that we emit the fewest number of whiteouts necessary:
1344          */
1345         for (i = inode->inodes.data + inode->inodes.nr - 1;
1346              i >= inode->inodes.data;
1347              --i) {
1348                 if (i->snapshot > equiv.snapshot ||
1349                     !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
1350                         continue;
1351
1352                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
1353                                 k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1354                                 !bkey_extent_is_reservation(k), c,
1355                                 "extent type past end of inode %llu:%u, i_size %llu\n  %s",
1356                                 i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1357                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1358                         struct btree_iter iter2;
1359
1360                         bch2_trans_copy_iter(&iter2, iter);
1361                         bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1362                         ret =   bch2_btree_iter_traverse(&iter2) ?:
1363                                 bch2_btree_delete_at(trans, &iter2,
1364                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1365                         bch2_trans_iter_exit(trans, &iter2);
1366                         if (ret)
1367                                 goto err;
1368
1369                         if (i->snapshot != equiv.snapshot) {
1370                                 ret = snapshots_seen_add(c, s, i->snapshot);
1371                                 if (ret)
1372                                         goto err;
1373                         }
1374                 }
1375         }
1376
1377         if (bkey_extent_is_allocation(k.k))
1378                 for_each_visible_inode(c, s, inode, equiv.snapshot, i)
1379                         i->count += k.k->size;
1380 #if 0
1381         bch2_bkey_buf_reassemble(&prev, c, k);
1382 #endif
1383
1384 out:
1385 err:
1386 fsck_err:
1387         printbuf_exit(&buf);
1388
1389         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1390                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1391         return ret;
1392 }
1393
1394 /*
1395  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1396  * that i_size an i_sectors are consistent
1397  */
1398 noinline_for_stack
1399 static int check_extents(struct bch_fs *c)
1400 {
1401         struct inode_walker w = inode_walker_init();
1402         struct snapshots_seen s;
1403         struct btree_trans trans;
1404         struct btree_iter iter;
1405         struct bkey_s_c k;
1406         extent_ends extent_ends = { 0 };
1407         int ret = 0;
1408
1409         snapshots_seen_init(&s);
1410         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1411
1412         bch_verbose(c, "checking extents");
1413
1414         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
1415                         POS(BCACHEFS_ROOT_INO, 0),
1416                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1417                         NULL, NULL,
1418                         BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
1419                 check_extent(&trans, &iter, k, &w, &s, &extent_ends));
1420
1421         extent_ends_reset(&extent_ends);
1422         darray_exit(&extent_ends);
1423         inode_walker_exit(&w);
1424         bch2_trans_exit(&trans);
1425         snapshots_seen_exit(&s);
1426
1427         if (ret)
1428                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1429         return ret;
1430 }
1431
1432 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1433 {
1434         struct bch_fs *c = trans->c;
1435         struct inode_walker_entry *i;
1436         u32 restart_count = trans->restart_count;
1437         int ret = 0;
1438         s64 count2;
1439
1440         darray_for_each(w->inodes, i) {
1441                 if (i->inode.bi_nlink == i->count)
1442                         continue;
1443
1444                 count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
1445                 if (count2 < 0)
1446                         return count2;
1447
1448                 if (i->count != count2) {
1449                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1450                                 i->count, count2);
1451                         i->count = count2;
1452                         if (i->inode.bi_nlink == i->count)
1453                                 continue;
1454                 }
1455
1456                 if (fsck_err_on(i->inode.bi_nlink != i->count, c,
1457                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1458                                 w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
1459                         i->inode.bi_nlink = i->count;
1460                         ret = write_inode(trans, &i->inode, i->snapshot);
1461                         if (ret)
1462                                 break;
1463                 }
1464         }
1465 fsck_err:
1466         if (ret)
1467                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1468         if (!ret && trans_was_restarted(trans, restart_count))
1469                 ret = -BCH_ERR_transaction_restart_nested;
1470         return ret;
1471 }
1472
1473 static int check_dirent_target(struct btree_trans *trans,
1474                                struct btree_iter *iter,
1475                                struct bkey_s_c_dirent d,
1476                                struct bch_inode_unpacked *target,
1477                                u32 target_snapshot)
1478 {
1479         struct bch_fs *c = trans->c;
1480         struct bkey_i_dirent *n;
1481         bool backpointer_exists = true;
1482         struct printbuf buf = PRINTBUF;
1483         int ret = 0;
1484
1485         if (!target->bi_dir &&
1486             !target->bi_dir_offset) {
1487                 target->bi_dir          = d.k->p.inode;
1488                 target->bi_dir_offset   = d.k->p.offset;
1489
1490                 ret = __write_inode(trans, target, target_snapshot);
1491                 if (ret)
1492                         goto err;
1493         }
1494
1495         if (!inode_points_to_dirent(target, d)) {
1496                 ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
1497                 if (ret < 0)
1498                         goto err;
1499
1500                 backpointer_exists = ret;
1501                 ret = 0;
1502
1503                 if (fsck_err_on(S_ISDIR(target->bi_mode) &&
1504                                 backpointer_exists, c,
1505                                 "directory %llu with multiple links",
1506                                 target->bi_inum)) {
1507                         ret = __remove_dirent(trans, d.k->p);
1508                         goto out;
1509                 }
1510
1511                 if (fsck_err_on(backpointer_exists &&
1512                                 !target->bi_nlink, c,
1513                                 "inode %llu type %s has multiple links but i_nlink 0",
1514                                 target->bi_inum, bch2_d_types[d.v->d_type])) {
1515                         target->bi_nlink++;
1516                         target->bi_flags &= ~BCH_INODE_UNLINKED;
1517
1518                         ret = __write_inode(trans, target, target_snapshot);
1519                         if (ret)
1520                                 goto err;
1521                 }
1522
1523                 if (fsck_err_on(!backpointer_exists, c,
1524                                 "inode %llu:%u has wrong backpointer:\n"
1525                                 "got       %llu:%llu\n"
1526                                 "should be %llu:%llu",
1527                                 target->bi_inum, target_snapshot,
1528                                 target->bi_dir,
1529                                 target->bi_dir_offset,
1530                                 d.k->p.inode,
1531                                 d.k->p.offset)) {
1532                         target->bi_dir          = d.k->p.inode;
1533                         target->bi_dir_offset   = d.k->p.offset;
1534
1535                         ret = __write_inode(trans, target, target_snapshot);
1536                         if (ret)
1537                                 goto err;
1538                 }
1539         }
1540
1541         if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
1542                         "incorrect d_type: got %s, should be %s:\n%s",
1543                         bch2_d_type_str(d.v->d_type),
1544                         bch2_d_type_str(inode_d_type(target)),
1545                         (printbuf_reset(&buf),
1546                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1547                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1548                 ret = PTR_ERR_OR_ZERO(n);
1549                 if (ret)
1550                         goto err;
1551
1552                 bkey_reassemble(&n->k_i, d.s_c);
1553                 n->v.d_type = inode_d_type(target);
1554
1555                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1556                 if (ret)
1557                         goto err;
1558
1559                 d = dirent_i_to_s_c(n);
1560         }
1561
1562         if (d.v->d_type == DT_SUBVOL &&
1563             target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
1564             (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
1565              fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
1566                       le32_to_cpu(d.v->d_parent_subvol),
1567                       target->bi_parent_subvol))) {
1568                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1569                 ret = PTR_ERR_OR_ZERO(n);
1570                 if (ret)
1571                         goto err;
1572
1573                 bkey_reassemble(&n->k_i, d.s_c);
1574                 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1575
1576                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1577                 if (ret)
1578                         goto err;
1579
1580                 d = dirent_i_to_s_c(n);
1581         }
1582 out:
1583 err:
1584 fsck_err:
1585         printbuf_exit(&buf);
1586
1587         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1588                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1589         return ret;
1590 }
1591
1592 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1593                         struct bkey_s_c k,
1594                         struct bch_hash_info *hash_info,
1595                         struct inode_walker *dir,
1596                         struct inode_walker *target,
1597                         struct snapshots_seen *s)
1598 {
1599         struct bch_fs *c = trans->c;
1600         struct bkey_s_c_dirent d;
1601         struct inode_walker_entry *i;
1602         struct printbuf buf = PRINTBUF;
1603         struct bpos equiv;
1604         int ret = 0;
1605
1606         ret = check_key_has_snapshot(trans, iter, k);
1607         if (ret) {
1608                 ret = ret < 0 ? ret : 0;
1609                 goto out;
1610         }
1611
1612         equiv = k.k->p;
1613         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1614
1615         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1616         if (ret)
1617                 goto err;
1618
1619         if (k.k->type == KEY_TYPE_whiteout)
1620                 goto out;
1621
1622         if (dir->cur_inum != k.k->p.inode) {
1623                 ret = check_subdir_count(trans, dir);
1624                 if (ret)
1625                         goto err;
1626         }
1627
1628         BUG_ON(!iter->path->should_be_locked);
1629
1630         ret = __walk_inode(trans, dir, equiv);
1631         if (ret < 0)
1632                 goto err;
1633
1634         if (fsck_err_on(ret == INT_MAX, c,
1635                         "dirent in nonexisting directory:\n%s",
1636                         (printbuf_reset(&buf),
1637                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1638                 ret = bch2_btree_delete_at(trans, iter,
1639                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1640                 goto out;
1641         }
1642
1643         if (ret == INT_MAX) {
1644                 ret = 0;
1645                 goto out;
1646         }
1647
1648         i = dir->inodes.data + ret;
1649         ret = 0;
1650
1651         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
1652                         "dirent in non directory inode type %s:\n%s",
1653                         bch2_d_type_str(inode_d_type(&i->inode)),
1654                         (printbuf_reset(&buf),
1655                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1656                 ret = bch2_btree_delete_at(trans, iter, 0);
1657                 goto out;
1658         }
1659
1660         if (dir->first_this_inode)
1661                 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
1662
1663         ret = hash_check_key(trans, bch2_dirent_hash_desc,
1664                              hash_info, iter, k);
1665         if (ret < 0)
1666                 goto err;
1667         if (ret) {
1668                 /* dirent has been deleted */
1669                 ret = 0;
1670                 goto out;
1671         }
1672
1673         if (k.k->type != KEY_TYPE_dirent)
1674                 goto out;
1675
1676         d = bkey_s_c_to_dirent(k);
1677
1678         if (d.v->d_type == DT_SUBVOL) {
1679                 struct bch_inode_unpacked subvol_root;
1680                 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1681                 u32 target_snapshot;
1682                 u64 target_inum;
1683
1684                 ret = __subvol_lookup(trans, target_subvol,
1685                                       &target_snapshot, &target_inum);
1686                 if (ret && ret != -ENOENT)
1687                         goto err;
1688
1689                 if (fsck_err_on(ret, c,
1690                                 "dirent points to missing subvolume %llu",
1691                                 le64_to_cpu(d.v->d_child_subvol))) {
1692                         ret = __remove_dirent(trans, d.k->p);
1693                         goto err;
1694                 }
1695
1696                 ret = __lookup_inode(trans, target_inum,
1697                                    &subvol_root, &target_snapshot);
1698                 if (ret && ret != -ENOENT)
1699                         goto err;
1700
1701                 if (fsck_err_on(ret, c,
1702                                 "subvolume %u points to missing subvolume root %llu",
1703                                 target_subvol,
1704                                 target_inum)) {
1705                         bch_err(c, "repair not implemented yet");
1706                         ret = -EINVAL;
1707                         goto err;
1708                 }
1709
1710                 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
1711                                 "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1712                                 target_inum,
1713                                 subvol_root.bi_subvol, target_subvol)) {
1714                         subvol_root.bi_subvol = target_subvol;
1715                         ret = __write_inode(trans, &subvol_root, target_snapshot);
1716                         if (ret)
1717                                 goto err;
1718                 }
1719
1720                 ret = check_dirent_target(trans, iter, d, &subvol_root,
1721                                           target_snapshot);
1722                 if (ret)
1723                         goto err;
1724         } else {
1725                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1726                 if (ret)
1727                         goto err;
1728
1729                 if (fsck_err_on(!target->inodes.nr, c,
1730                                 "dirent points to missing inode: (equiv %u)\n%s",
1731                                 equiv.snapshot,
1732                                 (printbuf_reset(&buf),
1733                                  bch2_bkey_val_to_text(&buf, c, k),
1734                                  buf.buf))) {
1735                         ret = __remove_dirent(trans, d.k->p);
1736                         if (ret)
1737                                 goto err;
1738                 }
1739
1740                 darray_for_each(target->inodes, i) {
1741                         ret = check_dirent_target(trans, iter, d,
1742                                                   &i->inode, i->snapshot);
1743                         if (ret)
1744                                 goto err;
1745                 }
1746         }
1747
1748         if (d.v->d_type == DT_DIR)
1749                 for_each_visible_inode(c, s, dir, equiv.snapshot, i)
1750                         i->count++;
1751
1752 out:
1753 err:
1754 fsck_err:
1755         printbuf_exit(&buf);
1756
1757         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1758                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1759         return ret;
1760 }
1761
1762 /*
1763  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1764  * validate d_type
1765  */
1766 noinline_for_stack
1767 static int check_dirents(struct bch_fs *c)
1768 {
1769         struct inode_walker dir = inode_walker_init();
1770         struct inode_walker target = inode_walker_init();
1771         struct snapshots_seen s;
1772         struct bch_hash_info hash_info;
1773         struct btree_trans trans;
1774         struct btree_iter iter;
1775         struct bkey_s_c k;
1776         int ret = 0;
1777
1778         bch_verbose(c, "checking dirents");
1779
1780         snapshots_seen_init(&s);
1781         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1782
1783         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
1784                         POS(BCACHEFS_ROOT_INO, 0),
1785                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1786                         k,
1787                         NULL, NULL,
1788                         BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
1789                 check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
1790
1791         bch2_trans_exit(&trans);
1792         snapshots_seen_exit(&s);
1793         inode_walker_exit(&dir);
1794         inode_walker_exit(&target);
1795
1796         if (ret)
1797                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1798         return ret;
1799 }
1800
1801 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
1802                        struct bkey_s_c k,
1803                        struct bch_hash_info *hash_info,
1804                        struct inode_walker *inode)
1805 {
1806         struct bch_fs *c = trans->c;
1807         int ret;
1808
1809         ret = check_key_has_snapshot(trans, iter, k);
1810         if (ret)
1811                 return ret;
1812
1813         ret = __walk_inode(trans, inode, k.k->p);
1814         if (ret < 0)
1815                 return ret;
1816
1817         if (fsck_err_on(ret == INT_MAX, c,
1818                         "xattr for missing inode %llu",
1819                         k.k->p.inode))
1820                 return bch2_btree_delete_at(trans, iter, 0);
1821
1822         if (ret == INT_MAX)
1823                 return 0;
1824
1825         ret = 0;
1826
1827         if (inode->first_this_inode)
1828                 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
1829
1830         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
1831 fsck_err:
1832         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
1833                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1834         return ret;
1835 }
1836
1837 /*
1838  * Walk xattrs: verify that they all have a corresponding inode
1839  */
1840 noinline_for_stack
1841 static int check_xattrs(struct bch_fs *c)
1842 {
1843         struct inode_walker inode = inode_walker_init();
1844         struct bch_hash_info hash_info;
1845         struct btree_trans trans;
1846         struct btree_iter iter;
1847         struct bkey_s_c k;
1848         int ret = 0;
1849
1850         bch_verbose(c, "checking xattrs");
1851
1852         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1853
1854         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
1855                         POS(BCACHEFS_ROOT_INO, 0),
1856                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1857                         k,
1858                         NULL, NULL,
1859                         BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
1860                 check_xattr(&trans, &iter, k, &hash_info, &inode));
1861
1862         bch2_trans_exit(&trans);
1863
1864         if (ret)
1865                 bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret));
1866         return ret;
1867 }
1868
1869 static int check_root_trans(struct btree_trans *trans)
1870 {
1871         struct bch_fs *c = trans->c;
1872         struct bch_inode_unpacked root_inode;
1873         u32 snapshot;
1874         u64 inum;
1875         int ret;
1876
1877         ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1878         if (ret && ret != -ENOENT)
1879                 return ret;
1880
1881         if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
1882                 struct bkey_i_subvolume root_subvol;
1883
1884                 snapshot        = U32_MAX;
1885                 inum            = BCACHEFS_ROOT_INO;
1886
1887                 bkey_subvolume_init(&root_subvol.k_i);
1888                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1889                 root_subvol.v.flags     = 0;
1890                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1891                 root_subvol.v.inode     = cpu_to_le64(inum);
1892                 ret = commit_do(trans, NULL, NULL,
1893                                       BTREE_INSERT_NOFAIL|
1894                                       BTREE_INSERT_LAZY_RW,
1895                         __bch2_btree_insert(trans, BTREE_ID_subvolumes,
1896                                             &root_subvol.k_i, 0));
1897                 if (ret) {
1898                         bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
1899                         goto err;
1900                 }
1901
1902         }
1903
1904         ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1905         if (ret && ret != -ENOENT)
1906                 return ret;
1907
1908         if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
1909             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
1910                                 "root inode not a directory")) {
1911                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1912                                 0, NULL);
1913                 root_inode.bi_inum = inum;
1914
1915                 ret = __write_inode(trans, &root_inode, snapshot);
1916                 if (ret)
1917                         bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
1918         }
1919 err:
1920 fsck_err:
1921         return ret;
1922 }
1923
1924 /* Get root directory, create if it doesn't exist: */
1925 noinline_for_stack
1926 static int check_root(struct bch_fs *c)
1927 {
1928         bch_verbose(c, "checking root directory");
1929
1930         return bch2_trans_do(c, NULL, NULL,
1931                              BTREE_INSERT_NOFAIL|
1932                              BTREE_INSERT_LAZY_RW,
1933                 check_root_trans(&trans));
1934 }
1935
1936 struct pathbuf_entry {
1937         u64     inum;
1938         u32     snapshot;
1939 };
1940
1941 typedef DARRAY(struct pathbuf_entry) pathbuf;
1942
1943 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
1944 {
1945         struct pathbuf_entry *i;
1946
1947         darray_for_each(*p, i)
1948                 if (i->inum     == inum &&
1949                     i->snapshot == snapshot)
1950                         return true;
1951
1952         return false;
1953 }
1954
1955 static int path_down(struct bch_fs *c, pathbuf *p,
1956                      u64 inum, u32 snapshot)
1957 {
1958         int ret = darray_push(p, ((struct pathbuf_entry) {
1959                 .inum           = inum,
1960                 .snapshot       = snapshot,
1961         }));
1962
1963         if (ret)
1964                 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
1965                         p->size);
1966         return ret;
1967 }
1968
1969 /*
1970  * Check that a given inode is reachable from the root:
1971  *
1972  * XXX: we should also be verifying that inodes are in the right subvolumes
1973  */
1974 static int check_path(struct btree_trans *trans,
1975                       pathbuf *p,
1976                       struct bch_inode_unpacked *inode,
1977                       u32 snapshot)
1978 {
1979         struct bch_fs *c = trans->c;
1980         int ret = 0;
1981
1982         snapshot = bch2_snapshot_equiv(c, snapshot);
1983         p->nr = 0;
1984
1985         while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
1986                  inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
1987                 struct btree_iter dirent_iter;
1988                 struct bkey_s_c_dirent d;
1989                 u32 parent_snapshot = snapshot;
1990
1991                 if (inode->bi_subvol) {
1992                         u64 inum;
1993
1994                         ret = subvol_lookup(trans, inode->bi_parent_subvol,
1995                                             &parent_snapshot, &inum);
1996                         if (ret)
1997                                 break;
1998                 }
1999
2000                 ret = lockrestart_do(trans,
2001                         PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
2002                                           SPOS(inode->bi_dir, inode->bi_dir_offset,
2003                                                parent_snapshot))).k));
2004                 if (ret && ret != -ENOENT)
2005                         break;
2006
2007                 if (!ret && !dirent_points_to_inode(d, inode)) {
2008                         bch2_trans_iter_exit(trans, &dirent_iter);
2009                         ret = -ENOENT;
2010                 }
2011
2012                 if (ret == -ENOENT) {
2013                         if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
2014                                      inode->bi_inum, snapshot,
2015                                      bch2_d_type_str(inode_d_type(inode)),
2016                                      inode->bi_nlink,
2017                                      inode->bi_dir,
2018                                      inode->bi_dir_offset))
2019                                 ret = reattach_inode(trans, inode, snapshot);
2020                         break;
2021                 }
2022
2023                 bch2_trans_iter_exit(trans, &dirent_iter);
2024
2025                 if (!S_ISDIR(inode->bi_mode))
2026                         break;
2027
2028                 ret = path_down(c, p, inode->bi_inum, snapshot);
2029                 if (ret) {
2030                         bch_err(c, "memory allocation failure");
2031                         return ret;
2032                 }
2033
2034                 snapshot = parent_snapshot;
2035
2036                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
2037                 if (ret) {
2038                         /* Should have been caught in dirents pass */
2039                         bch_err(c, "error looking up parent directory: %i", ret);
2040                         break;
2041                 }
2042
2043                 if (path_is_dup(p, inode->bi_inum, snapshot)) {
2044                         struct pathbuf_entry *i;
2045
2046                         /* XXX print path */
2047                         bch_err(c, "directory structure loop");
2048
2049                         darray_for_each(*p, i)
2050                                 pr_err("%llu:%u", i->inum, i->snapshot);
2051                         pr_err("%llu:%u", inode->bi_inum, snapshot);
2052
2053                         if (!fsck_err(c, "directory structure loop"))
2054                                 return 0;
2055
2056                         ret = commit_do(trans, NULL, NULL,
2057                                               BTREE_INSERT_NOFAIL|
2058                                               BTREE_INSERT_LAZY_RW,
2059                                         remove_backpointer(trans, inode));
2060                         if (ret) {
2061                                 bch_err(c, "error removing dirent: %i", ret);
2062                                 break;
2063                         }
2064
2065                         ret = reattach_inode(trans, inode, snapshot);
2066                 }
2067         }
2068 fsck_err:
2069         if (ret)
2070                 bch_err(c, "%s: err %s", __func__, bch2_err_str(ret));
2071         return ret;
2072 }
2073
2074 /*
2075  * Check for unreachable inodes, as well as loops in the directory structure:
2076  * After check_dirents(), if an inode backpointer doesn't exist that means it's
2077  * unreachable:
2078  */
2079 noinline_for_stack
2080 static int check_directory_structure(struct bch_fs *c)
2081 {
2082         struct btree_trans trans;
2083         struct btree_iter iter;
2084         struct bkey_s_c k;
2085         struct bch_inode_unpacked u;
2086         pathbuf path = { 0, };
2087         int ret;
2088
2089         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2090
2091         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
2092                            BTREE_ITER_INTENT|
2093                            BTREE_ITER_PREFETCH|
2094                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2095                 if (!bkey_is_inode(k.k))
2096                         continue;
2097
2098                 ret = bch2_inode_unpack(k, &u);
2099                 if (ret) {
2100                         /* Should have been caught earlier in fsck: */
2101                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
2102                         break;
2103                 }
2104
2105                 if (u.bi_flags & BCH_INODE_UNLINKED)
2106                         continue;
2107
2108                 ret = check_path(&trans, &path, &u, iter.pos.snapshot);
2109                 if (ret)
2110                         break;
2111         }
2112         bch2_trans_iter_exit(&trans, &iter);
2113
2114         darray_exit(&path);
2115
2116         bch2_trans_exit(&trans);
2117         return ret;
2118 }
2119
2120 struct nlink_table {
2121         size_t          nr;
2122         size_t          size;
2123
2124         struct nlink {
2125                 u64     inum;
2126                 u32     snapshot;
2127                 u32     count;
2128         }               *d;
2129 };
2130
2131 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2132                      u64 inum, u32 snapshot)
2133 {
2134         if (t->nr == t->size) {
2135                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2136                 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2137
2138                 if (!d) {
2139                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2140                                 new_size);
2141                         return -ENOMEM;
2142                 }
2143
2144                 if (t->d)
2145                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2146                 kvfree(t->d);
2147
2148                 t->d = d;
2149                 t->size = new_size;
2150         }
2151
2152
2153         t->d[t->nr++] = (struct nlink) {
2154                 .inum           = inum,
2155                 .snapshot       = snapshot,
2156         };
2157
2158         return 0;
2159 }
2160
2161 static int nlink_cmp(const void *_l, const void *_r)
2162 {
2163         const struct nlink *l = _l;
2164         const struct nlink *r = _r;
2165
2166         return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
2167 }
2168
2169 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2170                      struct nlink_table *links,
2171                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2172 {
2173         struct nlink *link, key = {
2174                 .inum = inum, .snapshot = U32_MAX,
2175         };
2176
2177         if (inum < range_start || inum >= range_end)
2178                 return;
2179
2180         link = __inline_bsearch(&key, links->d, links->nr,
2181                                 sizeof(links->d[0]), nlink_cmp);
2182         if (!link)
2183                 return;
2184
2185         while (link > links->d && link[0].inum == link[-1].inum)
2186                 --link;
2187
2188         for (; link < links->d + links->nr && link->inum == inum; link++)
2189                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2190                         link->count++;
2191                         if (link->snapshot >= snapshot)
2192                                 break;
2193                 }
2194 }
2195
2196 noinline_for_stack
2197 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2198                                        struct nlink_table *t,
2199                                        u64 start, u64 *end)
2200 {
2201         struct btree_trans trans;
2202         struct btree_iter iter;
2203         struct bkey_s_c k;
2204         struct bch_inode_unpacked u;
2205         int ret = 0;
2206
2207         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2208
2209         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2210                            POS(0, start),
2211                            BTREE_ITER_INTENT|
2212                            BTREE_ITER_PREFETCH|
2213                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2214                 if (!bkey_is_inode(k.k))
2215                         continue;
2216
2217                 /* Should never fail, checked by bch2_inode_invalid: */
2218                 BUG_ON(bch2_inode_unpack(k, &u));
2219
2220                 /*
2221                  * Backpointer and directory structure checks are sufficient for
2222                  * directories, since they can't have hardlinks:
2223                  */
2224                 if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2225                         continue;
2226
2227                 if (!u.bi_nlink)
2228                         continue;
2229
2230                 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2231                 if (ret) {
2232                         *end = k.k->p.offset;
2233                         ret = 0;
2234                         break;
2235                 }
2236
2237         }
2238         bch2_trans_iter_exit(&trans, &iter);
2239         bch2_trans_exit(&trans);
2240
2241         if (ret)
2242                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2243
2244         return ret;
2245 }
2246
2247 noinline_for_stack
2248 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2249                                      u64 range_start, u64 range_end)
2250 {
2251         struct btree_trans trans;
2252         struct snapshots_seen s;
2253         struct btree_iter iter;
2254         struct bkey_s_c k;
2255         struct bkey_s_c_dirent d;
2256         int ret;
2257
2258         snapshots_seen_init(&s);
2259
2260         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2261
2262         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
2263                            BTREE_ITER_INTENT|
2264                            BTREE_ITER_PREFETCH|
2265                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2266                 ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2267                 if (ret)
2268                         break;
2269
2270                 switch (k.k->type) {
2271                 case KEY_TYPE_dirent:
2272                         d = bkey_s_c_to_dirent(k);
2273
2274                         if (d.v->d_type != DT_DIR &&
2275                             d.v->d_type != DT_SUBVOL)
2276                                 inc_link(c, &s, links, range_start, range_end,
2277                                          le64_to_cpu(d.v->d_inum),
2278                                          bch2_snapshot_equiv(c, d.k->p.snapshot));
2279                         break;
2280                 }
2281         }
2282         bch2_trans_iter_exit(&trans, &iter);
2283
2284         if (ret)
2285                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
2286
2287         bch2_trans_exit(&trans);
2288         snapshots_seen_exit(&s);
2289         return ret;
2290 }
2291
2292 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2293                                      struct bkey_s_c k,
2294                                      struct nlink_table *links,
2295                                      size_t *idx, u64 range_end)
2296 {
2297         struct bch_fs *c = trans->c;
2298         struct bch_inode_unpacked u;
2299         struct nlink *link = &links->d[*idx];
2300         int ret = 0;
2301
2302         if (k.k->p.offset >= range_end)
2303                 return 1;
2304
2305         if (!bkey_is_inode(k.k))
2306                 return 0;
2307
2308         BUG_ON(bch2_inode_unpack(k, &u));
2309
2310         if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2311                 return 0;
2312
2313         if (!u.bi_nlink)
2314                 return 0;
2315
2316         while ((cmp_int(link->inum, k.k->p.offset) ?:
2317                 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2318                 BUG_ON(*idx == links->nr);
2319                 link = &links->d[++*idx];
2320         }
2321
2322         if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
2323                         "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2324                         u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2325                         bch2_inode_nlink_get(&u), link->count)) {
2326                 bch2_inode_nlink_set(&u, link->count);
2327                 ret = __write_inode(trans, &u, k.k->p.snapshot);
2328         }
2329 fsck_err:
2330         return ret;
2331 }
2332
2333 noinline_for_stack
2334 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2335                                struct nlink_table *links,
2336                                u64 range_start, u64 range_end)
2337 {
2338         struct btree_trans trans;
2339         struct btree_iter iter;
2340         struct bkey_s_c k;
2341         size_t idx = 0;
2342         int ret = 0;
2343
2344         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2345
2346         ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
2347                         POS(0, range_start),
2348                         BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
2349                         NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
2350                 check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
2351
2352         bch2_trans_exit(&trans);
2353
2354         if (ret < 0) {
2355                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2356                 return ret;
2357         }
2358
2359         return 0;
2360 }
2361
2362 noinline_for_stack
2363 static int check_nlinks(struct bch_fs *c)
2364 {
2365         struct nlink_table links = { 0 };
2366         u64 this_iter_range_start, next_iter_range_start = 0;
2367         int ret = 0;
2368
2369         bch_verbose(c, "checking inode nlinks");
2370
2371         do {
2372                 this_iter_range_start = next_iter_range_start;
2373                 next_iter_range_start = U64_MAX;
2374
2375                 ret = check_nlinks_find_hardlinks(c, &links,
2376                                                   this_iter_range_start,
2377                                                   &next_iter_range_start);
2378
2379                 ret = check_nlinks_walk_dirents(c, &links,
2380                                           this_iter_range_start,
2381                                           next_iter_range_start);
2382                 if (ret)
2383                         break;
2384
2385                 ret = check_nlinks_update_hardlinks(c, &links,
2386                                          this_iter_range_start,
2387                                          next_iter_range_start);
2388                 if (ret)
2389                         break;
2390
2391                 links.nr = 0;
2392         } while (next_iter_range_start != U64_MAX);
2393
2394         kvfree(links.d);
2395
2396         return ret;
2397 }
2398
2399 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2400                              struct bkey_s_c k)
2401 {
2402         struct bkey_s_c_reflink_p p;
2403         struct bkey_i_reflink_p *u;
2404         int ret;
2405
2406         if (k.k->type != KEY_TYPE_reflink_p)
2407                 return 0;
2408
2409         p = bkey_s_c_to_reflink_p(k);
2410
2411         if (!p.v->front_pad && !p.v->back_pad)
2412                 return 0;
2413
2414         u = bch2_trans_kmalloc(trans, sizeof(*u));
2415         ret = PTR_ERR_OR_ZERO(u);
2416         if (ret)
2417                 return ret;
2418
2419         bkey_reassemble(&u->k_i, k);
2420         u->v.front_pad  = 0;
2421         u->v.back_pad   = 0;
2422
2423         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2424 }
2425
2426 noinline_for_stack
2427 static int fix_reflink_p(struct bch_fs *c)
2428 {
2429         struct btree_trans trans;
2430         struct btree_iter iter;
2431         struct bkey_s_c k;
2432         int ret;
2433
2434         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2435                 return 0;
2436
2437         bch_verbose(c, "fixing reflink_p keys");
2438
2439         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2440
2441         ret = for_each_btree_key_commit(&trans, iter,
2442                         BTREE_ID_extents, POS_MIN,
2443                         BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
2444                         NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
2445                 fix_reflink_p_key(&trans, &iter, k));
2446
2447         bch2_trans_exit(&trans);
2448         return ret;
2449 }
2450
2451 /*
2452  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
2453  * Doesn't fix them yet, mainly because they haven't yet been observed:
2454  */
2455 int bch2_fsck_full(struct bch_fs *c)
2456 {
2457         int ret;
2458 again:
2459         ret =   bch2_fs_check_snapshots(c) ?:
2460                 bch2_fs_check_subvols(c) ?:
2461                 bch2_delete_dead_snapshots(c) ?:
2462                 check_inodes(c, true) ?:
2463                 check_extents(c) ?:
2464                 check_dirents(c) ?:
2465                 check_xattrs(c) ?:
2466                 check_root(c) ?:
2467                 check_directory_structure(c) ?:
2468                 check_nlinks(c) ?:
2469                 fix_reflink_p(c);
2470
2471         if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) {
2472                 set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
2473                 goto again;
2474         }
2475
2476         return ret;
2477 }
2478
2479 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
2480 {
2481         return  bch2_fs_check_snapshots(c) ?:
2482                 bch2_fs_check_subvols(c) ?:
2483                 bch2_delete_dead_snapshots(c) ?:
2484                 check_inodes(c, false);
2485 }