]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
Update bcachefs sources to 386f00b639 bcachefs: Snapshot creation, deletion
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "dirent.h"
7 #include "error.h"
8 #include "fs-common.h"
9 #include "fsck.h"
10 #include "inode.h"
11 #include "keylist.h"
12 #include "subvolume.h"
13 #include "super.h"
14 #include "xattr.h"
15
16 #include <linux/bsearch.h>
17 #include <linux/dcache.h> /* struct qstr */
18
19 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
20
21 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
22                                     u32 snapshot)
23 {
24         struct btree_iter iter;
25         struct bkey_s_c k;
26         u64 sectors = 0;
27         int ret;
28
29         for_each_btree_key(trans, iter, BTREE_ID_extents,
30                            SPOS(inum, 0, snapshot), 0, k, ret) {
31                 if (k.k->p.inode != inum)
32                         break;
33
34                 if (bkey_extent_is_allocation(k.k))
35                         sectors += k.k->size;
36         }
37
38         bch2_trans_iter_exit(trans, &iter);
39
40         return ret ?: sectors;
41 }
42
43 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
44                                     u32 snapshot)
45 {
46         struct btree_iter iter;
47         struct bkey_s_c k;
48         struct bkey_s_c_dirent d;
49         u64 subdirs = 0;
50         int ret;
51
52         for_each_btree_key(trans, iter, BTREE_ID_dirents,
53                            SPOS(inum, 0, snapshot), 0, k, ret) {
54                 if (k.k->p.inode != inum)
55                         break;
56
57                 if (k.k->type != KEY_TYPE_dirent)
58                         continue;
59
60                 d = bkey_s_c_to_dirent(k);
61                 if (d.v->d_type == DT_DIR)
62                         subdirs++;
63         }
64
65         bch2_trans_iter_exit(trans, &iter);
66
67         return ret ?: subdirs;
68 }
69
70 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
71                                     u32 *subvol)
72 {
73         struct btree_iter iter;
74         struct bkey_s_c k;
75         int ret;
76
77         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
78                              POS(0, snapshot), 0);
79         k = bch2_btree_iter_peek_slot(&iter);
80         ret = bkey_err(k);
81         if (ret)
82                 goto err;
83
84         if (k.k->type != KEY_TYPE_snapshot) {
85                 bch_err(trans->c, "snapshot %u not fonud", snapshot);
86                 ret = -ENOENT;
87                 goto err;
88         }
89
90         *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
91 err:
92         bch2_trans_iter_exit(trans, &iter);
93         return ret;
94
95 }
96
97 static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
98                                   u32 *subvol)
99 {
100         return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol));
101 }
102
103 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
104                            u32 *snapshot, u64 *inum)
105 {
106         struct btree_iter iter;
107         struct bkey_s_c k;
108         int ret;
109
110         bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
111                              POS(0, subvol), 0);
112         k = bch2_btree_iter_peek_slot(&iter);
113         ret = bkey_err(k);
114         if (ret)
115                 goto err;
116
117         if (k.k->type != KEY_TYPE_subvolume) {
118                 bch_err(trans->c, "subvolume %u not fonud", subvol);
119                 ret = -ENOENT;
120                 goto err;
121         }
122
123         *snapshot = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
124         *inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode);
125 err:
126         bch2_trans_iter_exit(trans, &iter);
127         return ret;
128
129 }
130
131 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
132                          u32 *snapshot, u64 *inum)
133 {
134         return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
135 }
136
137 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
138                           struct bch_inode_unpacked *inode,
139                           u32 *snapshot)
140 {
141         struct btree_iter iter;
142         struct bkey_s_c k;
143         int ret;
144
145         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
146                              SPOS(0, inode_nr, *snapshot), 0);
147         k = bch2_btree_iter_peek_slot(&iter);
148         ret = bkey_err(k);
149         if (ret)
150                 goto err;
151
152         *snapshot = iter.pos.snapshot;
153         ret = k.k->type == KEY_TYPE_inode
154                 ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
155                 : -ENOENT;
156 err:
157         bch2_trans_iter_exit(trans, &iter);
158         return ret;
159 }
160
161 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
162                         struct bch_inode_unpacked *inode,
163                         u32 *snapshot)
164 {
165         return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
166 }
167
168 static int __lookup_dirent(struct btree_trans *trans,
169                            struct bch_hash_info hash_info,
170                            subvol_inum dir, struct qstr *name,
171                            u64 *target, unsigned *type)
172 {
173         struct btree_iter iter;
174         struct bkey_s_c_dirent d;
175         int ret;
176
177         ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
178                                &hash_info, dir, name, 0);
179         if (ret)
180                 return ret;
181
182         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
183         *target = le64_to_cpu(d.v->d_inum);
184         *type = d.v->d_type;
185         bch2_trans_iter_exit(trans, &iter);
186         return 0;
187 }
188
189 static int lookup_dirent(struct btree_trans *trans,
190                          struct bch_hash_info hash_info,
191                          subvol_inum dir, struct qstr *name,
192                          u64 *target, unsigned *type)
193 {
194         return lockrestart_do(trans,
195                 __lookup_dirent(trans, hash_info, dir, name, target, type));
196 }
197
198 static int __write_inode(struct btree_trans *trans,
199                          struct bch_inode_unpacked *inode,
200                          u32 snapshot)
201 {
202         struct btree_iter iter;
203         int ret;
204
205         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
206                             SPOS(0, inode->bi_inum, snapshot),
207                             BTREE_ITER_INTENT);
208
209         ret   = bch2_btree_iter_traverse(&iter) ?:
210                 bch2_inode_write(trans, &iter, inode);
211         bch2_trans_iter_exit(trans, &iter);
212         return ret;
213 }
214
215 static int write_inode(struct btree_trans *trans,
216                        struct bch_inode_unpacked *inode,
217                        u32 snapshot)
218 {
219         int ret = __bch2_trans_do(trans, NULL, NULL,
220                                   BTREE_INSERT_NOFAIL|
221                                   BTREE_INSERT_LAZY_RW,
222                                   __write_inode(trans, inode, snapshot));
223         if (ret)
224                 bch_err(trans->c, "error in fsck: error %i updating inode", ret);
225         return ret;
226 }
227
228 static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
229 {
230         struct btree_iter iter = { NULL };
231         struct bkey_i_inode_generation delete;
232         struct bch_inode_unpacked inode_u;
233         struct bkey_s_c k;
234         int ret;
235
236         ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
237                                               SPOS(inum, 0, snapshot),
238                                               SPOS(inum, U64_MAX, snapshot),
239                                               0, NULL) ?:
240                 bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
241                                               SPOS(inum, 0, snapshot),
242                                               SPOS(inum, U64_MAX, snapshot),
243                                               0, NULL) ?:
244                 bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
245                                               SPOS(inum, 0, snapshot),
246                                               SPOS(inum, U64_MAX, snapshot),
247                                               0, NULL);
248         if (ret)
249                 goto err;
250 retry:
251         bch2_trans_begin(trans);
252
253         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
254                              SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
255         k = bch2_btree_iter_peek_slot(&iter);
256
257         ret = bkey_err(k);
258         if (ret)
259                 goto err;
260
261         if (k.k->type != KEY_TYPE_inode) {
262                 bch2_fs_inconsistent(trans->c,
263                                      "inode %llu:%u not found when deleting",
264                                      inum, snapshot);
265                 ret = -EIO;
266                 goto err;
267         }
268
269         bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
270
271         /* Subvolume root? */
272         if (inode_u.bi_subvol) {
273                 ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1);
274                 if (ret)
275                         goto err;
276         }
277
278         bkey_inode_generation_init(&delete.k_i);
279         delete.k.p = iter.pos;
280         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
281
282         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
283                 bch2_trans_commit(trans, NULL, NULL,
284                                 BTREE_INSERT_NOFAIL);
285 err:
286         bch2_trans_iter_exit(trans, &iter);
287         if (ret == -EINTR)
288                 goto retry;
289
290         return ret;
291 }
292
293 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
294 {
295         struct bch_fs *c = trans->c;
296         struct btree_iter iter;
297         struct bch_inode_unpacked dir_inode;
298         struct bch_hash_info dir_hash_info;
299         int ret;
300
301         ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
302         if (ret)
303                 return ret;
304
305         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
306
307         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
308
309         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
310                                   &dir_hash_info, &iter, 0);
311         bch2_trans_iter_exit(trans, &iter);
312         return ret;
313 }
314
315 static int remove_dirent(struct btree_trans *trans, struct bpos pos)
316 {
317         int ret = __bch2_trans_do(trans, NULL, NULL,
318                                   BTREE_INSERT_NOFAIL|
319                                   BTREE_INSERT_LAZY_RW,
320                                   __remove_dirent(trans, pos));
321         if (ret)
322                 bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
323         return ret;
324 }
325
326 /* Get lost+found, create if it doesn't exist: */
327 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
328                             struct bch_inode_unpacked *lostfound)
329 {
330         struct bch_fs *c = trans->c;
331         struct bch_inode_unpacked root;
332         struct bch_hash_info root_hash_info;
333         struct qstr lostfound_str = QSTR("lost+found");
334         subvol_inum root_inum = { .subvol = subvol };
335         u64 inum = 0;
336         unsigned d_type = 0;
337         u32 snapshot;
338         int ret;
339
340         ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
341         if (ret)
342                 return ret;
343
344         ret = lookup_inode(trans, root_inum.inum, &root, &snapshot);
345         if (ret) {
346                 bch_err(c, "error fetching subvol root: %i", ret);
347                 return ret;
348         }
349
350         root_hash_info = bch2_hash_info_init(c, &root);
351
352         ret = lookup_dirent(trans, root_hash_info, root_inum,
353                             &lostfound_str, &inum, &d_type);
354         if (ret == -ENOENT) {
355                 bch_notice(c, "creating lost+found");
356                 goto create_lostfound;
357         }
358
359         if (ret) {
360                 bch_err(c, "error looking up lost+found: %i", ret);
361                 return ret;
362         }
363
364         if (d_type != DT_DIR) {
365                 bch_err(c, "error looking up lost+found: not a directory");
366                 return ret;
367
368         }
369
370         ret = lookup_inode(trans, inum, lostfound, &snapshot);
371         if (ret && ret != -ENOENT) {
372                 /*
373                  * The check_dirents pass has already run, dangling dirents
374                  * shouldn't exist here:
375                  */
376                 bch_err(c, "error looking up lost+found: %i", ret);
377                 return ret;
378         }
379
380         if (ret == -ENOENT) {
381 create_lostfound:
382                 bch2_inode_init_early(c, lostfound);
383
384                 ret = __bch2_trans_do(trans, NULL, NULL,
385                                       BTREE_INSERT_NOFAIL|
386                                       BTREE_INSERT_LAZY_RW,
387                         bch2_create_trans(trans, root_inum, &root,
388                                           lostfound, &lostfound_str,
389                                           0, 0, S_IFDIR|0700, 0, NULL, NULL,
390                                           (subvol_inum) { }, 0));
391                 if (ret)
392                         bch_err(c, "error creating lost+found: %i", ret);
393         }
394
395         return 0;
396 }
397
398 static int reattach_inode(struct btree_trans *trans,
399                           struct bch_inode_unpacked *inode,
400                           u32 inode_snapshot)
401 {
402         struct bch_hash_info dir_hash;
403         struct bch_inode_unpacked lostfound;
404         char name_buf[20];
405         struct qstr name;
406         u64 dir_offset = 0;
407         u32 subvol;
408         int ret;
409
410         ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
411         if (ret)
412                 return ret;
413
414         ret = lookup_lostfound(trans, subvol, &lostfound);
415         if (ret)
416                 return ret;
417
418         if (S_ISDIR(inode->bi_mode)) {
419                 lostfound.bi_nlink++;
420
421                 ret = write_inode(trans, &lostfound, U32_MAX);
422                 if (ret)
423                         return ret;
424         }
425
426         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
427
428         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
429         name = (struct qstr) QSTR(name_buf);
430
431         ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
432                         bch2_dirent_create(trans,
433                                            (subvol_inum) {
434                                                 .subvol = subvol,
435                                                 .inum = lostfound.bi_inum,
436                                            },
437                                            &dir_hash,
438                                            mode_to_type(inode->bi_mode),
439                                            &name, inode->bi_inum, &dir_offset,
440                                            BCH_HASH_SET_MUST_CREATE));
441         if (ret) {
442                 bch_err(trans->c, "error %i reattaching inode %llu",
443                         ret, inode->bi_inum);
444                 return ret;
445         }
446
447         inode->bi_dir           = lostfound.bi_inum;
448         inode->bi_dir_offset    = dir_offset;
449
450         return write_inode(trans, inode, inode_snapshot);
451 }
452
453 static int remove_backpointer(struct btree_trans *trans,
454                               struct bch_inode_unpacked *inode)
455 {
456         struct btree_iter iter;
457         struct bkey_s_c k;
458         int ret;
459
460         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
461                              POS(inode->bi_dir, inode->bi_dir_offset), 0);
462         k = bch2_btree_iter_peek_slot(&iter);
463         ret = bkey_err(k);
464         if (ret)
465                 goto out;
466         if (k.k->type != KEY_TYPE_dirent) {
467                 ret = -ENOENT;
468                 goto out;
469         }
470
471         ret = remove_dirent(trans, k.k->p);
472 out:
473         bch2_trans_iter_exit(trans, &iter);
474         return ret;
475 }
476
477 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
478 {
479         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
480
481         if (bkey_cmp(s->pos, pos))
482                 s->nr = 0;
483         s->pos = pos;
484
485         /* Might get called multiple times due to lock restarts */
486         if (s->nr && s->d[s->nr - 1] == pos.snapshot)
487                 return 0;
488
489         return snapshots_seen_add(c, s, pos.snapshot);
490 }
491
492 /**
493  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
494  * and @ancestor hasn't been overwritten in @seen
495  *
496  * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
497  */
498 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
499                                     u32 id, u32 ancestor)
500 {
501         ssize_t i;
502
503         BUG_ON(id > ancestor);
504
505         id              = snapshot_t(c, id)->equiv;
506         ancestor        = snapshot_t(c, ancestor)->equiv;
507
508         /* @ancestor should be the snapshot most recently added to @seen */
509         BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
510         BUG_ON(seen->pos.snapshot != ancestor);
511
512         if (id == ancestor)
513                 return true;
514
515         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
516                 return false;
517
518         for (i = seen->nr - 2;
519              i >= 0 && seen->d[i] >= id;
520              --i)
521                 if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
522                     bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
523                         return false;
524
525         return true;
526 }
527
528 /**
529  * ref_visible - given a key with snapshot id @src that points to a key with
530  * snapshot id @dst, test whether there is some snapshot in which @dst is
531  * visible.
532  *
533  * This assumes we're visiting @src keys in natural key order.
534  *
535  * @s   - list of snapshot IDs already seen at @src
536  * @src - snapshot ID of src key
537  * @dst - snapshot ID of dst key
538  */
539 static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
540                        u32 src, u32 dst)
541 {
542         return dst <= src
543                 ? key_visible_in_snapshot(c, s, dst, src)
544                 : bch2_snapshot_is_ancestor(c, src, dst);
545 }
546
547 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)       \
548         for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
549                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
550
551 struct inode_walker {
552         bool                            first_this_inode;
553         u64                             cur_inum;
554
555         size_t                          nr;
556         size_t                          size;
557         struct inode_walker_entry {
558                 struct bch_inode_unpacked inode;
559                 u32                     snapshot;
560                 u64                     count;
561         } *d;
562 };
563
564 static void inode_walker_exit(struct inode_walker *w)
565 {
566         kfree(w->d);
567         w->d = NULL;
568 }
569
570 static struct inode_walker inode_walker_init(void)
571 {
572         return (struct inode_walker) { 0, };
573 }
574
575 static int inode_walker_realloc(struct inode_walker *w)
576 {
577         if (w->nr == w->size) {
578                 size_t new_size = max_t(size_t, 8UL, w->size * 2);
579                 void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
580                                    GFP_KERNEL);
581                 if (!d)
582                         return -ENOMEM;
583
584                 w->d = d;
585                 w->size = new_size;
586         }
587
588         return 0;
589 }
590
591 static int add_inode(struct bch_fs *c, struct inode_walker *w,
592                      struct bkey_s_c_inode inode)
593 {
594         struct bch_inode_unpacked u;
595         int ret;
596
597         ret = inode_walker_realloc(w);
598         if (ret)
599                 return ret;
600
601         BUG_ON(bch2_inode_unpack(inode, &u));
602
603         w->d[w->nr++] = (struct inode_walker_entry) {
604                 .inode          = u,
605                 .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
606         };
607
608         return 0;
609 }
610
611 static int __walk_inode(struct btree_trans *trans,
612                         struct inode_walker *w, struct bpos pos)
613 {
614         struct bch_fs *c = trans->c;
615         struct btree_iter iter;
616         struct bkey_s_c k;
617         unsigned i, ancestor_pos;
618         int ret;
619
620         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
621
622         if (pos.inode == w->cur_inum) {
623                 w->first_this_inode = false;
624                 goto lookup_snapshot;
625         }
626
627         w->nr = 0;
628
629         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
630                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
631                 if (k.k->p.offset != pos.inode)
632                         break;
633
634                 if (k.k->type == KEY_TYPE_inode)
635                         add_inode(c, w, bkey_s_c_to_inode(k));
636         }
637         bch2_trans_iter_exit(trans, &iter);
638
639         if (ret)
640                 return ret;
641
642         w->cur_inum             = pos.inode;
643         w->first_this_inode     = true;
644 lookup_snapshot:
645         for (i = 0; i < w->nr; i++)
646                 if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
647                         goto found;
648         return INT_MAX;
649 found:
650         BUG_ON(pos.snapshot > w->d[i].snapshot);
651
652         if (pos.snapshot != w->d[i].snapshot) {
653                 ancestor_pos = i;
654
655                 while (i && w->d[i - 1].snapshot > pos.snapshot)
656                         --i;
657
658                 ret = inode_walker_realloc(w);
659                 if (ret)
660                         return ret;
661
662                 array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
663                 w->d[i].snapshot = pos.snapshot;
664                 w->d[i].count   = 0;
665         }
666
667         return i;
668 }
669
670 static int walk_inode(struct btree_trans *trans,
671                       struct inode_walker *w, struct bpos pos)
672 {
673         return lockrestart_do(trans, __walk_inode(trans, w, pos));
674 }
675
676 static int __get_visible_inodes(struct btree_trans *trans,
677                                 struct inode_walker *w,
678                                 struct snapshots_seen *s,
679                                 u64 inum)
680 {
681         struct bch_fs *c = trans->c;
682         struct btree_iter iter;
683         struct bkey_s_c k;
684         int ret;
685
686         w->nr = 0;
687
688         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
689                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
690                 if (k.k->p.offset != inum)
691                         break;
692
693                 if (k.k->type != KEY_TYPE_inode)
694                         continue;
695
696                 if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
697                         add_inode(c, w, bkey_s_c_to_inode(k));
698                         if (k.k->p.snapshot >= s->pos.snapshot)
699                                 break;
700                 }
701         }
702         bch2_trans_iter_exit(trans, &iter);
703
704         return ret;
705 }
706
707 static int check_key_has_snapshot(struct btree_trans *trans,
708                                   struct btree_iter *iter,
709                                   struct bkey_s_c k)
710 {
711         struct bch_fs *c = trans->c;
712         char buf[200];
713         int ret = 0;
714
715         if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
716                         "key in missing snapshot: %s",
717                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
718                 ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
719                         bch2_btree_delete_at(trans, iter,
720                                              BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
721                 return ret ?: -EINTR;
722         }
723 fsck_err:
724         return ret;
725 }
726
727 static int hash_redo_key(struct btree_trans *trans,
728                          const struct bch_hash_desc desc,
729                          struct bch_hash_info *hash_info,
730                          struct btree_iter *k_iter, struct bkey_s_c k)
731 {
732         bch_err(trans->c, "hash_redo_key() not implemented yet");
733         return -EINVAL;
734 #if 0
735         struct bkey_i *delete;
736         struct bkey_i *tmp;
737
738         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
739         if (IS_ERR(delete))
740                 return PTR_ERR(delete);
741
742         tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
743         if (IS_ERR(tmp))
744                 return PTR_ERR(tmp);
745
746         bkey_reassemble(tmp, k);
747
748         bkey_init(&delete->k);
749         delete->k.p = k_iter->pos;
750         return  bch2_btree_iter_traverse(k_iter) ?:
751                 bch2_trans_update(trans, k_iter, delete, 0) ?:
752                 bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
753 #endif
754 }
755
756 static int fsck_hash_delete_at(struct btree_trans *trans,
757                                const struct bch_hash_desc desc,
758                                struct bch_hash_info *info,
759                                struct btree_iter *iter)
760 {
761         int ret;
762 retry:
763         ret   = bch2_hash_delete_at(trans, desc, info, iter, 0) ?:
764                 bch2_trans_commit(trans, NULL, NULL,
765                                   BTREE_INSERT_NOFAIL|
766                                   BTREE_INSERT_LAZY_RW);
767         if (ret == -EINTR) {
768                 ret = bch2_btree_iter_traverse(iter);
769                 if (!ret)
770                         goto retry;
771         }
772
773         return ret;
774 }
775
776 static int hash_check_key(struct btree_trans *trans,
777                           const struct bch_hash_desc desc,
778                           struct bch_hash_info *hash_info,
779                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
780 {
781         struct bch_fs *c = trans->c;
782         struct btree_iter iter = { NULL };
783         char buf[200];
784         struct bkey_s_c k;
785         u64 hash;
786         int ret = 0;
787
788         if (hash_k.k->type != desc.key_type)
789                 return 0;
790
791         hash = desc.hash_bkey(hash_info, hash_k);
792
793         if (likely(hash == hash_k.k->p.offset))
794                 return 0;
795
796         if (hash_k.k->p.offset < hash)
797                 goto bad_hash;
798
799         for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
800                            BTREE_ITER_SLOTS, k, ret) {
801                 if (!bkey_cmp(k.k->p, hash_k.k->p))
802                         break;
803
804                 if (fsck_err_on(k.k->type == desc.key_type &&
805                                 !desc.cmp_bkey(k, hash_k), c,
806                                 "duplicate hash table keys:\n%s",
807                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
808                                                        hash_k), buf))) {
809                         ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
810                         if (ret)
811                                 return ret;
812                         ret = 1;
813                         break;
814                 }
815
816                 if (bkey_deleted(k.k)) {
817                         bch2_trans_iter_exit(trans, &iter);
818                         goto bad_hash;
819                 }
820
821         }
822         bch2_trans_iter_exit(trans, &iter);
823         return ret;
824 bad_hash:
825         if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
826                      "hashed to %llu\n%s",
827                      desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
828                      (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
829                 return 0;
830
831         ret = __bch2_trans_do(trans, NULL, NULL,
832                               BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
833                 hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
834         if (ret) {
835                 bch_err(c, "hash_redo_key err %i", ret);
836                 return ret;
837         }
838         return -EINTR;
839 fsck_err:
840         return ret;
841 }
842
843 static int check_inode(struct btree_trans *trans,
844                        struct btree_iter *iter,
845                        struct bch_inode_unpacked *prev,
846                        struct bch_inode_unpacked u)
847 {
848         struct bch_fs *c = trans->c;
849         bool do_update = false;
850         int ret = 0;
851
852         if (fsck_err_on(prev &&
853                         (prev->bi_hash_seed             != u.bi_hash_seed ||
854                          mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c,
855                         "inodes in different snapshots don't match")) {
856                 bch_err(c, "repair not implemented yet");
857                 return -EINVAL;
858         }
859
860         if (u.bi_flags & BCH_INODE_UNLINKED &&
861             (!c->sb.clean ||
862              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
863                       u.bi_inum))) {
864                 bch2_trans_unlock(trans);
865                 bch2_fs_lazy_rw(c);
866
867                 ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
868                 if (ret)
869                         bch_err(c, "error in fsck: error %i while deleting inode", ret);
870                 return ret;
871         }
872
873         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
874             (!c->sb.clean ||
875              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
876                       u.bi_inum))) {
877                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
878
879                 bch2_trans_unlock(trans);
880                 bch2_fs_lazy_rw(c);
881
882                 /*
883                  * XXX: need to truncate partial blocks too here - or ideally
884                  * just switch units to bytes and that issue goes away
885                  */
886                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
887                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
888                                      iter->pos.snapshot),
889                                 POS(u.bi_inum, U64_MAX),
890                                 0, NULL);
891                 if (ret) {
892                         bch_err(c, "error in fsck: error %i truncating inode", ret);
893                         return ret;
894                 }
895
896                 /*
897                  * We truncated without our normal sector accounting hook, just
898                  * make sure we recalculate it:
899                  */
900                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
901
902                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
903                 do_update = true;
904         }
905
906         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
907             (!c->sb.clean ||
908              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
909                       u.bi_inum))) {
910                 s64 sectors;
911
912                 bch_verbose(c, "recounting sectors for inode %llu",
913                             u.bi_inum);
914
915                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
916                 if (sectors < 0) {
917                         bch_err(c, "error in fsck: error %i recounting inode sectors",
918                                 (int) sectors);
919                         return sectors;
920                 }
921
922                 u.bi_sectors = sectors;
923                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
924                 do_update = true;
925         }
926
927         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
928                 u.bi_dir = 0;
929                 u.bi_dir_offset = 0;
930                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
931                 do_update = true;
932         }
933
934         if (do_update) {
935                 ret = write_inode(trans, &u, iter->pos.snapshot);
936                 if (ret)
937                         bch_err(c, "error in fsck: error %i "
938                                 "updating inode", ret);
939         }
940 fsck_err:
941         return ret;
942 }
943
944 noinline_for_stack
945 static int check_inodes(struct bch_fs *c, bool full)
946 {
947         struct btree_trans trans;
948         struct btree_iter iter;
949         struct bkey_s_c k;
950         struct bkey_s_c_inode inode;
951         struct bch_inode_unpacked prev, u;
952         int ret;
953
954         memset(&prev, 0, sizeof(prev));
955
956         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
957
958         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
959                            BTREE_ITER_INTENT|
960                            BTREE_ITER_PREFETCH|
961                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
962                 ret = check_key_has_snapshot(&trans, &iter, k);
963                 if (ret)
964                         break;
965
966                 /*
967                  * if snapshot id isn't a leaf node, skip it - deletion in
968                  * particular is not atomic, so on the internal snapshot nodes
969                  * we can see inodes marked for deletion after a clean shutdown
970                  */
971                 if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
972                         continue;
973
974                 if (k.k->type != KEY_TYPE_inode)
975                         continue;
976
977                 inode = bkey_s_c_to_inode(k);
978
979                 if (!full &&
980                     !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
981                                            BCH_INODE_I_SECTORS_DIRTY|
982                                            BCH_INODE_UNLINKED)))
983                         continue;
984
985                 BUG_ON(bch2_inode_unpack(inode, &u));
986
987                 ret = check_inode(&trans, &iter,
988                                   full && prev.bi_inum == u.bi_inum
989                                   ? &prev : NULL, u);
990                 if (ret)
991                         break;
992
993                 prev = u;
994         }
995         bch2_trans_iter_exit(&trans, &iter);
996
997         BUG_ON(ret == -EINTR);
998
999         return bch2_trans_exit(&trans) ?: ret;
1000 }
1001
1002 noinline_for_stack
1003 static int check_subvols(struct bch_fs *c)
1004 {
1005         struct btree_trans trans;
1006         struct btree_iter iter;
1007         struct bkey_s_c k;
1008         int ret;
1009
1010         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1011
1012         for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
1013                            0, k, ret) {
1014         }
1015         bch2_trans_iter_exit(&trans, &iter);
1016
1017         bch2_trans_exit(&trans);
1018         return ret;
1019 }
1020
1021 /*
1022  * Checking for overlapping extents needs to be reimplemented
1023  */
1024 #if 0
1025 static int fix_overlapping_extent(struct btree_trans *trans,
1026                                        struct bkey_s_c k, struct bpos cut_at)
1027 {
1028         struct btree_iter iter;
1029         struct bkey_i *u;
1030         int ret;
1031
1032         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1033         ret = PTR_ERR_OR_ZERO(u);
1034         if (ret)
1035                 return ret;
1036
1037         bkey_reassemble(u, k);
1038         bch2_cut_front(cut_at, u);
1039
1040
1041         /*
1042          * We don't want to go through the extent_handle_overwrites path:
1043          *
1044          * XXX: this is going to screw up disk accounting, extent triggers
1045          * assume things about extent overwrites - we should be running the
1046          * triggers manually here
1047          */
1048         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
1049                              BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
1050
1051         BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
1052         ret   = bch2_btree_iter_traverse(&iter) ?:
1053                 bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
1054                 bch2_trans_commit(trans, NULL, NULL,
1055                                   BTREE_INSERT_NOFAIL|
1056                                   BTREE_INSERT_LAZY_RW);
1057         bch2_trans_iter_exit(trans, &iter);
1058         return ret;
1059 }
1060 #endif
1061
1062 static int inode_backpointer_exists(struct btree_trans *trans,
1063                                     struct bch_inode_unpacked *inode,
1064                                     u32 snapshot)
1065 {
1066         struct btree_iter iter;
1067         struct bkey_s_c k;
1068         int ret;
1069
1070         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
1071                         SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0);
1072         k = bch2_btree_iter_peek_slot(&iter);
1073         ret = bkey_err(k);
1074         if (ret)
1075                 goto out;
1076         if (k.k->type != KEY_TYPE_dirent)
1077                 goto out;
1078
1079         ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
1080 out:
1081         bch2_trans_iter_exit(trans, &iter);
1082         return ret;
1083 }
1084
1085 static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
1086                                       struct bch_inode_unpacked *inode)
1087 {
1088         return d.k->p.inode == inode->bi_dir &&
1089                 d.k->p.offset == inode->bi_dir_offset;
1090 }
1091
1092 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1093 {
1094         struct bch_fs *c = trans->c;
1095         struct inode_walker_entry *i;
1096         int ret = 0, ret2 = 0;
1097         s64 count2;
1098
1099         for (i = w->d; i < w->d + w->nr; i++) {
1100                 if (i->inode.bi_sectors == i->count)
1101                         continue;
1102
1103                 count2 = lockrestart_do(trans,
1104                         bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
1105
1106                 if (i->count != count2) {
1107                         bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
1108                                 i->count, count2);
1109                         i->count = count2;
1110                         if (i->inode.bi_sectors == i->count)
1111                                 continue;
1112                 }
1113
1114                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
1115                             "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1116                             w->cur_inum, i->snapshot,
1117                             i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
1118                         continue;
1119
1120                 i->inode.bi_sectors = i->count;
1121                 ret = write_inode(trans, &i->inode, i->snapshot);
1122                 if (ret)
1123                         break;
1124                 ret2 = -EINTR;
1125         }
1126 fsck_err:
1127         return ret ?: ret2;
1128 }
1129
1130 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1131                         struct inode_walker *inode,
1132                         struct snapshots_seen *s)
1133 {
1134         struct bch_fs *c = trans->c;
1135         struct bkey_s_c k;
1136         struct inode_walker_entry *i;
1137         char buf[200];
1138         int ret = 0;
1139
1140         k = bch2_btree_iter_peek(iter);
1141         if (!k.k)
1142                 return 0;
1143
1144         ret = bkey_err(k);
1145         if (ret)
1146                 return ret;
1147
1148         ret = check_key_has_snapshot(trans, iter, k);
1149         if (ret)
1150                 return ret;
1151
1152         ret = snapshots_seen_update(c, s, k.k->p);
1153         if (ret)
1154                 return ret;
1155
1156         if (k.k->type == KEY_TYPE_whiteout)
1157                 return 0;
1158
1159         if (inode->cur_inum != k.k->p.inode) {
1160                 ret = check_i_sectors(trans, inode);
1161                 if (ret)
1162                         return ret;
1163         }
1164 #if 0
1165         if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
1166                 char buf1[200];
1167                 char buf2[200];
1168
1169                 bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
1170                 bch2_bkey_val_to_text(&PBUF(buf2), c, k);
1171
1172                 if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
1173                         return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
1174         }
1175 #endif
1176         ret = __walk_inode(trans, inode, k.k->p);
1177         if (ret < 0)
1178                 return ret;
1179
1180         if (fsck_err_on(ret == INT_MAX, c,
1181                         "extent in missing inode:\n  %s",
1182                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1183                 return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
1184                         bch2_btree_delete_at(trans, iter,
1185                                              BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
1186
1187         if (ret == INT_MAX)
1188                 return 0;
1189
1190         i = inode->d + ret;
1191         ret = 0;
1192
1193         if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
1194                         !S_ISLNK(i->inode.bi_mode), c,
1195                         "extent in non regular inode mode %o:\n  %s",
1196                         i->inode.bi_mode,
1197                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1198                 return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
1199                          bch2_btree_delete_at(trans, iter,
1200                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
1201
1202         if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
1203                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
1204                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
1205                                         k.k->type != KEY_TYPE_reservation &&
1206                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
1207                                         "extent type %u offset %llu past end of inode %llu, i_size %llu",
1208                                         k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
1209                                 bch2_fs_lazy_rw(c);
1210                                 return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1211                                                 SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
1212                                                      k.k->p.snapshot),
1213                                                 POS(k.k->p.inode, U64_MAX),
1214                                                 0, NULL) ?: -EINTR;
1215                         }
1216                 }
1217         }
1218
1219         if (bkey_extent_is_allocation(k.k))
1220                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
1221                         i->count += k.k->size;
1222 #if 0
1223         bch2_bkey_buf_reassemble(&prev, c, k);
1224 #endif
1225
1226 fsck_err:
1227         return ret;
1228 }
1229
1230 /*
1231  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1232  * that i_size an i_sectors are consistent
1233  */
1234 noinline_for_stack
1235 static int check_extents(struct bch_fs *c)
1236 {
1237         struct inode_walker w = inode_walker_init();
1238         struct snapshots_seen s;
1239         struct btree_trans trans;
1240         struct btree_iter iter;
1241         int ret = 0;
1242
1243 #if 0
1244         struct bkey_buf prev;
1245         bch2_bkey_buf_init(&prev);
1246         prev.k->k = KEY(0, 0, 0);
1247 #endif
1248         snapshots_seen_init(&s);
1249         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1250
1251         bch_verbose(c, "checking extents");
1252
1253         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
1254                              POS(BCACHEFS_ROOT_INO, 0),
1255                              BTREE_ITER_INTENT|
1256                              BTREE_ITER_PREFETCH|
1257                              BTREE_ITER_ALL_SNAPSHOTS);
1258
1259         do {
1260                 ret = lockrestart_do(&trans,
1261                         check_extent(&trans, &iter, &w, &s));
1262                 if (ret)
1263                         break;
1264         } while (bch2_btree_iter_advance(&iter));
1265         bch2_trans_iter_exit(&trans, &iter);
1266 #if 0
1267         bch2_bkey_buf_exit(&prev, c);
1268 #endif
1269         inode_walker_exit(&w);
1270         bch2_trans_exit(&trans);
1271         snapshots_seen_exit(&s);
1272
1273         return ret;
1274 }
1275
1276 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1277 {
1278         struct bch_fs *c = trans->c;
1279         struct inode_walker_entry *i;
1280         int ret = 0, ret2 = 0;
1281         s64 count2;
1282
1283         for (i = w->d; i < w->d + w->nr; i++) {
1284                 if (i->inode.bi_nlink == i->count)
1285                         continue;
1286
1287                 count2 = lockrestart_do(trans,
1288                                 bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
1289
1290                 if (i->count != count2) {
1291                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1292                                 i->count, count2);
1293                         i->count = count2;
1294                         if (i->inode.bi_nlink == i->count)
1295                                 continue;
1296                 }
1297
1298                 if (fsck_err_on(i->inode.bi_nlink != i->count, c,
1299                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1300                                 w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
1301                         i->inode.bi_nlink = i->count;
1302                         ret = write_inode(trans, &i->inode, i->snapshot);
1303                         if (ret)
1304                                 break;
1305                         ret2 = -EINTR;
1306                 }
1307         }
1308 fsck_err:
1309         return ret ?: ret2;
1310 }
1311
1312 static int check_dirent_target(struct btree_trans *trans,
1313                                struct btree_iter *iter,
1314                                struct bkey_s_c_dirent d,
1315                                struct bch_inode_unpacked *target,
1316                                u32 target_snapshot)
1317 {
1318         struct bch_fs *c = trans->c;
1319         bool backpointer_exists = true;
1320         char buf[200];
1321         int ret = 0;
1322
1323         if (!target->bi_dir &&
1324             !target->bi_dir_offset) {
1325                 target->bi_dir          = d.k->p.inode;
1326                 target->bi_dir_offset   = d.k->p.offset;
1327
1328                 ret = write_inode(trans, target, target_snapshot);
1329                 if (ret)
1330                         goto err;
1331         }
1332
1333         if (!inode_backpointer_matches(d, target)) {
1334                 ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
1335                 if (ret < 0)
1336                         goto err;
1337
1338                 backpointer_exists = ret;
1339                 ret = 0;
1340
1341                 if (fsck_err_on(S_ISDIR(target->bi_mode) &&
1342                                 backpointer_exists, c,
1343                                 "directory %llu with multiple links",
1344                                 target->bi_inum)) {
1345                         ret = remove_dirent(trans, d.k->p);
1346                         if (ret)
1347                                 goto err;
1348                         return 0;
1349                 }
1350
1351                 if (fsck_err_on(backpointer_exists &&
1352                                 !target->bi_nlink, c,
1353                                 "inode %llu has multiple links but i_nlink 0",
1354                                 target->bi_inum)) {
1355                         target->bi_nlink++;
1356                         target->bi_flags &= ~BCH_INODE_UNLINKED;
1357
1358                         ret = write_inode(trans, target, target_snapshot);
1359                         if (ret)
1360                                 goto err;
1361                 }
1362
1363                 if (fsck_err_on(!backpointer_exists, c,
1364                                 "inode %llu has wrong backpointer:\n"
1365                                 "got       %llu:%llu\n"
1366                                 "should be %llu:%llu",
1367                                 target->bi_inum,
1368                                 target->bi_dir,
1369                                 target->bi_dir_offset,
1370                                 d.k->p.inode,
1371                                 d.k->p.offset)) {
1372                         target->bi_dir          = d.k->p.inode;
1373                         target->bi_dir_offset   = d.k->p.offset;
1374
1375                         ret = write_inode(trans, target, target_snapshot);
1376                         if (ret)
1377                                 goto err;
1378                 }
1379         }
1380
1381         if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c,
1382                         "incorrect d_type: should be %u:\n%s",
1383                         mode_to_type(target->bi_mode),
1384                         (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
1385                 struct bkey_i_dirent *n;
1386
1387                 n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
1388                 if (!n) {
1389                         ret = -ENOMEM;
1390                         goto err;
1391                 }
1392
1393                 bkey_reassemble(&n->k_i, d.s_c);
1394                 n->v.d_type = mode_to_type(target->bi_mode);
1395
1396                 ret = __bch2_trans_do(trans, NULL, NULL,
1397                                       BTREE_INSERT_NOFAIL|
1398                                       BTREE_INSERT_LAZY_RW,
1399                         bch2_trans_update(trans, iter, &n->k_i, 0));
1400                 kfree(n);
1401                 if (ret)
1402                         goto err;
1403         }
1404 err:
1405 fsck_err:
1406         return ret;
1407 }
1408
1409 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1410                         struct bch_hash_info *hash_info,
1411                         struct inode_walker *dir,
1412                         struct inode_walker *target,
1413                         struct snapshots_seen *s)
1414 {
1415         struct bch_fs *c = trans->c;
1416         struct bkey_s_c k;
1417         struct bkey_s_c_dirent d;
1418         struct inode_walker_entry *i;
1419         u32 target_snapshot;
1420         u32 target_subvol;
1421         u64 target_inum;
1422         char buf[200];
1423         int ret;
1424
1425         k = bch2_btree_iter_peek(iter);
1426         if (!k.k)
1427                 return 0;
1428
1429         ret = bkey_err(k);
1430         if (ret)
1431                 return ret;
1432
1433         ret = check_key_has_snapshot(trans, iter, k);
1434         if (ret)
1435                 return ret;
1436
1437         ret = snapshots_seen_update(c, s, k.k->p);
1438         if (ret)
1439                 return ret;
1440
1441         if (k.k->type == KEY_TYPE_whiteout)
1442                 return 0;
1443
1444         if (dir->cur_inum != k.k->p.inode) {
1445                 ret = check_subdir_count(trans, dir);
1446                 if (ret)
1447                         return ret;
1448         }
1449
1450         ret = __walk_inode(trans, dir, k.k->p);
1451         if (ret < 0)
1452                 return ret;
1453
1454         if (fsck_err_on(ret == INT_MAX, c,
1455                         "dirent in nonexisting directory:\n%s",
1456                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1457                 return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
1458                                 bch2_btree_delete_at(trans, iter,
1459                                                      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
1460
1461         if (ret == INT_MAX)
1462                 return 0;
1463
1464         i = dir->d + ret;
1465         ret = 0;
1466
1467         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
1468                         "dirent in non directory inode type %u:\n%s",
1469                         mode_to_type(i->inode.bi_mode),
1470                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1471                 return __bch2_trans_do(trans, NULL, NULL, 0,
1472                                 bch2_btree_delete_at(trans, iter, 0));
1473
1474         if (dir->first_this_inode)
1475                 *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
1476
1477         ret = hash_check_key(trans, bch2_dirent_hash_desc,
1478                              hash_info, iter, k);
1479         if (ret < 0)
1480                 return ret;
1481         if (ret) /* dirent has been deleted */
1482                 return 0;
1483
1484         if (k.k->type != KEY_TYPE_dirent)
1485                 return 0;
1486
1487         d = bkey_s_c_to_dirent(k);
1488
1489         ret = __bch2_dirent_read_target(trans, d,
1490                                         &target_subvol,
1491                                         &target_snapshot,
1492                                         &target_inum,
1493                                         true);
1494         if (ret && ret != -ENOENT)
1495                 return ret;
1496
1497         if (fsck_err_on(ret, c,
1498                         "dirent points to missing subvolume %llu",
1499                         le64_to_cpu(d.v->d_inum)))
1500                 return remove_dirent(trans, d.k->p);
1501
1502         if (target_subvol) {
1503                 struct bch_inode_unpacked subvol_root;
1504
1505                 ret = __lookup_inode(trans, target_inum,
1506                                    &subvol_root, &target_snapshot);
1507                 if (ret && ret != -ENOENT)
1508                         return ret;
1509
1510                 if (fsck_err_on(ret, c,
1511                                 "subvolume %u points to missing subvolume root %llu",
1512                                 target_subvol,
1513                                 target_inum)) {
1514                         bch_err(c, "repair not implemented yet");
1515                         return -EINVAL;
1516                 }
1517
1518                 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
1519                                 "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1520                                 target_inum,
1521                                 subvol_root.bi_subvol, target_subvol)) {
1522                         subvol_root.bi_subvol = target_subvol;
1523                         ret = write_inode(trans, &subvol_root, target_snapshot);
1524                         if (ret)
1525                                 return ret;
1526                 }
1527
1528                 ret = check_dirent_target(trans, iter, d, &subvol_root,
1529                                           target_snapshot);
1530                 if (ret)
1531                         return ret;
1532         } else {
1533                 ret = __get_visible_inodes(trans, target, s, target_inum);
1534                 if (ret)
1535                         return ret;
1536
1537                 if (fsck_err_on(!target->nr, c,
1538                                 "dirent points to missing inode:\n%s",
1539                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
1540                                                        k), buf))) {
1541                         ret = remove_dirent(trans, d.k->p);
1542                         if (ret)
1543                                 return ret;
1544                 }
1545
1546                 for (i = target->d; i < target->d + target->nr; i++) {
1547                         ret = check_dirent_target(trans, iter, d,
1548                                                   &i->inode, i->snapshot);
1549                         if (ret)
1550                                 return ret;
1551                 }
1552         }
1553
1554         if (d.v->d_type == DT_DIR)
1555                 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
1556                         i->count++;
1557
1558 fsck_err:
1559         return ret;
1560 }
1561
1562 /*
1563  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1564  * validate d_type
1565  */
1566 noinline_for_stack
1567 static int check_dirents(struct bch_fs *c)
1568 {
1569         struct inode_walker dir = inode_walker_init();
1570         struct inode_walker target = inode_walker_init();
1571         struct snapshots_seen s;
1572         struct bch_hash_info hash_info;
1573         struct btree_trans trans;
1574         struct btree_iter iter;
1575         int ret = 0;
1576
1577         bch_verbose(c, "checking dirents");
1578
1579         snapshots_seen_init(&s);
1580         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1581
1582         bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
1583                              POS(BCACHEFS_ROOT_INO, 0),
1584                              BTREE_ITER_INTENT|
1585                              BTREE_ITER_PREFETCH|
1586                              BTREE_ITER_ALL_SNAPSHOTS);
1587
1588         do {
1589                 ret = lockrestart_do(&trans,
1590                         check_dirent(&trans, &iter, &hash_info,
1591                                      &dir, &target, &s));
1592                 if (ret)
1593                         break;
1594         } while (bch2_btree_iter_advance(&iter));
1595         bch2_trans_iter_exit(&trans, &iter);
1596
1597         bch2_trans_exit(&trans);
1598         snapshots_seen_exit(&s);
1599         inode_walker_exit(&dir);
1600         inode_walker_exit(&target);
1601         return ret;
1602 }
1603
1604 /*
1605  * Walk xattrs: verify that they all have a corresponding inode
1606  */
1607 noinline_for_stack
1608 static int check_xattrs(struct bch_fs *c)
1609 {
1610         struct inode_walker w = inode_walker_init();
1611         struct bch_hash_info hash_info;
1612         struct btree_trans trans;
1613         struct btree_iter iter;
1614         struct bkey_s_c k;
1615         int ret = 0;
1616
1617         bch_verbose(c, "checking xattrs");
1618
1619         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1620
1621         bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
1622                              POS(BCACHEFS_ROOT_INO, 0),
1623                              BTREE_ITER_INTENT|
1624                              BTREE_ITER_PREFETCH|
1625                              BTREE_ITER_ALL_SNAPSHOTS);
1626 retry:
1627         bch2_trans_begin(&trans);
1628
1629         while ((k = bch2_btree_iter_peek(&iter)).k &&
1630                !(ret = bkey_err(k))) {
1631                 ret = check_key_has_snapshot(&trans, &iter, k);
1632                 if (ret)
1633                         break;
1634
1635                 ret = walk_inode(&trans, &w, k.k->p);
1636                 if (ret < 0)
1637                         break;
1638
1639                 if (fsck_err_on(ret == INT_MAX, c,
1640                                 "xattr for missing inode %llu",
1641                                 k.k->p.inode)) {
1642                         ret = bch2_btree_delete_at(&trans, &iter, 0);
1643                         if (ret)
1644                                 break;
1645                         continue;
1646                 }
1647
1648                 if (ret == INT_MAX)
1649                         goto next;
1650                 ret = 0;
1651
1652                 if (w.first_this_inode)
1653                         hash_info = bch2_hash_info_init(c, &w.d[0].inode);
1654
1655                 ret = hash_check_key(&trans, bch2_xattr_hash_desc,
1656                                      &hash_info, &iter, k);
1657                 if (ret)
1658                         break;
1659 next:
1660                 bch2_btree_iter_advance(&iter);
1661         }
1662 fsck_err:
1663         if (ret == -EINTR)
1664                 goto retry;
1665
1666         bch2_trans_iter_exit(&trans, &iter);
1667         return bch2_trans_exit(&trans) ?: ret;
1668 }
1669
1670 /* Get root directory, create if it doesn't exist: */
1671 static int check_root(struct bch_fs *c)
1672 {
1673         struct btree_trans trans;
1674         struct bch_inode_unpacked root_inode;
1675         u32 snapshot;
1676         u64 inum;
1677         int ret;
1678
1679         bch2_trans_init(&trans, c, 0, 0);
1680
1681         bch_verbose(c, "checking root directory");
1682
1683         ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1684         if (ret && ret != -ENOENT)
1685                 return ret;
1686
1687         if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
1688                 struct bkey_i_subvolume root_subvol;
1689
1690                 snapshot        = U32_MAX;
1691                 inum            = BCACHEFS_ROOT_INO;
1692
1693                 bkey_subvolume_init(&root_subvol.k_i);
1694                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1695                 root_subvol.v.flags     = 0;
1696                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1697                 root_subvol.v.inode     = cpu_to_le64(inum);
1698                 ret = __bch2_trans_do(&trans, NULL, NULL,
1699                                       BTREE_INSERT_NOFAIL|
1700                                       BTREE_INSERT_LAZY_RW,
1701                         __bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i));
1702                 if (ret) {
1703                         bch_err(c, "error writing root subvol: %i", ret);
1704                         goto err;
1705                 }
1706
1707         }
1708
1709         ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1710         if (ret && ret != -ENOENT)
1711                 return ret;
1712
1713         if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
1714             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
1715                                 "root inode not a directory")) {
1716                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1717                                 0, NULL);
1718                 root_inode.bi_inum = inum;
1719
1720                 ret = write_inode(&trans, &root_inode, snapshot);
1721                 if (ret)
1722                         bch_err(c, "error writing root inode: %i", ret);
1723         }
1724 err:
1725 fsck_err:
1726         bch2_trans_exit(&trans);
1727         return ret;
1728 }
1729
1730 struct pathbuf {
1731         size_t          nr;
1732         size_t          size;
1733
1734         struct pathbuf_entry {
1735                 u64     inum;
1736         }               *entries;
1737 };
1738
1739 static int path_down(struct pathbuf *p, u64 inum)
1740 {
1741         if (p->nr == p->size) {
1742                 size_t new_size = max_t(size_t, 256UL, p->size * 2);
1743                 void *n = krealloc(p->entries,
1744                                    new_size * sizeof(p->entries[0]),
1745                                    GFP_KERNEL);
1746                 if (!n) {
1747                         return -ENOMEM;
1748                 }
1749
1750                 p->entries = n;
1751                 p->size = new_size;
1752         };
1753
1754         p->entries[p->nr++] = (struct pathbuf_entry) {
1755                 .inum = inum,
1756         };
1757         return 0;
1758 }
1759
1760 static int check_path(struct btree_trans *trans,
1761                       struct pathbuf *p,
1762                       struct bch_inode_unpacked *inode,
1763                       u32 snapshot)
1764 {
1765         struct bch_fs *c = trans->c;
1766         size_t i;
1767         int ret = 0;
1768
1769         snapshot = snapshot_t(c, snapshot)->equiv;
1770         p->nr = 0;
1771
1772         while (inode->bi_inum != BCACHEFS_ROOT_INO) {
1773                 ret = lockrestart_do(trans,
1774                         inode_backpointer_exists(trans, inode, snapshot));
1775                 if (ret < 0)
1776                         break;
1777
1778                 if (!ret) {
1779                         if (fsck_err(c,  "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu",
1780                                      inode->bi_inum, snapshot,
1781                                      mode_to_type(inode->bi_mode),
1782                                      inode->bi_nlink,
1783                                      inode->bi_dir,
1784                                      inode->bi_dir_offset))
1785                                 ret = reattach_inode(trans, inode, snapshot);
1786                         break;
1787                 }
1788                 ret = 0;
1789
1790                 if (!S_ISDIR(inode->bi_mode))
1791                         break;
1792
1793                 ret = path_down(p, inode->bi_inum);
1794                 if (ret) {
1795                         bch_err(c, "memory allocation failure");
1796                         return ret;
1797                 }
1798
1799                 for (i = 0; i < p->nr; i++) {
1800                         if (inode->bi_dir != p->entries[i].inum)
1801                                 continue;
1802
1803                         /* XXX print path */
1804                         if (!fsck_err(c, "directory structure loop"))
1805                                 return 0;
1806
1807                         ret = lockrestart_do(trans,
1808                                         remove_backpointer(trans, inode));
1809                         if (ret) {
1810                                 bch_err(c, "error removing dirent: %i", ret);
1811                                 break;
1812                         }
1813
1814                         ret = reattach_inode(trans, inode, snapshot);
1815                         break;
1816                 }
1817
1818                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
1819                 if (ret) {
1820                         /* Should have been caught in dirents pass */
1821                         bch_err(c, "error looking up parent directory: %i", ret);
1822                         break;
1823                 }
1824         }
1825 fsck_err:
1826         if (ret)
1827                 bch_err(c, "%s: err %i", __func__, ret);
1828         return ret;
1829 }
1830
1831 /*
1832  * Check for unreachable inodes, as well as loops in the directory structure:
1833  * After check_dirents(), if an inode backpointer doesn't exist that means it's
1834  * unreachable:
1835  */
1836 static int check_directory_structure(struct bch_fs *c)
1837 {
1838         struct btree_trans trans;
1839         struct btree_iter iter;
1840         struct bkey_s_c k;
1841         struct bch_inode_unpacked u;
1842         struct pathbuf path = { 0, 0, NULL };
1843         int ret;
1844
1845         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1846
1847         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
1848                            BTREE_ITER_INTENT|
1849                            BTREE_ITER_PREFETCH|
1850                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
1851                 if (k.k->type != KEY_TYPE_inode)
1852                         continue;
1853
1854                 ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
1855                 if (ret) {
1856                         /* Should have been caught earlier in fsck: */
1857                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
1858                         break;
1859                 }
1860
1861                 if (u.bi_flags & BCH_INODE_UNLINKED)
1862                         continue;
1863
1864                 ret = check_path(&trans, &path, &u, iter.pos.snapshot);
1865                 if (ret)
1866                         break;
1867         }
1868         bch2_trans_iter_exit(&trans, &iter);
1869
1870         BUG_ON(ret == -EINTR);
1871
1872         kfree(path.entries);
1873
1874         return bch2_trans_exit(&trans) ?: ret;
1875 }
1876
1877 struct nlink_table {
1878         size_t          nr;
1879         size_t          size;
1880
1881         struct nlink {
1882                 u64     inum;
1883                 u32     snapshot;
1884                 u32     count;
1885         }               *d;
1886 };
1887
1888 static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
1889 {
1890         if (t->nr == t->size) {
1891                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
1892                 void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
1893                 if (!d) {
1894                         return -ENOMEM;
1895                 }
1896
1897                 if (t->d)
1898                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
1899                 kvfree(t->d);
1900
1901                 t->d = d;
1902                 t->size = new_size;
1903         }
1904
1905
1906         t->d[t->nr++] = (struct nlink) {
1907                 .inum           = inum,
1908                 .snapshot       = snapshot,
1909         };
1910
1911         return 0;
1912 }
1913
1914 static int nlink_cmp(const void *_l, const void *_r)
1915 {
1916         const struct nlink *l = _l;
1917         const struct nlink *r = _r;
1918
1919         return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
1920 }
1921
1922 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
1923                      struct nlink_table *links,
1924                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
1925 {
1926         struct nlink *link, key = {
1927                 .inum = inum, .snapshot = U32_MAX,
1928         };
1929
1930         if (inum < range_start || inum >= range_end)
1931                 return;
1932
1933         link = __inline_bsearch(&key, links->d, links->nr,
1934                                 sizeof(links->d[0]), nlink_cmp);
1935         if (!link)
1936                 return;
1937
1938         while (link > links->d && link[0].inum == link[-1].inum)
1939                 --link;
1940
1941         for (; link < links->d + links->nr && link->inum == inum; link++)
1942                 if (ref_visible(c, s, snapshot, link->snapshot)) {
1943                         link->count++;
1944                         if (link->snapshot >= snapshot)
1945                                 break;
1946                 }
1947 }
1948
1949 noinline_for_stack
1950 static int check_nlinks_find_hardlinks(struct bch_fs *c,
1951                                        struct nlink_table *t,
1952                                        u64 start, u64 *end)
1953 {
1954         struct btree_trans trans;
1955         struct btree_iter iter;
1956         struct bkey_s_c k;
1957         struct bkey_s_c_inode inode;
1958         struct bch_inode_unpacked u;
1959         int ret = 0;
1960
1961         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1962
1963         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
1964                            POS(0, start),
1965                            BTREE_ITER_INTENT|
1966                            BTREE_ITER_PREFETCH|
1967                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
1968                 if (k.k->type != KEY_TYPE_inode)
1969                         continue;
1970
1971                 inode = bkey_s_c_to_inode(k);
1972
1973                 /*
1974                  * Backpointer and directory structure checks are sufficient for
1975                  * directories, since they can't have hardlinks:
1976                  */
1977                 if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
1978                         continue;
1979
1980                 /* Should never fail, checked by bch2_inode_invalid: */
1981                 BUG_ON(bch2_inode_unpack(inode, &u));
1982
1983                 if (!u.bi_nlink)
1984                         continue;
1985
1986                 ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
1987                 if (ret) {
1988                         *end = k.k->p.offset;
1989                         ret = 0;
1990                         break;
1991                 }
1992
1993         }
1994         bch2_trans_iter_exit(&trans, &iter);
1995         bch2_trans_exit(&trans);
1996
1997         if (ret)
1998                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
1999
2000         return ret;
2001 }
2002
2003 noinline_for_stack
2004 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2005                                      u64 range_start, u64 range_end)
2006 {
2007         struct btree_trans trans;
2008         struct snapshots_seen s;
2009         struct btree_iter iter;
2010         struct bkey_s_c k;
2011         struct bkey_s_c_dirent d;
2012         int ret;
2013
2014         snapshots_seen_init(&s);
2015
2016         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2017
2018         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
2019                            BTREE_ITER_INTENT|
2020                            BTREE_ITER_PREFETCH|
2021                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2022                 ret = snapshots_seen_update(c, &s, k.k->p);
2023                 if (ret)
2024                         break;
2025
2026                 switch (k.k->type) {
2027                 case KEY_TYPE_dirent:
2028                         d = bkey_s_c_to_dirent(k);
2029
2030                         if (d.v->d_type != DT_DIR &&
2031                             d.v->d_type != DT_SUBVOL)
2032                                 inc_link(c, &s, links, range_start, range_end,
2033                                          le64_to_cpu(d.v->d_inum),
2034                                          d.k->p.snapshot);
2035                         break;
2036                 }
2037
2038                 bch2_trans_cond_resched(&trans);
2039         }
2040         bch2_trans_iter_exit(&trans, &iter);
2041
2042         if (ret)
2043                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
2044
2045         bch2_trans_exit(&trans);
2046         snapshots_seen_exit(&s);
2047         return ret;
2048 }
2049
2050 noinline_for_stack
2051 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2052                                struct nlink_table *links,
2053                                u64 range_start, u64 range_end)
2054 {
2055         struct btree_trans trans;
2056         struct btree_iter iter;
2057         struct bkey_s_c k;
2058         struct bkey_s_c_inode inode;
2059         struct bch_inode_unpacked u;
2060         struct nlink *link = links->d;
2061         int ret = 0;
2062
2063         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2064
2065         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2066                            POS(0, range_start),
2067                            BTREE_ITER_INTENT|
2068                            BTREE_ITER_PREFETCH|
2069                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2070                 if (k.k->p.offset >= range_end)
2071                         break;
2072
2073                 if (k.k->type != KEY_TYPE_inode)
2074                         continue;
2075
2076                 inode = bkey_s_c_to_inode(k);
2077                 if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
2078                         continue;
2079
2080                 BUG_ON(bch2_inode_unpack(inode, &u));
2081
2082                 if (!u.bi_nlink)
2083                         continue;
2084
2085                 while ((cmp_int(link->inum, k.k->p.offset) ?:
2086                         cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2087                         link++;
2088                         BUG_ON(link >= links->d + links->nr);
2089                 }
2090
2091                 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
2092                                 "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
2093                                 u.bi_inum, mode_to_type(u.bi_mode),
2094                                 bch2_inode_nlink_get(&u), link->count)) {
2095                         bch2_inode_nlink_set(&u, link->count);
2096
2097                         ret = __bch2_trans_do(&trans, NULL, NULL,
2098                                               BTREE_INSERT_NOFAIL|
2099                                               BTREE_INSERT_LAZY_RW,
2100                                               bch2_btree_iter_traverse(&iter) ?:
2101                                         bch2_inode_write(&trans, &iter, &u));
2102                         if (ret)
2103                                 bch_err(c, "error in fsck: error %i updating inode", ret);
2104                 }
2105         }
2106 fsck_err:
2107         bch2_trans_iter_exit(&trans, &iter);
2108         bch2_trans_exit(&trans);
2109
2110         if (ret)
2111                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2112
2113         return ret;
2114 }
2115
2116 noinline_for_stack
2117 static int check_nlinks(struct bch_fs *c)
2118 {
2119         struct nlink_table links = { 0 };
2120         u64 this_iter_range_start, next_iter_range_start = 0;
2121         int ret = 0;
2122
2123         bch_verbose(c, "checking inode nlinks");
2124
2125         do {
2126                 this_iter_range_start = next_iter_range_start;
2127                 next_iter_range_start = U64_MAX;
2128
2129                 ret = check_nlinks_find_hardlinks(c, &links,
2130                                                   this_iter_range_start,
2131                                                   &next_iter_range_start);
2132
2133                 ret = check_nlinks_walk_dirents(c, &links,
2134                                           this_iter_range_start,
2135                                           next_iter_range_start);
2136                 if (ret)
2137                         break;
2138
2139                 ret = check_nlinks_update_hardlinks(c, &links,
2140                                          this_iter_range_start,
2141                                          next_iter_range_start);
2142                 if (ret)
2143                         break;
2144
2145                 links.nr = 0;
2146         } while (next_iter_range_start != U64_MAX);
2147
2148         kvfree(links.d);
2149
2150         return ret;
2151 }
2152
2153 /*
2154  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
2155  * Doesn't fix them yet, mainly because they haven't yet been observed:
2156  */
2157 int bch2_fsck_full(struct bch_fs *c)
2158 {
2159         return  bch2_fs_snapshots_check(c) ?:
2160                 check_inodes(c, true) ?:
2161                 check_subvols(c) ?:
2162                 check_extents(c) ?:
2163                 check_dirents(c) ?:
2164                 check_xattrs(c) ?:
2165                 check_root(c) ?:
2166                 check_directory_structure(c) ?:
2167                 check_nlinks(c);
2168 }
2169
2170 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
2171 {
2172         return check_inodes(c, false);
2173 }