]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
Update bcachefs sources to e027cf9aa0 fixup! bcachefs: Defer checking of alloc -...
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "darray.h"
7 #include "dirent.h"
8 #include "error.h"
9 #include "fs-common.h"
10 #include "fsck.h"
11 #include "inode.h"
12 #include "keylist.h"
13 #include "subvolume.h"
14 #include "super.h"
15 #include "xattr.h"
16
17 #include <linux/bsearch.h>
18 #include <linux/dcache.h> /* struct qstr */
19
20 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
21
22 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
23                                     u32 snapshot)
24 {
25         struct btree_iter iter;
26         struct bkey_s_c k;
27         u64 sectors = 0;
28         int ret;
29
30         for_each_btree_key(trans, iter, BTREE_ID_extents,
31                            SPOS(inum, 0, snapshot), 0, k, ret) {
32                 if (k.k->p.inode != inum)
33                         break;
34
35                 if (bkey_extent_is_allocation(k.k))
36                         sectors += k.k->size;
37         }
38
39         bch2_trans_iter_exit(trans, &iter);
40
41         return ret ?: sectors;
42 }
43
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45                                     u32 snapshot)
46 {
47         struct btree_iter iter;
48         struct bkey_s_c k;
49         struct bkey_s_c_dirent d;
50         u64 subdirs = 0;
51         int ret;
52
53         for_each_btree_key(trans, iter, BTREE_ID_dirents,
54                            SPOS(inum, 0, snapshot), 0, k, ret) {
55                 if (k.k->p.inode != inum)
56                         break;
57
58                 if (k.k->type != KEY_TYPE_dirent)
59                         continue;
60
61                 d = bkey_s_c_to_dirent(k);
62                 if (d.v->d_type == DT_DIR)
63                         subdirs++;
64         }
65
66         bch2_trans_iter_exit(trans, &iter);
67
68         return ret ?: subdirs;
69 }
70
71 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
72                                     u32 *subvol)
73 {
74         struct btree_iter iter;
75         struct bkey_s_c k;
76         int ret;
77
78         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
79                              POS(0, snapshot), 0);
80         k = bch2_btree_iter_peek_slot(&iter);
81         ret = bkey_err(k);
82         if (ret)
83                 goto err;
84
85         if (k.k->type != KEY_TYPE_snapshot) {
86                 bch_err(trans->c, "snapshot %u not fonud", snapshot);
87                 ret = -ENOENT;
88                 goto err;
89         }
90
91         *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
92 err:
93         bch2_trans_iter_exit(trans, &iter);
94         return ret;
95
96 }
97
98 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
99                            u32 *snapshot, u64 *inum)
100 {
101         struct bch_subvolume s;
102         int ret;
103
104         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
105
106         *snapshot = le32_to_cpu(s.snapshot);
107         *inum = le64_to_cpu(s.inode);
108         return ret;
109 }
110
111 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
112                          u32 *snapshot, u64 *inum)
113 {
114         return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
115 }
116
117 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
118                               struct bch_inode_unpacked *inode)
119 {
120         struct btree_iter iter;
121         struct bkey_s_c k;
122         int ret;
123
124         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
125                              POS(0, inode_nr),
126                              BTREE_ITER_ALL_SNAPSHOTS);
127         k = bch2_btree_iter_peek(&iter);
128         ret = bkey_err(k);
129         if (ret)
130                 goto err;
131
132         if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
133                 ret = -ENOENT;
134                 goto err;
135         }
136
137         ret = bch2_inode_unpack(k, inode);
138 err:
139         if (ret && ret != -EINTR)
140                 bch_err(trans->c, "error %i fetching inode %llu",
141                         ret, inode_nr);
142         bch2_trans_iter_exit(trans, &iter);
143         return ret;
144 }
145
146 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
147                           struct bch_inode_unpacked *inode,
148                           u32 *snapshot)
149 {
150         struct btree_iter iter;
151         struct bkey_s_c k;
152         int ret;
153
154         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
155                              SPOS(0, inode_nr, *snapshot), 0);
156         k = bch2_btree_iter_peek_slot(&iter);
157         ret = bkey_err(k);
158         if (ret)
159                 goto err;
160
161         ret = bkey_is_inode(k.k)
162                 ? bch2_inode_unpack(k, inode)
163                 : -ENOENT;
164         if (!ret)
165                 *snapshot = iter.pos.snapshot;
166 err:
167         if (ret && ret != -EINTR)
168                 bch_err(trans->c, "error %i fetching inode %llu:%u",
169                         ret, inode_nr, *snapshot);
170         bch2_trans_iter_exit(trans, &iter);
171         return ret;
172 }
173
174 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
175                         struct bch_inode_unpacked *inode,
176                         u32 *snapshot)
177 {
178         return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
179 }
180
181 static int __lookup_dirent(struct btree_trans *trans,
182                            struct bch_hash_info hash_info,
183                            subvol_inum dir, struct qstr *name,
184                            u64 *target, unsigned *type)
185 {
186         struct btree_iter iter;
187         struct bkey_s_c_dirent d;
188         int ret;
189
190         ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
191                                &hash_info, dir, name, 0);
192         if (ret)
193                 return ret;
194
195         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
196         *target = le64_to_cpu(d.v->d_inum);
197         *type = d.v->d_type;
198         bch2_trans_iter_exit(trans, &iter);
199         return 0;
200 }
201
202 static int __write_inode(struct btree_trans *trans,
203                          struct bch_inode_unpacked *inode,
204                          u32 snapshot)
205 {
206         struct btree_iter iter;
207         int ret;
208
209         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
210                             SPOS(0, inode->bi_inum, snapshot),
211                             BTREE_ITER_INTENT);
212
213         ret   = bch2_btree_iter_traverse(&iter) ?:
214                 bch2_inode_write(trans, &iter, inode);
215         bch2_trans_iter_exit(trans, &iter);
216         return ret;
217 }
218
219 static int write_inode(struct btree_trans *trans,
220                        struct bch_inode_unpacked *inode,
221                        u32 snapshot)
222 {
223         int ret = __bch2_trans_do(trans, NULL, NULL,
224                                   BTREE_INSERT_NOFAIL|
225                                   BTREE_INSERT_LAZY_RW,
226                                   __write_inode(trans, inode, snapshot));
227         if (ret)
228                 bch_err(trans->c, "error in fsck: error %i updating inode", ret);
229         return ret;
230 }
231
232 static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
233 {
234         struct btree_iter iter = { NULL };
235         struct bkey_i_inode_generation delete;
236         struct bch_inode_unpacked inode_u;
237         struct bkey_s_c k;
238         int ret;
239
240         ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
241                                               SPOS(inum, 0, snapshot),
242                                               SPOS(inum, U64_MAX, snapshot),
243                                               0, NULL) ?:
244                 bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
245                                               SPOS(inum, 0, snapshot),
246                                               SPOS(inum, U64_MAX, snapshot),
247                                               0, NULL) ?:
248                 bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
249                                               SPOS(inum, 0, snapshot),
250                                               SPOS(inum, U64_MAX, snapshot),
251                                               0, NULL);
252         if (ret)
253                 goto err;
254 retry:
255         bch2_trans_begin(trans);
256
257         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
258                              SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
259         k = bch2_btree_iter_peek_slot(&iter);
260
261         ret = bkey_err(k);
262         if (ret)
263                 goto err;
264
265         if (!bkey_is_inode(k.k)) {
266                 bch2_fs_inconsistent(trans->c,
267                                      "inode %llu:%u not found when deleting",
268                                      inum, snapshot);
269                 ret = -EIO;
270                 goto err;
271         }
272
273         bch2_inode_unpack(k, &inode_u);
274
275         /* Subvolume root? */
276         if (inode_u.bi_subvol) {
277                 ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
278                 if (ret)
279                         goto err;
280         }
281
282         bkey_inode_generation_init(&delete.k_i);
283         delete.k.p = iter.pos;
284         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
285
286         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
287                 bch2_trans_commit(trans, NULL, NULL,
288                                 BTREE_INSERT_NOFAIL);
289 err:
290         bch2_trans_iter_exit(trans, &iter);
291         if (ret == -EINTR)
292                 goto retry;
293
294         return ret;
295 }
296
297 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
298 {
299         struct bch_fs *c = trans->c;
300         struct btree_iter iter;
301         struct bch_inode_unpacked dir_inode;
302         struct bch_hash_info dir_hash_info;
303         int ret;
304
305         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
306         if (ret)
307                 return ret;
308
309         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
310
311         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
312
313         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
314                                   &dir_hash_info, &iter, 0);
315         bch2_trans_iter_exit(trans, &iter);
316         return ret;
317 }
318
319 /* Get lost+found, create if it doesn't exist: */
320 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
321                             struct bch_inode_unpacked *lostfound)
322 {
323         struct bch_fs *c = trans->c;
324         struct bch_inode_unpacked root;
325         struct bch_hash_info root_hash_info;
326         struct qstr lostfound_str = QSTR("lost+found");
327         subvol_inum root_inum = { .subvol = subvol };
328         u64 inum = 0;
329         unsigned d_type = 0;
330         u32 snapshot;
331         int ret;
332
333         ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
334         if (ret)
335                 return ret;
336
337         ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
338         if (ret)
339                 return ret;
340
341         root_hash_info = bch2_hash_info_init(c, &root);
342
343         ret = __lookup_dirent(trans, root_hash_info, root_inum,
344                             &lostfound_str, &inum, &d_type);
345         if (ret == -ENOENT) {
346                 bch_notice(c, "creating lost+found");
347                 goto create_lostfound;
348         }
349
350         if (ret && ret != -EINTR)
351                 bch_err(c, "error looking up lost+found: %i", ret);
352         if (ret)
353                 return ret;
354
355         if (d_type != DT_DIR) {
356                 bch_err(c, "error looking up lost+found: not a directory");
357                 return ret;
358         }
359
360         /*
361          * The check_dirents pass has already run, dangling dirents
362          * shouldn't exist here:
363          */
364         return __lookup_inode(trans, inum, lostfound, &snapshot);
365
366 create_lostfound:
367         bch2_inode_init_early(c, lostfound);
368
369         ret = bch2_create_trans(trans, root_inum, &root,
370                                 lostfound, &lostfound_str,
371                                 0, 0, S_IFDIR|0700, 0, NULL, NULL,
372                                 (subvol_inum) { }, 0);
373         if (ret && ret != -EINTR)
374                 bch_err(c, "error creating lost+found: %i", ret);
375         return ret;
376 }
377
378 static int __reattach_inode(struct btree_trans *trans,
379                           struct bch_inode_unpacked *inode,
380                           u32 inode_snapshot)
381 {
382         struct bch_hash_info dir_hash;
383         struct bch_inode_unpacked lostfound;
384         char name_buf[20];
385         struct qstr name;
386         u64 dir_offset = 0;
387         u32 subvol;
388         int ret;
389
390         ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
391         if (ret)
392                 return ret;
393
394         ret = lookup_lostfound(trans, subvol, &lostfound);
395         if (ret)
396                 return ret;
397
398         if (S_ISDIR(inode->bi_mode)) {
399                 lostfound.bi_nlink++;
400
401                 ret = __write_inode(trans, &lostfound, U32_MAX);
402                 if (ret)
403                         return ret;
404         }
405
406         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
407
408         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
409         name = (struct qstr) QSTR(name_buf);
410
411         ret = bch2_dirent_create(trans,
412                                  (subvol_inum) {
413                                         .subvol = subvol,
414                                         .inum = lostfound.bi_inum,
415                                  },
416                                  &dir_hash,
417                                  inode_d_type(inode),
418                                  &name, inode->bi_inum, &dir_offset,
419                                  BCH_HASH_SET_MUST_CREATE);
420         if (ret)
421                 return ret;
422
423         inode->bi_dir           = lostfound.bi_inum;
424         inode->bi_dir_offset    = dir_offset;
425
426         return __write_inode(trans, inode, inode_snapshot);
427 }
428
429 static int reattach_inode(struct btree_trans *trans,
430                           struct bch_inode_unpacked *inode,
431                           u32 inode_snapshot)
432 {
433         int ret = __bch2_trans_do(trans, NULL, NULL,
434                                   BTREE_INSERT_LAZY_RW|
435                                   BTREE_INSERT_NOFAIL,
436                         __reattach_inode(trans, inode, inode_snapshot));
437         if (ret) {
438                 bch_err(trans->c, "error %i reattaching inode %llu",
439                         ret, inode->bi_inum);
440                 return ret;
441         }
442
443         return ret;
444 }
445
446 static int remove_backpointer(struct btree_trans *trans,
447                               struct bch_inode_unpacked *inode)
448 {
449         struct btree_iter iter;
450         struct bkey_s_c k;
451         int ret;
452
453         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
454                              POS(inode->bi_dir, inode->bi_dir_offset), 0);
455         k = bch2_btree_iter_peek_slot(&iter);
456         ret = bkey_err(k);
457         if (ret)
458                 goto out;
459         if (k.k->type != KEY_TYPE_dirent) {
460                 ret = -ENOENT;
461                 goto out;
462         }
463
464         ret = __remove_dirent(trans, k.k->p);
465 out:
466         bch2_trans_iter_exit(trans, &iter);
467         return ret;
468 }
469
470 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
471 {
472         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
473
474         if (bkey_cmp(s->pos, pos))
475                 s->ids.nr = 0;
476         s->pos = pos;
477
478         /* Might get called multiple times due to lock restarts */
479         if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot)
480                 return 0;
481
482         return snapshots_seen_add(c, s, pos.snapshot);
483 }
484
485 /**
486  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
487  * and @ancestor hasn't been overwritten in @seen
488  *
489  * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
490  */
491 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
492                                     u32 id, u32 ancestor)
493 {
494         ssize_t i;
495
496         BUG_ON(id > ancestor);
497
498         id              = snapshot_t(c, id)->equiv;
499         ancestor        = snapshot_t(c, ancestor)->equiv;
500
501         /* @ancestor should be the snapshot most recently added to @seen */
502         BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor);
503         BUG_ON(seen->pos.snapshot != ancestor);
504
505         if (id == ancestor)
506                 return true;
507
508         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
509                 return false;
510
511         for (i = seen->ids.nr - 2;
512              i >= 0 && seen->ids.data[i] >= id;
513              --i)
514                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) &&
515                     bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor))
516                         return false;
517
518         return true;
519 }
520
521 /**
522  * ref_visible - given a key with snapshot id @src that points to a key with
523  * snapshot id @dst, test whether there is some snapshot in which @dst is
524  * visible.
525  *
526  * This assumes we're visiting @src keys in natural key order.
527  *
528  * @s   - list of snapshot IDs already seen at @src
529  * @src - snapshot ID of src key
530  * @dst - snapshot ID of dst key
531  */
532 static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
533                        u32 src, u32 dst)
534 {
535         return dst <= src
536                 ? key_visible_in_snapshot(c, s, dst, src)
537                 : bch2_snapshot_is_ancestor(c, src, dst);
538 }
539
540 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)       \
541         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\
542                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
543
544 struct inode_walker_entry {
545         struct bch_inode_unpacked inode;
546         u32                     snapshot;
547         u64                     count;
548 };
549
550 struct inode_walker {
551         bool                            first_this_inode;
552         u64                             cur_inum;
553
554         DARRAY(struct inode_walker_entry) inodes;
555 };
556
557 static void inode_walker_exit(struct inode_walker *w)
558 {
559         darray_exit(w->inodes);
560 }
561
562 static struct inode_walker inode_walker_init(void)
563 {
564         return (struct inode_walker) { 0, };
565 }
566
567 static int add_inode(struct bch_fs *c, struct inode_walker *w,
568                      struct bkey_s_c inode)
569 {
570         struct bch_inode_unpacked u;
571
572         BUG_ON(bch2_inode_unpack(inode, &u));
573
574         return darray_push(w->inodes, ((struct inode_walker_entry) {
575                 .inode          = u,
576                 .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
577         }));
578 }
579
580 static int __walk_inode(struct btree_trans *trans,
581                         struct inode_walker *w, struct bpos pos)
582 {
583         struct bch_fs *c = trans->c;
584         struct btree_iter iter;
585         struct bkey_s_c k;
586         unsigned i, ancestor_pos;
587         int ret;
588
589         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
590
591         if (pos.inode == w->cur_inum) {
592                 w->first_this_inode = false;
593                 goto lookup_snapshot;
594         }
595
596         w->inodes.nr = 0;
597
598         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
599                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
600                 if (k.k->p.offset != pos.inode)
601                         break;
602
603                 if (bkey_is_inode(k.k))
604                         add_inode(c, w, k);
605         }
606         bch2_trans_iter_exit(trans, &iter);
607
608         if (ret)
609                 return ret;
610
611         w->cur_inum             = pos.inode;
612         w->first_this_inode     = true;
613 lookup_snapshot:
614         for (i = 0; i < w->inodes.nr; i++)
615                 if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
616                         goto found;
617         return INT_MAX;
618 found:
619         BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
620
621         if (pos.snapshot != w->inodes.data[i].snapshot) {
622                 ancestor_pos = i;
623
624                 while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
625                         --i;
626
627                 ret = darray_insert_item(w->inodes, i, w->inodes.data[ancestor_pos]);
628                 if (ret)
629                         return ret;
630
631                 w->inodes.data[i].snapshot = pos.snapshot;
632                 w->inodes.data[i].count = 0;
633         }
634
635         return i;
636 }
637
638 static int __get_visible_inodes(struct btree_trans *trans,
639                                 struct inode_walker *w,
640                                 struct snapshots_seen *s,
641                                 u64 inum)
642 {
643         struct bch_fs *c = trans->c;
644         struct btree_iter iter;
645         struct bkey_s_c k;
646         int ret;
647
648         w->inodes.nr = 0;
649
650         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
651                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
652                 if (k.k->p.offset != inum)
653                         break;
654
655                 if (!bkey_is_inode(k.k))
656                         continue;
657
658                 if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
659                         add_inode(c, w, k);
660                         if (k.k->p.snapshot >= s->pos.snapshot)
661                                 break;
662                 }
663         }
664         bch2_trans_iter_exit(trans, &iter);
665
666         return ret;
667 }
668
669 static int check_key_has_snapshot(struct btree_trans *trans,
670                                   struct btree_iter *iter,
671                                   struct bkey_s_c k)
672 {
673         struct bch_fs *c = trans->c;
674         struct printbuf buf = PRINTBUF;
675         int ret = 0;
676
677         if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
678                         "key in missing snapshot: %s",
679                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
680                 ret = bch2_btree_delete_at(trans, iter,
681                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
682 fsck_err:
683         printbuf_exit(&buf);
684         return ret;
685 }
686
687 static int hash_redo_key(struct btree_trans *trans,
688                          const struct bch_hash_desc desc,
689                          struct bch_hash_info *hash_info,
690                          struct btree_iter *k_iter, struct bkey_s_c k)
691 {
692         bch_err(trans->c, "hash_redo_key() not implemented yet");
693         return -EINVAL;
694 #if 0
695         struct bkey_i *delete;
696         struct bkey_i *tmp;
697
698         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
699         if (IS_ERR(delete))
700                 return PTR_ERR(delete);
701
702         tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
703         if (IS_ERR(tmp))
704                 return PTR_ERR(tmp);
705
706         bkey_reassemble(tmp, k);
707
708         bkey_init(&delete->k);
709         delete->k.p = k_iter->pos;
710         return  bch2_btree_iter_traverse(k_iter) ?:
711                 bch2_trans_update(trans, k_iter, delete, 0) ?:
712                 bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
713 #endif
714 }
715
716 static int hash_check_key(struct btree_trans *trans,
717                           const struct bch_hash_desc desc,
718                           struct bch_hash_info *hash_info,
719                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
720 {
721         struct bch_fs *c = trans->c;
722         struct btree_iter iter = { NULL };
723         struct printbuf buf = PRINTBUF;
724         struct bkey_s_c k;
725         u64 hash;
726         int ret = 0;
727
728         if (hash_k.k->type != desc.key_type)
729                 return 0;
730
731         hash = desc.hash_bkey(hash_info, hash_k);
732
733         if (likely(hash == hash_k.k->p.offset))
734                 return 0;
735
736         if (hash_k.k->p.offset < hash)
737                 goto bad_hash;
738
739         for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
740                            BTREE_ITER_SLOTS, k, ret) {
741                 if (!bkey_cmp(k.k->p, hash_k.k->p))
742                         break;
743
744                 if (fsck_err_on(k.k->type == desc.key_type &&
745                                 !desc.cmp_bkey(k, hash_k), c,
746                                 "duplicate hash table keys:\n%s",
747                                 (printbuf_reset(&buf),
748                                  bch2_bkey_val_to_text(&buf, c, hash_k),
749                                  buf.buf))) {
750                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
751                         break;
752                 }
753
754                 if (bkey_deleted(k.k)) {
755                         bch2_trans_iter_exit(trans, &iter);
756                         goto bad_hash;
757                 }
758
759         }
760 out:
761         bch2_trans_iter_exit(trans, &iter);
762         printbuf_exit(&buf);
763         return ret;
764 bad_hash:
765         if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
766                      "hashed to %llu\n%s",
767                      desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
768                      (printbuf_reset(&buf),
769                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
770                 return 0;
771
772         ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
773         if (ret) {
774                 bch_err(c, "hash_redo_key err %i", ret);
775                 return ret;
776         }
777         ret = -EINTR;
778 fsck_err:
779         goto out;
780 }
781
782 static int check_inode(struct btree_trans *trans,
783                        struct btree_iter *iter,
784                        struct bch_inode_unpacked *prev,
785                        bool full)
786 {
787         struct bch_fs *c = trans->c;
788         struct bkey_s_c k;
789         struct bch_inode_unpacked u;
790         bool do_update = false;
791         int ret;
792
793         k = bch2_btree_iter_peek(iter);
794         if (!k.k)
795                 return 0;
796
797         ret = bkey_err(k);
798         if (ret)
799                 return ret;
800
801         ret = check_key_has_snapshot(trans, iter, k);
802         if (ret)
803                 return ret < 0 ? ret : 0;
804
805         /*
806          * if snapshot id isn't a leaf node, skip it - deletion in
807          * particular is not atomic, so on the internal snapshot nodes
808          * we can see inodes marked for deletion after a clean shutdown
809          */
810         if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
811                 return 0;
812
813         if (!bkey_is_inode(k.k))
814                 return 0;
815
816         BUG_ON(bch2_inode_unpack(k, &u));
817
818         if (!full &&
819             !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
820                             BCH_INODE_I_SECTORS_DIRTY|
821                             BCH_INODE_UNLINKED)))
822                 return 0;
823
824         if (prev->bi_inum != u.bi_inum)
825                 *prev = u;
826
827         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
828                         inode_d_type(prev)      != inode_d_type(&u), c,
829                         "inodes in different snapshots don't match")) {
830                 bch_err(c, "repair not implemented yet");
831                 return -EINVAL;
832         }
833
834         if (u.bi_flags & BCH_INODE_UNLINKED &&
835             (!c->sb.clean ||
836              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
837                       u.bi_inum))) {
838                 bch2_trans_unlock(trans);
839                 bch2_fs_lazy_rw(c);
840
841                 ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
842                 if (ret)
843                         bch_err(c, "error in fsck: error %i while deleting inode", ret);
844                 return ret;
845         }
846
847         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
848             (!c->sb.clean ||
849              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
850                       u.bi_inum))) {
851                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
852
853                 bch2_trans_unlock(trans);
854                 bch2_fs_lazy_rw(c);
855
856                 /*
857                  * XXX: need to truncate partial blocks too here - or ideally
858                  * just switch units to bytes and that issue goes away
859                  */
860                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
861                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
862                                      iter->pos.snapshot),
863                                 POS(u.bi_inum, U64_MAX),
864                                 0, NULL);
865                 if (ret) {
866                         bch_err(c, "error in fsck: error %i truncating inode", ret);
867                         return ret;
868                 }
869
870                 /*
871                  * We truncated without our normal sector accounting hook, just
872                  * make sure we recalculate it:
873                  */
874                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
875
876                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
877                 do_update = true;
878         }
879
880         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
881             (!c->sb.clean ||
882              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
883                       u.bi_inum))) {
884                 s64 sectors;
885
886                 bch_verbose(c, "recounting sectors for inode %llu",
887                             u.bi_inum);
888
889                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
890                 if (sectors < 0) {
891                         bch_err(c, "error in fsck: error %i recounting inode sectors",
892                                 (int) sectors);
893                         return sectors;
894                 }
895
896                 u.bi_sectors = sectors;
897                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
898                 do_update = true;
899         }
900
901         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
902                 u.bi_dir = 0;
903                 u.bi_dir_offset = 0;
904                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
905                 do_update = true;
906         }
907
908         if (do_update) {
909                 ret = write_inode(trans, &u, iter->pos.snapshot);
910                 if (ret)
911                         bch_err(c, "error in fsck: error %i "
912                                 "updating inode", ret);
913         }
914 fsck_err:
915         return ret;
916 }
917
918 noinline_for_stack
919 static int check_inodes(struct bch_fs *c, bool full)
920 {
921         struct btree_trans trans;
922         struct btree_iter iter;
923         struct bch_inode_unpacked prev = { 0 };
924         int ret;
925
926         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
927
928         bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
929                              BTREE_ITER_INTENT|
930                              BTREE_ITER_PREFETCH|
931                              BTREE_ITER_ALL_SNAPSHOTS);
932
933         do {
934                 ret = __bch2_trans_do(&trans, NULL, NULL,
935                                       BTREE_INSERT_LAZY_RW|
936                                       BTREE_INSERT_NOFAIL,
937                         check_inode(&trans, &iter, &prev, full));
938                 if (ret)
939                         break;
940         } while (bch2_btree_iter_advance(&iter));
941         bch2_trans_iter_exit(&trans, &iter);
942
943         bch2_trans_exit(&trans);
944         return ret;
945 }
946
947 static int check_subvol(struct btree_trans *trans,
948                         struct btree_iter *iter)
949 {
950         struct bkey_s_c k;
951         struct bkey_s_c_subvolume subvol;
952         int ret;
953
954         k = bch2_btree_iter_peek(iter);
955         if (!k.k)
956                 return 0;
957
958         ret = bkey_err(k);
959         if (ret)
960                 return ret;
961
962         if (k.k->type != KEY_TYPE_subvolume)
963                 return 0;
964
965         subvol = bkey_s_c_to_subvolume(k);
966
967         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
968                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
969                 if (ret && ret != -EINTR)
970                         bch_err(trans->c, "error deleting subvolume %llu: %i",
971                                 iter->pos.offset, ret);
972                 if (ret)
973                         return ret;
974         }
975
976         return 0;
977 }
978
979 noinline_for_stack
980 static int check_subvols(struct bch_fs *c)
981 {
982         struct btree_trans trans;
983         struct btree_iter iter;
984         int ret;
985
986         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
987
988         bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
989                              POS_MIN,
990                              BTREE_ITER_INTENT|
991                              BTREE_ITER_PREFETCH);
992
993         do {
994                 ret = __bch2_trans_do(&trans, NULL, NULL,
995                                       BTREE_INSERT_LAZY_RW|
996                                       BTREE_INSERT_NOFAIL,
997                                       check_subvol(&trans, &iter));
998                 if (ret)
999                         break;
1000         } while (bch2_btree_iter_advance(&iter));
1001         bch2_trans_iter_exit(&trans, &iter);
1002
1003         bch2_trans_exit(&trans);
1004         return ret;
1005 }
1006
1007 /*
1008  * Checking for overlapping extents needs to be reimplemented
1009  */
1010 #if 0
1011 static int fix_overlapping_extent(struct btree_trans *trans,
1012                                        struct bkey_s_c k, struct bpos cut_at)
1013 {
1014         struct btree_iter iter;
1015         struct bkey_i *u;
1016         int ret;
1017
1018         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1019         ret = PTR_ERR_OR_ZERO(u);
1020         if (ret)
1021                 return ret;
1022
1023         bkey_reassemble(u, k);
1024         bch2_cut_front(cut_at, u);
1025
1026
1027         /*
1028          * We don't want to go through the extent_handle_overwrites path:
1029          *
1030          * XXX: this is going to screw up disk accounting, extent triggers
1031          * assume things about extent overwrites - we should be running the
1032          * triggers manually here
1033          */
1034         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
1035                              BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
1036
1037         BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
1038         ret   = bch2_btree_iter_traverse(&iter) ?:
1039                 bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
1040                 bch2_trans_commit(trans, NULL, NULL,
1041                                   BTREE_INSERT_NOFAIL|
1042                                   BTREE_INSERT_LAZY_RW);
1043         bch2_trans_iter_exit(trans, &iter);
1044         return ret;
1045 }
1046 #endif
1047
1048 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
1049                                                 struct btree_iter *iter,
1050                                                 struct bpos pos)
1051 {
1052         struct bkey_s_c k;
1053         int ret;
1054
1055         bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
1056         k = bch2_btree_iter_peek_slot(iter);
1057         ret = bkey_err(k);
1058         if (!ret && k.k->type != KEY_TYPE_dirent)
1059                 ret = -ENOENT;
1060         if (ret) {
1061                 bch2_trans_iter_exit(trans, iter);
1062                 return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
1063         }
1064
1065         return bkey_s_c_to_dirent(k);
1066 }
1067
1068 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
1069                                    struct bkey_s_c_dirent d)
1070 {
1071         return  inode->bi_dir           == d.k->p.inode &&
1072                 inode->bi_dir_offset    == d.k->p.offset;
1073 }
1074
1075 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
1076                                    struct bch_inode_unpacked *inode)
1077 {
1078         return d.v->d_type == DT_SUBVOL
1079                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
1080                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
1081 }
1082
1083 static int inode_backpointer_exists(struct btree_trans *trans,
1084                                     struct bch_inode_unpacked *inode,
1085                                     u32 snapshot)
1086 {
1087         struct btree_iter iter;
1088         struct bkey_s_c_dirent d;
1089         int ret;
1090
1091         d = dirent_get_by_pos(trans, &iter,
1092                         SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
1093         ret = bkey_err(d.s_c);
1094         if (ret)
1095                 return ret;
1096
1097         ret = dirent_points_to_inode(d, inode);
1098         bch2_trans_iter_exit(trans, &iter);
1099         return ret;
1100 }
1101
1102 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1103 {
1104         struct bch_fs *c = trans->c;
1105         struct inode_walker_entry *i;
1106         int ret = 0, ret2 = 0;
1107         s64 count2;
1108
1109         darray_for_each(w->inodes, i) {
1110                 if (i->inode.bi_sectors == i->count)
1111                         continue;
1112
1113                 count2 = lockrestart_do(trans,
1114                         bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
1115
1116                 if (i->count != count2) {
1117                         bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
1118                                 i->count, count2);
1119                         i->count = count2;
1120                         if (i->inode.bi_sectors == i->count)
1121                                 continue;
1122                 }
1123
1124                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
1125                             "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1126                             w->cur_inum, i->snapshot,
1127                             i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
1128                         continue;
1129
1130                 i->inode.bi_sectors = i->count;
1131                 ret = write_inode(trans, &i->inode, i->snapshot);
1132                 if (ret)
1133                         break;
1134                 ret2 = -EINTR;
1135         }
1136 fsck_err:
1137         return ret ?: ret2;
1138 }
1139
1140 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1141                         struct inode_walker *inode,
1142                         struct snapshots_seen *s)
1143 {
1144         struct bch_fs *c = trans->c;
1145         struct bkey_s_c k;
1146         struct inode_walker_entry *i;
1147         struct printbuf buf = PRINTBUF;
1148         int ret = 0;
1149
1150         k = bch2_btree_iter_peek(iter);
1151         if (!k.k)
1152                 goto out;
1153
1154         ret = bkey_err(k);
1155         if (ret)
1156                 goto err;
1157
1158         ret = check_key_has_snapshot(trans, iter, k);
1159         if (ret) {
1160                 ret = ret < 0 ? ret : 0;
1161                 goto out;
1162         }
1163
1164         ret = snapshots_seen_update(c, s, k.k->p);
1165         if (ret)
1166                 goto err;
1167
1168         if (k.k->type == KEY_TYPE_whiteout)
1169                 goto out;
1170
1171         if (inode->cur_inum != k.k->p.inode) {
1172                 ret = check_i_sectors(trans, inode);
1173                 if (ret)
1174                         goto err;
1175         }
1176 #if 0
1177         if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
1178                 char buf1[200];
1179                 char buf2[200];
1180
1181                 bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
1182                 bch2_bkey_val_to_text(&PBUF(buf2), c, k);
1183
1184                 if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
1185                         ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
1186                         goto out;
1187                 }
1188         }
1189 #endif
1190         ret = __walk_inode(trans, inode, k.k->p);
1191         if (ret < 0)
1192                 goto err;
1193
1194         if (fsck_err_on(ret == INT_MAX, c,
1195                         "extent in missing inode:\n  %s",
1196                         (printbuf_reset(&buf),
1197                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1198                 ret = bch2_btree_delete_at(trans, iter,
1199                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1200                 goto out;
1201         }
1202
1203         if (ret == INT_MAX) {
1204                 ret = 0;
1205                 goto out;
1206         }
1207
1208         i = inode->inodes.data + ret;
1209         ret = 0;
1210
1211         if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
1212                         !S_ISLNK(i->inode.bi_mode), c,
1213                         "extent in non regular inode mode %o:\n  %s",
1214                         i->inode.bi_mode,
1215                         (printbuf_reset(&buf),
1216                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1217                 ret = bch2_btree_delete_at(trans, iter,
1218                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1219                 goto out;
1220         }
1221
1222         if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
1223                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
1224                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
1225                                         k.k->type != KEY_TYPE_reservation &&
1226                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
1227                                         "extent type %u offset %llu past end of inode %llu, i_size %llu",
1228                                         k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
1229                                 bch2_fs_lazy_rw(c);
1230                                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1231                                                 SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
1232                                                      k.k->p.snapshot),
1233                                                 POS(k.k->p.inode, U64_MAX),
1234                                                 0, NULL) ?: -EINTR;
1235                                 goto out;
1236                         }
1237                 }
1238         }
1239
1240         if (bkey_extent_is_allocation(k.k))
1241                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
1242                         i->count += k.k->size;
1243 #if 0
1244         bch2_bkey_buf_reassemble(&prev, c, k);
1245 #endif
1246
1247 out:
1248 err:
1249 fsck_err:
1250         printbuf_exit(&buf);
1251         return ret;
1252 }
1253
1254 /*
1255  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1256  * that i_size an i_sectors are consistent
1257  */
1258 noinline_for_stack
1259 static int check_extents(struct bch_fs *c)
1260 {
1261         struct inode_walker w = inode_walker_init();
1262         struct snapshots_seen s;
1263         struct btree_trans trans;
1264         struct btree_iter iter;
1265         int ret = 0;
1266
1267 #if 0
1268         struct bkey_buf prev;
1269         bch2_bkey_buf_init(&prev);
1270         prev.k->k = KEY(0, 0, 0);
1271 #endif
1272         snapshots_seen_init(&s);
1273         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1274
1275         bch_verbose(c, "checking extents");
1276
1277         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
1278                              POS(BCACHEFS_ROOT_INO, 0),
1279                              BTREE_ITER_INTENT|
1280                              BTREE_ITER_PREFETCH|
1281                              BTREE_ITER_ALL_SNAPSHOTS);
1282
1283         do {
1284                 ret = __bch2_trans_do(&trans, NULL, NULL,
1285                                       BTREE_INSERT_LAZY_RW|
1286                                       BTREE_INSERT_NOFAIL,
1287                         check_extent(&trans, &iter, &w, &s));
1288                 if (ret)
1289                         break;
1290         } while (bch2_btree_iter_advance(&iter));
1291         bch2_trans_iter_exit(&trans, &iter);
1292 #if 0
1293         bch2_bkey_buf_exit(&prev, c);
1294 #endif
1295         inode_walker_exit(&w);
1296         bch2_trans_exit(&trans);
1297         snapshots_seen_exit(&s);
1298
1299         return ret;
1300 }
1301
1302 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1303 {
1304         struct bch_fs *c = trans->c;
1305         struct inode_walker_entry *i;
1306         int ret = 0, ret2 = 0;
1307         s64 count2;
1308
1309         darray_for_each(w->inodes, i) {
1310                 if (i->inode.bi_nlink == i->count)
1311                         continue;
1312
1313                 count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
1314                 if (count2 < 0)
1315                         return count2;
1316
1317                 if (i->count != count2) {
1318                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1319                                 i->count, count2);
1320                         i->count = count2;
1321                         if (i->inode.bi_nlink == i->count)
1322                                 continue;
1323                 }
1324
1325                 if (fsck_err_on(i->inode.bi_nlink != i->count, c,
1326                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1327                                 w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
1328                         i->inode.bi_nlink = i->count;
1329                         ret = write_inode(trans, &i->inode, i->snapshot);
1330                         if (ret)
1331                                 break;
1332                         ret2 = -EINTR;
1333                 }
1334         }
1335 fsck_err:
1336         return ret ?: ret2;
1337 }
1338
1339 static int check_dirent_target(struct btree_trans *trans,
1340                                struct btree_iter *iter,
1341                                struct bkey_s_c_dirent d,
1342                                struct bch_inode_unpacked *target,
1343                                u32 target_snapshot)
1344 {
1345         struct bch_fs *c = trans->c;
1346         struct bkey_i_dirent *n;
1347         bool backpointer_exists = true;
1348         struct printbuf buf = PRINTBUF;
1349         int ret = 0;
1350
1351         if (!target->bi_dir &&
1352             !target->bi_dir_offset) {
1353                 target->bi_dir          = d.k->p.inode;
1354                 target->bi_dir_offset   = d.k->p.offset;
1355
1356                 ret = __write_inode(trans, target, target_snapshot);
1357                 if (ret)
1358                         goto err;
1359         }
1360
1361         if (!inode_points_to_dirent(target, d)) {
1362                 ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
1363                 if (ret < 0)
1364                         goto err;
1365
1366                 backpointer_exists = ret;
1367                 ret = 0;
1368
1369                 if (fsck_err_on(S_ISDIR(target->bi_mode) &&
1370                                 backpointer_exists, c,
1371                                 "directory %llu with multiple links",
1372                                 target->bi_inum)) {
1373                         ret = __remove_dirent(trans, d.k->p);
1374                         goto out;
1375                 }
1376
1377                 if (fsck_err_on(backpointer_exists &&
1378                                 !target->bi_nlink, c,
1379                                 "inode %llu has multiple links but i_nlink 0",
1380                                 target->bi_inum)) {
1381                         target->bi_nlink++;
1382                         target->bi_flags &= ~BCH_INODE_UNLINKED;
1383
1384                         ret = __write_inode(trans, target, target_snapshot);
1385                         if (ret)
1386                                 goto err;
1387                 }
1388
1389                 if (fsck_err_on(!backpointer_exists, c,
1390                                 "inode %llu:%u has wrong backpointer:\n"
1391                                 "got       %llu:%llu\n"
1392                                 "should be %llu:%llu",
1393                                 target->bi_inum, target_snapshot,
1394                                 target->bi_dir,
1395                                 target->bi_dir_offset,
1396                                 d.k->p.inode,
1397                                 d.k->p.offset)) {
1398                         target->bi_dir          = d.k->p.inode;
1399                         target->bi_dir_offset   = d.k->p.offset;
1400
1401                         ret = __write_inode(trans, target, target_snapshot);
1402                         if (ret)
1403                                 goto err;
1404                 }
1405         }
1406
1407         if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
1408                         "incorrect d_type: got %s, should be %s:\n%s",
1409                         bch2_d_type_str(d.v->d_type),
1410                         bch2_d_type_str(inode_d_type(target)),
1411                         (printbuf_reset(&buf),
1412                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1413                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1414                 ret = PTR_ERR_OR_ZERO(n);
1415                 if (ret)
1416                         goto err;
1417
1418                 bkey_reassemble(&n->k_i, d.s_c);
1419                 n->v.d_type = inode_d_type(target);
1420
1421                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1422                 if (ret)
1423                         goto err;
1424
1425                 d = dirent_i_to_s_c(n);
1426         }
1427
1428         if (d.v->d_type == DT_SUBVOL &&
1429             target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
1430             (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
1431              fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
1432                       le32_to_cpu(d.v->d_parent_subvol),
1433                       target->bi_parent_subvol))) {
1434                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1435                 ret = PTR_ERR_OR_ZERO(n);
1436                 if (ret)
1437                         goto err;
1438
1439                 bkey_reassemble(&n->k_i, d.s_c);
1440                 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1441
1442                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1443                 if (ret)
1444                         goto err;
1445
1446                 d = dirent_i_to_s_c(n);
1447         }
1448 out:
1449 err:
1450 fsck_err:
1451         printbuf_exit(&buf);
1452         return ret;
1453 }
1454
1455 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1456                         struct bch_hash_info *hash_info,
1457                         struct inode_walker *dir,
1458                         struct inode_walker *target,
1459                         struct snapshots_seen *s)
1460 {
1461         struct bch_fs *c = trans->c;
1462         struct bkey_s_c k;
1463         struct bkey_s_c_dirent d;
1464         struct inode_walker_entry *i;
1465         struct printbuf buf = PRINTBUF;
1466         int ret = 0;
1467
1468         k = bch2_btree_iter_peek(iter);
1469         if (!k.k)
1470                 goto out;
1471
1472         ret = bkey_err(k);
1473         if (ret)
1474                 goto err;
1475
1476         ret = check_key_has_snapshot(trans, iter, k);
1477         if (ret) {
1478                 ret = ret < 0 ? ret : 0;
1479                 goto out;
1480         }
1481
1482         ret = snapshots_seen_update(c, s, k.k->p);
1483         if (ret)
1484                 goto err;
1485
1486         if (k.k->type == KEY_TYPE_whiteout)
1487                 goto out;
1488
1489         if (dir->cur_inum != k.k->p.inode) {
1490                 ret = check_subdir_count(trans, dir);
1491                 if (ret)
1492                         goto err;
1493         }
1494
1495         ret = __walk_inode(trans, dir, k.k->p);
1496         if (ret < 0)
1497                 goto err;
1498
1499         if (fsck_err_on(ret == INT_MAX, c,
1500                         "dirent in nonexisting directory:\n%s",
1501                         (printbuf_reset(&buf),
1502                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1503                 ret = bch2_btree_delete_at(trans, iter,
1504                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1505                 goto out;
1506         }
1507
1508         if (ret == INT_MAX) {
1509                 ret = 0;
1510                 goto out;
1511         }
1512
1513         i = dir->inodes.data + ret;
1514         ret = 0;
1515
1516         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
1517                         "dirent in non directory inode type %s:\n%s",
1518                         bch2_d_type_str(inode_d_type(&i->inode)),
1519                         (printbuf_reset(&buf),
1520                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1521                 ret = bch2_btree_delete_at(trans, iter, 0);
1522                 goto out;
1523         }
1524
1525         if (dir->first_this_inode)
1526                 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
1527
1528         ret = hash_check_key(trans, bch2_dirent_hash_desc,
1529                              hash_info, iter, k);
1530         if (ret < 0)
1531                 goto err;
1532         if (ret) {
1533                 /* dirent has been deleted */
1534                 ret = 0;
1535                 goto out;
1536         }
1537
1538         if (k.k->type != KEY_TYPE_dirent)
1539                 goto out;
1540
1541         d = bkey_s_c_to_dirent(k);
1542
1543         if (d.v->d_type == DT_SUBVOL) {
1544                 struct bch_inode_unpacked subvol_root;
1545                 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1546                 u32 target_snapshot;
1547                 u64 target_inum;
1548
1549                 ret = __subvol_lookup(trans, target_subvol,
1550                                       &target_snapshot, &target_inum);
1551                 if (ret && ret != -ENOENT)
1552                         goto err;
1553
1554                 if (fsck_err_on(ret, c,
1555                                 "dirent points to missing subvolume %llu",
1556                                 le64_to_cpu(d.v->d_child_subvol))) {
1557                         ret = __remove_dirent(trans, d.k->p);
1558                         goto err;
1559                 }
1560
1561                 ret = __lookup_inode(trans, target_inum,
1562                                    &subvol_root, &target_snapshot);
1563                 if (ret && ret != -ENOENT)
1564                         goto err;
1565
1566                 if (fsck_err_on(ret, c,
1567                                 "subvolume %u points to missing subvolume root %llu",
1568                                 target_subvol,
1569                                 target_inum)) {
1570                         bch_err(c, "repair not implemented yet");
1571                         ret = -EINVAL;
1572                         goto err;
1573                 }
1574
1575                 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
1576                                 "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1577                                 target_inum,
1578                                 subvol_root.bi_subvol, target_subvol)) {
1579                         subvol_root.bi_subvol = target_subvol;
1580                         ret = __write_inode(trans, &subvol_root, target_snapshot);
1581                         if (ret)
1582                                 goto err;
1583                 }
1584
1585                 ret = check_dirent_target(trans, iter, d, &subvol_root,
1586                                           target_snapshot);
1587                 if (ret)
1588                         goto err;
1589         } else {
1590                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1591                 if (ret)
1592                         goto err;
1593
1594                 if (fsck_err_on(!target->inodes.nr, c,
1595                                 "dirent points to missing inode:\n%s",
1596                                 (printbuf_reset(&buf),
1597                                  bch2_bkey_val_to_text(&buf, c, k),
1598                                  buf.buf))) {
1599                         ret = __remove_dirent(trans, d.k->p);
1600                         if (ret)
1601                                 goto err;
1602                 }
1603
1604                 darray_for_each(target->inodes, i) {
1605                         ret = check_dirent_target(trans, iter, d,
1606                                                   &i->inode, i->snapshot);
1607                         if (ret)
1608                                 goto err;
1609                 }
1610         }
1611
1612         if (d.v->d_type == DT_DIR)
1613                 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
1614                         i->count++;
1615
1616 out:
1617 err:
1618 fsck_err:
1619         printbuf_exit(&buf);
1620         return ret;
1621 }
1622
1623 /*
1624  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1625  * validate d_type
1626  */
1627 noinline_for_stack
1628 static int check_dirents(struct bch_fs *c)
1629 {
1630         struct inode_walker dir = inode_walker_init();
1631         struct inode_walker target = inode_walker_init();
1632         struct snapshots_seen s;
1633         struct bch_hash_info hash_info;
1634         struct btree_trans trans;
1635         struct btree_iter iter;
1636         int ret = 0;
1637
1638         bch_verbose(c, "checking dirents");
1639
1640         snapshots_seen_init(&s);
1641         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1642
1643         bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
1644                              POS(BCACHEFS_ROOT_INO, 0),
1645                              BTREE_ITER_INTENT|
1646                              BTREE_ITER_PREFETCH|
1647                              BTREE_ITER_ALL_SNAPSHOTS);
1648
1649         do {
1650                 ret = __bch2_trans_do(&trans, NULL, NULL,
1651                                       BTREE_INSERT_LAZY_RW|
1652                                       BTREE_INSERT_NOFAIL,
1653                         check_dirent(&trans, &iter, &hash_info,
1654                                      &dir, &target, &s));
1655                 if (ret)
1656                         break;
1657         } while (bch2_btree_iter_advance(&iter));
1658         bch2_trans_iter_exit(&trans, &iter);
1659
1660         bch2_trans_exit(&trans);
1661         snapshots_seen_exit(&s);
1662         inode_walker_exit(&dir);
1663         inode_walker_exit(&target);
1664         return ret;
1665 }
1666
1667 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
1668                        struct bch_hash_info *hash_info,
1669                        struct inode_walker *inode)
1670 {
1671         struct bch_fs *c = trans->c;
1672         struct bkey_s_c k;
1673         int ret;
1674
1675         k = bch2_btree_iter_peek(iter);
1676         if (!k.k)
1677                 return 0;
1678
1679         ret = bkey_err(k);
1680         if (ret)
1681                 return ret;
1682
1683         ret = check_key_has_snapshot(trans, iter, k);
1684         if (ret)
1685                 return ret;
1686
1687         ret = __walk_inode(trans, inode, k.k->p);
1688         if (ret < 0)
1689                 return ret;
1690
1691         if (fsck_err_on(ret == INT_MAX, c,
1692                         "xattr for missing inode %llu",
1693                         k.k->p.inode))
1694                 return bch2_btree_delete_at(trans, iter, 0);
1695
1696         if (ret == INT_MAX)
1697                 return 0;
1698
1699         ret = 0;
1700
1701         if (inode->first_this_inode)
1702                 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
1703
1704         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
1705 fsck_err:
1706         return ret;
1707 }
1708
1709 /*
1710  * Walk xattrs: verify that they all have a corresponding inode
1711  */
1712 noinline_for_stack
1713 static int check_xattrs(struct bch_fs *c)
1714 {
1715         struct inode_walker inode = inode_walker_init();
1716         struct bch_hash_info hash_info;
1717         struct btree_trans trans;
1718         struct btree_iter iter;
1719         int ret = 0;
1720
1721         bch_verbose(c, "checking xattrs");
1722
1723         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1724
1725         bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
1726                              POS(BCACHEFS_ROOT_INO, 0),
1727                              BTREE_ITER_INTENT|
1728                              BTREE_ITER_PREFETCH|
1729                              BTREE_ITER_ALL_SNAPSHOTS);
1730
1731         do {
1732                 ret = __bch2_trans_do(&trans, NULL, NULL,
1733                                       BTREE_INSERT_LAZY_RW|
1734                                       BTREE_INSERT_NOFAIL,
1735                                       check_xattr(&trans, &iter, &hash_info,
1736                                                   &inode));
1737                 if (ret)
1738                         break;
1739         } while (bch2_btree_iter_advance(&iter));
1740         bch2_trans_iter_exit(&trans, &iter);
1741
1742         bch2_trans_exit(&trans);
1743         return ret;
1744 }
1745
1746 static int check_root_trans(struct btree_trans *trans)
1747 {
1748         struct bch_fs *c = trans->c;
1749         struct bch_inode_unpacked root_inode;
1750         u32 snapshot;
1751         u64 inum;
1752         int ret;
1753
1754         ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1755         if (ret && ret != -ENOENT)
1756                 return ret;
1757
1758         if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
1759                 struct bkey_i_subvolume root_subvol;
1760
1761                 snapshot        = U32_MAX;
1762                 inum            = BCACHEFS_ROOT_INO;
1763
1764                 bkey_subvolume_init(&root_subvol.k_i);
1765                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1766                 root_subvol.v.flags     = 0;
1767                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1768                 root_subvol.v.inode     = cpu_to_le64(inum);
1769                 ret = __bch2_trans_do(trans, NULL, NULL,
1770                                       BTREE_INSERT_NOFAIL|
1771                                       BTREE_INSERT_LAZY_RW,
1772                         __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
1773                 if (ret) {
1774                         bch_err(c, "error writing root subvol: %i", ret);
1775                         goto err;
1776                 }
1777
1778         }
1779
1780         ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1781         if (ret && ret != -ENOENT)
1782                 return ret;
1783
1784         if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
1785             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
1786                                 "root inode not a directory")) {
1787                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1788                                 0, NULL);
1789                 root_inode.bi_inum = inum;
1790
1791                 ret = __write_inode(trans, &root_inode, snapshot);
1792                 if (ret)
1793                         bch_err(c, "error writing root inode: %i", ret);
1794         }
1795 err:
1796 fsck_err:
1797         return ret;
1798 }
1799
1800 /* Get root directory, create if it doesn't exist: */
1801 noinline_for_stack
1802 static int check_root(struct bch_fs *c)
1803 {
1804         bch_verbose(c, "checking root directory");
1805
1806         return bch2_trans_do(c, NULL, NULL,
1807                              BTREE_INSERT_NOFAIL|
1808                              BTREE_INSERT_LAZY_RW,
1809                 check_root_trans(&trans));
1810 }
1811
1812 struct pathbuf_entry {
1813         u64     inum;
1814         u32     snapshot;
1815 };
1816
1817 typedef DARRAY(struct pathbuf_entry) pathbuf;
1818
1819 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
1820 {
1821         struct pathbuf_entry *i;
1822
1823         darray_for_each(*p, i)
1824                 if (i->inum     == inum &&
1825                     i->snapshot == snapshot)
1826                         return true;
1827
1828         return false;
1829 }
1830
1831 static int path_down(struct bch_fs *c, pathbuf *p,
1832                      u64 inum, u32 snapshot)
1833 {
1834         int ret = darray_push(*p, ((struct pathbuf_entry) {
1835                 .inum           = inum,
1836                 .snapshot       = snapshot,
1837         }));
1838
1839         if (ret)
1840                 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
1841                         p->size);
1842         return ret;
1843 }
1844
1845 /*
1846  * Check that a given inode is reachable from the root:
1847  *
1848  * XXX: we should also be verifying that inodes are in the right subvolumes
1849  */
1850 static int check_path(struct btree_trans *trans,
1851                       pathbuf *p,
1852                       struct bch_inode_unpacked *inode,
1853                       u32 snapshot)
1854 {
1855         struct bch_fs *c = trans->c;
1856         int ret = 0;
1857
1858         snapshot = snapshot_t(c, snapshot)->equiv;
1859         p->nr = 0;
1860
1861         while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
1862                  inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
1863                 struct btree_iter dirent_iter;
1864                 struct bkey_s_c_dirent d;
1865                 u32 parent_snapshot = snapshot;
1866
1867                 if (inode->bi_subvol) {
1868                         u64 inum;
1869
1870                         ret = subvol_lookup(trans, inode->bi_parent_subvol,
1871                                             &parent_snapshot, &inum);
1872                         if (ret)
1873                                 break;
1874                 }
1875
1876                 ret = lockrestart_do(trans,
1877                         PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
1878                                           SPOS(inode->bi_dir, inode->bi_dir_offset,
1879                                                parent_snapshot))).k));
1880                 if (ret && ret != -ENOENT)
1881                         break;
1882
1883                 if (!ret && !dirent_points_to_inode(d, inode)) {
1884                         bch2_trans_iter_exit(trans, &dirent_iter);
1885                         ret = -ENOENT;
1886                 }
1887
1888                 if (ret == -ENOENT) {
1889                         if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
1890                                      inode->bi_inum, snapshot,
1891                                      bch2_d_type_str(inode_d_type(inode)),
1892                                      inode->bi_nlink,
1893                                      inode->bi_dir,
1894                                      inode->bi_dir_offset))
1895                                 ret = reattach_inode(trans, inode, snapshot);
1896                         break;
1897                 }
1898
1899                 bch2_trans_iter_exit(trans, &dirent_iter);
1900
1901                 if (!S_ISDIR(inode->bi_mode))
1902                         break;
1903
1904                 ret = path_down(c, p, inode->bi_inum, snapshot);
1905                 if (ret) {
1906                         bch_err(c, "memory allocation failure");
1907                         return ret;
1908                 }
1909
1910                 snapshot = parent_snapshot;
1911
1912                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
1913                 if (ret) {
1914                         /* Should have been caught in dirents pass */
1915                         bch_err(c, "error looking up parent directory: %i", ret);
1916                         break;
1917                 }
1918
1919                 if (path_is_dup(p, inode->bi_inum, snapshot)) {
1920                         struct pathbuf_entry *i;
1921
1922                         /* XXX print path */
1923                         bch_err(c, "directory structure loop");
1924
1925                         darray_for_each(*p, i)
1926                                 pr_err("%llu:%u", i->inum, i->snapshot);
1927                         pr_err("%llu:%u", inode->bi_inum, snapshot);
1928
1929                         if (!fsck_err(c, "directory structure loop"))
1930                                 return 0;
1931
1932                         ret = __bch2_trans_do(trans, NULL, NULL,
1933                                               BTREE_INSERT_NOFAIL|
1934                                               BTREE_INSERT_LAZY_RW,
1935                                         remove_backpointer(trans, inode));
1936                         if (ret) {
1937                                 bch_err(c, "error removing dirent: %i", ret);
1938                                 break;
1939                         }
1940
1941                         ret = reattach_inode(trans, inode, snapshot);
1942                 }
1943         }
1944 fsck_err:
1945         if (ret)
1946                 bch_err(c, "%s: err %i", __func__, ret);
1947         return ret;
1948 }
1949
1950 /*
1951  * Check for unreachable inodes, as well as loops in the directory structure:
1952  * After check_dirents(), if an inode backpointer doesn't exist that means it's
1953  * unreachable:
1954  */
1955 noinline_for_stack
1956 static int check_directory_structure(struct bch_fs *c)
1957 {
1958         struct btree_trans trans;
1959         struct btree_iter iter;
1960         struct bkey_s_c k;
1961         struct bch_inode_unpacked u;
1962         pathbuf path = { 0, };
1963         int ret;
1964
1965         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1966
1967         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
1968                            BTREE_ITER_INTENT|
1969                            BTREE_ITER_PREFETCH|
1970                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
1971                 if (!bkey_is_inode(k.k))
1972                         continue;
1973
1974                 ret = bch2_inode_unpack(k, &u);
1975                 if (ret) {
1976                         /* Should have been caught earlier in fsck: */
1977                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
1978                         break;
1979                 }
1980
1981                 if (u.bi_flags & BCH_INODE_UNLINKED)
1982                         continue;
1983
1984                 ret = check_path(&trans, &path, &u, iter.pos.snapshot);
1985                 if (ret)
1986                         break;
1987         }
1988         bch2_trans_iter_exit(&trans, &iter);
1989
1990         BUG_ON(ret == -EINTR);
1991
1992         darray_exit(path);
1993
1994         bch2_trans_exit(&trans);
1995         return ret;
1996 }
1997
1998 struct nlink_table {
1999         size_t          nr;
2000         size_t          size;
2001
2002         struct nlink {
2003                 u64     inum;
2004                 u32     snapshot;
2005                 u32     count;
2006         }               *d;
2007 };
2008
2009 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2010                      u64 inum, u32 snapshot)
2011 {
2012         if (t->nr == t->size) {
2013                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2014                 void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
2015                 if (!d) {
2016                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2017                                 new_size);
2018                         return -ENOMEM;
2019                 }
2020
2021                 if (t->d)
2022                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2023                 kvfree(t->d);
2024
2025                 t->d = d;
2026                 t->size = new_size;
2027         }
2028
2029
2030         t->d[t->nr++] = (struct nlink) {
2031                 .inum           = inum,
2032                 .snapshot       = snapshot,
2033         };
2034
2035         return 0;
2036 }
2037
2038 static int nlink_cmp(const void *_l, const void *_r)
2039 {
2040         const struct nlink *l = _l;
2041         const struct nlink *r = _r;
2042
2043         return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
2044 }
2045
2046 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2047                      struct nlink_table *links,
2048                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2049 {
2050         struct nlink *link, key = {
2051                 .inum = inum, .snapshot = U32_MAX,
2052         };
2053
2054         if (inum < range_start || inum >= range_end)
2055                 return;
2056
2057         link = __inline_bsearch(&key, links->d, links->nr,
2058                                 sizeof(links->d[0]), nlink_cmp);
2059         if (!link)
2060                 return;
2061
2062         while (link > links->d && link[0].inum == link[-1].inum)
2063                 --link;
2064
2065         for (; link < links->d + links->nr && link->inum == inum; link++)
2066                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2067                         link->count++;
2068                         if (link->snapshot >= snapshot)
2069                                 break;
2070                 }
2071 }
2072
2073 noinline_for_stack
2074 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2075                                        struct nlink_table *t,
2076                                        u64 start, u64 *end)
2077 {
2078         struct btree_trans trans;
2079         struct btree_iter iter;
2080         struct bkey_s_c k;
2081         struct bch_inode_unpacked u;
2082         int ret = 0;
2083
2084         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2085
2086         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2087                            POS(0, start),
2088                            BTREE_ITER_INTENT|
2089                            BTREE_ITER_PREFETCH|
2090                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2091                 if (!bkey_is_inode(k.k))
2092                         continue;
2093
2094                 /* Should never fail, checked by bch2_inode_invalid: */
2095                 BUG_ON(bch2_inode_unpack(k, &u));
2096
2097                 /*
2098                  * Backpointer and directory structure checks are sufficient for
2099                  * directories, since they can't have hardlinks:
2100                  */
2101                 if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2102                         continue;
2103
2104                 if (!u.bi_nlink)
2105                         continue;
2106
2107                 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2108                 if (ret) {
2109                         *end = k.k->p.offset;
2110                         ret = 0;
2111                         break;
2112                 }
2113
2114         }
2115         bch2_trans_iter_exit(&trans, &iter);
2116         bch2_trans_exit(&trans);
2117
2118         if (ret)
2119                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2120
2121         return ret;
2122 }
2123
2124 noinline_for_stack
2125 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2126                                      u64 range_start, u64 range_end)
2127 {
2128         struct btree_trans trans;
2129         struct snapshots_seen s;
2130         struct btree_iter iter;
2131         struct bkey_s_c k;
2132         struct bkey_s_c_dirent d;
2133         int ret;
2134
2135         snapshots_seen_init(&s);
2136
2137         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2138
2139         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
2140                            BTREE_ITER_INTENT|
2141                            BTREE_ITER_PREFETCH|
2142                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2143                 ret = snapshots_seen_update(c, &s, k.k->p);
2144                 if (ret)
2145                         break;
2146
2147                 switch (k.k->type) {
2148                 case KEY_TYPE_dirent:
2149                         d = bkey_s_c_to_dirent(k);
2150
2151                         if (d.v->d_type != DT_DIR &&
2152                             d.v->d_type != DT_SUBVOL)
2153                                 inc_link(c, &s, links, range_start, range_end,
2154                                          le64_to_cpu(d.v->d_inum),
2155                                          d.k->p.snapshot);
2156                         break;
2157                 }
2158         }
2159         bch2_trans_iter_exit(&trans, &iter);
2160
2161         if (ret)
2162                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
2163
2164         bch2_trans_exit(&trans);
2165         snapshots_seen_exit(&s);
2166         return ret;
2167 }
2168
2169 noinline_for_stack
2170 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2171                                struct nlink_table *links,
2172                                u64 range_start, u64 range_end)
2173 {
2174         struct btree_trans trans;
2175         struct btree_iter iter;
2176         struct bkey_s_c k;
2177         struct bch_inode_unpacked u;
2178         struct nlink *link = links->d;
2179         int ret = 0;
2180
2181         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2182
2183         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2184                            POS(0, range_start),
2185                            BTREE_ITER_INTENT|
2186                            BTREE_ITER_PREFETCH|
2187                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2188                 if (k.k->p.offset >= range_end)
2189                         break;
2190
2191                 if (!bkey_is_inode(k.k))
2192                         continue;
2193
2194                 BUG_ON(bch2_inode_unpack(k, &u));
2195
2196                 if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2197                         continue;
2198
2199                 if (!u.bi_nlink)
2200                         continue;
2201
2202                 while ((cmp_int(link->inum, k.k->p.offset) ?:
2203                         cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2204                         link++;
2205                         BUG_ON(link >= links->d + links->nr);
2206                 }
2207
2208                 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
2209                                 "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
2210                                 u.bi_inum, mode_to_type(u.bi_mode),
2211                                 bch2_inode_nlink_get(&u), link->count)) {
2212                         bch2_inode_nlink_set(&u, link->count);
2213
2214                         ret = write_inode(&trans, &u, k.k->p.snapshot);
2215                         if (ret)
2216                                 bch_err(c, "error in fsck: error %i updating inode", ret);
2217                 }
2218         }
2219 fsck_err:
2220         bch2_trans_iter_exit(&trans, &iter);
2221         bch2_trans_exit(&trans);
2222
2223         if (ret)
2224                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2225
2226         return ret;
2227 }
2228
2229 noinline_for_stack
2230 static int check_nlinks(struct bch_fs *c)
2231 {
2232         struct nlink_table links = { 0 };
2233         u64 this_iter_range_start, next_iter_range_start = 0;
2234         int ret = 0;
2235
2236         bch_verbose(c, "checking inode nlinks");
2237
2238         do {
2239                 this_iter_range_start = next_iter_range_start;
2240                 next_iter_range_start = U64_MAX;
2241
2242                 ret = check_nlinks_find_hardlinks(c, &links,
2243                                                   this_iter_range_start,
2244                                                   &next_iter_range_start);
2245
2246                 ret = check_nlinks_walk_dirents(c, &links,
2247                                           this_iter_range_start,
2248                                           next_iter_range_start);
2249                 if (ret)
2250                         break;
2251
2252                 ret = check_nlinks_update_hardlinks(c, &links,
2253                                          this_iter_range_start,
2254                                          next_iter_range_start);
2255                 if (ret)
2256                         break;
2257
2258                 links.nr = 0;
2259         } while (next_iter_range_start != U64_MAX);
2260
2261         kvfree(links.d);
2262
2263         return ret;
2264 }
2265
2266 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
2267 {
2268         struct bkey_s_c k;
2269         struct bkey_s_c_reflink_p p;
2270         struct bkey_i_reflink_p *u;
2271         int ret;
2272
2273         k = bch2_btree_iter_peek(iter);
2274         if (!k.k)
2275                 return 0;
2276
2277         ret = bkey_err(k);
2278         if (ret)
2279                 return ret;
2280
2281         if (k.k->type != KEY_TYPE_reflink_p)
2282                 return 0;
2283
2284         p = bkey_s_c_to_reflink_p(k);
2285
2286         if (!p.v->front_pad && !p.v->back_pad)
2287                 return 0;
2288
2289         u = bch2_trans_kmalloc(trans, sizeof(*u));
2290         ret = PTR_ERR_OR_ZERO(u);
2291         if (ret)
2292                 return ret;
2293
2294         bkey_reassemble(&u->k_i, k);
2295         u->v.front_pad  = 0;
2296         u->v.back_pad   = 0;
2297
2298         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2299 }
2300
2301 noinline_for_stack
2302 static int fix_reflink_p(struct bch_fs *c)
2303 {
2304         struct btree_trans trans;
2305         struct btree_iter iter;
2306         struct bkey_s_c k;
2307         int ret;
2308
2309         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2310                 return 0;
2311
2312         bch_verbose(c, "fixing reflink_p keys");
2313
2314         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2315
2316         for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
2317                            BTREE_ITER_INTENT|
2318                            BTREE_ITER_PREFETCH|
2319                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2320                 if (k.k->type == KEY_TYPE_reflink_p) {
2321                         ret = __bch2_trans_do(&trans, NULL, NULL,
2322                                               BTREE_INSERT_NOFAIL|
2323                                               BTREE_INSERT_LAZY_RW,
2324                                               fix_reflink_p_key(&trans, &iter));
2325                         if (ret)
2326                                 break;
2327                 }
2328         }
2329         bch2_trans_iter_exit(&trans, &iter);
2330
2331         bch2_trans_exit(&trans);
2332         return ret;
2333 }
2334
2335 /*
2336  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
2337  * Doesn't fix them yet, mainly because they haven't yet been observed:
2338  */
2339 int bch2_fsck_full(struct bch_fs *c)
2340 {
2341         return  bch2_fs_snapshots_check(c) ?:
2342                 check_inodes(c, true) ?:
2343                 check_subvols(c) ?:
2344                 check_extents(c) ?:
2345                 check_dirents(c) ?:
2346                 check_xattrs(c) ?:
2347                 check_root(c) ?:
2348                 check_directory_structure(c) ?:
2349                 check_nlinks(c) ?:
2350                 fix_reflink_p(c);
2351 }
2352
2353 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
2354 {
2355         return check_inodes(c, false);
2356 }