]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
d2b155f07fc10a185dde5fbd80ca3cfa02db36a6
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "darray.h"
7 #include "dirent.h"
8 #include "error.h"
9 #include "fs-common.h"
10 #include "fsck.h"
11 #include "inode.h"
12 #include "keylist.h"
13 #include "subvolume.h"
14 #include "super.h"
15 #include "xattr.h"
16
17 #include <linux/bsearch.h>
18 #include <linux/dcache.h> /* struct qstr */
19
20 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
21
22 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
23                                     u32 snapshot)
24 {
25         struct btree_iter iter;
26         struct bkey_s_c k;
27         u64 sectors = 0;
28         int ret;
29
30         for_each_btree_key(trans, iter, BTREE_ID_extents,
31                            SPOS(inum, 0, snapshot), 0, k, ret) {
32                 if (k.k->p.inode != inum)
33                         break;
34
35                 if (bkey_extent_is_allocation(k.k))
36                         sectors += k.k->size;
37         }
38
39         bch2_trans_iter_exit(trans, &iter);
40
41         return ret ?: sectors;
42 }
43
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45                                     u32 snapshot)
46 {
47         struct btree_iter iter;
48         struct bkey_s_c k;
49         struct bkey_s_c_dirent d;
50         u64 subdirs = 0;
51         int ret;
52
53         for_each_btree_key(trans, iter, BTREE_ID_dirents,
54                            SPOS(inum, 0, snapshot), 0, k, ret) {
55                 if (k.k->p.inode != inum)
56                         break;
57
58                 if (k.k->type != KEY_TYPE_dirent)
59                         continue;
60
61                 d = bkey_s_c_to_dirent(k);
62                 if (d.v->d_type == DT_DIR)
63                         subdirs++;
64         }
65
66         bch2_trans_iter_exit(trans, &iter);
67
68         return ret ?: subdirs;
69 }
70
71 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
72                                     u32 *subvol)
73 {
74         struct btree_iter iter;
75         struct bkey_s_c k;
76         int ret;
77
78         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
79                              POS(0, snapshot), 0);
80         k = bch2_btree_iter_peek_slot(&iter);
81         ret = bkey_err(k);
82         if (ret)
83                 goto err;
84
85         if (k.k->type != KEY_TYPE_snapshot) {
86                 bch_err(trans->c, "snapshot %u not fonud", snapshot);
87                 ret = -ENOENT;
88                 goto err;
89         }
90
91         *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
92 err:
93         bch2_trans_iter_exit(trans, &iter);
94         return ret;
95
96 }
97
98 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
99                            u32 *snapshot, u64 *inum)
100 {
101         struct bch_subvolume s;
102         int ret;
103
104         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
105
106         *snapshot = le32_to_cpu(s.snapshot);
107         *inum = le64_to_cpu(s.inode);
108         return ret;
109 }
110
111 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
112                          u32 *snapshot, u64 *inum)
113 {
114         return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
115 }
116
117 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
118                               struct bch_inode_unpacked *inode)
119 {
120         struct btree_iter iter;
121         struct bkey_s_c k;
122         int ret;
123
124         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
125                              POS(0, inode_nr),
126                              BTREE_ITER_ALL_SNAPSHOTS);
127         k = bch2_btree_iter_peek(&iter);
128         ret = bkey_err(k);
129         if (ret)
130                 goto err;
131
132         if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
133                 ret = -ENOENT;
134                 goto err;
135         }
136
137         ret = bch2_inode_unpack(k, inode);
138 err:
139         if (ret && ret != -EINTR)
140                 bch_err(trans->c, "error %i fetching inode %llu",
141                         ret, inode_nr);
142         bch2_trans_iter_exit(trans, &iter);
143         return ret;
144 }
145
146 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
147                           struct bch_inode_unpacked *inode,
148                           u32 *snapshot)
149 {
150         struct btree_iter iter;
151         struct bkey_s_c k;
152         int ret;
153
154         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
155                              SPOS(0, inode_nr, *snapshot), 0);
156         k = bch2_btree_iter_peek_slot(&iter);
157         ret = bkey_err(k);
158         if (ret)
159                 goto err;
160
161         ret = bkey_is_inode(k.k)
162                 ? bch2_inode_unpack(k, inode)
163                 : -ENOENT;
164         if (!ret)
165                 *snapshot = iter.pos.snapshot;
166 err:
167         if (ret && ret != -EINTR)
168                 bch_err(trans->c, "error %i fetching inode %llu:%u",
169                         ret, inode_nr, *snapshot);
170         bch2_trans_iter_exit(trans, &iter);
171         return ret;
172 }
173
174 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
175                         struct bch_inode_unpacked *inode,
176                         u32 *snapshot)
177 {
178         return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
179 }
180
181 static int __lookup_dirent(struct btree_trans *trans,
182                            struct bch_hash_info hash_info,
183                            subvol_inum dir, struct qstr *name,
184                            u64 *target, unsigned *type)
185 {
186         struct btree_iter iter;
187         struct bkey_s_c_dirent d;
188         int ret;
189
190         ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
191                                &hash_info, dir, name, 0);
192         if (ret)
193                 return ret;
194
195         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
196         *target = le64_to_cpu(d.v->d_inum);
197         *type = d.v->d_type;
198         bch2_trans_iter_exit(trans, &iter);
199         return 0;
200 }
201
202 static int __write_inode(struct btree_trans *trans,
203                          struct bch_inode_unpacked *inode,
204                          u32 snapshot)
205 {
206         struct btree_iter iter;
207         int ret;
208
209         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
210                             SPOS(0, inode->bi_inum, snapshot),
211                             BTREE_ITER_INTENT);
212
213         ret   = bch2_btree_iter_traverse(&iter) ?:
214                 bch2_inode_write(trans, &iter, inode);
215         bch2_trans_iter_exit(trans, &iter);
216         return ret;
217 }
218
219 static int write_inode(struct btree_trans *trans,
220                        struct bch_inode_unpacked *inode,
221                        u32 snapshot)
222 {
223         int ret = __bch2_trans_do(trans, NULL, NULL,
224                                   BTREE_INSERT_NOFAIL|
225                                   BTREE_INSERT_LAZY_RW,
226                                   __write_inode(trans, inode, snapshot));
227         if (ret)
228                 bch_err(trans->c, "error in fsck: error %i updating inode", ret);
229         return ret;
230 }
231
232 static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
233 {
234         struct btree_iter iter = { NULL };
235         struct bkey_i_inode_generation delete;
236         struct bch_inode_unpacked inode_u;
237         struct bkey_s_c k;
238         int ret;
239
240         ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
241                                               SPOS(inum, 0, snapshot),
242                                               SPOS(inum, U64_MAX, snapshot),
243                                               0, NULL) ?:
244                 bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
245                                               SPOS(inum, 0, snapshot),
246                                               SPOS(inum, U64_MAX, snapshot),
247                                               0, NULL) ?:
248                 bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
249                                               SPOS(inum, 0, snapshot),
250                                               SPOS(inum, U64_MAX, snapshot),
251                                               0, NULL);
252         if (ret)
253                 goto err;
254 retry:
255         bch2_trans_begin(trans);
256
257         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
258                              SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
259         k = bch2_btree_iter_peek_slot(&iter);
260
261         ret = bkey_err(k);
262         if (ret)
263                 goto err;
264
265         if (!bkey_is_inode(k.k)) {
266                 bch2_fs_inconsistent(trans->c,
267                                      "inode %llu:%u not found when deleting",
268                                      inum, snapshot);
269                 ret = -EIO;
270                 goto err;
271         }
272
273         bch2_inode_unpack(k, &inode_u);
274
275         /* Subvolume root? */
276         if (inode_u.bi_subvol) {
277                 ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
278                 if (ret)
279                         goto err;
280         }
281
282         bkey_inode_generation_init(&delete.k_i);
283         delete.k.p = iter.pos;
284         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
285
286         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
287                 bch2_trans_commit(trans, NULL, NULL,
288                                 BTREE_INSERT_NOFAIL);
289 err:
290         bch2_trans_iter_exit(trans, &iter);
291         if (ret == -EINTR)
292                 goto retry;
293
294         return ret;
295 }
296
297 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
298 {
299         struct bch_fs *c = trans->c;
300         struct btree_iter iter;
301         struct bch_inode_unpacked dir_inode;
302         struct bch_hash_info dir_hash_info;
303         int ret;
304
305         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
306         if (ret)
307                 return ret;
308
309         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
310
311         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
312
313         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
314                                   &dir_hash_info, &iter, 0);
315         bch2_trans_iter_exit(trans, &iter);
316         return ret;
317 }
318
319 /* Get lost+found, create if it doesn't exist: */
320 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
321                             struct bch_inode_unpacked *lostfound)
322 {
323         struct bch_fs *c = trans->c;
324         struct bch_inode_unpacked root;
325         struct bch_hash_info root_hash_info;
326         struct qstr lostfound_str = QSTR("lost+found");
327         subvol_inum root_inum = { .subvol = subvol };
328         u64 inum = 0;
329         unsigned d_type = 0;
330         u32 snapshot;
331         int ret;
332
333         ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
334         if (ret)
335                 return ret;
336
337         ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
338         if (ret)
339                 return ret;
340
341         root_hash_info = bch2_hash_info_init(c, &root);
342
343         ret = __lookup_dirent(trans, root_hash_info, root_inum,
344                             &lostfound_str, &inum, &d_type);
345         if (ret == -ENOENT) {
346                 bch_notice(c, "creating lost+found");
347                 goto create_lostfound;
348         }
349
350         if (ret && ret != -EINTR)
351                 bch_err(c, "error looking up lost+found: %i", ret);
352         if (ret)
353                 return ret;
354
355         if (d_type != DT_DIR) {
356                 bch_err(c, "error looking up lost+found: not a directory");
357                 return ret;
358         }
359
360         /*
361          * The check_dirents pass has already run, dangling dirents
362          * shouldn't exist here:
363          */
364         return __lookup_inode(trans, inum, lostfound, &snapshot);
365
366 create_lostfound:
367         bch2_inode_init_early(c, lostfound);
368
369         ret = bch2_create_trans(trans, root_inum, &root,
370                                 lostfound, &lostfound_str,
371                                 0, 0, S_IFDIR|0700, 0, NULL, NULL,
372                                 (subvol_inum) { }, 0);
373         if (ret && ret != -EINTR)
374                 bch_err(c, "error creating lost+found: %i", ret);
375         return ret;
376 }
377
378 static int __reattach_inode(struct btree_trans *trans,
379                           struct bch_inode_unpacked *inode,
380                           u32 inode_snapshot)
381 {
382         struct bch_hash_info dir_hash;
383         struct bch_inode_unpacked lostfound;
384         char name_buf[20];
385         struct qstr name;
386         u64 dir_offset = 0;
387         u32 subvol;
388         int ret;
389
390         ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
391         if (ret)
392                 return ret;
393
394         ret = lookup_lostfound(trans, subvol, &lostfound);
395         if (ret)
396                 return ret;
397
398         if (S_ISDIR(inode->bi_mode)) {
399                 lostfound.bi_nlink++;
400
401                 ret = __write_inode(trans, &lostfound, U32_MAX);
402                 if (ret)
403                         return ret;
404         }
405
406         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
407
408         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
409         name = (struct qstr) QSTR(name_buf);
410
411         ret = bch2_dirent_create(trans,
412                                  (subvol_inum) {
413                                         .subvol = subvol,
414                                         .inum = lostfound.bi_inum,
415                                  },
416                                  &dir_hash,
417                                  inode_d_type(inode),
418                                  &name, inode->bi_inum, &dir_offset,
419                                  BCH_HASH_SET_MUST_CREATE);
420         if (ret)
421                 return ret;
422
423         inode->bi_dir           = lostfound.bi_inum;
424         inode->bi_dir_offset    = dir_offset;
425
426         return __write_inode(trans, inode, inode_snapshot);
427 }
428
429 static int reattach_inode(struct btree_trans *trans,
430                           struct bch_inode_unpacked *inode,
431                           u32 inode_snapshot)
432 {
433         int ret = __bch2_trans_do(trans, NULL, NULL,
434                                   BTREE_INSERT_LAZY_RW|
435                                   BTREE_INSERT_NOFAIL,
436                         __reattach_inode(trans, inode, inode_snapshot));
437         if (ret) {
438                 bch_err(trans->c, "error %i reattaching inode %llu",
439                         ret, inode->bi_inum);
440                 return ret;
441         }
442
443         return ret;
444 }
445
446 static int remove_backpointer(struct btree_trans *trans,
447                               struct bch_inode_unpacked *inode)
448 {
449         struct btree_iter iter;
450         struct bkey_s_c k;
451         int ret;
452
453         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
454                              POS(inode->bi_dir, inode->bi_dir_offset), 0);
455         k = bch2_btree_iter_peek_slot(&iter);
456         ret = bkey_err(k);
457         if (ret)
458                 goto out;
459         if (k.k->type != KEY_TYPE_dirent) {
460                 ret = -ENOENT;
461                 goto out;
462         }
463
464         ret = __remove_dirent(trans, k.k->p);
465 out:
466         bch2_trans_iter_exit(trans, &iter);
467         return ret;
468 }
469
470 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
471 {
472         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
473
474         if (bkey_cmp(s->pos, pos))
475                 s->ids.nr = 0;
476         s->pos = pos;
477
478         /* Might get called multiple times due to lock restarts */
479         if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot)
480                 return 0;
481
482         return snapshots_seen_add(c, s, pos.snapshot);
483 }
484
485 /**
486  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
487  * and @ancestor hasn't been overwritten in @seen
488  *
489  * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
490  */
491 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
492                                     u32 id, u32 ancestor)
493 {
494         ssize_t i;
495
496         BUG_ON(id > ancestor);
497
498         id              = snapshot_t(c, id)->equiv;
499         ancestor        = snapshot_t(c, ancestor)->equiv;
500
501         /* @ancestor should be the snapshot most recently added to @seen */
502         BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor);
503         BUG_ON(seen->pos.snapshot != ancestor);
504
505         if (id == ancestor)
506                 return true;
507
508         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
509                 return false;
510
511         for (i = seen->ids.nr - 2;
512              i >= 0 && seen->ids.data[i] >= id;
513              --i)
514                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) &&
515                     bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor))
516                         return false;
517
518         return true;
519 }
520
521 /**
522  * ref_visible - given a key with snapshot id @src that points to a key with
523  * snapshot id @dst, test whether there is some snapshot in which @dst is
524  * visible.
525  *
526  * This assumes we're visiting @src keys in natural key order.
527  *
528  * @s   - list of snapshot IDs already seen at @src
529  * @src - snapshot ID of src key
530  * @dst - snapshot ID of dst key
531  */
532 static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
533                        u32 src, u32 dst)
534 {
535         return dst <= src
536                 ? key_visible_in_snapshot(c, s, dst, src)
537                 : bch2_snapshot_is_ancestor(c, src, dst);
538 }
539
540 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)       \
541         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\
542                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
543
544 struct inode_walker_entry {
545         struct bch_inode_unpacked inode;
546         u32                     snapshot;
547         u64                     count;
548 };
549
550 struct inode_walker {
551         bool                            first_this_inode;
552         u64                             cur_inum;
553
554         DARRAY(struct inode_walker_entry) inodes;
555 };
556
557 static void inode_walker_exit(struct inode_walker *w)
558 {
559         darray_exit(w->inodes);
560 }
561
562 static struct inode_walker inode_walker_init(void)
563 {
564         return (struct inode_walker) { 0, };
565 }
566
567 static int add_inode(struct bch_fs *c, struct inode_walker *w,
568                      struct bkey_s_c inode)
569 {
570         struct bch_inode_unpacked u;
571
572         BUG_ON(bch2_inode_unpack(inode, &u));
573
574         return darray_push(w->inodes, ((struct inode_walker_entry) {
575                 .inode          = u,
576                 .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
577         }));
578 }
579
580 static int __walk_inode(struct btree_trans *trans,
581                         struct inode_walker *w, struct bpos pos)
582 {
583         struct bch_fs *c = trans->c;
584         struct btree_iter iter;
585         struct bkey_s_c k;
586         unsigned i, ancestor_pos;
587         int ret;
588
589         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
590
591         if (pos.inode == w->cur_inum) {
592                 w->first_this_inode = false;
593                 goto lookup_snapshot;
594         }
595
596         w->inodes.nr = 0;
597
598         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
599                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
600                 if (k.k->p.offset != pos.inode)
601                         break;
602
603                 if (bkey_is_inode(k.k))
604                         add_inode(c, w, k);
605         }
606         bch2_trans_iter_exit(trans, &iter);
607
608         if (ret)
609                 return ret;
610
611         w->cur_inum             = pos.inode;
612         w->first_this_inode     = true;
613 lookup_snapshot:
614         for (i = 0; i < w->inodes.nr; i++)
615                 if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot))
616                         goto found;
617         return INT_MAX;
618 found:
619         BUG_ON(pos.snapshot > w->inodes.data[i].snapshot);
620
621         if (pos.snapshot != w->inodes.data[i].snapshot) {
622                 ancestor_pos = i;
623
624                 while (i && w->inodes.data[i - 1].snapshot > pos.snapshot)
625                         --i;
626
627                 ret = darray_insert_item(w->inodes, i, w->inodes.data[ancestor_pos]);
628                 if (ret)
629                         return ret;
630
631                 w->inodes.data[i].snapshot = pos.snapshot;
632                 w->inodes.data[i].count = 0;
633         }
634
635         return i;
636 }
637
638 static int __get_visible_inodes(struct btree_trans *trans,
639                                 struct inode_walker *w,
640                                 struct snapshots_seen *s,
641                                 u64 inum)
642 {
643         struct bch_fs *c = trans->c;
644         struct btree_iter iter;
645         struct bkey_s_c k;
646         int ret;
647
648         w->inodes.nr = 0;
649
650         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
651                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
652                 if (k.k->p.offset != inum)
653                         break;
654
655                 if (!bkey_is_inode(k.k))
656                         continue;
657
658                 if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
659                         add_inode(c, w, k);
660                         if (k.k->p.snapshot >= s->pos.snapshot)
661                                 break;
662                 }
663         }
664         bch2_trans_iter_exit(trans, &iter);
665
666         return ret;
667 }
668
669 static int check_key_has_snapshot(struct btree_trans *trans,
670                                   struct btree_iter *iter,
671                                   struct bkey_s_c k)
672 {
673         struct bch_fs *c = trans->c;
674         struct printbuf buf = PRINTBUF;
675         int ret = 0;
676
677         if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
678                         "key in missing snapshot: %s",
679                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
680                 ret = bch2_btree_delete_at(trans, iter,
681                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
682 fsck_err:
683         printbuf_exit(&buf);
684         return ret;
685 }
686
687 static int hash_redo_key(struct btree_trans *trans,
688                          const struct bch_hash_desc desc,
689                          struct bch_hash_info *hash_info,
690                          struct btree_iter *k_iter, struct bkey_s_c k)
691 {
692         bch_err(trans->c, "hash_redo_key() not implemented yet");
693         return -EINVAL;
694 #if 0
695         struct bkey_i *delete;
696         struct bkey_i *tmp;
697
698         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
699         if (IS_ERR(delete))
700                 return PTR_ERR(delete);
701
702         tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
703         if (IS_ERR(tmp))
704                 return PTR_ERR(tmp);
705
706         bkey_reassemble(tmp, k);
707
708         bkey_init(&delete->k);
709         delete->k.p = k_iter->pos;
710         return  bch2_btree_iter_traverse(k_iter) ?:
711                 bch2_trans_update(trans, k_iter, delete, 0) ?:
712                 bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
713 #endif
714 }
715
716 static int hash_check_key(struct btree_trans *trans,
717                           const struct bch_hash_desc desc,
718                           struct bch_hash_info *hash_info,
719                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
720 {
721         struct bch_fs *c = trans->c;
722         struct btree_iter iter = { NULL };
723         struct printbuf buf = PRINTBUF;
724         struct bkey_s_c k;
725         u64 hash;
726         int ret = 0;
727
728         if (hash_k.k->type != desc.key_type)
729                 return 0;
730
731         hash = desc.hash_bkey(hash_info, hash_k);
732
733         if (likely(hash == hash_k.k->p.offset))
734                 return 0;
735
736         if (hash_k.k->p.offset < hash)
737                 goto bad_hash;
738
739         for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
740                            BTREE_ITER_SLOTS, k, ret) {
741                 if (!bkey_cmp(k.k->p, hash_k.k->p))
742                         break;
743
744                 if (fsck_err_on(k.k->type == desc.key_type &&
745                                 !desc.cmp_bkey(k, hash_k), c,
746                                 "duplicate hash table keys:\n%s",
747                                 (printbuf_reset(&buf),
748                                  bch2_bkey_val_to_text(&buf, c, hash_k),
749                                  buf.buf))) {
750                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
751                         break;
752                 }
753
754                 if (bkey_deleted(k.k)) {
755                         bch2_trans_iter_exit(trans, &iter);
756                         goto bad_hash;
757                 }
758
759         }
760 out:
761         bch2_trans_iter_exit(trans, &iter);
762         printbuf_exit(&buf);
763         return ret;
764 bad_hash:
765         if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
766                      "hashed to %llu\n%s",
767                      desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
768                      (printbuf_reset(&buf),
769                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
770                 return 0;
771
772         ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
773         if (ret) {
774                 bch_err(c, "hash_redo_key err %i", ret);
775                 return ret;
776         }
777         ret = -EINTR;
778 fsck_err:
779         goto out;
780 }
781
782 static int check_inode(struct btree_trans *trans,
783                        struct btree_iter *iter,
784                        struct bch_inode_unpacked *prev,
785                        bool full)
786 {
787         struct bch_fs *c = trans->c;
788         struct bkey_s_c k;
789         struct bch_inode_unpacked u;
790         bool do_update = false;
791         int ret;
792
793         k = bch2_btree_iter_peek(iter);
794         if (!k.k)
795                 return 0;
796
797         ret = bkey_err(k);
798         if (ret)
799                 return ret;
800
801         ret = check_key_has_snapshot(trans, iter, k);
802         if (ret)
803                 return ret < 0 ? ret : 0;
804
805         /*
806          * if snapshot id isn't a leaf node, skip it - deletion in
807          * particular is not atomic, so on the internal snapshot nodes
808          * we can see inodes marked for deletion after a clean shutdown
809          */
810         if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
811                 return 0;
812
813         if (!bkey_is_inode(k.k))
814                 return 0;
815
816         BUG_ON(bch2_inode_unpack(k, &u));
817
818         if (!full &&
819             !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
820                             BCH_INODE_I_SECTORS_DIRTY|
821                             BCH_INODE_UNLINKED)))
822                 return 0;
823
824         if (prev->bi_inum != u.bi_inum)
825                 *prev = u;
826
827         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
828                         inode_d_type(prev)      != inode_d_type(&u), c,
829                         "inodes in different snapshots don't match")) {
830                 bch_err(c, "repair not implemented yet");
831                 return -EINVAL;
832         }
833
834         if (u.bi_flags & BCH_INODE_UNLINKED &&
835             (!c->sb.clean ||
836              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
837                       u.bi_inum))) {
838                 bch2_trans_unlock(trans);
839                 bch2_fs_lazy_rw(c);
840
841                 ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
842                 if (ret)
843                         bch_err(c, "error in fsck: error %i while deleting inode", ret);
844                 return ret;
845         }
846
847         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
848             (!c->sb.clean ||
849              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
850                       u.bi_inum))) {
851                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
852
853                 bch2_trans_unlock(trans);
854                 bch2_fs_lazy_rw(c);
855
856                 /*
857                  * XXX: need to truncate partial blocks too here - or ideally
858                  * just switch units to bytes and that issue goes away
859                  */
860                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
861                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
862                                      iter->pos.snapshot),
863                                 POS(u.bi_inum, U64_MAX),
864                                 0, NULL);
865                 if (ret) {
866                         bch_err(c, "error in fsck: error %i truncating inode", ret);
867                         return ret;
868                 }
869
870                 /*
871                  * We truncated without our normal sector accounting hook, just
872                  * make sure we recalculate it:
873                  */
874                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
875
876                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
877                 do_update = true;
878         }
879
880         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
881             (!c->sb.clean ||
882              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
883                       u.bi_inum))) {
884                 s64 sectors;
885
886                 bch_verbose(c, "recounting sectors for inode %llu",
887                             u.bi_inum);
888
889                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
890                 if (sectors < 0) {
891                         bch_err(c, "error in fsck: error %i recounting inode sectors",
892                                 (int) sectors);
893                         return sectors;
894                 }
895
896                 u.bi_sectors = sectors;
897                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
898                 do_update = true;
899         }
900
901         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
902                 u.bi_dir = 0;
903                 u.bi_dir_offset = 0;
904                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
905                 do_update = true;
906         }
907
908         if (do_update) {
909                 ret = write_inode(trans, &u, iter->pos.snapshot);
910                 if (ret)
911                         bch_err(c, "error in fsck: error %i "
912                                 "updating inode", ret);
913         }
914 fsck_err:
915         return ret;
916 }
917
918 noinline_for_stack
919 static int check_inodes(struct bch_fs *c, bool full)
920 {
921         struct btree_trans trans;
922         struct btree_iter iter;
923         struct bch_inode_unpacked prev = { 0 };
924         int ret;
925
926         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
927
928         bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
929                              BTREE_ITER_INTENT|
930                              BTREE_ITER_PREFETCH|
931                              BTREE_ITER_ALL_SNAPSHOTS);
932
933         do {
934                 ret = __bch2_trans_do(&trans, NULL, NULL,
935                                       BTREE_INSERT_LAZY_RW|
936                                       BTREE_INSERT_NOFAIL,
937                         check_inode(&trans, &iter, &prev, full));
938                 if (ret)
939                         break;
940         } while (bch2_btree_iter_advance(&iter));
941         bch2_trans_iter_exit(&trans, &iter);
942
943         bch2_trans_exit(&trans);
944         return ret;
945 }
946
947 static int check_subvol(struct btree_trans *trans,
948                         struct btree_iter *iter)
949 {
950         struct bkey_s_c k;
951         struct bkey_s_c_subvolume subvol;
952         int ret;
953
954         k = bch2_btree_iter_peek(iter);
955         if (!k.k)
956                 return 0;
957
958         ret = bkey_err(k);
959         if (ret)
960                 return ret;
961
962         if (k.k->type != KEY_TYPE_subvolume)
963                 return 0;
964
965         subvol = bkey_s_c_to_subvolume(k);
966
967         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
968                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
969                 if (ret && ret != -EINTR)
970                         bch_err(trans->c, "error deleting subvolume %llu: %i",
971                                 iter->pos.offset, ret);
972                 if (ret)
973                         return ret;
974         }
975
976         return 0;
977 }
978
979 noinline_for_stack
980 static int check_subvols(struct bch_fs *c)
981 {
982         struct btree_trans trans;
983         struct btree_iter iter;
984         int ret;
985
986         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
987
988         bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
989                              POS_MIN,
990                              BTREE_ITER_INTENT|
991                              BTREE_ITER_PREFETCH);
992
993         do {
994                 ret = __bch2_trans_do(&trans, NULL, NULL,
995                                       BTREE_INSERT_LAZY_RW|
996                                       BTREE_INSERT_NOFAIL,
997                                       check_subvol(&trans, &iter));
998                 if (ret)
999                         break;
1000         } while (bch2_btree_iter_advance(&iter));
1001         bch2_trans_iter_exit(&trans, &iter);
1002
1003         bch2_trans_exit(&trans);
1004         return ret;
1005 }
1006
1007 /*
1008  * Checking for overlapping extents needs to be reimplemented
1009  */
1010 #if 0
1011 static int fix_overlapping_extent(struct btree_trans *trans,
1012                                        struct bkey_s_c k, struct bpos cut_at)
1013 {
1014         struct btree_iter iter;
1015         struct bkey_i *u;
1016         int ret;
1017
1018         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1019         ret = PTR_ERR_OR_ZERO(u);
1020         if (ret)
1021                 return ret;
1022
1023         bkey_reassemble(u, k);
1024         bch2_cut_front(cut_at, u);
1025
1026
1027         /*
1028          * We don't want to go through the extent_handle_overwrites path:
1029          *
1030          * XXX: this is going to screw up disk accounting, extent triggers
1031          * assume things about extent overwrites - we should be running the
1032          * triggers manually here
1033          */
1034         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
1035                              BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
1036
1037         BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
1038         ret   = bch2_btree_iter_traverse(&iter) ?:
1039                 bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
1040                 bch2_trans_commit(trans, NULL, NULL,
1041                                   BTREE_INSERT_NOFAIL|
1042                                   BTREE_INSERT_LAZY_RW);
1043         bch2_trans_iter_exit(trans, &iter);
1044         return ret;
1045 }
1046 #endif
1047
1048 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
1049                                                 struct btree_iter *iter,
1050                                                 struct bpos pos)
1051 {
1052         struct bkey_s_c k;
1053         int ret;
1054
1055         bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
1056         k = bch2_btree_iter_peek_slot(iter);
1057         ret = bkey_err(k);
1058         if (!ret && k.k->type != KEY_TYPE_dirent)
1059                 ret = -ENOENT;
1060         if (ret) {
1061                 bch2_trans_iter_exit(trans, iter);
1062                 return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
1063         }
1064
1065         return bkey_s_c_to_dirent(k);
1066 }
1067
1068 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
1069                                    struct bkey_s_c_dirent d)
1070 {
1071         return  inode->bi_dir           == d.k->p.inode &&
1072                 inode->bi_dir_offset    == d.k->p.offset;
1073 }
1074
1075 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
1076                                    struct bch_inode_unpacked *inode)
1077 {
1078         return d.v->d_type == DT_SUBVOL
1079                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
1080                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
1081 }
1082
1083 static int inode_backpointer_exists(struct btree_trans *trans,
1084                                     struct bch_inode_unpacked *inode,
1085                                     u32 snapshot)
1086 {
1087         struct btree_iter iter;
1088         struct bkey_s_c_dirent d;
1089         int ret;
1090
1091         d = dirent_get_by_pos(trans, &iter,
1092                         SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
1093         ret = bkey_err(d.s_c);
1094         if (ret)
1095                 return ret;
1096
1097         ret = dirent_points_to_inode(d, inode);
1098         bch2_trans_iter_exit(trans, &iter);
1099         return ret;
1100 }
1101
1102 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1103 {
1104         struct bch_fs *c = trans->c;
1105         struct inode_walker_entry *i;
1106         int ret = 0, ret2 = 0;
1107         s64 count2;
1108
1109         darray_for_each(w->inodes, i) {
1110                 if (i->inode.bi_sectors == i->count)
1111                         continue;
1112
1113                 count2 = lockrestart_do(trans,
1114                         bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
1115
1116                 if (i->count != count2) {
1117                         bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
1118                                 i->count, count2);
1119                         i->count = count2;
1120                         if (i->inode.bi_sectors == i->count)
1121                                 continue;
1122                 }
1123
1124                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
1125                             "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1126                             w->cur_inum, i->snapshot,
1127                             i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
1128                         continue;
1129
1130                 i->inode.bi_sectors = i->count;
1131                 ret = write_inode(trans, &i->inode, i->snapshot);
1132                 if (ret)
1133                         break;
1134                 ret2 = -EINTR;
1135         }
1136 fsck_err:
1137         return ret ?: ret2;
1138 }
1139
1140 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1141                         struct inode_walker *inode,
1142                         struct snapshots_seen *s)
1143 {
1144         struct bch_fs *c = trans->c;
1145         struct bkey_s_c k;
1146         struct inode_walker_entry *i;
1147         struct printbuf buf = PRINTBUF;
1148         int ret = 0;
1149 peek:
1150         k = bch2_btree_iter_peek(iter);
1151         if (!k.k)
1152                 goto out;
1153
1154         ret = bkey_err(k);
1155         if (ret)
1156                 goto err;
1157
1158         ret = check_key_has_snapshot(trans, iter, k);
1159         if (ret) {
1160                 ret = ret < 0 ? ret : 0;
1161                 goto out;
1162         }
1163
1164         ret = snapshots_seen_update(c, s, k.k->p);
1165         if (ret)
1166                 goto err;
1167
1168         if (k.k->type == KEY_TYPE_whiteout)
1169                 goto out;
1170
1171         if (inode->cur_inum != k.k->p.inode) {
1172                 ret = check_i_sectors(trans, inode);
1173                 if (ret)
1174                         goto err;
1175         }
1176
1177         if (!iter->path->should_be_locked) {
1178                 /*
1179                  * hack: check_i_sectors may have handled a transaction restart,
1180                  * it shouldn't be but we need to fix the new i_sectors check
1181                  * code and delete the old bch2_count_inode_sectors() first
1182                  */
1183                 goto peek;
1184         }
1185 #if 0
1186         if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
1187                 char buf1[200];
1188                 char buf2[200];
1189
1190                 bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
1191                 bch2_bkey_val_to_text(&PBUF(buf2), c, k);
1192
1193                 if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
1194                         ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
1195                         goto out;
1196                 }
1197         }
1198 #endif
1199         ret = __walk_inode(trans, inode, k.k->p);
1200         if (ret < 0)
1201                 goto err;
1202
1203         if (fsck_err_on(ret == INT_MAX, c,
1204                         "extent in missing inode:\n  %s",
1205                         (printbuf_reset(&buf),
1206                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1207                 ret = bch2_btree_delete_at(trans, iter,
1208                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1209                 goto out;
1210         }
1211
1212         if (ret == INT_MAX) {
1213                 ret = 0;
1214                 goto out;
1215         }
1216
1217         i = inode->inodes.data + ret;
1218         ret = 0;
1219
1220         if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
1221                         !S_ISLNK(i->inode.bi_mode), c,
1222                         "extent in non regular inode mode %o:\n  %s",
1223                         i->inode.bi_mode,
1224                         (printbuf_reset(&buf),
1225                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1226                 ret = bch2_btree_delete_at(trans, iter,
1227                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1228                 goto out;
1229         }
1230
1231         if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
1232                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
1233                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
1234                                         k.k->type != KEY_TYPE_reservation &&
1235                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
1236                                         "extent type %u offset %llu past end of inode %llu, i_size %llu",
1237                                         k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
1238                                 bch2_fs_lazy_rw(c);
1239                                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1240                                                 SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
1241                                                      k.k->p.snapshot),
1242                                                 POS(k.k->p.inode, U64_MAX),
1243                                                 0, NULL) ?: -EINTR;
1244                                 goto out;
1245                         }
1246                 }
1247         }
1248
1249         if (bkey_extent_is_allocation(k.k))
1250                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
1251                         i->count += k.k->size;
1252 #if 0
1253         bch2_bkey_buf_reassemble(&prev, c, k);
1254 #endif
1255
1256 out:
1257 err:
1258 fsck_err:
1259         printbuf_exit(&buf);
1260         return ret;
1261 }
1262
1263 /*
1264  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1265  * that i_size an i_sectors are consistent
1266  */
1267 noinline_for_stack
1268 static int check_extents(struct bch_fs *c)
1269 {
1270         struct inode_walker w = inode_walker_init();
1271         struct snapshots_seen s;
1272         struct btree_trans trans;
1273         struct btree_iter iter;
1274         int ret = 0;
1275
1276 #if 0
1277         struct bkey_buf prev;
1278         bch2_bkey_buf_init(&prev);
1279         prev.k->k = KEY(0, 0, 0);
1280 #endif
1281         snapshots_seen_init(&s);
1282         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1283
1284         bch_verbose(c, "checking extents");
1285
1286         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
1287                              POS(BCACHEFS_ROOT_INO, 0),
1288                              BTREE_ITER_INTENT|
1289                              BTREE_ITER_PREFETCH|
1290                              BTREE_ITER_ALL_SNAPSHOTS);
1291
1292         do {
1293                 ret = __bch2_trans_do(&trans, NULL, NULL,
1294                                       BTREE_INSERT_LAZY_RW|
1295                                       BTREE_INSERT_NOFAIL,
1296                         check_extent(&trans, &iter, &w, &s));
1297                 if (ret)
1298                         break;
1299         } while (bch2_btree_iter_advance(&iter));
1300         bch2_trans_iter_exit(&trans, &iter);
1301 #if 0
1302         bch2_bkey_buf_exit(&prev, c);
1303 #endif
1304         inode_walker_exit(&w);
1305         bch2_trans_exit(&trans);
1306         snapshots_seen_exit(&s);
1307
1308         return ret;
1309 }
1310
1311 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1312 {
1313         struct bch_fs *c = trans->c;
1314         struct inode_walker_entry *i;
1315         int ret = 0, ret2 = 0;
1316         s64 count2;
1317
1318         darray_for_each(w->inodes, i) {
1319                 if (i->inode.bi_nlink == i->count)
1320                         continue;
1321
1322                 count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
1323                 if (count2 < 0)
1324                         return count2;
1325
1326                 if (i->count != count2) {
1327                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1328                                 i->count, count2);
1329                         i->count = count2;
1330                         if (i->inode.bi_nlink == i->count)
1331                                 continue;
1332                 }
1333
1334                 if (fsck_err_on(i->inode.bi_nlink != i->count, c,
1335                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1336                                 w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
1337                         i->inode.bi_nlink = i->count;
1338                         ret = write_inode(trans, &i->inode, i->snapshot);
1339                         if (ret)
1340                                 break;
1341                         ret2 = -EINTR;
1342                 }
1343         }
1344 fsck_err:
1345         return ret ?: ret2;
1346 }
1347
1348 static int check_dirent_target(struct btree_trans *trans,
1349                                struct btree_iter *iter,
1350                                struct bkey_s_c_dirent d,
1351                                struct bch_inode_unpacked *target,
1352                                u32 target_snapshot)
1353 {
1354         struct bch_fs *c = trans->c;
1355         struct bkey_i_dirent *n;
1356         bool backpointer_exists = true;
1357         struct printbuf buf = PRINTBUF;
1358         int ret = 0;
1359
1360         if (!target->bi_dir &&
1361             !target->bi_dir_offset) {
1362                 target->bi_dir          = d.k->p.inode;
1363                 target->bi_dir_offset   = d.k->p.offset;
1364
1365                 ret = __write_inode(trans, target, target_snapshot);
1366                 if (ret)
1367                         goto err;
1368         }
1369
1370         if (!inode_points_to_dirent(target, d)) {
1371                 ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
1372                 if (ret < 0)
1373                         goto err;
1374
1375                 backpointer_exists = ret;
1376                 ret = 0;
1377
1378                 if (fsck_err_on(S_ISDIR(target->bi_mode) &&
1379                                 backpointer_exists, c,
1380                                 "directory %llu with multiple links",
1381                                 target->bi_inum)) {
1382                         ret = __remove_dirent(trans, d.k->p);
1383                         goto out;
1384                 }
1385
1386                 if (fsck_err_on(backpointer_exists &&
1387                                 !target->bi_nlink, c,
1388                                 "inode %llu has multiple links but i_nlink 0",
1389                                 target->bi_inum)) {
1390                         target->bi_nlink++;
1391                         target->bi_flags &= ~BCH_INODE_UNLINKED;
1392
1393                         ret = __write_inode(trans, target, target_snapshot);
1394                         if (ret)
1395                                 goto err;
1396                 }
1397
1398                 if (fsck_err_on(!backpointer_exists, c,
1399                                 "inode %llu:%u has wrong backpointer:\n"
1400                                 "got       %llu:%llu\n"
1401                                 "should be %llu:%llu",
1402                                 target->bi_inum, target_snapshot,
1403                                 target->bi_dir,
1404                                 target->bi_dir_offset,
1405                                 d.k->p.inode,
1406                                 d.k->p.offset)) {
1407                         target->bi_dir          = d.k->p.inode;
1408                         target->bi_dir_offset   = d.k->p.offset;
1409
1410                         ret = __write_inode(trans, target, target_snapshot);
1411                         if (ret)
1412                                 goto err;
1413                 }
1414         }
1415
1416         if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
1417                         "incorrect d_type: got %s, should be %s:\n%s",
1418                         bch2_d_type_str(d.v->d_type),
1419                         bch2_d_type_str(inode_d_type(target)),
1420                         (printbuf_reset(&buf),
1421                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1422                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1423                 ret = PTR_ERR_OR_ZERO(n);
1424                 if (ret)
1425                         goto err;
1426
1427                 bkey_reassemble(&n->k_i, d.s_c);
1428                 n->v.d_type = inode_d_type(target);
1429
1430                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1431                 if (ret)
1432                         goto err;
1433
1434                 d = dirent_i_to_s_c(n);
1435         }
1436
1437         if (d.v->d_type == DT_SUBVOL &&
1438             target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
1439             (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
1440              fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
1441                       le32_to_cpu(d.v->d_parent_subvol),
1442                       target->bi_parent_subvol))) {
1443                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1444                 ret = PTR_ERR_OR_ZERO(n);
1445                 if (ret)
1446                         goto err;
1447
1448                 bkey_reassemble(&n->k_i, d.s_c);
1449                 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1450
1451                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1452                 if (ret)
1453                         goto err;
1454
1455                 d = dirent_i_to_s_c(n);
1456         }
1457 out:
1458 err:
1459 fsck_err:
1460         printbuf_exit(&buf);
1461         return ret;
1462 }
1463
1464 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1465                         struct bch_hash_info *hash_info,
1466                         struct inode_walker *dir,
1467                         struct inode_walker *target,
1468                         struct snapshots_seen *s)
1469 {
1470         struct bch_fs *c = trans->c;
1471         struct bkey_s_c k;
1472         struct bkey_s_c_dirent d;
1473         struct inode_walker_entry *i;
1474         struct printbuf buf = PRINTBUF;
1475         int ret = 0;
1476 peek:
1477         k = bch2_btree_iter_peek(iter);
1478         if (!k.k)
1479                 goto out;
1480
1481         ret = bkey_err(k);
1482         if (ret)
1483                 goto err;
1484
1485         ret = check_key_has_snapshot(trans, iter, k);
1486         if (ret) {
1487                 ret = ret < 0 ? ret : 0;
1488                 goto out;
1489         }
1490
1491         ret = snapshots_seen_update(c, s, k.k->p);
1492         if (ret)
1493                 goto err;
1494
1495         if (k.k->type == KEY_TYPE_whiteout)
1496                 goto out;
1497
1498         if (dir->cur_inum != k.k->p.inode) {
1499                 ret = check_subdir_count(trans, dir);
1500                 if (ret)
1501                         goto err;
1502         }
1503
1504         if (!iter->path->should_be_locked) {
1505                 /* hack: see check_extent() */
1506                 goto peek;
1507         }
1508
1509         ret = __walk_inode(trans, dir, k.k->p);
1510         if (ret < 0)
1511                 goto err;
1512
1513         if (fsck_err_on(ret == INT_MAX, c,
1514                         "dirent in nonexisting directory:\n%s",
1515                         (printbuf_reset(&buf),
1516                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1517                 ret = bch2_btree_delete_at(trans, iter,
1518                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1519                 goto out;
1520         }
1521
1522         if (ret == INT_MAX) {
1523                 ret = 0;
1524                 goto out;
1525         }
1526
1527         i = dir->inodes.data + ret;
1528         ret = 0;
1529
1530         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
1531                         "dirent in non directory inode type %s:\n%s",
1532                         bch2_d_type_str(inode_d_type(&i->inode)),
1533                         (printbuf_reset(&buf),
1534                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1535                 ret = bch2_btree_delete_at(trans, iter, 0);
1536                 goto out;
1537         }
1538
1539         if (dir->first_this_inode)
1540                 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
1541
1542         ret = hash_check_key(trans, bch2_dirent_hash_desc,
1543                              hash_info, iter, k);
1544         if (ret < 0)
1545                 goto err;
1546         if (ret) {
1547                 /* dirent has been deleted */
1548                 ret = 0;
1549                 goto out;
1550         }
1551
1552         if (k.k->type != KEY_TYPE_dirent)
1553                 goto out;
1554
1555         d = bkey_s_c_to_dirent(k);
1556
1557         if (d.v->d_type == DT_SUBVOL) {
1558                 struct bch_inode_unpacked subvol_root;
1559                 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1560                 u32 target_snapshot;
1561                 u64 target_inum;
1562
1563                 ret = __subvol_lookup(trans, target_subvol,
1564                                       &target_snapshot, &target_inum);
1565                 if (ret && ret != -ENOENT)
1566                         goto err;
1567
1568                 if (fsck_err_on(ret, c,
1569                                 "dirent points to missing subvolume %llu",
1570                                 le64_to_cpu(d.v->d_child_subvol))) {
1571                         ret = __remove_dirent(trans, d.k->p);
1572                         goto err;
1573                 }
1574
1575                 ret = __lookup_inode(trans, target_inum,
1576                                    &subvol_root, &target_snapshot);
1577                 if (ret && ret != -ENOENT)
1578                         goto err;
1579
1580                 if (fsck_err_on(ret, c,
1581                                 "subvolume %u points to missing subvolume root %llu",
1582                                 target_subvol,
1583                                 target_inum)) {
1584                         bch_err(c, "repair not implemented yet");
1585                         ret = -EINVAL;
1586                         goto err;
1587                 }
1588
1589                 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
1590                                 "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1591                                 target_inum,
1592                                 subvol_root.bi_subvol, target_subvol)) {
1593                         subvol_root.bi_subvol = target_subvol;
1594                         ret = __write_inode(trans, &subvol_root, target_snapshot);
1595                         if (ret)
1596                                 goto err;
1597                 }
1598
1599                 ret = check_dirent_target(trans, iter, d, &subvol_root,
1600                                           target_snapshot);
1601                 if (ret)
1602                         goto err;
1603         } else {
1604                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1605                 if (ret)
1606                         goto err;
1607
1608                 if (fsck_err_on(!target->inodes.nr, c,
1609                                 "dirent points to missing inode:\n%s",
1610                                 (printbuf_reset(&buf),
1611                                  bch2_bkey_val_to_text(&buf, c, k),
1612                                  buf.buf))) {
1613                         ret = __remove_dirent(trans, d.k->p);
1614                         if (ret)
1615                                 goto err;
1616                 }
1617
1618                 darray_for_each(target->inodes, i) {
1619                         ret = check_dirent_target(trans, iter, d,
1620                                                   &i->inode, i->snapshot);
1621                         if (ret)
1622                                 goto err;
1623                 }
1624         }
1625
1626         if (d.v->d_type == DT_DIR)
1627                 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
1628                         i->count++;
1629
1630 out:
1631 err:
1632 fsck_err:
1633         printbuf_exit(&buf);
1634         return ret;
1635 }
1636
1637 /*
1638  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1639  * validate d_type
1640  */
1641 noinline_for_stack
1642 static int check_dirents(struct bch_fs *c)
1643 {
1644         struct inode_walker dir = inode_walker_init();
1645         struct inode_walker target = inode_walker_init();
1646         struct snapshots_seen s;
1647         struct bch_hash_info hash_info;
1648         struct btree_trans trans;
1649         struct btree_iter iter;
1650         int ret = 0;
1651
1652         bch_verbose(c, "checking dirents");
1653
1654         snapshots_seen_init(&s);
1655         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1656
1657         bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
1658                              POS(BCACHEFS_ROOT_INO, 0),
1659                              BTREE_ITER_INTENT|
1660                              BTREE_ITER_PREFETCH|
1661                              BTREE_ITER_ALL_SNAPSHOTS);
1662
1663         do {
1664                 ret = __bch2_trans_do(&trans, NULL, NULL,
1665                                       BTREE_INSERT_LAZY_RW|
1666                                       BTREE_INSERT_NOFAIL,
1667                         check_dirent(&trans, &iter, &hash_info,
1668                                      &dir, &target, &s));
1669                 if (ret)
1670                         break;
1671         } while (bch2_btree_iter_advance(&iter));
1672         bch2_trans_iter_exit(&trans, &iter);
1673
1674         bch2_trans_exit(&trans);
1675         snapshots_seen_exit(&s);
1676         inode_walker_exit(&dir);
1677         inode_walker_exit(&target);
1678         return ret;
1679 }
1680
1681 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
1682                        struct bch_hash_info *hash_info,
1683                        struct inode_walker *inode)
1684 {
1685         struct bch_fs *c = trans->c;
1686         struct bkey_s_c k;
1687         int ret;
1688
1689         k = bch2_btree_iter_peek(iter);
1690         if (!k.k)
1691                 return 0;
1692
1693         ret = bkey_err(k);
1694         if (ret)
1695                 return ret;
1696
1697         ret = check_key_has_snapshot(trans, iter, k);
1698         if (ret)
1699                 return ret;
1700
1701         ret = __walk_inode(trans, inode, k.k->p);
1702         if (ret < 0)
1703                 return ret;
1704
1705         if (fsck_err_on(ret == INT_MAX, c,
1706                         "xattr for missing inode %llu",
1707                         k.k->p.inode))
1708                 return bch2_btree_delete_at(trans, iter, 0);
1709
1710         if (ret == INT_MAX)
1711                 return 0;
1712
1713         ret = 0;
1714
1715         if (inode->first_this_inode)
1716                 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
1717
1718         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
1719 fsck_err:
1720         return ret;
1721 }
1722
1723 /*
1724  * Walk xattrs: verify that they all have a corresponding inode
1725  */
1726 noinline_for_stack
1727 static int check_xattrs(struct bch_fs *c)
1728 {
1729         struct inode_walker inode = inode_walker_init();
1730         struct bch_hash_info hash_info;
1731         struct btree_trans trans;
1732         struct btree_iter iter;
1733         int ret = 0;
1734
1735         bch_verbose(c, "checking xattrs");
1736
1737         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1738
1739         bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
1740                              POS(BCACHEFS_ROOT_INO, 0),
1741                              BTREE_ITER_INTENT|
1742                              BTREE_ITER_PREFETCH|
1743                              BTREE_ITER_ALL_SNAPSHOTS);
1744
1745         do {
1746                 ret = __bch2_trans_do(&trans, NULL, NULL,
1747                                       BTREE_INSERT_LAZY_RW|
1748                                       BTREE_INSERT_NOFAIL,
1749                                       check_xattr(&trans, &iter, &hash_info,
1750                                                   &inode));
1751                 if (ret)
1752                         break;
1753         } while (bch2_btree_iter_advance(&iter));
1754         bch2_trans_iter_exit(&trans, &iter);
1755
1756         bch2_trans_exit(&trans);
1757         return ret;
1758 }
1759
1760 static int check_root_trans(struct btree_trans *trans)
1761 {
1762         struct bch_fs *c = trans->c;
1763         struct bch_inode_unpacked root_inode;
1764         u32 snapshot;
1765         u64 inum;
1766         int ret;
1767
1768         ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1769         if (ret && ret != -ENOENT)
1770                 return ret;
1771
1772         if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
1773                 struct bkey_i_subvolume root_subvol;
1774
1775                 snapshot        = U32_MAX;
1776                 inum            = BCACHEFS_ROOT_INO;
1777
1778                 bkey_subvolume_init(&root_subvol.k_i);
1779                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1780                 root_subvol.v.flags     = 0;
1781                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1782                 root_subvol.v.inode     = cpu_to_le64(inum);
1783                 ret = __bch2_trans_do(trans, NULL, NULL,
1784                                       BTREE_INSERT_NOFAIL|
1785                                       BTREE_INSERT_LAZY_RW,
1786                         __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
1787                 if (ret) {
1788                         bch_err(c, "error writing root subvol: %i", ret);
1789                         goto err;
1790                 }
1791
1792         }
1793
1794         ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1795         if (ret && ret != -ENOENT)
1796                 return ret;
1797
1798         if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
1799             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
1800                                 "root inode not a directory")) {
1801                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1802                                 0, NULL);
1803                 root_inode.bi_inum = inum;
1804
1805                 ret = __write_inode(trans, &root_inode, snapshot);
1806                 if (ret)
1807                         bch_err(c, "error writing root inode: %i", ret);
1808         }
1809 err:
1810 fsck_err:
1811         return ret;
1812 }
1813
1814 /* Get root directory, create if it doesn't exist: */
1815 noinline_for_stack
1816 static int check_root(struct bch_fs *c)
1817 {
1818         bch_verbose(c, "checking root directory");
1819
1820         return bch2_trans_do(c, NULL, NULL,
1821                              BTREE_INSERT_NOFAIL|
1822                              BTREE_INSERT_LAZY_RW,
1823                 check_root_trans(&trans));
1824 }
1825
1826 struct pathbuf_entry {
1827         u64     inum;
1828         u32     snapshot;
1829 };
1830
1831 typedef DARRAY(struct pathbuf_entry) pathbuf;
1832
1833 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
1834 {
1835         struct pathbuf_entry *i;
1836
1837         darray_for_each(*p, i)
1838                 if (i->inum     == inum &&
1839                     i->snapshot == snapshot)
1840                         return true;
1841
1842         return false;
1843 }
1844
1845 static int path_down(struct bch_fs *c, pathbuf *p,
1846                      u64 inum, u32 snapshot)
1847 {
1848         int ret = darray_push(*p, ((struct pathbuf_entry) {
1849                 .inum           = inum,
1850                 .snapshot       = snapshot,
1851         }));
1852
1853         if (ret)
1854                 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
1855                         p->size);
1856         return ret;
1857 }
1858
1859 /*
1860  * Check that a given inode is reachable from the root:
1861  *
1862  * XXX: we should also be verifying that inodes are in the right subvolumes
1863  */
1864 static int check_path(struct btree_trans *trans,
1865                       pathbuf *p,
1866                       struct bch_inode_unpacked *inode,
1867                       u32 snapshot)
1868 {
1869         struct bch_fs *c = trans->c;
1870         int ret = 0;
1871
1872         snapshot = snapshot_t(c, snapshot)->equiv;
1873         p->nr = 0;
1874
1875         while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
1876                  inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
1877                 struct btree_iter dirent_iter;
1878                 struct bkey_s_c_dirent d;
1879                 u32 parent_snapshot = snapshot;
1880
1881                 if (inode->bi_subvol) {
1882                         u64 inum;
1883
1884                         ret = subvol_lookup(trans, inode->bi_parent_subvol,
1885                                             &parent_snapshot, &inum);
1886                         if (ret)
1887                                 break;
1888                 }
1889
1890                 ret = lockrestart_do(trans,
1891                         PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
1892                                           SPOS(inode->bi_dir, inode->bi_dir_offset,
1893                                                parent_snapshot))).k));
1894                 if (ret && ret != -ENOENT)
1895                         break;
1896
1897                 if (!ret && !dirent_points_to_inode(d, inode)) {
1898                         bch2_trans_iter_exit(trans, &dirent_iter);
1899                         ret = -ENOENT;
1900                 }
1901
1902                 if (ret == -ENOENT) {
1903                         if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
1904                                      inode->bi_inum, snapshot,
1905                                      bch2_d_type_str(inode_d_type(inode)),
1906                                      inode->bi_nlink,
1907                                      inode->bi_dir,
1908                                      inode->bi_dir_offset))
1909                                 ret = reattach_inode(trans, inode, snapshot);
1910                         break;
1911                 }
1912
1913                 bch2_trans_iter_exit(trans, &dirent_iter);
1914
1915                 if (!S_ISDIR(inode->bi_mode))
1916                         break;
1917
1918                 ret = path_down(c, p, inode->bi_inum, snapshot);
1919                 if (ret) {
1920                         bch_err(c, "memory allocation failure");
1921                         return ret;
1922                 }
1923
1924                 snapshot = parent_snapshot;
1925
1926                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
1927                 if (ret) {
1928                         /* Should have been caught in dirents pass */
1929                         bch_err(c, "error looking up parent directory: %i", ret);
1930                         break;
1931                 }
1932
1933                 if (path_is_dup(p, inode->bi_inum, snapshot)) {
1934                         struct pathbuf_entry *i;
1935
1936                         /* XXX print path */
1937                         bch_err(c, "directory structure loop");
1938
1939                         darray_for_each(*p, i)
1940                                 pr_err("%llu:%u", i->inum, i->snapshot);
1941                         pr_err("%llu:%u", inode->bi_inum, snapshot);
1942
1943                         if (!fsck_err(c, "directory structure loop"))
1944                                 return 0;
1945
1946                         ret = __bch2_trans_do(trans, NULL, NULL,
1947                                               BTREE_INSERT_NOFAIL|
1948                                               BTREE_INSERT_LAZY_RW,
1949                                         remove_backpointer(trans, inode));
1950                         if (ret) {
1951                                 bch_err(c, "error removing dirent: %i", ret);
1952                                 break;
1953                         }
1954
1955                         ret = reattach_inode(trans, inode, snapshot);
1956                 }
1957         }
1958 fsck_err:
1959         if (ret)
1960                 bch_err(c, "%s: err %i", __func__, ret);
1961         return ret;
1962 }
1963
1964 /*
1965  * Check for unreachable inodes, as well as loops in the directory structure:
1966  * After check_dirents(), if an inode backpointer doesn't exist that means it's
1967  * unreachable:
1968  */
1969 noinline_for_stack
1970 static int check_directory_structure(struct bch_fs *c)
1971 {
1972         struct btree_trans trans;
1973         struct btree_iter iter;
1974         struct bkey_s_c k;
1975         struct bch_inode_unpacked u;
1976         pathbuf path = { 0, };
1977         int ret;
1978
1979         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1980
1981         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
1982                            BTREE_ITER_INTENT|
1983                            BTREE_ITER_PREFETCH|
1984                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
1985                 if (!bkey_is_inode(k.k))
1986                         continue;
1987
1988                 ret = bch2_inode_unpack(k, &u);
1989                 if (ret) {
1990                         /* Should have been caught earlier in fsck: */
1991                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
1992                         break;
1993                 }
1994
1995                 if (u.bi_flags & BCH_INODE_UNLINKED)
1996                         continue;
1997
1998                 ret = check_path(&trans, &path, &u, iter.pos.snapshot);
1999                 if (ret)
2000                         break;
2001         }
2002         bch2_trans_iter_exit(&trans, &iter);
2003
2004         BUG_ON(ret == -EINTR);
2005
2006         darray_exit(path);
2007
2008         bch2_trans_exit(&trans);
2009         return ret;
2010 }
2011
2012 struct nlink_table {
2013         size_t          nr;
2014         size_t          size;
2015
2016         struct nlink {
2017                 u64     inum;
2018                 u32     snapshot;
2019                 u32     count;
2020         }               *d;
2021 };
2022
2023 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2024                      u64 inum, u32 snapshot)
2025 {
2026         if (t->nr == t->size) {
2027                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2028                 void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
2029                 if (!d) {
2030                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2031                                 new_size);
2032                         return -ENOMEM;
2033                 }
2034
2035                 if (t->d)
2036                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2037                 kvfree(t->d);
2038
2039                 t->d = d;
2040                 t->size = new_size;
2041         }
2042
2043
2044         t->d[t->nr++] = (struct nlink) {
2045                 .inum           = inum,
2046                 .snapshot       = snapshot,
2047         };
2048
2049         return 0;
2050 }
2051
2052 static int nlink_cmp(const void *_l, const void *_r)
2053 {
2054         const struct nlink *l = _l;
2055         const struct nlink *r = _r;
2056
2057         return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
2058 }
2059
2060 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2061                      struct nlink_table *links,
2062                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2063 {
2064         struct nlink *link, key = {
2065                 .inum = inum, .snapshot = U32_MAX,
2066         };
2067
2068         if (inum < range_start || inum >= range_end)
2069                 return;
2070
2071         link = __inline_bsearch(&key, links->d, links->nr,
2072                                 sizeof(links->d[0]), nlink_cmp);
2073         if (!link)
2074                 return;
2075
2076         while (link > links->d && link[0].inum == link[-1].inum)
2077                 --link;
2078
2079         for (; link < links->d + links->nr && link->inum == inum; link++)
2080                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2081                         link->count++;
2082                         if (link->snapshot >= snapshot)
2083                                 break;
2084                 }
2085 }
2086
2087 noinline_for_stack
2088 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2089                                        struct nlink_table *t,
2090                                        u64 start, u64 *end)
2091 {
2092         struct btree_trans trans;
2093         struct btree_iter iter;
2094         struct bkey_s_c k;
2095         struct bch_inode_unpacked u;
2096         int ret = 0;
2097
2098         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2099
2100         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2101                            POS(0, start),
2102                            BTREE_ITER_INTENT|
2103                            BTREE_ITER_PREFETCH|
2104                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2105                 if (!bkey_is_inode(k.k))
2106                         continue;
2107
2108                 /* Should never fail, checked by bch2_inode_invalid: */
2109                 BUG_ON(bch2_inode_unpack(k, &u));
2110
2111                 /*
2112                  * Backpointer and directory structure checks are sufficient for
2113                  * directories, since they can't have hardlinks:
2114                  */
2115                 if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2116                         continue;
2117
2118                 if (!u.bi_nlink)
2119                         continue;
2120
2121                 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2122                 if (ret) {
2123                         *end = k.k->p.offset;
2124                         ret = 0;
2125                         break;
2126                 }
2127
2128         }
2129         bch2_trans_iter_exit(&trans, &iter);
2130         bch2_trans_exit(&trans);
2131
2132         if (ret)
2133                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2134
2135         return ret;
2136 }
2137
2138 noinline_for_stack
2139 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2140                                      u64 range_start, u64 range_end)
2141 {
2142         struct btree_trans trans;
2143         struct snapshots_seen s;
2144         struct btree_iter iter;
2145         struct bkey_s_c k;
2146         struct bkey_s_c_dirent d;
2147         int ret;
2148
2149         snapshots_seen_init(&s);
2150
2151         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2152
2153         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
2154                            BTREE_ITER_INTENT|
2155                            BTREE_ITER_PREFETCH|
2156                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2157                 ret = snapshots_seen_update(c, &s, k.k->p);
2158                 if (ret)
2159                         break;
2160
2161                 switch (k.k->type) {
2162                 case KEY_TYPE_dirent:
2163                         d = bkey_s_c_to_dirent(k);
2164
2165                         if (d.v->d_type != DT_DIR &&
2166                             d.v->d_type != DT_SUBVOL)
2167                                 inc_link(c, &s, links, range_start, range_end,
2168                                          le64_to_cpu(d.v->d_inum),
2169                                          d.k->p.snapshot);
2170                         break;
2171                 }
2172         }
2173         bch2_trans_iter_exit(&trans, &iter);
2174
2175         if (ret)
2176                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
2177
2178         bch2_trans_exit(&trans);
2179         snapshots_seen_exit(&s);
2180         return ret;
2181 }
2182
2183 noinline_for_stack
2184 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2185                                struct nlink_table *links,
2186                                u64 range_start, u64 range_end)
2187 {
2188         struct btree_trans trans;
2189         struct btree_iter iter;
2190         struct bkey_s_c k;
2191         struct bch_inode_unpacked u;
2192         struct nlink *link = links->d;
2193         int ret = 0;
2194
2195         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2196
2197         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2198                            POS(0, range_start),
2199                            BTREE_ITER_INTENT|
2200                            BTREE_ITER_PREFETCH|
2201                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2202                 if (k.k->p.offset >= range_end)
2203                         break;
2204
2205                 if (!bkey_is_inode(k.k))
2206                         continue;
2207
2208                 BUG_ON(bch2_inode_unpack(k, &u));
2209
2210                 if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2211                         continue;
2212
2213                 if (!u.bi_nlink)
2214                         continue;
2215
2216                 while ((cmp_int(link->inum, k.k->p.offset) ?:
2217                         cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2218                         link++;
2219                         BUG_ON(link >= links->d + links->nr);
2220                 }
2221
2222                 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
2223                                 "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
2224                                 u.bi_inum, mode_to_type(u.bi_mode),
2225                                 bch2_inode_nlink_get(&u), link->count)) {
2226                         bch2_inode_nlink_set(&u, link->count);
2227
2228                         ret = write_inode(&trans, &u, k.k->p.snapshot);
2229                         if (ret)
2230                                 bch_err(c, "error in fsck: error %i updating inode", ret);
2231                 }
2232         }
2233 fsck_err:
2234         bch2_trans_iter_exit(&trans, &iter);
2235         bch2_trans_exit(&trans);
2236
2237         if (ret)
2238                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2239
2240         return ret;
2241 }
2242
2243 noinline_for_stack
2244 static int check_nlinks(struct bch_fs *c)
2245 {
2246         struct nlink_table links = { 0 };
2247         u64 this_iter_range_start, next_iter_range_start = 0;
2248         int ret = 0;
2249
2250         bch_verbose(c, "checking inode nlinks");
2251
2252         do {
2253                 this_iter_range_start = next_iter_range_start;
2254                 next_iter_range_start = U64_MAX;
2255
2256                 ret = check_nlinks_find_hardlinks(c, &links,
2257                                                   this_iter_range_start,
2258                                                   &next_iter_range_start);
2259
2260                 ret = check_nlinks_walk_dirents(c, &links,
2261                                           this_iter_range_start,
2262                                           next_iter_range_start);
2263                 if (ret)
2264                         break;
2265
2266                 ret = check_nlinks_update_hardlinks(c, &links,
2267                                          this_iter_range_start,
2268                                          next_iter_range_start);
2269                 if (ret)
2270                         break;
2271
2272                 links.nr = 0;
2273         } while (next_iter_range_start != U64_MAX);
2274
2275         kvfree(links.d);
2276
2277         return ret;
2278 }
2279
2280 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
2281 {
2282         struct bkey_s_c k;
2283         struct bkey_s_c_reflink_p p;
2284         struct bkey_i_reflink_p *u;
2285         int ret;
2286
2287         k = bch2_btree_iter_peek(iter);
2288         if (!k.k)
2289                 return 0;
2290
2291         ret = bkey_err(k);
2292         if (ret)
2293                 return ret;
2294
2295         if (k.k->type != KEY_TYPE_reflink_p)
2296                 return 0;
2297
2298         p = bkey_s_c_to_reflink_p(k);
2299
2300         if (!p.v->front_pad && !p.v->back_pad)
2301                 return 0;
2302
2303         u = bch2_trans_kmalloc(trans, sizeof(*u));
2304         ret = PTR_ERR_OR_ZERO(u);
2305         if (ret)
2306                 return ret;
2307
2308         bkey_reassemble(&u->k_i, k);
2309         u->v.front_pad  = 0;
2310         u->v.back_pad   = 0;
2311
2312         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2313 }
2314
2315 noinline_for_stack
2316 static int fix_reflink_p(struct bch_fs *c)
2317 {
2318         struct btree_trans trans;
2319         struct btree_iter iter;
2320         struct bkey_s_c k;
2321         int ret;
2322
2323         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2324                 return 0;
2325
2326         bch_verbose(c, "fixing reflink_p keys");
2327
2328         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2329
2330         for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
2331                            BTREE_ITER_INTENT|
2332                            BTREE_ITER_PREFETCH|
2333                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2334                 if (k.k->type == KEY_TYPE_reflink_p) {
2335                         ret = __bch2_trans_do(&trans, NULL, NULL,
2336                                               BTREE_INSERT_NOFAIL|
2337                                               BTREE_INSERT_LAZY_RW,
2338                                               fix_reflink_p_key(&trans, &iter));
2339                         if (ret)
2340                                 break;
2341                 }
2342         }
2343         bch2_trans_iter_exit(&trans, &iter);
2344
2345         bch2_trans_exit(&trans);
2346         return ret;
2347 }
2348
2349 /*
2350  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
2351  * Doesn't fix them yet, mainly because they haven't yet been observed:
2352  */
2353 int bch2_fsck_full(struct bch_fs *c)
2354 {
2355         return  bch2_fs_snapshots_check(c) ?:
2356                 check_inodes(c, true) ?:
2357                 check_subvols(c) ?:
2358                 check_extents(c) ?:
2359                 check_dirents(c) ?:
2360                 check_xattrs(c) ?:
2361                 check_root(c) ?:
2362                 check_directory_structure(c) ?:
2363                 check_nlinks(c) ?:
2364                 fix_reflink_p(c);
2365 }
2366
2367 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
2368 {
2369         return check_inodes(c, false);
2370 }