]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
Update bcachefs sources to 9b3aa5ec6c bcachefs: Add tabstops to printbufs
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "dirent.h"
7 #include "error.h"
8 #include "fs-common.h"
9 #include "fsck.h"
10 #include "inode.h"
11 #include "keylist.h"
12 #include "subvolume.h"
13 #include "super.h"
14 #include "xattr.h"
15
16 #include <linux/bsearch.h>
17 #include <linux/dcache.h> /* struct qstr */
18
19 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
20
21 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
22                                     u32 snapshot)
23 {
24         struct btree_iter iter;
25         struct bkey_s_c k;
26         u64 sectors = 0;
27         int ret;
28
29         for_each_btree_key(trans, iter, BTREE_ID_extents,
30                            SPOS(inum, 0, snapshot), 0, k, ret) {
31                 if (k.k->p.inode != inum)
32                         break;
33
34                 if (bkey_extent_is_allocation(k.k))
35                         sectors += k.k->size;
36         }
37
38         bch2_trans_iter_exit(trans, &iter);
39
40         return ret ?: sectors;
41 }
42
43 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
44                                     u32 snapshot)
45 {
46         struct btree_iter iter;
47         struct bkey_s_c k;
48         struct bkey_s_c_dirent d;
49         u64 subdirs = 0;
50         int ret;
51
52         for_each_btree_key(trans, iter, BTREE_ID_dirents,
53                            SPOS(inum, 0, snapshot), 0, k, ret) {
54                 if (k.k->p.inode != inum)
55                         break;
56
57                 if (k.k->type != KEY_TYPE_dirent)
58                         continue;
59
60                 d = bkey_s_c_to_dirent(k);
61                 if (d.v->d_type == DT_DIR)
62                         subdirs++;
63         }
64
65         bch2_trans_iter_exit(trans, &iter);
66
67         return ret ?: subdirs;
68 }
69
70 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
71                                     u32 *subvol)
72 {
73         struct btree_iter iter;
74         struct bkey_s_c k;
75         int ret;
76
77         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
78                              POS(0, snapshot), 0);
79         k = bch2_btree_iter_peek_slot(&iter);
80         ret = bkey_err(k);
81         if (ret)
82                 goto err;
83
84         if (k.k->type != KEY_TYPE_snapshot) {
85                 bch_err(trans->c, "snapshot %u not fonud", snapshot);
86                 ret = -ENOENT;
87                 goto err;
88         }
89
90         *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
91 err:
92         bch2_trans_iter_exit(trans, &iter);
93         return ret;
94
95 }
96
97 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
98                            u32 *snapshot, u64 *inum)
99 {
100         struct bch_subvolume s;
101         int ret;
102
103         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
104
105         *snapshot = le32_to_cpu(s.snapshot);
106         *inum = le64_to_cpu(s.inode);
107         return ret;
108 }
109
110 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
111                          u32 *snapshot, u64 *inum)
112 {
113         return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
114 }
115
116 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
117                               struct bch_inode_unpacked *inode)
118 {
119         struct btree_iter iter;
120         struct bkey_s_c k;
121         int ret;
122
123         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
124                              POS(0, inode_nr),
125                              BTREE_ITER_ALL_SNAPSHOTS);
126         k = bch2_btree_iter_peek(&iter);
127         ret = bkey_err(k);
128         if (ret)
129                 goto err;
130
131         if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
132                 ret = -ENOENT;
133                 goto err;
134         }
135
136         ret = bch2_inode_unpack(k, inode);
137 err:
138         if (ret && ret != -EINTR)
139                 bch_err(trans->c, "error %i fetching inode %llu",
140                         ret, inode_nr);
141         bch2_trans_iter_exit(trans, &iter);
142         return ret;
143 }
144
145 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
146                           struct bch_inode_unpacked *inode,
147                           u32 *snapshot)
148 {
149         struct btree_iter iter;
150         struct bkey_s_c k;
151         int ret;
152
153         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
154                              SPOS(0, inode_nr, *snapshot), 0);
155         k = bch2_btree_iter_peek_slot(&iter);
156         ret = bkey_err(k);
157         if (ret)
158                 goto err;
159
160         ret = bkey_is_inode(k.k)
161                 ? bch2_inode_unpack(k, inode)
162                 : -ENOENT;
163         if (!ret)
164                 *snapshot = iter.pos.snapshot;
165 err:
166         if (ret && ret != -EINTR)
167                 bch_err(trans->c, "error %i fetching inode %llu:%u",
168                         ret, inode_nr, *snapshot);
169         bch2_trans_iter_exit(trans, &iter);
170         return ret;
171 }
172
173 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
174                         struct bch_inode_unpacked *inode,
175                         u32 *snapshot)
176 {
177         return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
178 }
179
180 static int __lookup_dirent(struct btree_trans *trans,
181                            struct bch_hash_info hash_info,
182                            subvol_inum dir, struct qstr *name,
183                            u64 *target, unsigned *type)
184 {
185         struct btree_iter iter;
186         struct bkey_s_c_dirent d;
187         int ret;
188
189         ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
190                                &hash_info, dir, name, 0);
191         if (ret)
192                 return ret;
193
194         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
195         *target = le64_to_cpu(d.v->d_inum);
196         *type = d.v->d_type;
197         bch2_trans_iter_exit(trans, &iter);
198         return 0;
199 }
200
201 static int __write_inode(struct btree_trans *trans,
202                          struct bch_inode_unpacked *inode,
203                          u32 snapshot)
204 {
205         struct btree_iter iter;
206         int ret;
207
208         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
209                             SPOS(0, inode->bi_inum, snapshot),
210                             BTREE_ITER_INTENT);
211
212         ret   = bch2_btree_iter_traverse(&iter) ?:
213                 bch2_inode_write(trans, &iter, inode);
214         bch2_trans_iter_exit(trans, &iter);
215         return ret;
216 }
217
218 static int write_inode(struct btree_trans *trans,
219                        struct bch_inode_unpacked *inode,
220                        u32 snapshot)
221 {
222         int ret = __bch2_trans_do(trans, NULL, NULL,
223                                   BTREE_INSERT_NOFAIL|
224                                   BTREE_INSERT_LAZY_RW,
225                                   __write_inode(trans, inode, snapshot));
226         if (ret)
227                 bch_err(trans->c, "error in fsck: error %i updating inode", ret);
228         return ret;
229 }
230
231 static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
232 {
233         struct btree_iter iter = { NULL };
234         struct bkey_i_inode_generation delete;
235         struct bch_inode_unpacked inode_u;
236         struct bkey_s_c k;
237         int ret;
238
239         ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
240                                               SPOS(inum, 0, snapshot),
241                                               SPOS(inum, U64_MAX, snapshot),
242                                               0, NULL) ?:
243                 bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
244                                               SPOS(inum, 0, snapshot),
245                                               SPOS(inum, U64_MAX, snapshot),
246                                               0, NULL) ?:
247                 bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
248                                               SPOS(inum, 0, snapshot),
249                                               SPOS(inum, U64_MAX, snapshot),
250                                               0, NULL);
251         if (ret)
252                 goto err;
253 retry:
254         bch2_trans_begin(trans);
255
256         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
257                              SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
258         k = bch2_btree_iter_peek_slot(&iter);
259
260         ret = bkey_err(k);
261         if (ret)
262                 goto err;
263
264         if (!bkey_is_inode(k.k)) {
265                 bch2_fs_inconsistent(trans->c,
266                                      "inode %llu:%u not found when deleting",
267                                      inum, snapshot);
268                 ret = -EIO;
269                 goto err;
270         }
271
272         bch2_inode_unpack(k, &inode_u);
273
274         /* Subvolume root? */
275         if (inode_u.bi_subvol) {
276                 ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
277                 if (ret)
278                         goto err;
279         }
280
281         bkey_inode_generation_init(&delete.k_i);
282         delete.k.p = iter.pos;
283         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
284
285         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
286                 bch2_trans_commit(trans, NULL, NULL,
287                                 BTREE_INSERT_NOFAIL);
288 err:
289         bch2_trans_iter_exit(trans, &iter);
290         if (ret == -EINTR)
291                 goto retry;
292
293         return ret;
294 }
295
296 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
297 {
298         struct bch_fs *c = trans->c;
299         struct btree_iter iter;
300         struct bch_inode_unpacked dir_inode;
301         struct bch_hash_info dir_hash_info;
302         int ret;
303
304         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
305         if (ret)
306                 return ret;
307
308         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
309
310         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
311
312         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
313                                   &dir_hash_info, &iter, 0);
314         bch2_trans_iter_exit(trans, &iter);
315         return ret;
316 }
317
318 /* Get lost+found, create if it doesn't exist: */
319 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
320                             struct bch_inode_unpacked *lostfound)
321 {
322         struct bch_fs *c = trans->c;
323         struct bch_inode_unpacked root;
324         struct bch_hash_info root_hash_info;
325         struct qstr lostfound_str = QSTR("lost+found");
326         subvol_inum root_inum = { .subvol = subvol };
327         u64 inum = 0;
328         unsigned d_type = 0;
329         u32 snapshot;
330         int ret;
331
332         ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
333         if (ret)
334                 return ret;
335
336         ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
337         if (ret)
338                 return ret;
339
340         root_hash_info = bch2_hash_info_init(c, &root);
341
342         ret = __lookup_dirent(trans, root_hash_info, root_inum,
343                             &lostfound_str, &inum, &d_type);
344         if (ret == -ENOENT) {
345                 bch_notice(c, "creating lost+found");
346                 goto create_lostfound;
347         }
348
349         if (ret && ret != -EINTR)
350                 bch_err(c, "error looking up lost+found: %i", ret);
351         if (ret)
352                 return ret;
353
354         if (d_type != DT_DIR) {
355                 bch_err(c, "error looking up lost+found: not a directory");
356                 return ret;
357         }
358
359         /*
360          * The check_dirents pass has already run, dangling dirents
361          * shouldn't exist here:
362          */
363         return __lookup_inode(trans, inum, lostfound, &snapshot);
364
365 create_lostfound:
366         bch2_inode_init_early(c, lostfound);
367
368         ret = bch2_create_trans(trans, root_inum, &root,
369                                 lostfound, &lostfound_str,
370                                 0, 0, S_IFDIR|0700, 0, NULL, NULL,
371                                 (subvol_inum) { }, 0);
372         if (ret && ret != -EINTR)
373                 bch_err(c, "error creating lost+found: %i", ret);
374         return ret;
375 }
376
377 static int __reattach_inode(struct btree_trans *trans,
378                           struct bch_inode_unpacked *inode,
379                           u32 inode_snapshot)
380 {
381         struct bch_hash_info dir_hash;
382         struct bch_inode_unpacked lostfound;
383         char name_buf[20];
384         struct qstr name;
385         u64 dir_offset = 0;
386         u32 subvol;
387         int ret;
388
389         ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
390         if (ret)
391                 return ret;
392
393         ret = lookup_lostfound(trans, subvol, &lostfound);
394         if (ret)
395                 return ret;
396
397         if (S_ISDIR(inode->bi_mode)) {
398                 lostfound.bi_nlink++;
399
400                 ret = __write_inode(trans, &lostfound, U32_MAX);
401                 if (ret)
402                         return ret;
403         }
404
405         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
406
407         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
408         name = (struct qstr) QSTR(name_buf);
409
410         ret = bch2_dirent_create(trans,
411                                  (subvol_inum) {
412                                         .subvol = subvol,
413                                         .inum = lostfound.bi_inum,
414                                  },
415                                  &dir_hash,
416                                  inode_d_type(inode),
417                                  &name, inode->bi_inum, &dir_offset,
418                                  BCH_HASH_SET_MUST_CREATE);
419         if (ret)
420                 return ret;
421
422         inode->bi_dir           = lostfound.bi_inum;
423         inode->bi_dir_offset    = dir_offset;
424
425         return __write_inode(trans, inode, inode_snapshot);
426 }
427
428 static int reattach_inode(struct btree_trans *trans,
429                           struct bch_inode_unpacked *inode,
430                           u32 inode_snapshot)
431 {
432         int ret = __bch2_trans_do(trans, NULL, NULL,
433                                   BTREE_INSERT_LAZY_RW|
434                                   BTREE_INSERT_NOFAIL,
435                         __reattach_inode(trans, inode, inode_snapshot));
436         if (ret) {
437                 bch_err(trans->c, "error %i reattaching inode %llu",
438                         ret, inode->bi_inum);
439                 return ret;
440         }
441
442         return ret;
443 }
444
445 static int remove_backpointer(struct btree_trans *trans,
446                               struct bch_inode_unpacked *inode)
447 {
448         struct btree_iter iter;
449         struct bkey_s_c k;
450         int ret;
451
452         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
453                              POS(inode->bi_dir, inode->bi_dir_offset), 0);
454         k = bch2_btree_iter_peek_slot(&iter);
455         ret = bkey_err(k);
456         if (ret)
457                 goto out;
458         if (k.k->type != KEY_TYPE_dirent) {
459                 ret = -ENOENT;
460                 goto out;
461         }
462
463         ret = __remove_dirent(trans, k.k->p);
464 out:
465         bch2_trans_iter_exit(trans, &iter);
466         return ret;
467 }
468
469 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
470 {
471         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
472
473         if (bkey_cmp(s->pos, pos))
474                 s->nr = 0;
475         s->pos = pos;
476
477         /* Might get called multiple times due to lock restarts */
478         if (s->nr && s->d[s->nr - 1] == pos.snapshot)
479                 return 0;
480
481         return snapshots_seen_add(c, s, pos.snapshot);
482 }
483
484 /**
485  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
486  * and @ancestor hasn't been overwritten in @seen
487  *
488  * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
489  */
490 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
491                                     u32 id, u32 ancestor)
492 {
493         ssize_t i;
494
495         BUG_ON(id > ancestor);
496
497         id              = snapshot_t(c, id)->equiv;
498         ancestor        = snapshot_t(c, ancestor)->equiv;
499
500         /* @ancestor should be the snapshot most recently added to @seen */
501         BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
502         BUG_ON(seen->pos.snapshot != ancestor);
503
504         if (id == ancestor)
505                 return true;
506
507         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
508                 return false;
509
510         for (i = seen->nr - 2;
511              i >= 0 && seen->d[i] >= id;
512              --i)
513                 if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
514                     bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
515                         return false;
516
517         return true;
518 }
519
520 /**
521  * ref_visible - given a key with snapshot id @src that points to a key with
522  * snapshot id @dst, test whether there is some snapshot in which @dst is
523  * visible.
524  *
525  * This assumes we're visiting @src keys in natural key order.
526  *
527  * @s   - list of snapshot IDs already seen at @src
528  * @src - snapshot ID of src key
529  * @dst - snapshot ID of dst key
530  */
531 static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
532                        u32 src, u32 dst)
533 {
534         return dst <= src
535                 ? key_visible_in_snapshot(c, s, dst, src)
536                 : bch2_snapshot_is_ancestor(c, src, dst);
537 }
538
539 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)       \
540         for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
541                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
542
543 struct inode_walker {
544         bool                            first_this_inode;
545         u64                             cur_inum;
546
547         size_t                          nr;
548         size_t                          size;
549         struct inode_walker_entry {
550                 struct bch_inode_unpacked inode;
551                 u32                     snapshot;
552                 u64                     count;
553         } *d;
554 };
555
556 static void inode_walker_exit(struct inode_walker *w)
557 {
558         kfree(w->d);
559         w->d = NULL;
560 }
561
562 static struct inode_walker inode_walker_init(void)
563 {
564         return (struct inode_walker) { 0, };
565 }
566
567 static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w)
568 {
569         if (w->nr == w->size) {
570                 size_t new_size = max_t(size_t, 8UL, w->size * 2);
571                 void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
572                                    GFP_KERNEL);
573                 if (!d) {
574                         bch_err(c, "fsck: error allocating memory for inode_walker, size %zu",
575                                 new_size);
576                         return -ENOMEM;
577                 }
578
579                 w->d = d;
580                 w->size = new_size;
581         }
582
583         return 0;
584 }
585
586 static int add_inode(struct bch_fs *c, struct inode_walker *w,
587                      struct bkey_s_c inode)
588 {
589         struct bch_inode_unpacked u;
590         int ret;
591
592         ret = inode_walker_realloc(c, w);
593         if (ret)
594                 return ret;
595
596         BUG_ON(bch2_inode_unpack(inode, &u));
597
598         w->d[w->nr++] = (struct inode_walker_entry) {
599                 .inode          = u,
600                 .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
601         };
602
603         return 0;
604 }
605
606 static int __walk_inode(struct btree_trans *trans,
607                         struct inode_walker *w, struct bpos pos)
608 {
609         struct bch_fs *c = trans->c;
610         struct btree_iter iter;
611         struct bkey_s_c k;
612         unsigned i, ancestor_pos;
613         int ret;
614
615         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
616
617         if (pos.inode == w->cur_inum) {
618                 w->first_this_inode = false;
619                 goto lookup_snapshot;
620         }
621
622         w->nr = 0;
623
624         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
625                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
626                 if (k.k->p.offset != pos.inode)
627                         break;
628
629                 if (bkey_is_inode(k.k))
630                         add_inode(c, w, k);
631         }
632         bch2_trans_iter_exit(trans, &iter);
633
634         if (ret)
635                 return ret;
636
637         w->cur_inum             = pos.inode;
638         w->first_this_inode     = true;
639 lookup_snapshot:
640         for (i = 0; i < w->nr; i++)
641                 if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
642                         goto found;
643         return INT_MAX;
644 found:
645         BUG_ON(pos.snapshot > w->d[i].snapshot);
646
647         if (pos.snapshot != w->d[i].snapshot) {
648                 ancestor_pos = i;
649
650                 while (i && w->d[i - 1].snapshot > pos.snapshot)
651                         --i;
652
653                 ret = inode_walker_realloc(c, w);
654                 if (ret)
655                         return ret;
656
657                 array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
658                 w->d[i].snapshot = pos.snapshot;
659                 w->d[i].count   = 0;
660         }
661
662         return i;
663 }
664
665 static int __get_visible_inodes(struct btree_trans *trans,
666                                 struct inode_walker *w,
667                                 struct snapshots_seen *s,
668                                 u64 inum)
669 {
670         struct bch_fs *c = trans->c;
671         struct btree_iter iter;
672         struct bkey_s_c k;
673         int ret;
674
675         w->nr = 0;
676
677         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
678                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
679                 if (k.k->p.offset != inum)
680                         break;
681
682                 if (!bkey_is_inode(k.k))
683                         continue;
684
685                 if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
686                         add_inode(c, w, k);
687                         if (k.k->p.snapshot >= s->pos.snapshot)
688                                 break;
689                 }
690         }
691         bch2_trans_iter_exit(trans, &iter);
692
693         return ret;
694 }
695
696 static int check_key_has_snapshot(struct btree_trans *trans,
697                                   struct btree_iter *iter,
698                                   struct bkey_s_c k)
699 {
700         struct bch_fs *c = trans->c;
701         char buf[200];
702         int ret = 0;
703
704         if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
705                         "key in missing snapshot: %s",
706                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
707                 return bch2_btree_delete_at(trans, iter,
708                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
709 fsck_err:
710         return ret;
711 }
712
713 static int hash_redo_key(struct btree_trans *trans,
714                          const struct bch_hash_desc desc,
715                          struct bch_hash_info *hash_info,
716                          struct btree_iter *k_iter, struct bkey_s_c k)
717 {
718         bch_err(trans->c, "hash_redo_key() not implemented yet");
719         return -EINVAL;
720 #if 0
721         struct bkey_i *delete;
722         struct bkey_i *tmp;
723
724         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
725         if (IS_ERR(delete))
726                 return PTR_ERR(delete);
727
728         tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
729         if (IS_ERR(tmp))
730                 return PTR_ERR(tmp);
731
732         bkey_reassemble(tmp, k);
733
734         bkey_init(&delete->k);
735         delete->k.p = k_iter->pos;
736         return  bch2_btree_iter_traverse(k_iter) ?:
737                 bch2_trans_update(trans, k_iter, delete, 0) ?:
738                 bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
739 #endif
740 }
741
742 static int hash_check_key(struct btree_trans *trans,
743                           const struct bch_hash_desc desc,
744                           struct bch_hash_info *hash_info,
745                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
746 {
747         struct bch_fs *c = trans->c;
748         struct btree_iter iter = { NULL };
749         char buf[200];
750         struct bkey_s_c k;
751         u64 hash;
752         int ret = 0;
753
754         if (hash_k.k->type != desc.key_type)
755                 return 0;
756
757         hash = desc.hash_bkey(hash_info, hash_k);
758
759         if (likely(hash == hash_k.k->p.offset))
760                 return 0;
761
762         if (hash_k.k->p.offset < hash)
763                 goto bad_hash;
764
765         for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
766                            BTREE_ITER_SLOTS, k, ret) {
767                 if (!bkey_cmp(k.k->p, hash_k.k->p))
768                         break;
769
770                 if (fsck_err_on(k.k->type == desc.key_type &&
771                                 !desc.cmp_bkey(k, hash_k), c,
772                                 "duplicate hash table keys:\n%s",
773                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
774                                                        hash_k), buf))) {
775                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
776                         break;
777                 }
778
779                 if (bkey_deleted(k.k)) {
780                         bch2_trans_iter_exit(trans, &iter);
781                         goto bad_hash;
782                 }
783
784         }
785         bch2_trans_iter_exit(trans, &iter);
786         return ret;
787 bad_hash:
788         if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
789                      "hashed to %llu\n%s",
790                      desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
791                      (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
792                 return 0;
793
794         ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
795         if (ret) {
796                 bch_err(c, "hash_redo_key err %i", ret);
797                 return ret;
798         }
799         return -EINTR;
800 fsck_err:
801         return ret;
802 }
803
804 static int check_inode(struct btree_trans *trans,
805                        struct btree_iter *iter,
806                        struct bch_inode_unpacked *prev,
807                        bool full)
808 {
809         struct bch_fs *c = trans->c;
810         struct bkey_s_c k;
811         struct bch_inode_unpacked u;
812         bool do_update = false;
813         int ret;
814
815         k = bch2_btree_iter_peek(iter);
816         if (!k.k)
817                 return 0;
818
819         ret = bkey_err(k);
820         if (ret)
821                 return ret;
822
823         ret = check_key_has_snapshot(trans, iter, k);
824         if (ret)
825                 return ret < 0 ? ret : 0;
826
827         /*
828          * if snapshot id isn't a leaf node, skip it - deletion in
829          * particular is not atomic, so on the internal snapshot nodes
830          * we can see inodes marked for deletion after a clean shutdown
831          */
832         if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
833                 return 0;
834
835         if (!bkey_is_inode(k.k))
836                 return 0;
837
838         BUG_ON(bch2_inode_unpack(k, &u));
839
840         if (!full &&
841             !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
842                             BCH_INODE_I_SECTORS_DIRTY|
843                             BCH_INODE_UNLINKED)))
844                 return 0;
845
846         if (prev->bi_inum != u.bi_inum)
847                 *prev = u;
848
849         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
850                         inode_d_type(prev)      != inode_d_type(&u), c,
851                         "inodes in different snapshots don't match")) {
852                 bch_err(c, "repair not implemented yet");
853                 return -EINVAL;
854         }
855
856         if (u.bi_flags & BCH_INODE_UNLINKED &&
857             (!c->sb.clean ||
858              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
859                       u.bi_inum))) {
860                 bch2_trans_unlock(trans);
861                 bch2_fs_lazy_rw(c);
862
863                 ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
864                 if (ret)
865                         bch_err(c, "error in fsck: error %i while deleting inode", ret);
866                 return ret;
867         }
868
869         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
870             (!c->sb.clean ||
871              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
872                       u.bi_inum))) {
873                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
874
875                 bch2_trans_unlock(trans);
876                 bch2_fs_lazy_rw(c);
877
878                 /*
879                  * XXX: need to truncate partial blocks too here - or ideally
880                  * just switch units to bytes and that issue goes away
881                  */
882                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
883                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
884                                      iter->pos.snapshot),
885                                 POS(u.bi_inum, U64_MAX),
886                                 0, NULL);
887                 if (ret) {
888                         bch_err(c, "error in fsck: error %i truncating inode", ret);
889                         return ret;
890                 }
891
892                 /*
893                  * We truncated without our normal sector accounting hook, just
894                  * make sure we recalculate it:
895                  */
896                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
897
898                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
899                 do_update = true;
900         }
901
902         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
903             (!c->sb.clean ||
904              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
905                       u.bi_inum))) {
906                 s64 sectors;
907
908                 bch_verbose(c, "recounting sectors for inode %llu",
909                             u.bi_inum);
910
911                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
912                 if (sectors < 0) {
913                         bch_err(c, "error in fsck: error %i recounting inode sectors",
914                                 (int) sectors);
915                         return sectors;
916                 }
917
918                 u.bi_sectors = sectors;
919                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
920                 do_update = true;
921         }
922
923         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
924                 u.bi_dir = 0;
925                 u.bi_dir_offset = 0;
926                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
927                 do_update = true;
928         }
929
930         if (do_update) {
931                 ret = write_inode(trans, &u, iter->pos.snapshot);
932                 if (ret)
933                         bch_err(c, "error in fsck: error %i "
934                                 "updating inode", ret);
935         }
936 fsck_err:
937         return ret;
938 }
939
940 noinline_for_stack
941 static int check_inodes(struct bch_fs *c, bool full)
942 {
943         struct btree_trans trans;
944         struct btree_iter iter;
945         struct bch_inode_unpacked prev = { 0 };
946         int ret;
947
948         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
949
950         bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
951                              BTREE_ITER_INTENT|
952                              BTREE_ITER_PREFETCH|
953                              BTREE_ITER_ALL_SNAPSHOTS);
954
955         do {
956                 ret = __bch2_trans_do(&trans, NULL, NULL,
957                                       BTREE_INSERT_LAZY_RW|
958                                       BTREE_INSERT_NOFAIL,
959                         check_inode(&trans, &iter, &prev, full));
960                 if (ret)
961                         break;
962         } while (bch2_btree_iter_advance(&iter));
963         bch2_trans_iter_exit(&trans, &iter);
964
965         bch2_trans_exit(&trans);
966         return ret;
967 }
968
969 static int check_subvol(struct btree_trans *trans,
970                         struct btree_iter *iter)
971 {
972         struct bkey_s_c k;
973         struct bkey_s_c_subvolume subvol;
974         int ret;
975
976         k = bch2_btree_iter_peek(iter);
977         if (!k.k)
978                 return 0;
979
980         ret = bkey_err(k);
981         if (ret)
982                 return ret;
983
984         if (k.k->type != KEY_TYPE_subvolume)
985                 return 0;
986
987         subvol = bkey_s_c_to_subvolume(k);
988
989         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
990                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
991                 if (ret && ret != -EINTR)
992                         bch_err(trans->c, "error deleting subvolume %llu: %i",
993                                 iter->pos.offset, ret);
994                 if (ret)
995                         return ret;
996         }
997
998         return 0;
999 }
1000
1001 noinline_for_stack
1002 static int check_subvols(struct bch_fs *c)
1003 {
1004         struct btree_trans trans;
1005         struct btree_iter iter;
1006         int ret;
1007
1008         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1009
1010         bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
1011                              POS_MIN,
1012                              BTREE_ITER_INTENT|
1013                              BTREE_ITER_PREFETCH);
1014
1015         do {
1016                 ret = __bch2_trans_do(&trans, NULL, NULL,
1017                                       BTREE_INSERT_LAZY_RW|
1018                                       BTREE_INSERT_NOFAIL,
1019                                       check_subvol(&trans, &iter));
1020                 if (ret)
1021                         break;
1022         } while (bch2_btree_iter_advance(&iter));
1023         bch2_trans_iter_exit(&trans, &iter);
1024
1025         bch2_trans_exit(&trans);
1026         return ret;
1027 }
1028
1029 /*
1030  * Checking for overlapping extents needs to be reimplemented
1031  */
1032 #if 0
1033 static int fix_overlapping_extent(struct btree_trans *trans,
1034                                        struct bkey_s_c k, struct bpos cut_at)
1035 {
1036         struct btree_iter iter;
1037         struct bkey_i *u;
1038         int ret;
1039
1040         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1041         ret = PTR_ERR_OR_ZERO(u);
1042         if (ret)
1043                 return ret;
1044
1045         bkey_reassemble(u, k);
1046         bch2_cut_front(cut_at, u);
1047
1048
1049         /*
1050          * We don't want to go through the extent_handle_overwrites path:
1051          *
1052          * XXX: this is going to screw up disk accounting, extent triggers
1053          * assume things about extent overwrites - we should be running the
1054          * triggers manually here
1055          */
1056         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
1057                              BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
1058
1059         BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
1060         ret   = bch2_btree_iter_traverse(&iter) ?:
1061                 bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
1062                 bch2_trans_commit(trans, NULL, NULL,
1063                                   BTREE_INSERT_NOFAIL|
1064                                   BTREE_INSERT_LAZY_RW);
1065         bch2_trans_iter_exit(trans, &iter);
1066         return ret;
1067 }
1068 #endif
1069
1070 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
1071                                                 struct btree_iter *iter,
1072                                                 struct bpos pos)
1073 {
1074         struct bkey_s_c k;
1075         int ret;
1076
1077         bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
1078         k = bch2_btree_iter_peek_slot(iter);
1079         ret = bkey_err(k);
1080         if (!ret && k.k->type != KEY_TYPE_dirent)
1081                 ret = -ENOENT;
1082         if (ret) {
1083                 bch2_trans_iter_exit(trans, iter);
1084                 return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
1085         }
1086
1087         return bkey_s_c_to_dirent(k);
1088 }
1089
1090 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
1091                                    struct bkey_s_c_dirent d)
1092 {
1093         return  inode->bi_dir           == d.k->p.inode &&
1094                 inode->bi_dir_offset    == d.k->p.offset;
1095 }
1096
1097 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
1098                                    struct bch_inode_unpacked *inode)
1099 {
1100         return d.v->d_type == DT_SUBVOL
1101                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
1102                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
1103 }
1104
1105 static int inode_backpointer_exists(struct btree_trans *trans,
1106                                     struct bch_inode_unpacked *inode,
1107                                     u32 snapshot)
1108 {
1109         struct btree_iter iter;
1110         struct bkey_s_c_dirent d;
1111         int ret;
1112
1113         d = dirent_get_by_pos(trans, &iter,
1114                         SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
1115         ret = bkey_err(d.s_c);
1116         if (ret)
1117                 return ret;
1118
1119         ret = dirent_points_to_inode(d, inode);
1120         bch2_trans_iter_exit(trans, &iter);
1121         return ret;
1122 }
1123
1124 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1125 {
1126         struct bch_fs *c = trans->c;
1127         struct inode_walker_entry *i;
1128         int ret = 0, ret2 = 0;
1129         s64 count2;
1130
1131         for (i = w->d; i < w->d + w->nr; i++) {
1132                 if (i->inode.bi_sectors == i->count)
1133                         continue;
1134
1135                 count2 = lockrestart_do(trans,
1136                         bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
1137
1138                 if (i->count != count2) {
1139                         bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
1140                                 i->count, count2);
1141                         i->count = count2;
1142                         if (i->inode.bi_sectors == i->count)
1143                                 continue;
1144                 }
1145
1146                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
1147                             "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1148                             w->cur_inum, i->snapshot,
1149                             i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
1150                         continue;
1151
1152                 i->inode.bi_sectors = i->count;
1153                 ret = write_inode(trans, &i->inode, i->snapshot);
1154                 if (ret)
1155                         break;
1156                 ret2 = -EINTR;
1157         }
1158 fsck_err:
1159         return ret ?: ret2;
1160 }
1161
1162 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1163                         struct inode_walker *inode,
1164                         struct snapshots_seen *s)
1165 {
1166         struct bch_fs *c = trans->c;
1167         struct bkey_s_c k;
1168         struct inode_walker_entry *i;
1169         char buf[200];
1170         int ret = 0;
1171
1172         k = bch2_btree_iter_peek(iter);
1173         if (!k.k)
1174                 return 0;
1175
1176         ret = bkey_err(k);
1177         if (ret)
1178                 return ret;
1179
1180         ret = check_key_has_snapshot(trans, iter, k);
1181         if (ret)
1182                 return ret < 0 ? ret : 0;
1183
1184         ret = snapshots_seen_update(c, s, k.k->p);
1185         if (ret)
1186                 return ret;
1187
1188         if (k.k->type == KEY_TYPE_whiteout)
1189                 return 0;
1190
1191         if (inode->cur_inum != k.k->p.inode) {
1192                 ret = check_i_sectors(trans, inode);
1193                 if (ret)
1194                         return ret;
1195         }
1196 #if 0
1197         if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
1198                 char buf1[200];
1199                 char buf2[200];
1200
1201                 bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
1202                 bch2_bkey_val_to_text(&PBUF(buf2), c, k);
1203
1204                 if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
1205                         return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
1206         }
1207 #endif
1208         ret = __walk_inode(trans, inode, k.k->p);
1209         if (ret < 0)
1210                 return ret;
1211
1212         if (fsck_err_on(ret == INT_MAX, c,
1213                         "extent in missing inode:\n  %s",
1214                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1215                 return bch2_btree_delete_at(trans, iter,
1216                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1217
1218         if (ret == INT_MAX)
1219                 return 0;
1220
1221         i = inode->d + ret;
1222         ret = 0;
1223
1224         if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
1225                         !S_ISLNK(i->inode.bi_mode), c,
1226                         "extent in non regular inode mode %o:\n  %s",
1227                         i->inode.bi_mode,
1228                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1229                 return bch2_btree_delete_at(trans, iter,
1230                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1231
1232         if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
1233                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
1234                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
1235                                         k.k->type != KEY_TYPE_reservation &&
1236                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
1237                                         "extent type %u offset %llu past end of inode %llu, i_size %llu",
1238                                         k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
1239                                 bch2_fs_lazy_rw(c);
1240                                 return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1241                                                 SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
1242                                                      k.k->p.snapshot),
1243                                                 POS(k.k->p.inode, U64_MAX),
1244                                                 0, NULL) ?: -EINTR;
1245                         }
1246                 }
1247         }
1248
1249         if (bkey_extent_is_allocation(k.k))
1250                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
1251                         i->count += k.k->size;
1252 #if 0
1253         bch2_bkey_buf_reassemble(&prev, c, k);
1254 #endif
1255
1256 fsck_err:
1257         return ret;
1258 }
1259
1260 /*
1261  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1262  * that i_size an i_sectors are consistent
1263  */
1264 noinline_for_stack
1265 static int check_extents(struct bch_fs *c)
1266 {
1267         struct inode_walker w = inode_walker_init();
1268         struct snapshots_seen s;
1269         struct btree_trans trans;
1270         struct btree_iter iter;
1271         int ret = 0;
1272
1273 #if 0
1274         struct bkey_buf prev;
1275         bch2_bkey_buf_init(&prev);
1276         prev.k->k = KEY(0, 0, 0);
1277 #endif
1278         snapshots_seen_init(&s);
1279         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1280
1281         bch_verbose(c, "checking extents");
1282
1283         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
1284                              POS(BCACHEFS_ROOT_INO, 0),
1285                              BTREE_ITER_INTENT|
1286                              BTREE_ITER_PREFETCH|
1287                              BTREE_ITER_ALL_SNAPSHOTS);
1288
1289         do {
1290                 ret = __bch2_trans_do(&trans, NULL, NULL,
1291                                       BTREE_INSERT_LAZY_RW|
1292                                       BTREE_INSERT_NOFAIL,
1293                         check_extent(&trans, &iter, &w, &s));
1294                 if (ret)
1295                         break;
1296         } while (bch2_btree_iter_advance(&iter));
1297         bch2_trans_iter_exit(&trans, &iter);
1298 #if 0
1299         bch2_bkey_buf_exit(&prev, c);
1300 #endif
1301         inode_walker_exit(&w);
1302         bch2_trans_exit(&trans);
1303         snapshots_seen_exit(&s);
1304
1305         return ret;
1306 }
1307
1308 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1309 {
1310         struct bch_fs *c = trans->c;
1311         struct inode_walker_entry *i;
1312         int ret = 0, ret2 = 0;
1313         s64 count2;
1314
1315         for (i = w->d; i < w->d + w->nr; i++) {
1316                 if (i->inode.bi_nlink == i->count)
1317                         continue;
1318
1319                 count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
1320                 if (count2 < 0)
1321                         return count2;
1322
1323                 if (i->count != count2) {
1324                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1325                                 i->count, count2);
1326                         i->count = count2;
1327                         if (i->inode.bi_nlink == i->count)
1328                                 continue;
1329                 }
1330
1331                 if (fsck_err_on(i->inode.bi_nlink != i->count, c,
1332                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1333                                 w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
1334                         i->inode.bi_nlink = i->count;
1335                         ret = write_inode(trans, &i->inode, i->snapshot);
1336                         if (ret)
1337                                 break;
1338                         ret2 = -EINTR;
1339                 }
1340         }
1341 fsck_err:
1342         return ret ?: ret2;
1343 }
1344
1345 static int check_dirent_target(struct btree_trans *trans,
1346                                struct btree_iter *iter,
1347                                struct bkey_s_c_dirent d,
1348                                struct bch_inode_unpacked *target,
1349                                u32 target_snapshot)
1350 {
1351         struct bch_fs *c = trans->c;
1352         struct bkey_i_dirent *n;
1353         bool backpointer_exists = true;
1354         char buf[200];
1355         int ret = 0;
1356
1357         if (!target->bi_dir &&
1358             !target->bi_dir_offset) {
1359                 target->bi_dir          = d.k->p.inode;
1360                 target->bi_dir_offset   = d.k->p.offset;
1361
1362                 ret = __write_inode(trans, target, target_snapshot);
1363                 if (ret)
1364                         goto err;
1365         }
1366
1367         if (!inode_points_to_dirent(target, d)) {
1368                 ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
1369                 if (ret < 0)
1370                         goto err;
1371
1372                 backpointer_exists = ret;
1373                 ret = 0;
1374
1375                 if (fsck_err_on(S_ISDIR(target->bi_mode) &&
1376                                 backpointer_exists, c,
1377                                 "directory %llu with multiple links",
1378                                 target->bi_inum)) {
1379                         ret = __remove_dirent(trans, d.k->p);
1380                         if (ret)
1381                                 goto err;
1382                         return 0;
1383                 }
1384
1385                 if (fsck_err_on(backpointer_exists &&
1386                                 !target->bi_nlink, c,
1387                                 "inode %llu has multiple links but i_nlink 0",
1388                                 target->bi_inum)) {
1389                         target->bi_nlink++;
1390                         target->bi_flags &= ~BCH_INODE_UNLINKED;
1391
1392                         ret = __write_inode(trans, target, target_snapshot);
1393                         if (ret)
1394                                 goto err;
1395                 }
1396
1397                 if (fsck_err_on(!backpointer_exists, c,
1398                                 "inode %llu:%u has wrong backpointer:\n"
1399                                 "got       %llu:%llu\n"
1400                                 "should be %llu:%llu",
1401                                 target->bi_inum, target_snapshot,
1402                                 target->bi_dir,
1403                                 target->bi_dir_offset,
1404                                 d.k->p.inode,
1405                                 d.k->p.offset)) {
1406                         target->bi_dir          = d.k->p.inode;
1407                         target->bi_dir_offset   = d.k->p.offset;
1408
1409                         ret = __write_inode(trans, target, target_snapshot);
1410                         if (ret)
1411                                 goto err;
1412                 }
1413         }
1414
1415         if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
1416                         "incorrect d_type: got %s, should be %s:\n%s",
1417                         bch2_d_type_str(d.v->d_type),
1418                         bch2_d_type_str(inode_d_type(target)),
1419                         (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
1420                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1421                 ret = PTR_ERR_OR_ZERO(n);
1422                 if (ret)
1423                         return ret;
1424
1425                 bkey_reassemble(&n->k_i, d.s_c);
1426                 n->v.d_type = inode_d_type(target);
1427
1428                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1429                 if (ret)
1430                         return ret;
1431
1432                 d = dirent_i_to_s_c(n);
1433         }
1434
1435         if (d.v->d_type == DT_SUBVOL &&
1436             target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
1437             (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
1438              fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
1439                       le32_to_cpu(d.v->d_parent_subvol),
1440                       target->bi_parent_subvol))) {
1441                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1442                 ret = PTR_ERR_OR_ZERO(n);
1443                 if (ret)
1444                         return ret;
1445
1446                 bkey_reassemble(&n->k_i, d.s_c);
1447                 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1448
1449                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1450                 if (ret)
1451                         return ret;
1452
1453                 d = dirent_i_to_s_c(n);
1454         }
1455 err:
1456 fsck_err:
1457         return ret;
1458 }
1459
1460 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1461                         struct bch_hash_info *hash_info,
1462                         struct inode_walker *dir,
1463                         struct inode_walker *target,
1464                         struct snapshots_seen *s)
1465 {
1466         struct bch_fs *c = trans->c;
1467         struct bkey_s_c k;
1468         struct bkey_s_c_dirent d;
1469         struct inode_walker_entry *i;
1470         char buf[200];
1471         int ret;
1472
1473         k = bch2_btree_iter_peek(iter);
1474         if (!k.k)
1475                 return 0;
1476
1477         ret = bkey_err(k);
1478         if (ret)
1479                 return ret;
1480
1481         ret = check_key_has_snapshot(trans, iter, k);
1482         if (ret)
1483                 return ret < 0 ? ret : 0;
1484
1485         ret = snapshots_seen_update(c, s, k.k->p);
1486         if (ret)
1487                 return ret;
1488
1489         if (k.k->type == KEY_TYPE_whiteout)
1490                 return 0;
1491
1492         if (dir->cur_inum != k.k->p.inode) {
1493                 ret = check_subdir_count(trans, dir);
1494                 if (ret)
1495                         return ret;
1496         }
1497
1498         ret = __walk_inode(trans, dir, k.k->p);
1499         if (ret < 0)
1500                 return ret;
1501
1502         if (fsck_err_on(ret == INT_MAX, c,
1503                         "dirent in nonexisting directory:\n%s",
1504                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1505                 return bch2_btree_delete_at(trans, iter,
1506                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1507
1508         if (ret == INT_MAX)
1509                 return 0;
1510
1511         i = dir->d + ret;
1512         ret = 0;
1513
1514         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
1515                         "dirent in non directory inode type %s:\n%s",
1516                         bch2_d_type_str(inode_d_type(&i->inode)),
1517                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1518                 return bch2_btree_delete_at(trans, iter, 0);
1519
1520         if (dir->first_this_inode)
1521                 *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
1522
1523         ret = hash_check_key(trans, bch2_dirent_hash_desc,
1524                              hash_info, iter, k);
1525         if (ret < 0)
1526                 return ret;
1527         if (ret) /* dirent has been deleted */
1528                 return 0;
1529
1530         if (k.k->type != KEY_TYPE_dirent)
1531                 return 0;
1532
1533         d = bkey_s_c_to_dirent(k);
1534
1535         if (d.v->d_type == DT_SUBVOL) {
1536                 struct bch_inode_unpacked subvol_root;
1537                 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1538                 u32 target_snapshot;
1539                 u64 target_inum;
1540
1541                 ret = __subvol_lookup(trans, target_subvol,
1542                                       &target_snapshot, &target_inum);
1543                 if (ret && ret != -ENOENT)
1544                         return ret;
1545
1546                 if (fsck_err_on(ret, c,
1547                                 "dirent points to missing subvolume %llu",
1548                                 le64_to_cpu(d.v->d_child_subvol)))
1549                         return __remove_dirent(trans, d.k->p);
1550
1551                 ret = __lookup_inode(trans, target_inum,
1552                                    &subvol_root, &target_snapshot);
1553                 if (ret && ret != -ENOENT)
1554                         return ret;
1555
1556                 if (fsck_err_on(ret, c,
1557                                 "subvolume %u points to missing subvolume root %llu",
1558                                 target_subvol,
1559                                 target_inum)) {
1560                         bch_err(c, "repair not implemented yet");
1561                         return -EINVAL;
1562                 }
1563
1564                 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
1565                                 "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1566                                 target_inum,
1567                                 subvol_root.bi_subvol, target_subvol)) {
1568                         subvol_root.bi_subvol = target_subvol;
1569                         ret = __write_inode(trans, &subvol_root, target_snapshot);
1570                         if (ret)
1571                                 return ret;
1572                 }
1573
1574                 ret = check_dirent_target(trans, iter, d, &subvol_root,
1575                                           target_snapshot);
1576                 if (ret)
1577                         return ret;
1578         } else {
1579                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1580                 if (ret)
1581                         return ret;
1582
1583                 if (fsck_err_on(!target->nr, c,
1584                                 "dirent points to missing inode:\n%s",
1585                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
1586                                                        k), buf))) {
1587                         ret = __remove_dirent(trans, d.k->p);
1588                         if (ret)
1589                                 return ret;
1590                 }
1591
1592                 for (i = target->d; i < target->d + target->nr; i++) {
1593                         ret = check_dirent_target(trans, iter, d,
1594                                                   &i->inode, i->snapshot);
1595                         if (ret)
1596                                 return ret;
1597                 }
1598         }
1599
1600         if (d.v->d_type == DT_DIR)
1601                 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
1602                         i->count++;
1603
1604 fsck_err:
1605         return ret;
1606 }
1607
1608 /*
1609  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1610  * validate d_type
1611  */
1612 noinline_for_stack
1613 static int check_dirents(struct bch_fs *c)
1614 {
1615         struct inode_walker dir = inode_walker_init();
1616         struct inode_walker target = inode_walker_init();
1617         struct snapshots_seen s;
1618         struct bch_hash_info hash_info;
1619         struct btree_trans trans;
1620         struct btree_iter iter;
1621         int ret = 0;
1622
1623         bch_verbose(c, "checking dirents");
1624
1625         snapshots_seen_init(&s);
1626         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1627
1628         bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
1629                              POS(BCACHEFS_ROOT_INO, 0),
1630                              BTREE_ITER_INTENT|
1631                              BTREE_ITER_PREFETCH|
1632                              BTREE_ITER_ALL_SNAPSHOTS);
1633
1634         do {
1635                 ret = __bch2_trans_do(&trans, NULL, NULL,
1636                                       BTREE_INSERT_LAZY_RW|
1637                                       BTREE_INSERT_NOFAIL,
1638                         check_dirent(&trans, &iter, &hash_info,
1639                                      &dir, &target, &s));
1640                 if (ret)
1641                         break;
1642         } while (bch2_btree_iter_advance(&iter));
1643         bch2_trans_iter_exit(&trans, &iter);
1644
1645         bch2_trans_exit(&trans);
1646         snapshots_seen_exit(&s);
1647         inode_walker_exit(&dir);
1648         inode_walker_exit(&target);
1649         return ret;
1650 }
1651
1652 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
1653                        struct bch_hash_info *hash_info,
1654                        struct inode_walker *inode)
1655 {
1656         struct bch_fs *c = trans->c;
1657         struct bkey_s_c k;
1658         int ret;
1659
1660         k = bch2_btree_iter_peek(iter);
1661         if (!k.k)
1662                 return 0;
1663
1664         ret = bkey_err(k);
1665         if (ret)
1666                 return ret;
1667
1668         ret = check_key_has_snapshot(trans, iter, k);
1669         if (ret)
1670                 return ret;
1671
1672         ret = __walk_inode(trans, inode, k.k->p);
1673         if (ret < 0)
1674                 return ret;
1675
1676         if (fsck_err_on(ret == INT_MAX, c,
1677                         "xattr for missing inode %llu",
1678                         k.k->p.inode))
1679                 return bch2_btree_delete_at(trans, iter, 0);
1680
1681         if (ret == INT_MAX)
1682                 return 0;
1683
1684         ret = 0;
1685
1686         if (inode->first_this_inode)
1687                 *hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
1688
1689         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
1690 fsck_err:
1691         return ret;
1692 }
1693
1694 /*
1695  * Walk xattrs: verify that they all have a corresponding inode
1696  */
1697 noinline_for_stack
1698 static int check_xattrs(struct bch_fs *c)
1699 {
1700         struct inode_walker inode = inode_walker_init();
1701         struct bch_hash_info hash_info;
1702         struct btree_trans trans;
1703         struct btree_iter iter;
1704         int ret = 0;
1705
1706         bch_verbose(c, "checking xattrs");
1707
1708         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1709
1710         bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
1711                              POS(BCACHEFS_ROOT_INO, 0),
1712                              BTREE_ITER_INTENT|
1713                              BTREE_ITER_PREFETCH|
1714                              BTREE_ITER_ALL_SNAPSHOTS);
1715
1716         do {
1717                 ret = __bch2_trans_do(&trans, NULL, NULL,
1718                                       BTREE_INSERT_LAZY_RW|
1719                                       BTREE_INSERT_NOFAIL,
1720                                       check_xattr(&trans, &iter, &hash_info,
1721                                                   &inode));
1722                 if (ret)
1723                         break;
1724         } while (bch2_btree_iter_advance(&iter));
1725         bch2_trans_iter_exit(&trans, &iter);
1726
1727         bch2_trans_exit(&trans);
1728         return ret;
1729 }
1730
1731 static int check_root_trans(struct btree_trans *trans)
1732 {
1733         struct bch_fs *c = trans->c;
1734         struct bch_inode_unpacked root_inode;
1735         u32 snapshot;
1736         u64 inum;
1737         int ret;
1738
1739         ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1740         if (ret && ret != -ENOENT)
1741                 return ret;
1742
1743         if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
1744                 struct bkey_i_subvolume root_subvol;
1745
1746                 snapshot        = U32_MAX;
1747                 inum            = BCACHEFS_ROOT_INO;
1748
1749                 bkey_subvolume_init(&root_subvol.k_i);
1750                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1751                 root_subvol.v.flags     = 0;
1752                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1753                 root_subvol.v.inode     = cpu_to_le64(inum);
1754                 ret = __bch2_trans_do(trans, NULL, NULL,
1755                                       BTREE_INSERT_NOFAIL|
1756                                       BTREE_INSERT_LAZY_RW,
1757                         __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
1758                 if (ret) {
1759                         bch_err(c, "error writing root subvol: %i", ret);
1760                         goto err;
1761                 }
1762
1763         }
1764
1765         ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1766         if (ret && ret != -ENOENT)
1767                 return ret;
1768
1769         if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
1770             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
1771                                 "root inode not a directory")) {
1772                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1773                                 0, NULL);
1774                 root_inode.bi_inum = inum;
1775
1776                 ret = __write_inode(trans, &root_inode, snapshot);
1777                 if (ret)
1778                         bch_err(c, "error writing root inode: %i", ret);
1779         }
1780 err:
1781 fsck_err:
1782         return ret;
1783 }
1784
1785 /* Get root directory, create if it doesn't exist: */
1786 noinline_for_stack
1787 static int check_root(struct bch_fs *c)
1788 {
1789         bch_verbose(c, "checking root directory");
1790
1791         return bch2_trans_do(c, NULL, NULL,
1792                              BTREE_INSERT_NOFAIL|
1793                              BTREE_INSERT_LAZY_RW,
1794                 check_root_trans(&trans));
1795 }
1796
1797 struct pathbuf {
1798         size_t          nr;
1799         size_t          size;
1800
1801         struct pathbuf_entry {
1802                 u64     inum;
1803                 u32     snapshot;
1804         }               *entries;
1805 };
1806
1807 static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
1808 {
1809         struct pathbuf_entry *i;
1810
1811         for (i = p->entries; i < p->entries + p->nr; i++)
1812                 if (i->inum     == inum &&
1813                     i->snapshot == snapshot)
1814                         return true;
1815
1816         return false;
1817 }
1818
1819 static int path_down(struct bch_fs *c, struct pathbuf *p,
1820                      u64 inum, u32 snapshot)
1821 {
1822         if (p->nr == p->size) {
1823                 size_t new_size = max_t(size_t, 256UL, p->size * 2);
1824                 void *n = krealloc(p->entries,
1825                                    new_size * sizeof(p->entries[0]),
1826                                    GFP_KERNEL);
1827                 if (!n) {
1828                         bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
1829                                 new_size);
1830                         return -ENOMEM;
1831                 }
1832
1833                 p->entries = n;
1834                 p->size = new_size;
1835         };
1836
1837         p->entries[p->nr++] = (struct pathbuf_entry) {
1838                 .inum           = inum,
1839                 .snapshot       = snapshot,
1840         };
1841         return 0;
1842 }
1843
1844 /*
1845  * Check that a given inode is reachable from the root:
1846  *
1847  * XXX: we should also be verifying that inodes are in the right subvolumes
1848  */
1849 static int check_path(struct btree_trans *trans,
1850                       struct pathbuf *p,
1851                       struct bch_inode_unpacked *inode,
1852                       u32 snapshot)
1853 {
1854         struct bch_fs *c = trans->c;
1855         int ret = 0;
1856
1857         snapshot = snapshot_t(c, snapshot)->equiv;
1858         p->nr = 0;
1859
1860         while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
1861                  inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
1862                 struct btree_iter dirent_iter;
1863                 struct bkey_s_c_dirent d;
1864                 u32 parent_snapshot = snapshot;
1865
1866                 if (inode->bi_subvol) {
1867                         u64 inum;
1868
1869                         ret = subvol_lookup(trans, inode->bi_parent_subvol,
1870                                             &parent_snapshot, &inum);
1871                         if (ret)
1872                                 break;
1873                 }
1874
1875                 ret = lockrestart_do(trans,
1876                         PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
1877                                           SPOS(inode->bi_dir, inode->bi_dir_offset,
1878                                                parent_snapshot))).k));
1879                 if (ret && ret != -ENOENT)
1880                         break;
1881
1882                 if (!ret && !dirent_points_to_inode(d, inode)) {
1883                         bch2_trans_iter_exit(trans, &dirent_iter);
1884                         ret = -ENOENT;
1885                 }
1886
1887                 if (ret == -ENOENT) {
1888                         if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
1889                                      inode->bi_inum, snapshot,
1890                                      bch2_d_type_str(inode_d_type(inode)),
1891                                      inode->bi_nlink,
1892                                      inode->bi_dir,
1893                                      inode->bi_dir_offset))
1894                                 ret = reattach_inode(trans, inode, snapshot);
1895                         break;
1896                 }
1897
1898                 bch2_trans_iter_exit(trans, &dirent_iter);
1899
1900                 if (!S_ISDIR(inode->bi_mode))
1901                         break;
1902
1903                 ret = path_down(c, p, inode->bi_inum, snapshot);
1904                 if (ret) {
1905                         bch_err(c, "memory allocation failure");
1906                         return ret;
1907                 }
1908
1909                 snapshot = parent_snapshot;
1910
1911                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
1912                 if (ret) {
1913                         /* Should have been caught in dirents pass */
1914                         bch_err(c, "error looking up parent directory: %i", ret);
1915                         break;
1916                 }
1917
1918                 if (path_is_dup(p, inode->bi_inum, snapshot)) {
1919                         struct pathbuf_entry *i;
1920
1921                         /* XXX print path */
1922                         bch_err(c, "directory structure loop");
1923
1924                         for (i = p->entries; i < p->entries + p->nr; i++)
1925                                 pr_err("%llu:%u", i->inum, i->snapshot);
1926                         pr_err("%llu:%u", inode->bi_inum, snapshot);
1927
1928                         if (!fsck_err(c, "directory structure loop"))
1929                                 return 0;
1930
1931                         ret = __bch2_trans_do(trans, NULL, NULL,
1932                                               BTREE_INSERT_NOFAIL|
1933                                               BTREE_INSERT_LAZY_RW,
1934                                         remove_backpointer(trans, inode));
1935                         if (ret) {
1936                                 bch_err(c, "error removing dirent: %i", ret);
1937                                 break;
1938                         }
1939
1940                         ret = reattach_inode(trans, inode, snapshot);
1941                 }
1942         }
1943 fsck_err:
1944         if (ret)
1945                 bch_err(c, "%s: err %i", __func__, ret);
1946         return ret;
1947 }
1948
1949 /*
1950  * Check for unreachable inodes, as well as loops in the directory structure:
1951  * After check_dirents(), if an inode backpointer doesn't exist that means it's
1952  * unreachable:
1953  */
1954 noinline_for_stack
1955 static int check_directory_structure(struct bch_fs *c)
1956 {
1957         struct btree_trans trans;
1958         struct btree_iter iter;
1959         struct bkey_s_c k;
1960         struct bch_inode_unpacked u;
1961         struct pathbuf path = { 0, 0, NULL };
1962         int ret;
1963
1964         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1965
1966         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
1967                            BTREE_ITER_INTENT|
1968                            BTREE_ITER_PREFETCH|
1969                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
1970                 if (!bkey_is_inode(k.k))
1971                         continue;
1972
1973                 ret = bch2_inode_unpack(k, &u);
1974                 if (ret) {
1975                         /* Should have been caught earlier in fsck: */
1976                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
1977                         break;
1978                 }
1979
1980                 if (u.bi_flags & BCH_INODE_UNLINKED)
1981                         continue;
1982
1983                 ret = check_path(&trans, &path, &u, iter.pos.snapshot);
1984                 if (ret)
1985                         break;
1986         }
1987         bch2_trans_iter_exit(&trans, &iter);
1988
1989         BUG_ON(ret == -EINTR);
1990
1991         kfree(path.entries);
1992
1993         bch2_trans_exit(&trans);
1994         return ret;
1995 }
1996
1997 struct nlink_table {
1998         size_t          nr;
1999         size_t          size;
2000
2001         struct nlink {
2002                 u64     inum;
2003                 u32     snapshot;
2004                 u32     count;
2005         }               *d;
2006 };
2007
2008 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2009                      u64 inum, u32 snapshot)
2010 {
2011         if (t->nr == t->size) {
2012                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2013                 void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
2014                 if (!d) {
2015                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2016                                 new_size);
2017                         return -ENOMEM;
2018                 }
2019
2020                 if (t->d)
2021                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2022                 kvfree(t->d);
2023
2024                 t->d = d;
2025                 t->size = new_size;
2026         }
2027
2028
2029         t->d[t->nr++] = (struct nlink) {
2030                 .inum           = inum,
2031                 .snapshot       = snapshot,
2032         };
2033
2034         return 0;
2035 }
2036
2037 static int nlink_cmp(const void *_l, const void *_r)
2038 {
2039         const struct nlink *l = _l;
2040         const struct nlink *r = _r;
2041
2042         return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
2043 }
2044
2045 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2046                      struct nlink_table *links,
2047                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2048 {
2049         struct nlink *link, key = {
2050                 .inum = inum, .snapshot = U32_MAX,
2051         };
2052
2053         if (inum < range_start || inum >= range_end)
2054                 return;
2055
2056         link = __inline_bsearch(&key, links->d, links->nr,
2057                                 sizeof(links->d[0]), nlink_cmp);
2058         if (!link)
2059                 return;
2060
2061         while (link > links->d && link[0].inum == link[-1].inum)
2062                 --link;
2063
2064         for (; link < links->d + links->nr && link->inum == inum; link++)
2065                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2066                         link->count++;
2067                         if (link->snapshot >= snapshot)
2068                                 break;
2069                 }
2070 }
2071
2072 noinline_for_stack
2073 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2074                                        struct nlink_table *t,
2075                                        u64 start, u64 *end)
2076 {
2077         struct btree_trans trans;
2078         struct btree_iter iter;
2079         struct bkey_s_c k;
2080         struct bch_inode_unpacked u;
2081         int ret = 0;
2082
2083         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2084
2085         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2086                            POS(0, start),
2087                            BTREE_ITER_INTENT|
2088                            BTREE_ITER_PREFETCH|
2089                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2090                 if (!bkey_is_inode(k.k))
2091                         continue;
2092
2093                 /* Should never fail, checked by bch2_inode_invalid: */
2094                 BUG_ON(bch2_inode_unpack(k, &u));
2095
2096                 /*
2097                  * Backpointer and directory structure checks are sufficient for
2098                  * directories, since they can't have hardlinks:
2099                  */
2100                 if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2101                         continue;
2102
2103                 if (!u.bi_nlink)
2104                         continue;
2105
2106                 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2107                 if (ret) {
2108                         *end = k.k->p.offset;
2109                         ret = 0;
2110                         break;
2111                 }
2112
2113         }
2114         bch2_trans_iter_exit(&trans, &iter);
2115         bch2_trans_exit(&trans);
2116
2117         if (ret)
2118                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2119
2120         return ret;
2121 }
2122
2123 noinline_for_stack
2124 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2125                                      u64 range_start, u64 range_end)
2126 {
2127         struct btree_trans trans;
2128         struct snapshots_seen s;
2129         struct btree_iter iter;
2130         struct bkey_s_c k;
2131         struct bkey_s_c_dirent d;
2132         int ret;
2133
2134         snapshots_seen_init(&s);
2135
2136         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2137
2138         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
2139                            BTREE_ITER_INTENT|
2140                            BTREE_ITER_PREFETCH|
2141                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2142                 ret = snapshots_seen_update(c, &s, k.k->p);
2143                 if (ret)
2144                         break;
2145
2146                 switch (k.k->type) {
2147                 case KEY_TYPE_dirent:
2148                         d = bkey_s_c_to_dirent(k);
2149
2150                         if (d.v->d_type != DT_DIR &&
2151                             d.v->d_type != DT_SUBVOL)
2152                                 inc_link(c, &s, links, range_start, range_end,
2153                                          le64_to_cpu(d.v->d_inum),
2154                                          d.k->p.snapshot);
2155                         break;
2156                 }
2157         }
2158         bch2_trans_iter_exit(&trans, &iter);
2159
2160         if (ret)
2161                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
2162
2163         bch2_trans_exit(&trans);
2164         snapshots_seen_exit(&s);
2165         return ret;
2166 }
2167
2168 noinline_for_stack
2169 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2170                                struct nlink_table *links,
2171                                u64 range_start, u64 range_end)
2172 {
2173         struct btree_trans trans;
2174         struct btree_iter iter;
2175         struct bkey_s_c k;
2176         struct bch_inode_unpacked u;
2177         struct nlink *link = links->d;
2178         int ret = 0;
2179
2180         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2181
2182         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2183                            POS(0, range_start),
2184                            BTREE_ITER_INTENT|
2185                            BTREE_ITER_PREFETCH|
2186                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2187                 if (k.k->p.offset >= range_end)
2188                         break;
2189
2190                 if (!bkey_is_inode(k.k))
2191                         continue;
2192
2193                 BUG_ON(bch2_inode_unpack(k, &u));
2194
2195                 if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2196                         continue;
2197
2198                 if (!u.bi_nlink)
2199                         continue;
2200
2201                 while ((cmp_int(link->inum, k.k->p.offset) ?:
2202                         cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2203                         link++;
2204                         BUG_ON(link >= links->d + links->nr);
2205                 }
2206
2207                 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
2208                                 "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
2209                                 u.bi_inum, mode_to_type(u.bi_mode),
2210                                 bch2_inode_nlink_get(&u), link->count)) {
2211                         bch2_inode_nlink_set(&u, link->count);
2212
2213                         ret = write_inode(&trans, &u, k.k->p.snapshot);
2214                         if (ret)
2215                                 bch_err(c, "error in fsck: error %i updating inode", ret);
2216                 }
2217         }
2218 fsck_err:
2219         bch2_trans_iter_exit(&trans, &iter);
2220         bch2_trans_exit(&trans);
2221
2222         if (ret)
2223                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2224
2225         return ret;
2226 }
2227
2228 noinline_for_stack
2229 static int check_nlinks(struct bch_fs *c)
2230 {
2231         struct nlink_table links = { 0 };
2232         u64 this_iter_range_start, next_iter_range_start = 0;
2233         int ret = 0;
2234
2235         bch_verbose(c, "checking inode nlinks");
2236
2237         do {
2238                 this_iter_range_start = next_iter_range_start;
2239                 next_iter_range_start = U64_MAX;
2240
2241                 ret = check_nlinks_find_hardlinks(c, &links,
2242                                                   this_iter_range_start,
2243                                                   &next_iter_range_start);
2244
2245                 ret = check_nlinks_walk_dirents(c, &links,
2246                                           this_iter_range_start,
2247                                           next_iter_range_start);
2248                 if (ret)
2249                         break;
2250
2251                 ret = check_nlinks_update_hardlinks(c, &links,
2252                                          this_iter_range_start,
2253                                          next_iter_range_start);
2254                 if (ret)
2255                         break;
2256
2257                 links.nr = 0;
2258         } while (next_iter_range_start != U64_MAX);
2259
2260         kvfree(links.d);
2261
2262         return ret;
2263 }
2264
2265 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
2266 {
2267         struct bkey_s_c k;
2268         struct bkey_s_c_reflink_p p;
2269         struct bkey_i_reflink_p *u;
2270         int ret;
2271
2272         k = bch2_btree_iter_peek(iter);
2273         if (!k.k)
2274                 return 0;
2275
2276         ret = bkey_err(k);
2277         if (ret)
2278                 return ret;
2279
2280         if (k.k->type != KEY_TYPE_reflink_p)
2281                 return 0;
2282
2283         p = bkey_s_c_to_reflink_p(k);
2284
2285         if (!p.v->front_pad && !p.v->back_pad)
2286                 return 0;
2287
2288         u = bch2_trans_kmalloc(trans, sizeof(*u));
2289         ret = PTR_ERR_OR_ZERO(u);
2290         if (ret)
2291                 return ret;
2292
2293         bkey_reassemble(&u->k_i, k);
2294         u->v.front_pad  = 0;
2295         u->v.back_pad   = 0;
2296
2297         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2298 }
2299
2300 noinline_for_stack
2301 static int fix_reflink_p(struct bch_fs *c)
2302 {
2303         struct btree_trans trans;
2304         struct btree_iter iter;
2305         struct bkey_s_c k;
2306         int ret;
2307
2308         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2309                 return 0;
2310
2311         bch_verbose(c, "fixing reflink_p keys");
2312
2313         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2314
2315         for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
2316                            BTREE_ITER_INTENT|
2317                            BTREE_ITER_PREFETCH|
2318                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2319                 if (k.k->type == KEY_TYPE_reflink_p) {
2320                         ret = __bch2_trans_do(&trans, NULL, NULL,
2321                                               BTREE_INSERT_NOFAIL|
2322                                               BTREE_INSERT_LAZY_RW,
2323                                               fix_reflink_p_key(&trans, &iter));
2324                         if (ret)
2325                                 break;
2326                 }
2327         }
2328         bch2_trans_iter_exit(&trans, &iter);
2329
2330         bch2_trans_exit(&trans);
2331         return ret;
2332 }
2333
2334 /*
2335  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
2336  * Doesn't fix them yet, mainly because they haven't yet been observed:
2337  */
2338 int bch2_fsck_full(struct bch_fs *c)
2339 {
2340         return  bch2_fs_snapshots_check(c) ?:
2341                 check_inodes(c, true) ?:
2342                 check_subvols(c) ?:
2343                 check_extents(c) ?:
2344                 check_dirents(c) ?:
2345                 check_xattrs(c) ?:
2346                 check_root(c) ?:
2347                 check_directory_structure(c) ?:
2348                 check_nlinks(c) ?:
2349                 fix_reflink_p(c);
2350 }
2351
2352 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
2353 {
2354         return check_inodes(c, false);
2355 }