]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
Update bcachefs sources to b1899a0bd9 bcachefs: Move bch2_evict_subvolume_inodes...
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "dirent.h"
7 #include "error.h"
8 #include "fs-common.h"
9 #include "fsck.h"
10 #include "inode.h"
11 #include "keylist.h"
12 #include "subvolume.h"
13 #include "super.h"
14 #include "xattr.h"
15
16 #include <linux/bsearch.h>
17 #include <linux/dcache.h> /* struct qstr */
18
19 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
20
21 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
22                                     u32 snapshot)
23 {
24         struct btree_iter iter;
25         struct bkey_s_c k;
26         u64 sectors = 0;
27         int ret;
28
29         for_each_btree_key(trans, iter, BTREE_ID_extents,
30                            SPOS(inum, 0, snapshot), 0, k, ret) {
31                 if (k.k->p.inode != inum)
32                         break;
33
34                 if (bkey_extent_is_allocation(k.k))
35                         sectors += k.k->size;
36         }
37
38         bch2_trans_iter_exit(trans, &iter);
39
40         return ret ?: sectors;
41 }
42
43 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
44                                     u32 snapshot)
45 {
46         struct btree_iter iter;
47         struct bkey_s_c k;
48         struct bkey_s_c_dirent d;
49         u64 subdirs = 0;
50         int ret;
51
52         for_each_btree_key(trans, iter, BTREE_ID_dirents,
53                            SPOS(inum, 0, snapshot), 0, k, ret) {
54                 if (k.k->p.inode != inum)
55                         break;
56
57                 if (k.k->type != KEY_TYPE_dirent)
58                         continue;
59
60                 d = bkey_s_c_to_dirent(k);
61                 if (d.v->d_type == DT_DIR)
62                         subdirs++;
63         }
64
65         bch2_trans_iter_exit(trans, &iter);
66
67         return ret ?: subdirs;
68 }
69
70 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
71                                     u32 *subvol)
72 {
73         struct btree_iter iter;
74         struct bkey_s_c k;
75         int ret;
76
77         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
78                              POS(0, snapshot), 0);
79         k = bch2_btree_iter_peek_slot(&iter);
80         ret = bkey_err(k);
81         if (ret)
82                 goto err;
83
84         if (k.k->type != KEY_TYPE_snapshot) {
85                 bch_err(trans->c, "snapshot %u not fonud", snapshot);
86                 ret = -ENOENT;
87                 goto err;
88         }
89
90         *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
91 err:
92         bch2_trans_iter_exit(trans, &iter);
93         return ret;
94
95 }
96
97 static int snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
98                                   u32 *subvol)
99 {
100         return lockrestart_do(trans, __snapshot_lookup_subvol(trans, snapshot, subvol));
101 }
102
103 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
104                            u32 *snapshot, u64 *inum)
105 {
106         struct bch_subvolume s;
107         int ret;
108
109         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
110
111         *snapshot = le32_to_cpu(s.snapshot);
112         *inum = le64_to_cpu(s.inode);
113         return ret;
114 }
115
116 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
117                          u32 *snapshot, u64 *inum)
118 {
119         return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
120 }
121
122 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
123                           struct bch_inode_unpacked *inode,
124                           u32 *snapshot)
125 {
126         struct btree_iter iter;
127         struct bkey_s_c k;
128         int ret;
129
130         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
131                              SPOS(0, inode_nr, *snapshot), 0);
132         k = bch2_btree_iter_peek_slot(&iter);
133         ret = bkey_err(k);
134         if (ret)
135                 goto err;
136
137         ret = k.k->type == KEY_TYPE_inode
138                 ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
139                 : -ENOENT;
140         if (!ret)
141                 *snapshot = iter.pos.snapshot;
142 err:
143         bch2_trans_iter_exit(trans, &iter);
144         return ret;
145 }
146
147 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
148                         struct bch_inode_unpacked *inode,
149                         u32 *snapshot)
150 {
151         return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
152 }
153
154 static int __lookup_dirent(struct btree_trans *trans,
155                            struct bch_hash_info hash_info,
156                            subvol_inum dir, struct qstr *name,
157                            u64 *target, unsigned *type)
158 {
159         struct btree_iter iter;
160         struct bkey_s_c_dirent d;
161         int ret;
162
163         ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
164                                &hash_info, dir, name, 0);
165         if (ret)
166                 return ret;
167
168         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
169         *target = le64_to_cpu(d.v->d_inum);
170         *type = d.v->d_type;
171         bch2_trans_iter_exit(trans, &iter);
172         return 0;
173 }
174
175 static int lookup_dirent(struct btree_trans *trans,
176                          struct bch_hash_info hash_info,
177                          subvol_inum dir, struct qstr *name,
178                          u64 *target, unsigned *type)
179 {
180         return lockrestart_do(trans,
181                 __lookup_dirent(trans, hash_info, dir, name, target, type));
182 }
183
184 static int __write_inode(struct btree_trans *trans,
185                          struct bch_inode_unpacked *inode,
186                          u32 snapshot)
187 {
188         struct btree_iter iter;
189         int ret;
190
191         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
192                             SPOS(0, inode->bi_inum, snapshot),
193                             BTREE_ITER_INTENT);
194
195         ret   = bch2_btree_iter_traverse(&iter) ?:
196                 bch2_inode_write(trans, &iter, inode);
197         bch2_trans_iter_exit(trans, &iter);
198         return ret;
199 }
200
201 static int write_inode(struct btree_trans *trans,
202                        struct bch_inode_unpacked *inode,
203                        u32 snapshot)
204 {
205         int ret = __bch2_trans_do(trans, NULL, NULL,
206                                   BTREE_INSERT_NOFAIL|
207                                   BTREE_INSERT_LAZY_RW,
208                                   __write_inode(trans, inode, snapshot));
209         if (ret)
210                 bch_err(trans->c, "error in fsck: error %i updating inode", ret);
211         return ret;
212 }
213
214 static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
215 {
216         struct btree_iter iter = { NULL };
217         struct bkey_i_inode_generation delete;
218         struct bch_inode_unpacked inode_u;
219         struct bkey_s_c k;
220         int ret;
221
222         ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
223                                               SPOS(inum, 0, snapshot),
224                                               SPOS(inum, U64_MAX, snapshot),
225                                               0, NULL) ?:
226                 bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
227                                               SPOS(inum, 0, snapshot),
228                                               SPOS(inum, U64_MAX, snapshot),
229                                               0, NULL) ?:
230                 bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
231                                               SPOS(inum, 0, snapshot),
232                                               SPOS(inum, U64_MAX, snapshot),
233                                               0, NULL);
234         if (ret)
235                 goto err;
236 retry:
237         bch2_trans_begin(trans);
238
239         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
240                              SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
241         k = bch2_btree_iter_peek_slot(&iter);
242
243         ret = bkey_err(k);
244         if (ret)
245                 goto err;
246
247         if (k.k->type != KEY_TYPE_inode) {
248                 bch2_fs_inconsistent(trans->c,
249                                      "inode %llu:%u not found when deleting",
250                                      inum, snapshot);
251                 ret = -EIO;
252                 goto err;
253         }
254
255         bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
256
257         /* Subvolume root? */
258         if (inode_u.bi_subvol) {
259                 ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
260                 if (ret)
261                         goto err;
262         }
263
264         bkey_inode_generation_init(&delete.k_i);
265         delete.k.p = iter.pos;
266         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
267
268         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
269                 bch2_trans_commit(trans, NULL, NULL,
270                                 BTREE_INSERT_NOFAIL);
271 err:
272         bch2_trans_iter_exit(trans, &iter);
273         if (ret == -EINTR)
274                 goto retry;
275
276         return ret;
277 }
278
279 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
280 {
281         struct bch_fs *c = trans->c;
282         struct btree_iter iter;
283         struct bch_inode_unpacked dir_inode;
284         struct bch_hash_info dir_hash_info;
285         int ret;
286
287         ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
288         if (ret)
289                 return ret;
290
291         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
292
293         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
294
295         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
296                                   &dir_hash_info, &iter, 0);
297         bch2_trans_iter_exit(trans, &iter);
298         return ret;
299 }
300
301 static int remove_dirent(struct btree_trans *trans, struct bpos pos)
302 {
303         int ret = __bch2_trans_do(trans, NULL, NULL,
304                                   BTREE_INSERT_NOFAIL|
305                                   BTREE_INSERT_LAZY_RW,
306                                   __remove_dirent(trans, pos));
307         if (ret)
308                 bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
309         return ret;
310 }
311
312 /* Get lost+found, create if it doesn't exist: */
313 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
314                             struct bch_inode_unpacked *lostfound)
315 {
316         struct bch_fs *c = trans->c;
317         struct bch_inode_unpacked root;
318         struct bch_hash_info root_hash_info;
319         struct qstr lostfound_str = QSTR("lost+found");
320         subvol_inum root_inum = { .subvol = subvol };
321         u64 inum = 0;
322         unsigned d_type = 0;
323         u32 snapshot;
324         int ret;
325
326         ret = subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
327         if (ret)
328                 return ret;
329
330         ret = lookup_inode(trans, root_inum.inum, &root, &snapshot);
331         if (ret) {
332                 bch_err(c, "error fetching subvol root: %i", ret);
333                 return ret;
334         }
335
336         root_hash_info = bch2_hash_info_init(c, &root);
337
338         ret = lookup_dirent(trans, root_hash_info, root_inum,
339                             &lostfound_str, &inum, &d_type);
340         if (ret == -ENOENT) {
341                 bch_notice(c, "creating lost+found");
342                 goto create_lostfound;
343         }
344
345         if (ret) {
346                 bch_err(c, "error looking up lost+found: %i", ret);
347                 return ret;
348         }
349
350         if (d_type != DT_DIR) {
351                 bch_err(c, "error looking up lost+found: not a directory");
352                 return ret;
353
354         }
355
356         ret = lookup_inode(trans, inum, lostfound, &snapshot);
357         if (ret && ret != -ENOENT) {
358                 /*
359                  * The check_dirents pass has already run, dangling dirents
360                  * shouldn't exist here:
361                  */
362                 bch_err(c, "error looking up lost+found: %i", ret);
363                 return ret;
364         }
365
366         if (ret == -ENOENT) {
367 create_lostfound:
368                 bch2_inode_init_early(c, lostfound);
369
370                 ret = __bch2_trans_do(trans, NULL, NULL,
371                                       BTREE_INSERT_NOFAIL|
372                                       BTREE_INSERT_LAZY_RW,
373                         bch2_create_trans(trans, root_inum, &root,
374                                           lostfound, &lostfound_str,
375                                           0, 0, S_IFDIR|0700, 0, NULL, NULL,
376                                           (subvol_inum) { }, 0));
377                 if (ret)
378                         bch_err(c, "error creating lost+found: %i", ret);
379         }
380
381         return 0;
382 }
383
384 static int reattach_inode(struct btree_trans *trans,
385                           struct bch_inode_unpacked *inode,
386                           u32 inode_snapshot)
387 {
388         struct bch_hash_info dir_hash;
389         struct bch_inode_unpacked lostfound;
390         char name_buf[20];
391         struct qstr name;
392         u64 dir_offset = 0;
393         u32 subvol;
394         int ret;
395
396         ret = snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
397         if (ret)
398                 return ret;
399
400         ret = lookup_lostfound(trans, subvol, &lostfound);
401         if (ret)
402                 return ret;
403
404         if (S_ISDIR(inode->bi_mode)) {
405                 lostfound.bi_nlink++;
406
407                 ret = write_inode(trans, &lostfound, U32_MAX);
408                 if (ret)
409                         return ret;
410         }
411
412         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
413
414         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
415         name = (struct qstr) QSTR(name_buf);
416
417         ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
418                         bch2_dirent_create(trans,
419                                            (subvol_inum) {
420                                                 .subvol = subvol,
421                                                 .inum = lostfound.bi_inum,
422                                            },
423                                            &dir_hash,
424                                            mode_to_type(inode->bi_mode),
425                                            &name, inode->bi_inum, &dir_offset,
426                                            BCH_HASH_SET_MUST_CREATE));
427         if (ret) {
428                 bch_err(trans->c, "error %i reattaching inode %llu",
429                         ret, inode->bi_inum);
430                 return ret;
431         }
432
433         inode->bi_dir           = lostfound.bi_inum;
434         inode->bi_dir_offset    = dir_offset;
435
436         return write_inode(trans, inode, inode_snapshot);
437 }
438
439 static int remove_backpointer(struct btree_trans *trans,
440                               struct bch_inode_unpacked *inode)
441 {
442         struct btree_iter iter;
443         struct bkey_s_c k;
444         int ret;
445
446         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
447                              POS(inode->bi_dir, inode->bi_dir_offset), 0);
448         k = bch2_btree_iter_peek_slot(&iter);
449         ret = bkey_err(k);
450         if (ret)
451                 goto out;
452         if (k.k->type != KEY_TYPE_dirent) {
453                 ret = -ENOENT;
454                 goto out;
455         }
456
457         ret = remove_dirent(trans, k.k->p);
458 out:
459         bch2_trans_iter_exit(trans, &iter);
460         return ret;
461 }
462
463 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
464 {
465         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
466
467         if (bkey_cmp(s->pos, pos))
468                 s->nr = 0;
469         s->pos = pos;
470
471         /* Might get called multiple times due to lock restarts */
472         if (s->nr && s->d[s->nr - 1] == pos.snapshot)
473                 return 0;
474
475         return snapshots_seen_add(c, s, pos.snapshot);
476 }
477
478 /**
479  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
480  * and @ancestor hasn't been overwritten in @seen
481  *
482  * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
483  */
484 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
485                                     u32 id, u32 ancestor)
486 {
487         ssize_t i;
488
489         BUG_ON(id > ancestor);
490
491         id              = snapshot_t(c, id)->equiv;
492         ancestor        = snapshot_t(c, ancestor)->equiv;
493
494         /* @ancestor should be the snapshot most recently added to @seen */
495         BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
496         BUG_ON(seen->pos.snapshot != ancestor);
497
498         if (id == ancestor)
499                 return true;
500
501         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
502                 return false;
503
504         for (i = seen->nr - 2;
505              i >= 0 && seen->d[i] >= id;
506              --i)
507                 if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
508                     bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
509                         return false;
510
511         return true;
512 }
513
514 /**
515  * ref_visible - given a key with snapshot id @src that points to a key with
516  * snapshot id @dst, test whether there is some snapshot in which @dst is
517  * visible.
518  *
519  * This assumes we're visiting @src keys in natural key order.
520  *
521  * @s   - list of snapshot IDs already seen at @src
522  * @src - snapshot ID of src key
523  * @dst - snapshot ID of dst key
524  */
525 static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
526                        u32 src, u32 dst)
527 {
528         return dst <= src
529                 ? key_visible_in_snapshot(c, s, dst, src)
530                 : bch2_snapshot_is_ancestor(c, src, dst);
531 }
532
533 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)       \
534         for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
535                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
536
537 struct inode_walker {
538         bool                            first_this_inode;
539         u64                             cur_inum;
540
541         size_t                          nr;
542         size_t                          size;
543         struct inode_walker_entry {
544                 struct bch_inode_unpacked inode;
545                 u32                     snapshot;
546                 u64                     count;
547         } *d;
548 };
549
550 static void inode_walker_exit(struct inode_walker *w)
551 {
552         kfree(w->d);
553         w->d = NULL;
554 }
555
556 static struct inode_walker inode_walker_init(void)
557 {
558         return (struct inode_walker) { 0, };
559 }
560
561 static int inode_walker_realloc(struct inode_walker *w)
562 {
563         if (w->nr == w->size) {
564                 size_t new_size = max_t(size_t, 8UL, w->size * 2);
565                 void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
566                                    GFP_KERNEL);
567                 if (!d)
568                         return -ENOMEM;
569
570                 w->d = d;
571                 w->size = new_size;
572         }
573
574         return 0;
575 }
576
577 static int add_inode(struct bch_fs *c, struct inode_walker *w,
578                      struct bkey_s_c_inode inode)
579 {
580         struct bch_inode_unpacked u;
581         int ret;
582
583         ret = inode_walker_realloc(w);
584         if (ret)
585                 return ret;
586
587         BUG_ON(bch2_inode_unpack(inode, &u));
588
589         w->d[w->nr++] = (struct inode_walker_entry) {
590                 .inode          = u,
591                 .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
592         };
593
594         return 0;
595 }
596
597 static int __walk_inode(struct btree_trans *trans,
598                         struct inode_walker *w, struct bpos pos)
599 {
600         struct bch_fs *c = trans->c;
601         struct btree_iter iter;
602         struct bkey_s_c k;
603         unsigned i, ancestor_pos;
604         int ret;
605
606         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
607
608         if (pos.inode == w->cur_inum) {
609                 w->first_this_inode = false;
610                 goto lookup_snapshot;
611         }
612
613         w->nr = 0;
614
615         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
616                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
617                 if (k.k->p.offset != pos.inode)
618                         break;
619
620                 if (k.k->type == KEY_TYPE_inode)
621                         add_inode(c, w, bkey_s_c_to_inode(k));
622         }
623         bch2_trans_iter_exit(trans, &iter);
624
625         if (ret)
626                 return ret;
627
628         w->cur_inum             = pos.inode;
629         w->first_this_inode     = true;
630 lookup_snapshot:
631         for (i = 0; i < w->nr; i++)
632                 if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
633                         goto found;
634         return INT_MAX;
635 found:
636         BUG_ON(pos.snapshot > w->d[i].snapshot);
637
638         if (pos.snapshot != w->d[i].snapshot) {
639                 ancestor_pos = i;
640
641                 while (i && w->d[i - 1].snapshot > pos.snapshot)
642                         --i;
643
644                 ret = inode_walker_realloc(w);
645                 if (ret)
646                         return ret;
647
648                 array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
649                 w->d[i].snapshot = pos.snapshot;
650                 w->d[i].count   = 0;
651         }
652
653         return i;
654 }
655
656 static int walk_inode(struct btree_trans *trans,
657                       struct inode_walker *w, struct bpos pos)
658 {
659         return lockrestart_do(trans, __walk_inode(trans, w, pos));
660 }
661
662 static int __get_visible_inodes(struct btree_trans *trans,
663                                 struct inode_walker *w,
664                                 struct snapshots_seen *s,
665                                 u64 inum)
666 {
667         struct bch_fs *c = trans->c;
668         struct btree_iter iter;
669         struct bkey_s_c k;
670         int ret;
671
672         w->nr = 0;
673
674         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
675                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
676                 if (k.k->p.offset != inum)
677                         break;
678
679                 if (k.k->type != KEY_TYPE_inode)
680                         continue;
681
682                 if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
683                         add_inode(c, w, bkey_s_c_to_inode(k));
684                         if (k.k->p.snapshot >= s->pos.snapshot)
685                                 break;
686                 }
687         }
688         bch2_trans_iter_exit(trans, &iter);
689
690         return ret;
691 }
692
693 static int check_key_has_snapshot(struct btree_trans *trans,
694                                   struct btree_iter *iter,
695                                   struct bkey_s_c k)
696 {
697         struct bch_fs *c = trans->c;
698         char buf[200];
699         int ret = 0;
700
701         if (fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
702                         "key in missing snapshot: %s",
703                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) {
704                 ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
705                         bch2_btree_delete_at(trans, iter,
706                                              BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
707                 return ret ?: -EINTR;
708         }
709 fsck_err:
710         return ret;
711 }
712
713 static int hash_redo_key(struct btree_trans *trans,
714                          const struct bch_hash_desc desc,
715                          struct bch_hash_info *hash_info,
716                          struct btree_iter *k_iter, struct bkey_s_c k)
717 {
718         bch_err(trans->c, "hash_redo_key() not implemented yet");
719         return -EINVAL;
720 #if 0
721         struct bkey_i *delete;
722         struct bkey_i *tmp;
723
724         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
725         if (IS_ERR(delete))
726                 return PTR_ERR(delete);
727
728         tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
729         if (IS_ERR(tmp))
730                 return PTR_ERR(tmp);
731
732         bkey_reassemble(tmp, k);
733
734         bkey_init(&delete->k);
735         delete->k.p = k_iter->pos;
736         return  bch2_btree_iter_traverse(k_iter) ?:
737                 bch2_trans_update(trans, k_iter, delete, 0) ?:
738                 bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
739 #endif
740 }
741
742 static int fsck_hash_delete_at(struct btree_trans *trans,
743                                const struct bch_hash_desc desc,
744                                struct bch_hash_info *info,
745                                struct btree_iter *iter)
746 {
747         int ret;
748 retry:
749         ret   = bch2_hash_delete_at(trans, desc, info, iter, 0) ?:
750                 bch2_trans_commit(trans, NULL, NULL,
751                                   BTREE_INSERT_NOFAIL|
752                                   BTREE_INSERT_LAZY_RW);
753         if (ret == -EINTR) {
754                 ret = bch2_btree_iter_traverse(iter);
755                 if (!ret)
756                         goto retry;
757         }
758
759         return ret;
760 }
761
762 static int hash_check_key(struct btree_trans *trans,
763                           const struct bch_hash_desc desc,
764                           struct bch_hash_info *hash_info,
765                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
766 {
767         struct bch_fs *c = trans->c;
768         struct btree_iter iter = { NULL };
769         char buf[200];
770         struct bkey_s_c k;
771         u64 hash;
772         int ret = 0;
773
774         if (hash_k.k->type != desc.key_type)
775                 return 0;
776
777         hash = desc.hash_bkey(hash_info, hash_k);
778
779         if (likely(hash == hash_k.k->p.offset))
780                 return 0;
781
782         if (hash_k.k->p.offset < hash)
783                 goto bad_hash;
784
785         for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
786                            BTREE_ITER_SLOTS, k, ret) {
787                 if (!bkey_cmp(k.k->p, hash_k.k->p))
788                         break;
789
790                 if (fsck_err_on(k.k->type == desc.key_type &&
791                                 !desc.cmp_bkey(k, hash_k), c,
792                                 "duplicate hash table keys:\n%s",
793                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
794                                                        hash_k), buf))) {
795                         ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
796                         if (ret)
797                                 return ret;
798                         ret = 1;
799                         break;
800                 }
801
802                 if (bkey_deleted(k.k)) {
803                         bch2_trans_iter_exit(trans, &iter);
804                         goto bad_hash;
805                 }
806
807         }
808         bch2_trans_iter_exit(trans, &iter);
809         return ret;
810 bad_hash:
811         if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
812                      "hashed to %llu\n%s",
813                      desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
814                      (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
815                 return 0;
816
817         ret = __bch2_trans_do(trans, NULL, NULL,
818                               BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
819                 hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
820         if (ret) {
821                 bch_err(c, "hash_redo_key err %i", ret);
822                 return ret;
823         }
824         return -EINTR;
825 fsck_err:
826         return ret;
827 }
828
829 static int check_inode(struct btree_trans *trans,
830                        struct btree_iter *iter,
831                        struct bch_inode_unpacked *prev,
832                        struct bch_inode_unpacked u)
833 {
834         struct bch_fs *c = trans->c;
835         bool do_update = false;
836         int ret = 0;
837
838         if (fsck_err_on(prev &&
839                         (prev->bi_hash_seed             != u.bi_hash_seed ||
840                          mode_to_type(prev->bi_mode) != mode_to_type(u.bi_mode)), c,
841                         "inodes in different snapshots don't match")) {
842                 bch_err(c, "repair not implemented yet");
843                 return -EINVAL;
844         }
845
846         if (u.bi_flags & BCH_INODE_UNLINKED &&
847             (!c->sb.clean ||
848              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
849                       u.bi_inum))) {
850                 bch2_trans_unlock(trans);
851                 bch2_fs_lazy_rw(c);
852
853                 ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
854                 if (ret)
855                         bch_err(c, "error in fsck: error %i while deleting inode", ret);
856                 return ret;
857         }
858
859         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
860             (!c->sb.clean ||
861              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
862                       u.bi_inum))) {
863                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
864
865                 bch2_trans_unlock(trans);
866                 bch2_fs_lazy_rw(c);
867
868                 /*
869                  * XXX: need to truncate partial blocks too here - or ideally
870                  * just switch units to bytes and that issue goes away
871                  */
872                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
873                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
874                                      iter->pos.snapshot),
875                                 POS(u.bi_inum, U64_MAX),
876                                 0, NULL);
877                 if (ret) {
878                         bch_err(c, "error in fsck: error %i truncating inode", ret);
879                         return ret;
880                 }
881
882                 /*
883                  * We truncated without our normal sector accounting hook, just
884                  * make sure we recalculate it:
885                  */
886                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
887
888                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
889                 do_update = true;
890         }
891
892         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
893             (!c->sb.clean ||
894              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
895                       u.bi_inum))) {
896                 s64 sectors;
897
898                 bch_verbose(c, "recounting sectors for inode %llu",
899                             u.bi_inum);
900
901                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
902                 if (sectors < 0) {
903                         bch_err(c, "error in fsck: error %i recounting inode sectors",
904                                 (int) sectors);
905                         return sectors;
906                 }
907
908                 u.bi_sectors = sectors;
909                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
910                 do_update = true;
911         }
912
913         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
914                 u.bi_dir = 0;
915                 u.bi_dir_offset = 0;
916                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
917                 do_update = true;
918         }
919
920         if (do_update) {
921                 ret = write_inode(trans, &u, iter->pos.snapshot);
922                 if (ret)
923                         bch_err(c, "error in fsck: error %i "
924                                 "updating inode", ret);
925         }
926 fsck_err:
927         return ret;
928 }
929
930 noinline_for_stack
931 static int check_inodes(struct bch_fs *c, bool full)
932 {
933         struct btree_trans trans;
934         struct btree_iter iter;
935         struct bkey_s_c k;
936         struct bkey_s_c_inode inode;
937         struct bch_inode_unpacked prev, u;
938         int ret;
939
940         memset(&prev, 0, sizeof(prev));
941
942         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
943
944         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
945                            BTREE_ITER_INTENT|
946                            BTREE_ITER_PREFETCH|
947                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
948                 ret = check_key_has_snapshot(&trans, &iter, k);
949                 if (ret)
950                         break;
951
952                 /*
953                  * if snapshot id isn't a leaf node, skip it - deletion in
954                  * particular is not atomic, so on the internal snapshot nodes
955                  * we can see inodes marked for deletion after a clean shutdown
956                  */
957                 if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
958                         continue;
959
960                 if (k.k->type != KEY_TYPE_inode)
961                         continue;
962
963                 inode = bkey_s_c_to_inode(k);
964
965                 if (!full &&
966                     !(inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
967                                            BCH_INODE_I_SECTORS_DIRTY|
968                                            BCH_INODE_UNLINKED)))
969                         continue;
970
971                 BUG_ON(bch2_inode_unpack(inode, &u));
972
973                 ret = check_inode(&trans, &iter,
974                                   full && prev.bi_inum == u.bi_inum
975                                   ? &prev : NULL, u);
976                 if (ret)
977                         break;
978
979                 prev = u;
980         }
981         bch2_trans_iter_exit(&trans, &iter);
982
983         BUG_ON(ret == -EINTR);
984
985         bch2_trans_exit(&trans);
986         return ret;
987 }
988
989 noinline_for_stack
990 static int check_subvols(struct bch_fs *c)
991 {
992         struct btree_trans trans;
993         struct btree_iter iter;
994         struct bkey_s_c k;
995         struct bkey_s_c_subvolume subvol;
996         int ret;
997
998         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
999
1000         for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN,
1001                            0, k, ret) {
1002                 if (k.k->type != KEY_TYPE_subvolume)
1003                         continue;
1004
1005                 subvol = bkey_s_c_to_subvolume(k);
1006
1007                 if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
1008                         ret = __bch2_trans_do(&trans,  NULL, NULL,
1009                                               BTREE_INSERT_LAZY_RW,
1010                                         bch2_subvolume_delete(&trans, iter.pos.offset));
1011                         if (ret) {
1012                                 bch_err(c, "error deleting subvolume %llu: %i",
1013                                         iter.pos.offset, ret);
1014                                 break;
1015                         }
1016                 }
1017         }
1018         bch2_trans_iter_exit(&trans, &iter);
1019
1020         bch2_trans_exit(&trans);
1021         return ret;
1022 }
1023
1024 /*
1025  * Checking for overlapping extents needs to be reimplemented
1026  */
1027 #if 0
1028 static int fix_overlapping_extent(struct btree_trans *trans,
1029                                        struct bkey_s_c k, struct bpos cut_at)
1030 {
1031         struct btree_iter iter;
1032         struct bkey_i *u;
1033         int ret;
1034
1035         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1036         ret = PTR_ERR_OR_ZERO(u);
1037         if (ret)
1038                 return ret;
1039
1040         bkey_reassemble(u, k);
1041         bch2_cut_front(cut_at, u);
1042
1043
1044         /*
1045          * We don't want to go through the extent_handle_overwrites path:
1046          *
1047          * XXX: this is going to screw up disk accounting, extent triggers
1048          * assume things about extent overwrites - we should be running the
1049          * triggers manually here
1050          */
1051         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
1052                              BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
1053
1054         BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
1055         ret   = bch2_btree_iter_traverse(&iter) ?:
1056                 bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
1057                 bch2_trans_commit(trans, NULL, NULL,
1058                                   BTREE_INSERT_NOFAIL|
1059                                   BTREE_INSERT_LAZY_RW);
1060         bch2_trans_iter_exit(trans, &iter);
1061         return ret;
1062 }
1063 #endif
1064
1065 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
1066                                                 struct btree_iter *iter,
1067                                                 struct bpos pos)
1068 {
1069         struct bkey_s_c k;
1070         int ret;
1071
1072         bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
1073         k = bch2_btree_iter_peek_slot(iter);
1074         ret = bkey_err(k);
1075         if (!ret && k.k->type != KEY_TYPE_dirent)
1076                 ret = -ENOENT;
1077         if (ret) {
1078                 bch2_trans_iter_exit(trans, iter);
1079                 return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
1080         }
1081
1082         return bkey_s_c_to_dirent(k);
1083 }
1084
1085 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
1086                                    struct bkey_s_c_dirent d)
1087 {
1088         return  inode->bi_dir           == d.k->p.inode &&
1089                 inode->bi_dir_offset    == d.k->p.offset;
1090 }
1091
1092 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
1093                                    struct bch_inode_unpacked *inode)
1094 {
1095         return d.v->d_type == DT_SUBVOL
1096                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
1097                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
1098 }
1099
1100 static int inode_backpointer_exists(struct btree_trans *trans,
1101                                     struct bch_inode_unpacked *inode,
1102                                     u32 snapshot)
1103 {
1104         struct btree_iter iter;
1105         struct bkey_s_c_dirent d;
1106         int ret;
1107
1108         d = dirent_get_by_pos(trans, &iter,
1109                         SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
1110         ret = bkey_err(d.s_c);
1111         if (ret)
1112                 return ret;
1113
1114         ret = dirent_points_to_inode(d, inode);
1115         bch2_trans_iter_exit(trans, &iter);
1116         return ret;
1117 }
1118
1119 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1120 {
1121         struct bch_fs *c = trans->c;
1122         struct inode_walker_entry *i;
1123         int ret = 0, ret2 = 0;
1124         s64 count2;
1125
1126         for (i = w->d; i < w->d + w->nr; i++) {
1127                 if (i->inode.bi_sectors == i->count)
1128                         continue;
1129
1130                 count2 = lockrestart_do(trans,
1131                         bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
1132
1133                 if (i->count != count2) {
1134                         bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
1135                                 i->count, count2);
1136                         i->count = count2;
1137                         if (i->inode.bi_sectors == i->count)
1138                                 continue;
1139                 }
1140
1141                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
1142                             "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1143                             w->cur_inum, i->snapshot,
1144                             i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
1145                         continue;
1146
1147                 i->inode.bi_sectors = i->count;
1148                 ret = write_inode(trans, &i->inode, i->snapshot);
1149                 if (ret)
1150                         break;
1151                 ret2 = -EINTR;
1152         }
1153 fsck_err:
1154         return ret ?: ret2;
1155 }
1156
1157 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1158                         struct inode_walker *inode,
1159                         struct snapshots_seen *s)
1160 {
1161         struct bch_fs *c = trans->c;
1162         struct bkey_s_c k;
1163         struct inode_walker_entry *i;
1164         char buf[200];
1165         int ret = 0;
1166
1167         k = bch2_btree_iter_peek(iter);
1168         if (!k.k)
1169                 return 0;
1170
1171         ret = bkey_err(k);
1172         if (ret)
1173                 return ret;
1174
1175         ret = check_key_has_snapshot(trans, iter, k);
1176         if (ret)
1177                 return ret;
1178
1179         ret = snapshots_seen_update(c, s, k.k->p);
1180         if (ret)
1181                 return ret;
1182
1183         if (k.k->type == KEY_TYPE_whiteout)
1184                 return 0;
1185
1186         if (inode->cur_inum != k.k->p.inode) {
1187                 ret = check_i_sectors(trans, inode);
1188                 if (ret)
1189                         return ret;
1190         }
1191 #if 0
1192         if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
1193                 char buf1[200];
1194                 char buf2[200];
1195
1196                 bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
1197                 bch2_bkey_val_to_text(&PBUF(buf2), c, k);
1198
1199                 if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
1200                         return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
1201         }
1202 #endif
1203         ret = __walk_inode(trans, inode, k.k->p);
1204         if (ret < 0)
1205                 return ret;
1206
1207         if (fsck_err_on(ret == INT_MAX, c,
1208                         "extent in missing inode:\n  %s",
1209                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1210                 return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
1211                         bch2_btree_delete_at(trans, iter,
1212                                              BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
1213
1214         if (ret == INT_MAX)
1215                 return 0;
1216
1217         i = inode->d + ret;
1218         ret = 0;
1219
1220         if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
1221                         !S_ISLNK(i->inode.bi_mode), c,
1222                         "extent in non regular inode mode %o:\n  %s",
1223                         i->inode.bi_mode,
1224                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1225                 return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
1226                          bch2_btree_delete_at(trans, iter,
1227                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
1228
1229         if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
1230                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
1231                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
1232                                         k.k->type != KEY_TYPE_reservation &&
1233                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
1234                                         "extent type %u offset %llu past end of inode %llu, i_size %llu",
1235                                         k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
1236                                 bch2_fs_lazy_rw(c);
1237                                 return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1238                                                 SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
1239                                                      k.k->p.snapshot),
1240                                                 POS(k.k->p.inode, U64_MAX),
1241                                                 0, NULL) ?: -EINTR;
1242                         }
1243                 }
1244         }
1245
1246         if (bkey_extent_is_allocation(k.k))
1247                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
1248                         i->count += k.k->size;
1249 #if 0
1250         bch2_bkey_buf_reassemble(&prev, c, k);
1251 #endif
1252
1253 fsck_err:
1254         return ret;
1255 }
1256
1257 /*
1258  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1259  * that i_size an i_sectors are consistent
1260  */
1261 noinline_for_stack
1262 static int check_extents(struct bch_fs *c)
1263 {
1264         struct inode_walker w = inode_walker_init();
1265         struct snapshots_seen s;
1266         struct btree_trans trans;
1267         struct btree_iter iter;
1268         int ret = 0;
1269
1270 #if 0
1271         struct bkey_buf prev;
1272         bch2_bkey_buf_init(&prev);
1273         prev.k->k = KEY(0, 0, 0);
1274 #endif
1275         snapshots_seen_init(&s);
1276         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1277
1278         bch_verbose(c, "checking extents");
1279
1280         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
1281                              POS(BCACHEFS_ROOT_INO, 0),
1282                              BTREE_ITER_INTENT|
1283                              BTREE_ITER_PREFETCH|
1284                              BTREE_ITER_ALL_SNAPSHOTS);
1285
1286         do {
1287                 ret = lockrestart_do(&trans,
1288                         check_extent(&trans, &iter, &w, &s));
1289                 if (ret)
1290                         break;
1291         } while (bch2_btree_iter_advance(&iter));
1292         bch2_trans_iter_exit(&trans, &iter);
1293 #if 0
1294         bch2_bkey_buf_exit(&prev, c);
1295 #endif
1296         inode_walker_exit(&w);
1297         bch2_trans_exit(&trans);
1298         snapshots_seen_exit(&s);
1299
1300         return ret;
1301 }
1302
1303 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1304 {
1305         struct bch_fs *c = trans->c;
1306         struct inode_walker_entry *i;
1307         int ret = 0, ret2 = 0;
1308         s64 count2;
1309
1310         for (i = w->d; i < w->d + w->nr; i++) {
1311                 if (i->inode.bi_nlink == i->count)
1312                         continue;
1313
1314                 count2 = lockrestart_do(trans,
1315                                 bch2_count_subdirs(trans, w->cur_inum, i->snapshot));
1316
1317                 if (i->count != count2) {
1318                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1319                                 i->count, count2);
1320                         i->count = count2;
1321                         if (i->inode.bi_nlink == i->count)
1322                                 continue;
1323                 }
1324
1325                 if (fsck_err_on(i->inode.bi_nlink != i->count, c,
1326                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1327                                 w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
1328                         i->inode.bi_nlink = i->count;
1329                         ret = write_inode(trans, &i->inode, i->snapshot);
1330                         if (ret)
1331                                 break;
1332                         ret2 = -EINTR;
1333                 }
1334         }
1335 fsck_err:
1336         return ret ?: ret2;
1337 }
1338
1339 static int check_dirent_target(struct btree_trans *trans,
1340                                struct btree_iter *iter,
1341                                struct bkey_s_c_dirent d,
1342                                struct bch_inode_unpacked *target,
1343                                u32 target_snapshot)
1344 {
1345         struct bch_fs *c = trans->c;
1346         bool backpointer_exists = true;
1347         char buf[200];
1348         int ret = 0;
1349
1350         if (!target->bi_dir &&
1351             !target->bi_dir_offset) {
1352                 target->bi_dir          = d.k->p.inode;
1353                 target->bi_dir_offset   = d.k->p.offset;
1354
1355                 ret = write_inode(trans, target, target_snapshot);
1356                 if (ret)
1357                         goto err;
1358         }
1359
1360         if (!inode_points_to_dirent(target, d)) {
1361                 ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
1362                 if (ret < 0)
1363                         goto err;
1364
1365                 backpointer_exists = ret;
1366                 ret = 0;
1367
1368                 if (fsck_err_on(S_ISDIR(target->bi_mode) &&
1369                                 backpointer_exists, c,
1370                                 "directory %llu with multiple links",
1371                                 target->bi_inum)) {
1372                         ret = remove_dirent(trans, d.k->p);
1373                         if (ret)
1374                                 goto err;
1375                         return 0;
1376                 }
1377
1378                 if (fsck_err_on(backpointer_exists &&
1379                                 !target->bi_nlink, c,
1380                                 "inode %llu has multiple links but i_nlink 0",
1381                                 target->bi_inum)) {
1382                         target->bi_nlink++;
1383                         target->bi_flags &= ~BCH_INODE_UNLINKED;
1384
1385                         ret = write_inode(trans, target, target_snapshot);
1386                         if (ret)
1387                                 goto err;
1388                 }
1389
1390                 if (fsck_err_on(!backpointer_exists, c,
1391                                 "inode %llu:%u has wrong backpointer:\n"
1392                                 "got       %llu:%llu\n"
1393                                 "should be %llu:%llu",
1394                                 target->bi_inum, target_snapshot,
1395                                 target->bi_dir,
1396                                 target->bi_dir_offset,
1397                                 d.k->p.inode,
1398                                 d.k->p.offset)) {
1399                         target->bi_dir          = d.k->p.inode;
1400                         target->bi_dir_offset   = d.k->p.offset;
1401
1402                         ret = write_inode(trans, target, target_snapshot);
1403                         if (ret)
1404                                 goto err;
1405                 }
1406         }
1407
1408         if (fsck_err_on(vfs_d_type(d.v->d_type) != mode_to_type(target->bi_mode), c,
1409                         "incorrect d_type: should be %u:\n%s",
1410                         mode_to_type(target->bi_mode),
1411                         (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
1412                 struct bkey_i_dirent *n;
1413
1414                 n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
1415                 if (!n) {
1416                         ret = -ENOMEM;
1417                         goto err;
1418                 }
1419
1420                 bkey_reassemble(&n->k_i, d.s_c);
1421                 n->v.d_type = mode_to_type(target->bi_mode);
1422
1423                 ret = __bch2_trans_do(trans, NULL, NULL,
1424                                       BTREE_INSERT_NOFAIL|
1425                                       BTREE_INSERT_LAZY_RW,
1426                         bch2_trans_update(trans, iter, &n->k_i, 0));
1427                 kfree(n);
1428
1429                 return ret ?: -EINTR;
1430         }
1431
1432         if (d.v->d_type == DT_SUBVOL &&
1433             target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
1434             (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
1435              fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
1436                       le32_to_cpu(d.v->d_parent_subvol),
1437                       target->bi_parent_subvol))) {
1438                 struct bkey_i_dirent *n;
1439
1440                 n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
1441                 if (!n) {
1442                         ret = -ENOMEM;
1443                         goto err;
1444                 }
1445
1446                 bkey_reassemble(&n->k_i, d.s_c);
1447                 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1448
1449                 ret = __bch2_trans_do(trans, NULL, NULL,
1450                                       BTREE_INSERT_NOFAIL|
1451                                       BTREE_INSERT_LAZY_RW,
1452                         bch2_trans_update(trans, iter, &n->k_i, 0));
1453                 kfree(n);
1454
1455                 return ret ?: -EINTR;
1456         }
1457 err:
1458 fsck_err:
1459         return ret;
1460 }
1461
1462 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1463                         struct bch_hash_info *hash_info,
1464                         struct inode_walker *dir,
1465                         struct inode_walker *target,
1466                         struct snapshots_seen *s)
1467 {
1468         struct bch_fs *c = trans->c;
1469         struct bkey_s_c k;
1470         struct bkey_s_c_dirent d;
1471         struct inode_walker_entry *i;
1472         char buf[200];
1473         int ret;
1474
1475         k = bch2_btree_iter_peek(iter);
1476         if (!k.k)
1477                 return 0;
1478
1479         ret = bkey_err(k);
1480         if (ret)
1481                 return ret;
1482
1483         ret = check_key_has_snapshot(trans, iter, k);
1484         if (ret)
1485                 return ret;
1486
1487         ret = snapshots_seen_update(c, s, k.k->p);
1488         if (ret)
1489                 return ret;
1490
1491         if (k.k->type == KEY_TYPE_whiteout)
1492                 return 0;
1493
1494         if (dir->cur_inum != k.k->p.inode) {
1495                 ret = check_subdir_count(trans, dir);
1496                 if (ret)
1497                         return ret;
1498         }
1499
1500         ret = __walk_inode(trans, dir, k.k->p);
1501         if (ret < 0)
1502                 return ret;
1503
1504         if (fsck_err_on(ret == INT_MAX, c,
1505                         "dirent in nonexisting directory:\n%s",
1506                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1507                 return __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
1508                                 bch2_btree_delete_at(trans, iter,
1509                                                      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
1510
1511         if (ret == INT_MAX)
1512                 return 0;
1513
1514         i = dir->d + ret;
1515         ret = 0;
1516
1517         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
1518                         "dirent in non directory inode type %u:\n%s",
1519                         mode_to_type(i->inode.bi_mode),
1520                         (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
1521                 return __bch2_trans_do(trans, NULL, NULL, 0,
1522                                 bch2_btree_delete_at(trans, iter, 0));
1523
1524         if (dir->first_this_inode)
1525                 *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
1526
1527         ret = hash_check_key(trans, bch2_dirent_hash_desc,
1528                              hash_info, iter, k);
1529         if (ret < 0)
1530                 return ret;
1531         if (ret) /* dirent has been deleted */
1532                 return 0;
1533
1534         if (k.k->type != KEY_TYPE_dirent)
1535                 return 0;
1536
1537         d = bkey_s_c_to_dirent(k);
1538
1539         if (d.v->d_type == DT_SUBVOL) {
1540                 struct bch_inode_unpacked subvol_root;
1541                 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1542                 u32 target_snapshot;
1543                 u64 target_inum;
1544
1545                 ret = __subvol_lookup(trans, target_subvol,
1546                                       &target_snapshot, &target_inum);
1547                 if (ret && ret != -ENOENT)
1548                         return ret;
1549
1550                 if (fsck_err_on(ret, c,
1551                                 "dirent points to missing subvolume %llu",
1552                                 le64_to_cpu(d.v->d_child_subvol)))
1553                         return remove_dirent(trans, d.k->p);
1554
1555                 ret = __lookup_inode(trans, target_inum,
1556                                    &subvol_root, &target_snapshot);
1557                 if (ret && ret != -ENOENT)
1558                         return ret;
1559
1560                 if (fsck_err_on(ret, c,
1561                                 "subvolume %u points to missing subvolume root %llu",
1562                                 target_subvol,
1563                                 target_inum)) {
1564                         bch_err(c, "repair not implemented yet");
1565                         return -EINVAL;
1566                 }
1567
1568                 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
1569                                 "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1570                                 target_inum,
1571                                 subvol_root.bi_subvol, target_subvol)) {
1572                         subvol_root.bi_subvol = target_subvol;
1573                         ret = write_inode(trans, &subvol_root, target_snapshot);
1574                         if (ret)
1575                                 return ret;
1576                 }
1577
1578                 ret = check_dirent_target(trans, iter, d, &subvol_root,
1579                                           target_snapshot);
1580                 if (ret)
1581                         return ret;
1582         } else {
1583                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1584                 if (ret)
1585                         return ret;
1586
1587                 if (fsck_err_on(!target->nr, c,
1588                                 "dirent points to missing inode:\n%s",
1589                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
1590                                                        k), buf))) {
1591                         ret = remove_dirent(trans, d.k->p);
1592                         if (ret)
1593                                 return ret;
1594                 }
1595
1596                 for (i = target->d; i < target->d + target->nr; i++) {
1597                         ret = check_dirent_target(trans, iter, d,
1598                                                   &i->inode, i->snapshot);
1599                         if (ret)
1600                                 return ret;
1601                 }
1602         }
1603
1604         if (d.v->d_type == DT_DIR)
1605                 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
1606                         i->count++;
1607
1608 fsck_err:
1609         return ret;
1610 }
1611
1612 /*
1613  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1614  * validate d_type
1615  */
1616 noinline_for_stack
1617 static int check_dirents(struct bch_fs *c)
1618 {
1619         struct inode_walker dir = inode_walker_init();
1620         struct inode_walker target = inode_walker_init();
1621         struct snapshots_seen s;
1622         struct bch_hash_info hash_info;
1623         struct btree_trans trans;
1624         struct btree_iter iter;
1625         int ret = 0;
1626
1627         bch_verbose(c, "checking dirents");
1628
1629         snapshots_seen_init(&s);
1630         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1631
1632         bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
1633                              POS(BCACHEFS_ROOT_INO, 0),
1634                              BTREE_ITER_INTENT|
1635                              BTREE_ITER_PREFETCH|
1636                              BTREE_ITER_ALL_SNAPSHOTS);
1637
1638         do {
1639                 ret = lockrestart_do(&trans,
1640                         check_dirent(&trans, &iter, &hash_info,
1641                                      &dir, &target, &s));
1642                 if (ret)
1643                         break;
1644         } while (bch2_btree_iter_advance(&iter));
1645         bch2_trans_iter_exit(&trans, &iter);
1646
1647         bch2_trans_exit(&trans);
1648         snapshots_seen_exit(&s);
1649         inode_walker_exit(&dir);
1650         inode_walker_exit(&target);
1651         return ret;
1652 }
1653
1654 /*
1655  * Walk xattrs: verify that they all have a corresponding inode
1656  */
1657 noinline_for_stack
1658 static int check_xattrs(struct bch_fs *c)
1659 {
1660         struct inode_walker w = inode_walker_init();
1661         struct bch_hash_info hash_info;
1662         struct btree_trans trans;
1663         struct btree_iter iter;
1664         struct bkey_s_c k;
1665         int ret = 0;
1666
1667         bch_verbose(c, "checking xattrs");
1668
1669         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1670
1671         bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
1672                              POS(BCACHEFS_ROOT_INO, 0),
1673                              BTREE_ITER_INTENT|
1674                              BTREE_ITER_PREFETCH|
1675                              BTREE_ITER_ALL_SNAPSHOTS);
1676 retry:
1677         bch2_trans_begin(&trans);
1678
1679         while ((k = bch2_btree_iter_peek(&iter)).k &&
1680                !(ret = bkey_err(k))) {
1681                 ret = check_key_has_snapshot(&trans, &iter, k);
1682                 if (ret)
1683                         break;
1684
1685                 ret = walk_inode(&trans, &w, k.k->p);
1686                 if (ret < 0)
1687                         break;
1688
1689                 if (fsck_err_on(ret == INT_MAX, c,
1690                                 "xattr for missing inode %llu",
1691                                 k.k->p.inode)) {
1692                         ret = bch2_btree_delete_at(&trans, &iter, 0);
1693                         if (ret)
1694                                 break;
1695                         continue;
1696                 }
1697
1698                 if (ret == INT_MAX)
1699                         goto next;
1700                 ret = 0;
1701
1702                 if (w.first_this_inode)
1703                         hash_info = bch2_hash_info_init(c, &w.d[0].inode);
1704
1705                 ret = hash_check_key(&trans, bch2_xattr_hash_desc,
1706                                      &hash_info, &iter, k);
1707                 if (ret)
1708                         break;
1709 next:
1710                 bch2_btree_iter_advance(&iter);
1711         }
1712 fsck_err:
1713         if (ret == -EINTR)
1714                 goto retry;
1715
1716         bch2_trans_iter_exit(&trans, &iter);
1717         bch2_trans_exit(&trans);
1718         return ret;
1719 }
1720
1721 /* Get root directory, create if it doesn't exist: */
1722 static int check_root(struct bch_fs *c)
1723 {
1724         struct btree_trans trans;
1725         struct bch_inode_unpacked root_inode;
1726         u32 snapshot;
1727         u64 inum;
1728         int ret;
1729
1730         bch2_trans_init(&trans, c, 0, 0);
1731
1732         bch_verbose(c, "checking root directory");
1733
1734         ret = subvol_lookup(&trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1735         if (ret && ret != -ENOENT)
1736                 return ret;
1737
1738         if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
1739                 struct bkey_i_subvolume root_subvol;
1740
1741                 snapshot        = U32_MAX;
1742                 inum            = BCACHEFS_ROOT_INO;
1743
1744                 bkey_subvolume_init(&root_subvol.k_i);
1745                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1746                 root_subvol.v.flags     = 0;
1747                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1748                 root_subvol.v.inode     = cpu_to_le64(inum);
1749                 ret = __bch2_trans_do(&trans, NULL, NULL,
1750                                       BTREE_INSERT_NOFAIL|
1751                                       BTREE_INSERT_LAZY_RW,
1752                         __bch2_btree_insert(&trans, BTREE_ID_subvolumes, &root_subvol.k_i));
1753                 if (ret) {
1754                         bch_err(c, "error writing root subvol: %i", ret);
1755                         goto err;
1756                 }
1757
1758         }
1759
1760         ret = lookup_inode(&trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1761         if (ret && ret != -ENOENT)
1762                 return ret;
1763
1764         if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
1765             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
1766                                 "root inode not a directory")) {
1767                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1768                                 0, NULL);
1769                 root_inode.bi_inum = inum;
1770
1771                 ret = write_inode(&trans, &root_inode, snapshot);
1772                 if (ret)
1773                         bch_err(c, "error writing root inode: %i", ret);
1774         }
1775 err:
1776 fsck_err:
1777         bch2_trans_exit(&trans);
1778         return ret;
1779 }
1780
1781 struct pathbuf {
1782         size_t          nr;
1783         size_t          size;
1784
1785         struct pathbuf_entry {
1786                 u64     inum;
1787                 u32     snapshot;
1788         }               *entries;
1789 };
1790
1791 static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
1792 {
1793         struct pathbuf_entry *i;
1794
1795         for (i = p->entries; i < p->entries + p->nr; i++)
1796                 if (i->inum     == inum &&
1797                     i->snapshot == snapshot)
1798                         return true;
1799
1800         return false;
1801 }
1802
1803 static int path_down(struct pathbuf *p, u64 inum, u32 snapshot)
1804 {
1805         if (p->nr == p->size) {
1806                 size_t new_size = max_t(size_t, 256UL, p->size * 2);
1807                 void *n = krealloc(p->entries,
1808                                    new_size * sizeof(p->entries[0]),
1809                                    GFP_KERNEL);
1810                 if (!n) {
1811                         return -ENOMEM;
1812                 }
1813
1814                 p->entries = n;
1815                 p->size = new_size;
1816         };
1817
1818         p->entries[p->nr++] = (struct pathbuf_entry) {
1819                 .inum           = inum,
1820                 .snapshot       = snapshot,
1821         };
1822         return 0;
1823 }
1824
1825 /*
1826  * Check that a given inode is reachable from the root:
1827  *
1828  * XXX: we should also be verifying that inodes are in the right subvolumes
1829  */
1830 static int check_path(struct btree_trans *trans,
1831                       struct pathbuf *p,
1832                       struct bch_inode_unpacked *inode,
1833                       u32 snapshot)
1834 {
1835         struct bch_fs *c = trans->c;
1836         int ret = 0;
1837
1838         snapshot = snapshot_t(c, snapshot)->equiv;
1839         p->nr = 0;
1840
1841         while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
1842                  inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
1843                 struct btree_iter dirent_iter;
1844                 struct bkey_s_c_dirent d;
1845                 u32 parent_snapshot = snapshot;
1846
1847                 if (inode->bi_subvol) {
1848                         u64 inum;
1849
1850                         ret = subvol_lookup(trans, inode->bi_parent_subvol,
1851                                             &parent_snapshot, &inum);
1852                         if (ret)
1853                                 break;
1854                 }
1855
1856                 ret = lockrestart_do(trans,
1857                         PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
1858                                           SPOS(inode->bi_dir, inode->bi_dir_offset,
1859                                                parent_snapshot))).k));
1860                 if (ret && ret != -ENOENT)
1861                         break;
1862
1863                 if (!ret && !dirent_points_to_inode(d, inode)) {
1864                         bch2_trans_iter_exit(trans, &dirent_iter);
1865                         ret = -ENOENT;
1866                 }
1867
1868                 if (ret == -ENOENT) {
1869                         if (fsck_err(c,  "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu",
1870                                      inode->bi_inum, snapshot,
1871                                      mode_to_type(inode->bi_mode),
1872                                      inode->bi_nlink,
1873                                      inode->bi_dir,
1874                                      inode->bi_dir_offset))
1875                                 ret = reattach_inode(trans, inode, snapshot);
1876                         break;
1877                 }
1878
1879                 bch2_trans_iter_exit(trans, &dirent_iter);
1880
1881                 if (!S_ISDIR(inode->bi_mode))
1882                         break;
1883
1884                 ret = path_down(p, inode->bi_inum, snapshot);
1885                 if (ret) {
1886                         bch_err(c, "memory allocation failure");
1887                         return ret;
1888                 }
1889
1890                 snapshot = parent_snapshot;
1891
1892                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
1893                 if (ret) {
1894                         /* Should have been caught in dirents pass */
1895                         bch_err(c, "error looking up parent directory: %i", ret);
1896                         break;
1897                 }
1898
1899                 if (path_is_dup(p, inode->bi_inum, snapshot)) {
1900                         struct pathbuf_entry *i;
1901
1902                         /* XXX print path */
1903                         bch_err(c, "directory structure loop");
1904
1905                         for (i = p->entries; i < p->entries + p->nr; i++)
1906                                 pr_err("%llu:%u", i->inum, i->snapshot);
1907                         pr_err("%llu:%u", inode->bi_inum, snapshot);
1908
1909                         if (!fsck_err(c, "directory structure loop"))
1910                                 return 0;
1911
1912                         ret = lockrestart_do(trans,
1913                                         remove_backpointer(trans, inode));
1914                         if (ret) {
1915                                 bch_err(c, "error removing dirent: %i", ret);
1916                                 break;
1917                         }
1918
1919                         ret = reattach_inode(trans, inode, snapshot);
1920                 }
1921         }
1922 fsck_err:
1923         if (ret)
1924                 bch_err(c, "%s: err %i", __func__, ret);
1925         return ret;
1926 }
1927
1928 /*
1929  * Check for unreachable inodes, as well as loops in the directory structure:
1930  * After check_dirents(), if an inode backpointer doesn't exist that means it's
1931  * unreachable:
1932  */
1933 static int check_directory_structure(struct bch_fs *c)
1934 {
1935         struct btree_trans trans;
1936         struct btree_iter iter;
1937         struct bkey_s_c k;
1938         struct bch_inode_unpacked u;
1939         struct pathbuf path = { 0, 0, NULL };
1940         int ret;
1941
1942         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1943
1944         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
1945                            BTREE_ITER_INTENT|
1946                            BTREE_ITER_PREFETCH|
1947                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
1948                 if (k.k->type != KEY_TYPE_inode)
1949                         continue;
1950
1951                 ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
1952                 if (ret) {
1953                         /* Should have been caught earlier in fsck: */
1954                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
1955                         break;
1956                 }
1957
1958                 if (u.bi_flags & BCH_INODE_UNLINKED)
1959                         continue;
1960
1961                 ret = check_path(&trans, &path, &u, iter.pos.snapshot);
1962                 if (ret)
1963                         break;
1964         }
1965         bch2_trans_iter_exit(&trans, &iter);
1966
1967         BUG_ON(ret == -EINTR);
1968
1969         kfree(path.entries);
1970
1971         bch2_trans_exit(&trans);
1972         return ret;
1973 }
1974
1975 struct nlink_table {
1976         size_t          nr;
1977         size_t          size;
1978
1979         struct nlink {
1980                 u64     inum;
1981                 u32     snapshot;
1982                 u32     count;
1983         }               *d;
1984 };
1985
1986 static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
1987 {
1988         if (t->nr == t->size) {
1989                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
1990                 void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
1991                 if (!d) {
1992                         return -ENOMEM;
1993                 }
1994
1995                 if (t->d)
1996                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
1997                 kvfree(t->d);
1998
1999                 t->d = d;
2000                 t->size = new_size;
2001         }
2002
2003
2004         t->d[t->nr++] = (struct nlink) {
2005                 .inum           = inum,
2006                 .snapshot       = snapshot,
2007         };
2008
2009         return 0;
2010 }
2011
2012 static int nlink_cmp(const void *_l, const void *_r)
2013 {
2014         const struct nlink *l = _l;
2015         const struct nlink *r = _r;
2016
2017         return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
2018 }
2019
2020 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2021                      struct nlink_table *links,
2022                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2023 {
2024         struct nlink *link, key = {
2025                 .inum = inum, .snapshot = U32_MAX,
2026         };
2027
2028         if (inum < range_start || inum >= range_end)
2029                 return;
2030
2031         link = __inline_bsearch(&key, links->d, links->nr,
2032                                 sizeof(links->d[0]), nlink_cmp);
2033         if (!link)
2034                 return;
2035
2036         while (link > links->d && link[0].inum == link[-1].inum)
2037                 --link;
2038
2039         for (; link < links->d + links->nr && link->inum == inum; link++)
2040                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2041                         link->count++;
2042                         if (link->snapshot >= snapshot)
2043                                 break;
2044                 }
2045 }
2046
2047 noinline_for_stack
2048 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2049                                        struct nlink_table *t,
2050                                        u64 start, u64 *end)
2051 {
2052         struct btree_trans trans;
2053         struct btree_iter iter;
2054         struct bkey_s_c k;
2055         struct bkey_s_c_inode inode;
2056         struct bch_inode_unpacked u;
2057         int ret = 0;
2058
2059         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2060
2061         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2062                            POS(0, start),
2063                            BTREE_ITER_INTENT|
2064                            BTREE_ITER_PREFETCH|
2065                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2066                 if (k.k->type != KEY_TYPE_inode)
2067                         continue;
2068
2069                 inode = bkey_s_c_to_inode(k);
2070
2071                 /*
2072                  * Backpointer and directory structure checks are sufficient for
2073                  * directories, since they can't have hardlinks:
2074                  */
2075                 if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
2076                         continue;
2077
2078                 /* Should never fail, checked by bch2_inode_invalid: */
2079                 BUG_ON(bch2_inode_unpack(inode, &u));
2080
2081                 if (!u.bi_nlink)
2082                         continue;
2083
2084                 ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
2085                 if (ret) {
2086                         *end = k.k->p.offset;
2087                         ret = 0;
2088                         break;
2089                 }
2090
2091         }
2092         bch2_trans_iter_exit(&trans, &iter);
2093         bch2_trans_exit(&trans);
2094
2095         if (ret)
2096                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2097
2098         return ret;
2099 }
2100
2101 noinline_for_stack
2102 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2103                                      u64 range_start, u64 range_end)
2104 {
2105         struct btree_trans trans;
2106         struct snapshots_seen s;
2107         struct btree_iter iter;
2108         struct bkey_s_c k;
2109         struct bkey_s_c_dirent d;
2110         int ret;
2111
2112         snapshots_seen_init(&s);
2113
2114         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2115
2116         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
2117                            BTREE_ITER_INTENT|
2118                            BTREE_ITER_PREFETCH|
2119                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2120                 ret = snapshots_seen_update(c, &s, k.k->p);
2121                 if (ret)
2122                         break;
2123
2124                 switch (k.k->type) {
2125                 case KEY_TYPE_dirent:
2126                         d = bkey_s_c_to_dirent(k);
2127
2128                         if (d.v->d_type != DT_DIR &&
2129                             d.v->d_type != DT_SUBVOL)
2130                                 inc_link(c, &s, links, range_start, range_end,
2131                                          le64_to_cpu(d.v->d_inum),
2132                                          d.k->p.snapshot);
2133                         break;
2134                 }
2135         }
2136         bch2_trans_iter_exit(&trans, &iter);
2137
2138         if (ret)
2139                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
2140
2141         bch2_trans_exit(&trans);
2142         snapshots_seen_exit(&s);
2143         return ret;
2144 }
2145
2146 noinline_for_stack
2147 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2148                                struct nlink_table *links,
2149                                u64 range_start, u64 range_end)
2150 {
2151         struct btree_trans trans;
2152         struct btree_iter iter;
2153         struct bkey_s_c k;
2154         struct bkey_s_c_inode inode;
2155         struct bch_inode_unpacked u;
2156         struct nlink *link = links->d;
2157         int ret = 0;
2158
2159         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2160
2161         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2162                            POS(0, range_start),
2163                            BTREE_ITER_INTENT|
2164                            BTREE_ITER_PREFETCH|
2165                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2166                 if (k.k->p.offset >= range_end)
2167                         break;
2168
2169                 if (k.k->type != KEY_TYPE_inode)
2170                         continue;
2171
2172                 inode = bkey_s_c_to_inode(k);
2173                 if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
2174                         continue;
2175
2176                 BUG_ON(bch2_inode_unpack(inode, &u));
2177
2178                 if (!u.bi_nlink)
2179                         continue;
2180
2181                 while ((cmp_int(link->inum, k.k->p.offset) ?:
2182                         cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2183                         link++;
2184                         BUG_ON(link >= links->d + links->nr);
2185                 }
2186
2187                 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
2188                                 "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
2189                                 u.bi_inum, mode_to_type(u.bi_mode),
2190                                 bch2_inode_nlink_get(&u), link->count)) {
2191                         bch2_inode_nlink_set(&u, link->count);
2192
2193                         ret = write_inode(&trans, &u, k.k->p.snapshot);
2194                         if (ret)
2195                                 bch_err(c, "error in fsck: error %i updating inode", ret);
2196                 }
2197         }
2198 fsck_err:
2199         bch2_trans_iter_exit(&trans, &iter);
2200         bch2_trans_exit(&trans);
2201
2202         if (ret)
2203                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2204
2205         return ret;
2206 }
2207
2208 noinline_for_stack
2209 static int check_nlinks(struct bch_fs *c)
2210 {
2211         struct nlink_table links = { 0 };
2212         u64 this_iter_range_start, next_iter_range_start = 0;
2213         int ret = 0;
2214
2215         bch_verbose(c, "checking inode nlinks");
2216
2217         do {
2218                 this_iter_range_start = next_iter_range_start;
2219                 next_iter_range_start = U64_MAX;
2220
2221                 ret = check_nlinks_find_hardlinks(c, &links,
2222                                                   this_iter_range_start,
2223                                                   &next_iter_range_start);
2224
2225                 ret = check_nlinks_walk_dirents(c, &links,
2226                                           this_iter_range_start,
2227                                           next_iter_range_start);
2228                 if (ret)
2229                         break;
2230
2231                 ret = check_nlinks_update_hardlinks(c, &links,
2232                                          this_iter_range_start,
2233                                          next_iter_range_start);
2234                 if (ret)
2235                         break;
2236
2237                 links.nr = 0;
2238         } while (next_iter_range_start != U64_MAX);
2239
2240         kvfree(links.d);
2241
2242         return ret;
2243 }
2244
2245 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
2246 {
2247         struct bkey_s_c k;
2248         struct bkey_s_c_reflink_p p;
2249         struct bkey_i_reflink_p *u;
2250         int ret;
2251
2252         k = bch2_btree_iter_peek(iter);
2253         if (!k.k)
2254                 return 0;
2255
2256         ret = bkey_err(k);
2257         if (ret)
2258                 return ret;
2259
2260         if (k.k->type != KEY_TYPE_reflink_p)
2261                 return 0;
2262
2263         p = bkey_s_c_to_reflink_p(k);
2264
2265         if (!p.v->front_pad && !p.v->back_pad)
2266                 return 0;
2267
2268         u = bch2_trans_kmalloc(trans, sizeof(*u));
2269         ret = PTR_ERR_OR_ZERO(u);
2270         if (ret)
2271                 return ret;
2272
2273         bkey_reassemble(&u->k_i, k);
2274         u->v.front_pad  = 0;
2275         u->v.back_pad   = 0;
2276
2277         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2278 }
2279
2280 static int fix_reflink_p(struct bch_fs *c)
2281 {
2282         struct btree_trans trans;
2283         struct btree_iter iter;
2284         struct bkey_s_c k;
2285         int ret;
2286
2287         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2288                 return 0;
2289
2290         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2291
2292         for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
2293                            BTREE_ITER_INTENT|
2294                            BTREE_ITER_PREFETCH|
2295                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2296                 if (k.k->type == KEY_TYPE_reflink_p) {
2297                         ret = __bch2_trans_do(&trans, NULL, NULL,
2298                                               BTREE_INSERT_NOFAIL|
2299                                               BTREE_INSERT_LAZY_RW,
2300                                               fix_reflink_p_key(&trans, &iter));
2301                         if (ret)
2302                                 break;
2303                 }
2304         }
2305         bch2_trans_iter_exit(&trans, &iter);
2306
2307         bch2_trans_exit(&trans);
2308         return ret;
2309 }
2310
2311 /*
2312  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
2313  * Doesn't fix them yet, mainly because they haven't yet been observed:
2314  */
2315 int bch2_fsck_full(struct bch_fs *c)
2316 {
2317         return  bch2_fs_snapshots_check(c) ?:
2318                 check_inodes(c, true) ?:
2319                 check_subvols(c) ?:
2320                 check_extents(c) ?:
2321                 check_dirents(c) ?:
2322                 check_xattrs(c) ?:
2323                 check_root(c) ?:
2324                 check_directory_structure(c) ?:
2325                 check_nlinks(c) ?:
2326                 fix_reflink_p(c);
2327 }
2328
2329 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
2330 {
2331         return check_inodes(c, false);
2332 }