]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
8783b950055e5c9adfc30ba8fe7229cb3ebb188a
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "dirent.h"
7 #include "error.h"
8 #include "fs-common.h"
9 #include "fsck.h"
10 #include "inode.h"
11 #include "keylist.h"
12 #include "subvolume.h"
13 #include "super.h"
14 #include "xattr.h"
15
16 #include <linux/bsearch.h>
17 #include <linux/dcache.h> /* struct qstr */
18
19 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
20
21 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
22                                     u32 snapshot)
23 {
24         struct btree_iter iter;
25         struct bkey_s_c k;
26         u64 sectors = 0;
27         int ret;
28
29         for_each_btree_key(trans, iter, BTREE_ID_extents,
30                            SPOS(inum, 0, snapshot), 0, k, ret) {
31                 if (k.k->p.inode != inum)
32                         break;
33
34                 if (bkey_extent_is_allocation(k.k))
35                         sectors += k.k->size;
36         }
37
38         bch2_trans_iter_exit(trans, &iter);
39
40         return ret ?: sectors;
41 }
42
43 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
44                                     u32 snapshot)
45 {
46         struct btree_iter iter;
47         struct bkey_s_c k;
48         struct bkey_s_c_dirent d;
49         u64 subdirs = 0;
50         int ret;
51
52         for_each_btree_key(trans, iter, BTREE_ID_dirents,
53                            SPOS(inum, 0, snapshot), 0, k, ret) {
54                 if (k.k->p.inode != inum)
55                         break;
56
57                 if (k.k->type != KEY_TYPE_dirent)
58                         continue;
59
60                 d = bkey_s_c_to_dirent(k);
61                 if (d.v->d_type == DT_DIR)
62                         subdirs++;
63         }
64
65         bch2_trans_iter_exit(trans, &iter);
66
67         return ret ?: subdirs;
68 }
69
70 static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
71                                     u32 *subvol)
72 {
73         struct btree_iter iter;
74         struct bkey_s_c k;
75         int ret;
76
77         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
78                              POS(0, snapshot), 0);
79         k = bch2_btree_iter_peek_slot(&iter);
80         ret = bkey_err(k);
81         if (ret)
82                 goto err;
83
84         if (k.k->type != KEY_TYPE_snapshot) {
85                 bch_err(trans->c, "snapshot %u not fonud", snapshot);
86                 ret = -ENOENT;
87                 goto err;
88         }
89
90         *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
91 err:
92         bch2_trans_iter_exit(trans, &iter);
93         return ret;
94
95 }
96
97 static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
98                            u32 *snapshot, u64 *inum)
99 {
100         struct bch_subvolume s;
101         int ret;
102
103         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
104
105         *snapshot = le32_to_cpu(s.snapshot);
106         *inum = le64_to_cpu(s.inode);
107         return ret;
108 }
109
110 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
111                          u32 *snapshot, u64 *inum)
112 {
113         return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
114 }
115
116 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
117                               struct bch_inode_unpacked *inode)
118 {
119         struct btree_iter iter;
120         struct bkey_s_c k;
121         int ret;
122
123         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
124                              POS(0, inode_nr),
125                              BTREE_ITER_ALL_SNAPSHOTS);
126         k = bch2_btree_iter_peek(&iter);
127         ret = bkey_err(k);
128         if (ret)
129                 goto err;
130
131         if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
132                 ret = -ENOENT;
133                 goto err;
134         }
135
136         ret = bch2_inode_unpack(k, inode);
137 err:
138         if (ret && ret != -EINTR)
139                 bch_err(trans->c, "error %i fetching inode %llu",
140                         ret, inode_nr);
141         bch2_trans_iter_exit(trans, &iter);
142         return ret;
143 }
144
145 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
146                           struct bch_inode_unpacked *inode,
147                           u32 *snapshot)
148 {
149         struct btree_iter iter;
150         struct bkey_s_c k;
151         int ret;
152
153         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
154                              SPOS(0, inode_nr, *snapshot), 0);
155         k = bch2_btree_iter_peek_slot(&iter);
156         ret = bkey_err(k);
157         if (ret)
158                 goto err;
159
160         ret = bkey_is_inode(k.k)
161                 ? bch2_inode_unpack(k, inode)
162                 : -ENOENT;
163         if (!ret)
164                 *snapshot = iter.pos.snapshot;
165 err:
166         if (ret && ret != -EINTR)
167                 bch_err(trans->c, "error %i fetching inode %llu:%u",
168                         ret, inode_nr, *snapshot);
169         bch2_trans_iter_exit(trans, &iter);
170         return ret;
171 }
172
173 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
174                         struct bch_inode_unpacked *inode,
175                         u32 *snapshot)
176 {
177         return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
178 }
179
180 static int __lookup_dirent(struct btree_trans *trans,
181                            struct bch_hash_info hash_info,
182                            subvol_inum dir, struct qstr *name,
183                            u64 *target, unsigned *type)
184 {
185         struct btree_iter iter;
186         struct bkey_s_c_dirent d;
187         int ret;
188
189         ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
190                                &hash_info, dir, name, 0);
191         if (ret)
192                 return ret;
193
194         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
195         *target = le64_to_cpu(d.v->d_inum);
196         *type = d.v->d_type;
197         bch2_trans_iter_exit(trans, &iter);
198         return 0;
199 }
200
201 static int __write_inode(struct btree_trans *trans,
202                          struct bch_inode_unpacked *inode,
203                          u32 snapshot)
204 {
205         struct btree_iter iter;
206         int ret;
207
208         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
209                             SPOS(0, inode->bi_inum, snapshot),
210                             BTREE_ITER_INTENT);
211
212         ret   = bch2_btree_iter_traverse(&iter) ?:
213                 bch2_inode_write(trans, &iter, inode);
214         bch2_trans_iter_exit(trans, &iter);
215         return ret;
216 }
217
218 static int write_inode(struct btree_trans *trans,
219                        struct bch_inode_unpacked *inode,
220                        u32 snapshot)
221 {
222         int ret = __bch2_trans_do(trans, NULL, NULL,
223                                   BTREE_INSERT_NOFAIL|
224                                   BTREE_INSERT_LAZY_RW,
225                                   __write_inode(trans, inode, snapshot));
226         if (ret)
227                 bch_err(trans->c, "error in fsck: error %i updating inode", ret);
228         return ret;
229 }
230
231 static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
232 {
233         struct btree_iter iter = { NULL };
234         struct bkey_i_inode_generation delete;
235         struct bch_inode_unpacked inode_u;
236         struct bkey_s_c k;
237         int ret;
238
239         ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
240                                               SPOS(inum, 0, snapshot),
241                                               SPOS(inum, U64_MAX, snapshot),
242                                               0, NULL) ?:
243                 bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
244                                               SPOS(inum, 0, snapshot),
245                                               SPOS(inum, U64_MAX, snapshot),
246                                               0, NULL) ?:
247                 bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
248                                               SPOS(inum, 0, snapshot),
249                                               SPOS(inum, U64_MAX, snapshot),
250                                               0, NULL);
251         if (ret)
252                 goto err;
253 retry:
254         bch2_trans_begin(trans);
255
256         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
257                              SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
258         k = bch2_btree_iter_peek_slot(&iter);
259
260         ret = bkey_err(k);
261         if (ret)
262                 goto err;
263
264         if (!bkey_is_inode(k.k)) {
265                 bch2_fs_inconsistent(trans->c,
266                                      "inode %llu:%u not found when deleting",
267                                      inum, snapshot);
268                 ret = -EIO;
269                 goto err;
270         }
271
272         bch2_inode_unpack(k, &inode_u);
273
274         /* Subvolume root? */
275         if (inode_u.bi_subvol) {
276                 ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
277                 if (ret)
278                         goto err;
279         }
280
281         bkey_inode_generation_init(&delete.k_i);
282         delete.k.p = iter.pos;
283         delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
284
285         ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
286                 bch2_trans_commit(trans, NULL, NULL,
287                                 BTREE_INSERT_NOFAIL);
288 err:
289         bch2_trans_iter_exit(trans, &iter);
290         if (ret == -EINTR)
291                 goto retry;
292
293         return ret;
294 }
295
296 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
297 {
298         struct bch_fs *c = trans->c;
299         struct btree_iter iter;
300         struct bch_inode_unpacked dir_inode;
301         struct bch_hash_info dir_hash_info;
302         int ret;
303
304         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
305         if (ret)
306                 return ret;
307
308         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
309
310         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
311
312         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
313                                   &dir_hash_info, &iter, 0);
314         bch2_trans_iter_exit(trans, &iter);
315         return ret;
316 }
317
318 /* Get lost+found, create if it doesn't exist: */
319 static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
320                             struct bch_inode_unpacked *lostfound)
321 {
322         struct bch_fs *c = trans->c;
323         struct bch_inode_unpacked root;
324         struct bch_hash_info root_hash_info;
325         struct qstr lostfound_str = QSTR("lost+found");
326         subvol_inum root_inum = { .subvol = subvol };
327         u64 inum = 0;
328         unsigned d_type = 0;
329         u32 snapshot;
330         int ret;
331
332         ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
333         if (ret)
334                 return ret;
335
336         ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
337         if (ret)
338                 return ret;
339
340         root_hash_info = bch2_hash_info_init(c, &root);
341
342         ret = __lookup_dirent(trans, root_hash_info, root_inum,
343                             &lostfound_str, &inum, &d_type);
344         if (ret == -ENOENT) {
345                 bch_notice(c, "creating lost+found");
346                 goto create_lostfound;
347         }
348
349         if (ret && ret != -EINTR)
350                 bch_err(c, "error looking up lost+found: %i", ret);
351         if (ret)
352                 return ret;
353
354         if (d_type != DT_DIR) {
355                 bch_err(c, "error looking up lost+found: not a directory");
356                 return ret;
357         }
358
359         /*
360          * The check_dirents pass has already run, dangling dirents
361          * shouldn't exist here:
362          */
363         return __lookup_inode(trans, inum, lostfound, &snapshot);
364
365 create_lostfound:
366         bch2_inode_init_early(c, lostfound);
367
368         ret = bch2_create_trans(trans, root_inum, &root,
369                                 lostfound, &lostfound_str,
370                                 0, 0, S_IFDIR|0700, 0, NULL, NULL,
371                                 (subvol_inum) { }, 0);
372         if (ret && ret != -EINTR)
373                 bch_err(c, "error creating lost+found: %i", ret);
374         return ret;
375 }
376
377 static int __reattach_inode(struct btree_trans *trans,
378                           struct bch_inode_unpacked *inode,
379                           u32 inode_snapshot)
380 {
381         struct bch_hash_info dir_hash;
382         struct bch_inode_unpacked lostfound;
383         char name_buf[20];
384         struct qstr name;
385         u64 dir_offset = 0;
386         u32 subvol;
387         int ret;
388
389         ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
390         if (ret)
391                 return ret;
392
393         ret = lookup_lostfound(trans, subvol, &lostfound);
394         if (ret)
395                 return ret;
396
397         if (S_ISDIR(inode->bi_mode)) {
398                 lostfound.bi_nlink++;
399
400                 ret = __write_inode(trans, &lostfound, U32_MAX);
401                 if (ret)
402                         return ret;
403         }
404
405         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
406
407         snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
408         name = (struct qstr) QSTR(name_buf);
409
410         ret = bch2_dirent_create(trans,
411                                  (subvol_inum) {
412                                         .subvol = subvol,
413                                         .inum = lostfound.bi_inum,
414                                  },
415                                  &dir_hash,
416                                  inode_d_type(inode),
417                                  &name, inode->bi_inum, &dir_offset,
418                                  BCH_HASH_SET_MUST_CREATE);
419         if (ret)
420                 return ret;
421
422         inode->bi_dir           = lostfound.bi_inum;
423         inode->bi_dir_offset    = dir_offset;
424
425         return __write_inode(trans, inode, inode_snapshot);
426 }
427
428 static int reattach_inode(struct btree_trans *trans,
429                           struct bch_inode_unpacked *inode,
430                           u32 inode_snapshot)
431 {
432         int ret = __bch2_trans_do(trans, NULL, NULL,
433                                   BTREE_INSERT_LAZY_RW|
434                                   BTREE_INSERT_NOFAIL,
435                         __reattach_inode(trans, inode, inode_snapshot));
436         if (ret) {
437                 bch_err(trans->c, "error %i reattaching inode %llu",
438                         ret, inode->bi_inum);
439                 return ret;
440         }
441
442         return ret;
443 }
444
445 static int remove_backpointer(struct btree_trans *trans,
446                               struct bch_inode_unpacked *inode)
447 {
448         struct btree_iter iter;
449         struct bkey_s_c k;
450         int ret;
451
452         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
453                              POS(inode->bi_dir, inode->bi_dir_offset), 0);
454         k = bch2_btree_iter_peek_slot(&iter);
455         ret = bkey_err(k);
456         if (ret)
457                 goto out;
458         if (k.k->type != KEY_TYPE_dirent) {
459                 ret = -ENOENT;
460                 goto out;
461         }
462
463         ret = __remove_dirent(trans, k.k->p);
464 out:
465         bch2_trans_iter_exit(trans, &iter);
466         return ret;
467 }
468
469 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
470 {
471         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
472
473         if (bkey_cmp(s->pos, pos))
474                 s->nr = 0;
475         s->pos = pos;
476
477         /* Might get called multiple times due to lock restarts */
478         if (s->nr && s->d[s->nr - 1] == pos.snapshot)
479                 return 0;
480
481         return snapshots_seen_add(c, s, pos.snapshot);
482 }
483
484 /**
485  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
486  * and @ancestor hasn't been overwritten in @seen
487  *
488  * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
489  */
490 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
491                                     u32 id, u32 ancestor)
492 {
493         ssize_t i;
494
495         BUG_ON(id > ancestor);
496
497         id              = snapshot_t(c, id)->equiv;
498         ancestor        = snapshot_t(c, ancestor)->equiv;
499
500         /* @ancestor should be the snapshot most recently added to @seen */
501         BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
502         BUG_ON(seen->pos.snapshot != ancestor);
503
504         if (id == ancestor)
505                 return true;
506
507         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
508                 return false;
509
510         for (i = seen->nr - 2;
511              i >= 0 && seen->d[i] >= id;
512              --i)
513                 if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
514                     bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
515                         return false;
516
517         return true;
518 }
519
520 /**
521  * ref_visible - given a key with snapshot id @src that points to a key with
522  * snapshot id @dst, test whether there is some snapshot in which @dst is
523  * visible.
524  *
525  * This assumes we're visiting @src keys in natural key order.
526  *
527  * @s   - list of snapshot IDs already seen at @src
528  * @src - snapshot ID of src key
529  * @dst - snapshot ID of dst key
530  */
531 static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
532                        u32 src, u32 dst)
533 {
534         return dst <= src
535                 ? key_visible_in_snapshot(c, s, dst, src)
536                 : bch2_snapshot_is_ancestor(c, src, dst);
537 }
538
539 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)       \
540         for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
541                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
542
543 struct inode_walker {
544         bool                            first_this_inode;
545         u64                             cur_inum;
546
547         size_t                          nr;
548         size_t                          size;
549         struct inode_walker_entry {
550                 struct bch_inode_unpacked inode;
551                 u32                     snapshot;
552                 u64                     count;
553         } *d;
554 };
555
556 static void inode_walker_exit(struct inode_walker *w)
557 {
558         kfree(w->d);
559         w->d = NULL;
560 }
561
562 static struct inode_walker inode_walker_init(void)
563 {
564         return (struct inode_walker) { 0, };
565 }
566
567 static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w)
568 {
569         if (w->nr == w->size) {
570                 size_t new_size = max_t(size_t, 8UL, w->size * 2);
571                 void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
572                                    GFP_KERNEL);
573                 if (!d) {
574                         bch_err(c, "fsck: error allocating memory for inode_walker, size %zu",
575                                 new_size);
576                         return -ENOMEM;
577                 }
578
579                 w->d = d;
580                 w->size = new_size;
581         }
582
583         return 0;
584 }
585
586 static int add_inode(struct bch_fs *c, struct inode_walker *w,
587                      struct bkey_s_c inode)
588 {
589         struct bch_inode_unpacked u;
590         int ret;
591
592         ret = inode_walker_realloc(c, w);
593         if (ret)
594                 return ret;
595
596         BUG_ON(bch2_inode_unpack(inode, &u));
597
598         w->d[w->nr++] = (struct inode_walker_entry) {
599                 .inode          = u,
600                 .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
601         };
602
603         return 0;
604 }
605
606 static int __walk_inode(struct btree_trans *trans,
607                         struct inode_walker *w, struct bpos pos)
608 {
609         struct bch_fs *c = trans->c;
610         struct btree_iter iter;
611         struct bkey_s_c k;
612         unsigned i, ancestor_pos;
613         int ret;
614
615         pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
616
617         if (pos.inode == w->cur_inum) {
618                 w->first_this_inode = false;
619                 goto lookup_snapshot;
620         }
621
622         w->nr = 0;
623
624         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
625                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
626                 if (k.k->p.offset != pos.inode)
627                         break;
628
629                 if (bkey_is_inode(k.k))
630                         add_inode(c, w, k);
631         }
632         bch2_trans_iter_exit(trans, &iter);
633
634         if (ret)
635                 return ret;
636
637         w->cur_inum             = pos.inode;
638         w->first_this_inode     = true;
639 lookup_snapshot:
640         for (i = 0; i < w->nr; i++)
641                 if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
642                         goto found;
643         return INT_MAX;
644 found:
645         BUG_ON(pos.snapshot > w->d[i].snapshot);
646
647         if (pos.snapshot != w->d[i].snapshot) {
648                 ancestor_pos = i;
649
650                 while (i && w->d[i - 1].snapshot > pos.snapshot)
651                         --i;
652
653                 ret = inode_walker_realloc(c, w);
654                 if (ret)
655                         return ret;
656
657                 array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
658                 w->d[i].snapshot = pos.snapshot;
659                 w->d[i].count   = 0;
660         }
661
662         return i;
663 }
664
665 static int __get_visible_inodes(struct btree_trans *trans,
666                                 struct inode_walker *w,
667                                 struct snapshots_seen *s,
668                                 u64 inum)
669 {
670         struct bch_fs *c = trans->c;
671         struct btree_iter iter;
672         struct bkey_s_c k;
673         int ret;
674
675         w->nr = 0;
676
677         for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
678                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
679                 if (k.k->p.offset != inum)
680                         break;
681
682                 if (!bkey_is_inode(k.k))
683                         continue;
684
685                 if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
686                         add_inode(c, w, k);
687                         if (k.k->p.snapshot >= s->pos.snapshot)
688                                 break;
689                 }
690         }
691         bch2_trans_iter_exit(trans, &iter);
692
693         return ret;
694 }
695
696 static int check_key_has_snapshot(struct btree_trans *trans,
697                                   struct btree_iter *iter,
698                                   struct bkey_s_c k)
699 {
700         struct bch_fs *c = trans->c;
701         struct printbuf buf = PRINTBUF;
702         int ret = 0;
703
704         if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
705                         "key in missing snapshot: %s",
706                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
707                 ret = bch2_btree_delete_at(trans, iter,
708                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
709 fsck_err:
710         printbuf_exit(&buf);
711         return ret;
712 }
713
714 static int hash_redo_key(struct btree_trans *trans,
715                          const struct bch_hash_desc desc,
716                          struct bch_hash_info *hash_info,
717                          struct btree_iter *k_iter, struct bkey_s_c k)
718 {
719         bch_err(trans->c, "hash_redo_key() not implemented yet");
720         return -EINVAL;
721 #if 0
722         struct bkey_i *delete;
723         struct bkey_i *tmp;
724
725         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
726         if (IS_ERR(delete))
727                 return PTR_ERR(delete);
728
729         tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
730         if (IS_ERR(tmp))
731                 return PTR_ERR(tmp);
732
733         bkey_reassemble(tmp, k);
734
735         bkey_init(&delete->k);
736         delete->k.p = k_iter->pos;
737         return  bch2_btree_iter_traverse(k_iter) ?:
738                 bch2_trans_update(trans, k_iter, delete, 0) ?:
739                 bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
740 #endif
741 }
742
743 static int hash_check_key(struct btree_trans *trans,
744                           const struct bch_hash_desc desc,
745                           struct bch_hash_info *hash_info,
746                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
747 {
748         struct bch_fs *c = trans->c;
749         struct btree_iter iter = { NULL };
750         struct printbuf buf = PRINTBUF;
751         struct bkey_s_c k;
752         u64 hash;
753         int ret = 0;
754
755         if (hash_k.k->type != desc.key_type)
756                 return 0;
757
758         hash = desc.hash_bkey(hash_info, hash_k);
759
760         if (likely(hash == hash_k.k->p.offset))
761                 return 0;
762
763         if (hash_k.k->p.offset < hash)
764                 goto bad_hash;
765
766         for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
767                            BTREE_ITER_SLOTS, k, ret) {
768                 if (!bkey_cmp(k.k->p, hash_k.k->p))
769                         break;
770
771                 if (fsck_err_on(k.k->type == desc.key_type &&
772                                 !desc.cmp_bkey(k, hash_k), c,
773                                 "duplicate hash table keys:\n%s",
774                                 (printbuf_reset(&buf),
775                                  bch2_bkey_val_to_text(&buf, c, hash_k),
776                                  buf.buf))) {
777                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
778                         break;
779                 }
780
781                 if (bkey_deleted(k.k)) {
782                         bch2_trans_iter_exit(trans, &iter);
783                         goto bad_hash;
784                 }
785
786         }
787 out:
788         bch2_trans_iter_exit(trans, &iter);
789         printbuf_exit(&buf);
790         return ret;
791 bad_hash:
792         if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
793                      "hashed to %llu\n%s",
794                      desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
795                      (printbuf_reset(&buf),
796                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)) == FSCK_ERR_IGNORE)
797                 return 0;
798
799         ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
800         if (ret) {
801                 bch_err(c, "hash_redo_key err %i", ret);
802                 return ret;
803         }
804         ret = -EINTR;
805 fsck_err:
806         goto out;
807 }
808
809 static int check_inode(struct btree_trans *trans,
810                        struct btree_iter *iter,
811                        struct bch_inode_unpacked *prev,
812                        bool full)
813 {
814         struct bch_fs *c = trans->c;
815         struct bkey_s_c k;
816         struct bch_inode_unpacked u;
817         bool do_update = false;
818         int ret;
819
820         k = bch2_btree_iter_peek(iter);
821         if (!k.k)
822                 return 0;
823
824         ret = bkey_err(k);
825         if (ret)
826                 return ret;
827
828         ret = check_key_has_snapshot(trans, iter, k);
829         if (ret)
830                 return ret < 0 ? ret : 0;
831
832         /*
833          * if snapshot id isn't a leaf node, skip it - deletion in
834          * particular is not atomic, so on the internal snapshot nodes
835          * we can see inodes marked for deletion after a clean shutdown
836          */
837         if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
838                 return 0;
839
840         if (!bkey_is_inode(k.k))
841                 return 0;
842
843         BUG_ON(bch2_inode_unpack(k, &u));
844
845         if (!full &&
846             !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
847                             BCH_INODE_I_SECTORS_DIRTY|
848                             BCH_INODE_UNLINKED)))
849                 return 0;
850
851         if (prev->bi_inum != u.bi_inum)
852                 *prev = u;
853
854         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
855                         inode_d_type(prev)      != inode_d_type(&u), c,
856                         "inodes in different snapshots don't match")) {
857                 bch_err(c, "repair not implemented yet");
858                 return -EINVAL;
859         }
860
861         if (u.bi_flags & BCH_INODE_UNLINKED &&
862             (!c->sb.clean ||
863              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
864                       u.bi_inum))) {
865                 bch2_trans_unlock(trans);
866                 bch2_fs_lazy_rw(c);
867
868                 ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
869                 if (ret)
870                         bch_err(c, "error in fsck: error %i while deleting inode", ret);
871                 return ret;
872         }
873
874         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
875             (!c->sb.clean ||
876              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
877                       u.bi_inum))) {
878                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
879
880                 bch2_trans_unlock(trans);
881                 bch2_fs_lazy_rw(c);
882
883                 /*
884                  * XXX: need to truncate partial blocks too here - or ideally
885                  * just switch units to bytes and that issue goes away
886                  */
887                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
888                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
889                                      iter->pos.snapshot),
890                                 POS(u.bi_inum, U64_MAX),
891                                 0, NULL);
892                 if (ret) {
893                         bch_err(c, "error in fsck: error %i truncating inode", ret);
894                         return ret;
895                 }
896
897                 /*
898                  * We truncated without our normal sector accounting hook, just
899                  * make sure we recalculate it:
900                  */
901                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
902
903                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
904                 do_update = true;
905         }
906
907         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
908             (!c->sb.clean ||
909              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
910                       u.bi_inum))) {
911                 s64 sectors;
912
913                 bch_verbose(c, "recounting sectors for inode %llu",
914                             u.bi_inum);
915
916                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
917                 if (sectors < 0) {
918                         bch_err(c, "error in fsck: error %i recounting inode sectors",
919                                 (int) sectors);
920                         return sectors;
921                 }
922
923                 u.bi_sectors = sectors;
924                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
925                 do_update = true;
926         }
927
928         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
929                 u.bi_dir = 0;
930                 u.bi_dir_offset = 0;
931                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
932                 do_update = true;
933         }
934
935         if (do_update) {
936                 ret = write_inode(trans, &u, iter->pos.snapshot);
937                 if (ret)
938                         bch_err(c, "error in fsck: error %i "
939                                 "updating inode", ret);
940         }
941 fsck_err:
942         return ret;
943 }
944
945 noinline_for_stack
946 static int check_inodes(struct bch_fs *c, bool full)
947 {
948         struct btree_trans trans;
949         struct btree_iter iter;
950         struct bch_inode_unpacked prev = { 0 };
951         int ret;
952
953         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
954
955         bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
956                              BTREE_ITER_INTENT|
957                              BTREE_ITER_PREFETCH|
958                              BTREE_ITER_ALL_SNAPSHOTS);
959
960         do {
961                 ret = __bch2_trans_do(&trans, NULL, NULL,
962                                       BTREE_INSERT_LAZY_RW|
963                                       BTREE_INSERT_NOFAIL,
964                         check_inode(&trans, &iter, &prev, full));
965                 if (ret)
966                         break;
967         } while (bch2_btree_iter_advance(&iter));
968         bch2_trans_iter_exit(&trans, &iter);
969
970         bch2_trans_exit(&trans);
971         return ret;
972 }
973
974 static int check_subvol(struct btree_trans *trans,
975                         struct btree_iter *iter)
976 {
977         struct bkey_s_c k;
978         struct bkey_s_c_subvolume subvol;
979         int ret;
980
981         k = bch2_btree_iter_peek(iter);
982         if (!k.k)
983                 return 0;
984
985         ret = bkey_err(k);
986         if (ret)
987                 return ret;
988
989         if (k.k->type != KEY_TYPE_subvolume)
990                 return 0;
991
992         subvol = bkey_s_c_to_subvolume(k);
993
994         if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
995                 ret = bch2_subvolume_delete(trans, iter->pos.offset);
996                 if (ret && ret != -EINTR)
997                         bch_err(trans->c, "error deleting subvolume %llu: %i",
998                                 iter->pos.offset, ret);
999                 if (ret)
1000                         return ret;
1001         }
1002
1003         return 0;
1004 }
1005
1006 noinline_for_stack
1007 static int check_subvols(struct bch_fs *c)
1008 {
1009         struct btree_trans trans;
1010         struct btree_iter iter;
1011         int ret;
1012
1013         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1014
1015         bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
1016                              POS_MIN,
1017                              BTREE_ITER_INTENT|
1018                              BTREE_ITER_PREFETCH);
1019
1020         do {
1021                 ret = __bch2_trans_do(&trans, NULL, NULL,
1022                                       BTREE_INSERT_LAZY_RW|
1023                                       BTREE_INSERT_NOFAIL,
1024                                       check_subvol(&trans, &iter));
1025                 if (ret)
1026                         break;
1027         } while (bch2_btree_iter_advance(&iter));
1028         bch2_trans_iter_exit(&trans, &iter);
1029
1030         bch2_trans_exit(&trans);
1031         return ret;
1032 }
1033
1034 /*
1035  * Checking for overlapping extents needs to be reimplemented
1036  */
1037 #if 0
1038 static int fix_overlapping_extent(struct btree_trans *trans,
1039                                        struct bkey_s_c k, struct bpos cut_at)
1040 {
1041         struct btree_iter iter;
1042         struct bkey_i *u;
1043         int ret;
1044
1045         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1046         ret = PTR_ERR_OR_ZERO(u);
1047         if (ret)
1048                 return ret;
1049
1050         bkey_reassemble(u, k);
1051         bch2_cut_front(cut_at, u);
1052
1053
1054         /*
1055          * We don't want to go through the extent_handle_overwrites path:
1056          *
1057          * XXX: this is going to screw up disk accounting, extent triggers
1058          * assume things about extent overwrites - we should be running the
1059          * triggers manually here
1060          */
1061         bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
1062                              BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
1063
1064         BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
1065         ret   = bch2_btree_iter_traverse(&iter) ?:
1066                 bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
1067                 bch2_trans_commit(trans, NULL, NULL,
1068                                   BTREE_INSERT_NOFAIL|
1069                                   BTREE_INSERT_LAZY_RW);
1070         bch2_trans_iter_exit(trans, &iter);
1071         return ret;
1072 }
1073 #endif
1074
1075 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
1076                                                 struct btree_iter *iter,
1077                                                 struct bpos pos)
1078 {
1079         struct bkey_s_c k;
1080         int ret;
1081
1082         bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
1083         k = bch2_btree_iter_peek_slot(iter);
1084         ret = bkey_err(k);
1085         if (!ret && k.k->type != KEY_TYPE_dirent)
1086                 ret = -ENOENT;
1087         if (ret) {
1088                 bch2_trans_iter_exit(trans, iter);
1089                 return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
1090         }
1091
1092         return bkey_s_c_to_dirent(k);
1093 }
1094
1095 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
1096                                    struct bkey_s_c_dirent d)
1097 {
1098         return  inode->bi_dir           == d.k->p.inode &&
1099                 inode->bi_dir_offset    == d.k->p.offset;
1100 }
1101
1102 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
1103                                    struct bch_inode_unpacked *inode)
1104 {
1105         return d.v->d_type == DT_SUBVOL
1106                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
1107                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
1108 }
1109
1110 static int inode_backpointer_exists(struct btree_trans *trans,
1111                                     struct bch_inode_unpacked *inode,
1112                                     u32 snapshot)
1113 {
1114         struct btree_iter iter;
1115         struct bkey_s_c_dirent d;
1116         int ret;
1117
1118         d = dirent_get_by_pos(trans, &iter,
1119                         SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
1120         ret = bkey_err(d.s_c);
1121         if (ret)
1122                 return ret;
1123
1124         ret = dirent_points_to_inode(d, inode);
1125         bch2_trans_iter_exit(trans, &iter);
1126         return ret;
1127 }
1128
1129 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1130 {
1131         struct bch_fs *c = trans->c;
1132         struct inode_walker_entry *i;
1133         int ret = 0, ret2 = 0;
1134         s64 count2;
1135
1136         for (i = w->d; i < w->d + w->nr; i++) {
1137                 if (i->inode.bi_sectors == i->count)
1138                         continue;
1139
1140                 count2 = lockrestart_do(trans,
1141                         bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
1142
1143                 if (i->count != count2) {
1144                         bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
1145                                 i->count, count2);
1146                         i->count = count2;
1147                         if (i->inode.bi_sectors == i->count)
1148                                 continue;
1149                 }
1150
1151                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
1152                             "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1153                             w->cur_inum, i->snapshot,
1154                             i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
1155                         continue;
1156
1157                 i->inode.bi_sectors = i->count;
1158                 ret = write_inode(trans, &i->inode, i->snapshot);
1159                 if (ret)
1160                         break;
1161                 ret2 = -EINTR;
1162         }
1163 fsck_err:
1164         return ret ?: ret2;
1165 }
1166
1167 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1168                         struct inode_walker *inode,
1169                         struct snapshots_seen *s)
1170 {
1171         struct bch_fs *c = trans->c;
1172         struct bkey_s_c k;
1173         struct inode_walker_entry *i;
1174         struct printbuf buf = PRINTBUF;
1175         int ret = 0;
1176
1177         k = bch2_btree_iter_peek(iter);
1178         if (!k.k)
1179                 goto out;
1180
1181         ret = bkey_err(k);
1182         if (ret)
1183                 goto err;
1184
1185         ret = check_key_has_snapshot(trans, iter, k);
1186         if (ret) {
1187                 ret = ret < 0 ? ret : 0;
1188                 goto out;
1189         }
1190
1191         ret = snapshots_seen_update(c, s, k.k->p);
1192         if (ret)
1193                 goto err;
1194
1195         if (k.k->type == KEY_TYPE_whiteout)
1196                 goto out;
1197
1198         if (inode->cur_inum != k.k->p.inode) {
1199                 ret = check_i_sectors(trans, inode);
1200                 if (ret)
1201                         goto err;
1202         }
1203 #if 0
1204         if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
1205                 char buf1[200];
1206                 char buf2[200];
1207
1208                 bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
1209                 bch2_bkey_val_to_text(&PBUF(buf2), c, k);
1210
1211                 if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) {
1212                         ret = fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
1213                         goto out;
1214                 }
1215         }
1216 #endif
1217         ret = __walk_inode(trans, inode, k.k->p);
1218         if (ret < 0)
1219                 goto err;
1220
1221         if (fsck_err_on(ret == INT_MAX, c,
1222                         "extent in missing inode:\n  %s",
1223                         (printbuf_reset(&buf),
1224                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1225                 ret = bch2_btree_delete_at(trans, iter,
1226                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1227                 goto out;
1228         }
1229
1230         if (ret == INT_MAX) {
1231                 ret = 0;
1232                 goto out;
1233         }
1234
1235         i = inode->d + ret;
1236         ret = 0;
1237
1238         if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
1239                         !S_ISLNK(i->inode.bi_mode), c,
1240                         "extent in non regular inode mode %o:\n  %s",
1241                         i->inode.bi_mode,
1242                         (printbuf_reset(&buf),
1243                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1244                 ret = bch2_btree_delete_at(trans, iter,
1245                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1246                 goto out;
1247         }
1248
1249         if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
1250                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
1251                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
1252                                         k.k->type != KEY_TYPE_reservation &&
1253                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
1254                                         "extent type %u offset %llu past end of inode %llu, i_size %llu",
1255                                         k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
1256                                 bch2_fs_lazy_rw(c);
1257                                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1258                                                 SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
1259                                                      k.k->p.snapshot),
1260                                                 POS(k.k->p.inode, U64_MAX),
1261                                                 0, NULL) ?: -EINTR;
1262                                 goto out;
1263                         }
1264                 }
1265         }
1266
1267         if (bkey_extent_is_allocation(k.k))
1268                 for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
1269                         i->count += k.k->size;
1270 #if 0
1271         bch2_bkey_buf_reassemble(&prev, c, k);
1272 #endif
1273
1274 out:
1275 err:
1276 fsck_err:
1277         printbuf_exit(&buf);
1278         return ret;
1279 }
1280
1281 /*
1282  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1283  * that i_size an i_sectors are consistent
1284  */
1285 noinline_for_stack
1286 static int check_extents(struct bch_fs *c)
1287 {
1288         struct inode_walker w = inode_walker_init();
1289         struct snapshots_seen s;
1290         struct btree_trans trans;
1291         struct btree_iter iter;
1292         int ret = 0;
1293
1294 #if 0
1295         struct bkey_buf prev;
1296         bch2_bkey_buf_init(&prev);
1297         prev.k->k = KEY(0, 0, 0);
1298 #endif
1299         snapshots_seen_init(&s);
1300         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1301
1302         bch_verbose(c, "checking extents");
1303
1304         bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
1305                              POS(BCACHEFS_ROOT_INO, 0),
1306                              BTREE_ITER_INTENT|
1307                              BTREE_ITER_PREFETCH|
1308                              BTREE_ITER_ALL_SNAPSHOTS);
1309
1310         do {
1311                 ret = __bch2_trans_do(&trans, NULL, NULL,
1312                                       BTREE_INSERT_LAZY_RW|
1313                                       BTREE_INSERT_NOFAIL,
1314                         check_extent(&trans, &iter, &w, &s));
1315                 if (ret)
1316                         break;
1317         } while (bch2_btree_iter_advance(&iter));
1318         bch2_trans_iter_exit(&trans, &iter);
1319 #if 0
1320         bch2_bkey_buf_exit(&prev, c);
1321 #endif
1322         inode_walker_exit(&w);
1323         bch2_trans_exit(&trans);
1324         snapshots_seen_exit(&s);
1325
1326         return ret;
1327 }
1328
1329 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1330 {
1331         struct bch_fs *c = trans->c;
1332         struct inode_walker_entry *i;
1333         int ret = 0, ret2 = 0;
1334         s64 count2;
1335
1336         for (i = w->d; i < w->d + w->nr; i++) {
1337                 if (i->inode.bi_nlink == i->count)
1338                         continue;
1339
1340                 count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
1341                 if (count2 < 0)
1342                         return count2;
1343
1344                 if (i->count != count2) {
1345                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1346                                 i->count, count2);
1347                         i->count = count2;
1348                         if (i->inode.bi_nlink == i->count)
1349                                 continue;
1350                 }
1351
1352                 if (fsck_err_on(i->inode.bi_nlink != i->count, c,
1353                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1354                                 w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
1355                         i->inode.bi_nlink = i->count;
1356                         ret = write_inode(trans, &i->inode, i->snapshot);
1357                         if (ret)
1358                                 break;
1359                         ret2 = -EINTR;
1360                 }
1361         }
1362 fsck_err:
1363         return ret ?: ret2;
1364 }
1365
1366 static int check_dirent_target(struct btree_trans *trans,
1367                                struct btree_iter *iter,
1368                                struct bkey_s_c_dirent d,
1369                                struct bch_inode_unpacked *target,
1370                                u32 target_snapshot)
1371 {
1372         struct bch_fs *c = trans->c;
1373         struct bkey_i_dirent *n;
1374         bool backpointer_exists = true;
1375         struct printbuf buf = PRINTBUF;
1376         int ret = 0;
1377
1378         if (!target->bi_dir &&
1379             !target->bi_dir_offset) {
1380                 target->bi_dir          = d.k->p.inode;
1381                 target->bi_dir_offset   = d.k->p.offset;
1382
1383                 ret = __write_inode(trans, target, target_snapshot);
1384                 if (ret)
1385                         goto err;
1386         }
1387
1388         if (!inode_points_to_dirent(target, d)) {
1389                 ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
1390                 if (ret < 0)
1391                         goto err;
1392
1393                 backpointer_exists = ret;
1394                 ret = 0;
1395
1396                 if (fsck_err_on(S_ISDIR(target->bi_mode) &&
1397                                 backpointer_exists, c,
1398                                 "directory %llu with multiple links",
1399                                 target->bi_inum)) {
1400                         ret = __remove_dirent(trans, d.k->p);
1401                         goto out;
1402                 }
1403
1404                 if (fsck_err_on(backpointer_exists &&
1405                                 !target->bi_nlink, c,
1406                                 "inode %llu has multiple links but i_nlink 0",
1407                                 target->bi_inum)) {
1408                         target->bi_nlink++;
1409                         target->bi_flags &= ~BCH_INODE_UNLINKED;
1410
1411                         ret = __write_inode(trans, target, target_snapshot);
1412                         if (ret)
1413                                 goto err;
1414                 }
1415
1416                 if (fsck_err_on(!backpointer_exists, c,
1417                                 "inode %llu:%u has wrong backpointer:\n"
1418                                 "got       %llu:%llu\n"
1419                                 "should be %llu:%llu",
1420                                 target->bi_inum, target_snapshot,
1421                                 target->bi_dir,
1422                                 target->bi_dir_offset,
1423                                 d.k->p.inode,
1424                                 d.k->p.offset)) {
1425                         target->bi_dir          = d.k->p.inode;
1426                         target->bi_dir_offset   = d.k->p.offset;
1427
1428                         ret = __write_inode(trans, target, target_snapshot);
1429                         if (ret)
1430                                 goto err;
1431                 }
1432         }
1433
1434         if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
1435                         "incorrect d_type: got %s, should be %s:\n%s",
1436                         bch2_d_type_str(d.v->d_type),
1437                         bch2_d_type_str(inode_d_type(target)),
1438                         (printbuf_reset(&buf),
1439                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1440                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1441                 ret = PTR_ERR_OR_ZERO(n);
1442                 if (ret)
1443                         goto err;
1444
1445                 bkey_reassemble(&n->k_i, d.s_c);
1446                 n->v.d_type = inode_d_type(target);
1447
1448                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1449                 if (ret)
1450                         goto err;
1451
1452                 d = dirent_i_to_s_c(n);
1453         }
1454
1455         if (d.v->d_type == DT_SUBVOL &&
1456             target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
1457             (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
1458              fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
1459                       le32_to_cpu(d.v->d_parent_subvol),
1460                       target->bi_parent_subvol))) {
1461                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1462                 ret = PTR_ERR_OR_ZERO(n);
1463                 if (ret)
1464                         goto err;
1465
1466                 bkey_reassemble(&n->k_i, d.s_c);
1467                 n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1468
1469                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1470                 if (ret)
1471                         goto err;
1472
1473                 d = dirent_i_to_s_c(n);
1474         }
1475 out:
1476 err:
1477 fsck_err:
1478         printbuf_exit(&buf);
1479         return ret;
1480 }
1481
1482 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1483                         struct bch_hash_info *hash_info,
1484                         struct inode_walker *dir,
1485                         struct inode_walker *target,
1486                         struct snapshots_seen *s)
1487 {
1488         struct bch_fs *c = trans->c;
1489         struct bkey_s_c k;
1490         struct bkey_s_c_dirent d;
1491         struct inode_walker_entry *i;
1492         struct printbuf buf = PRINTBUF;
1493         int ret = 0;
1494
1495         k = bch2_btree_iter_peek(iter);
1496         if (!k.k)
1497                 goto out;
1498
1499         ret = bkey_err(k);
1500         if (ret)
1501                 goto err;
1502
1503         ret = check_key_has_snapshot(trans, iter, k);
1504         if (ret) {
1505                 ret = ret < 0 ? ret : 0;
1506                 goto out;
1507         }
1508
1509         ret = snapshots_seen_update(c, s, k.k->p);
1510         if (ret)
1511                 goto err;
1512
1513         if (k.k->type == KEY_TYPE_whiteout)
1514                 goto out;
1515
1516         if (dir->cur_inum != k.k->p.inode) {
1517                 ret = check_subdir_count(trans, dir);
1518                 if (ret)
1519                         goto err;
1520         }
1521
1522         ret = __walk_inode(trans, dir, k.k->p);
1523         if (ret < 0)
1524                 goto err;
1525
1526         if (fsck_err_on(ret == INT_MAX, c,
1527                         "dirent in nonexisting directory:\n%s",
1528                         (printbuf_reset(&buf),
1529                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1530                 ret = bch2_btree_delete_at(trans, iter,
1531                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1532                 goto out;
1533         }
1534
1535         if (ret == INT_MAX) {
1536                 ret = 0;
1537                 goto out;
1538         }
1539
1540         i = dir->d + ret;
1541         ret = 0;
1542
1543         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
1544                         "dirent in non directory inode type %s:\n%s",
1545                         bch2_d_type_str(inode_d_type(&i->inode)),
1546                         (printbuf_reset(&buf),
1547                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1548                 ret = bch2_btree_delete_at(trans, iter, 0);
1549                 goto out;
1550         }
1551
1552         if (dir->first_this_inode)
1553                 *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
1554
1555         ret = hash_check_key(trans, bch2_dirent_hash_desc,
1556                              hash_info, iter, k);
1557         if (ret < 0)
1558                 goto err;
1559         if (ret) {
1560                 /* dirent has been deleted */
1561                 ret = 0;
1562                 goto out;
1563         }
1564
1565         if (k.k->type != KEY_TYPE_dirent)
1566                 goto out;
1567
1568         d = bkey_s_c_to_dirent(k);
1569
1570         if (d.v->d_type == DT_SUBVOL) {
1571                 struct bch_inode_unpacked subvol_root;
1572                 u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1573                 u32 target_snapshot;
1574                 u64 target_inum;
1575
1576                 ret = __subvol_lookup(trans, target_subvol,
1577                                       &target_snapshot, &target_inum);
1578                 if (ret && ret != -ENOENT)
1579                         goto err;
1580
1581                 if (fsck_err_on(ret, c,
1582                                 "dirent points to missing subvolume %llu",
1583                                 le64_to_cpu(d.v->d_child_subvol))) {
1584                         ret = __remove_dirent(trans, d.k->p);
1585                         goto err;
1586                 }
1587
1588                 ret = __lookup_inode(trans, target_inum,
1589                                    &subvol_root, &target_snapshot);
1590                 if (ret && ret != -ENOENT)
1591                         goto err;
1592
1593                 if (fsck_err_on(ret, c,
1594                                 "subvolume %u points to missing subvolume root %llu",
1595                                 target_subvol,
1596                                 target_inum)) {
1597                         bch_err(c, "repair not implemented yet");
1598                         ret = -EINVAL;
1599                         goto err;
1600                 }
1601
1602                 if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
1603                                 "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
1604                                 target_inum,
1605                                 subvol_root.bi_subvol, target_subvol)) {
1606                         subvol_root.bi_subvol = target_subvol;
1607                         ret = __write_inode(trans, &subvol_root, target_snapshot);
1608                         if (ret)
1609                                 goto err;
1610                 }
1611
1612                 ret = check_dirent_target(trans, iter, d, &subvol_root,
1613                                           target_snapshot);
1614                 if (ret)
1615                         goto err;
1616         } else {
1617                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1618                 if (ret)
1619                         goto err;
1620
1621                 if (fsck_err_on(!target->nr, c,
1622                                 "dirent points to missing inode:\n%s",
1623                                 (printbuf_reset(&buf),
1624                                  bch2_bkey_val_to_text(&buf, c, k),
1625                                  buf.buf))) {
1626                         ret = __remove_dirent(trans, d.k->p);
1627                         if (ret)
1628                                 goto err;
1629                 }
1630
1631                 for (i = target->d; i < target->d + target->nr; i++) {
1632                         ret = check_dirent_target(trans, iter, d,
1633                                                   &i->inode, i->snapshot);
1634                         if (ret)
1635                                 goto err;
1636                 }
1637         }
1638
1639         if (d.v->d_type == DT_DIR)
1640                 for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
1641                         i->count++;
1642
1643 out:
1644 err:
1645 fsck_err:
1646         printbuf_exit(&buf);
1647         return ret;
1648 }
1649
1650 /*
1651  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1652  * validate d_type
1653  */
1654 noinline_for_stack
1655 static int check_dirents(struct bch_fs *c)
1656 {
1657         struct inode_walker dir = inode_walker_init();
1658         struct inode_walker target = inode_walker_init();
1659         struct snapshots_seen s;
1660         struct bch_hash_info hash_info;
1661         struct btree_trans trans;
1662         struct btree_iter iter;
1663         int ret = 0;
1664
1665         bch_verbose(c, "checking dirents");
1666
1667         snapshots_seen_init(&s);
1668         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1669
1670         bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
1671                              POS(BCACHEFS_ROOT_INO, 0),
1672                              BTREE_ITER_INTENT|
1673                              BTREE_ITER_PREFETCH|
1674                              BTREE_ITER_ALL_SNAPSHOTS);
1675
1676         do {
1677                 ret = __bch2_trans_do(&trans, NULL, NULL,
1678                                       BTREE_INSERT_LAZY_RW|
1679                                       BTREE_INSERT_NOFAIL,
1680                         check_dirent(&trans, &iter, &hash_info,
1681                                      &dir, &target, &s));
1682                 if (ret)
1683                         break;
1684         } while (bch2_btree_iter_advance(&iter));
1685         bch2_trans_iter_exit(&trans, &iter);
1686
1687         bch2_trans_exit(&trans);
1688         snapshots_seen_exit(&s);
1689         inode_walker_exit(&dir);
1690         inode_walker_exit(&target);
1691         return ret;
1692 }
1693
1694 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
1695                        struct bch_hash_info *hash_info,
1696                        struct inode_walker *inode)
1697 {
1698         struct bch_fs *c = trans->c;
1699         struct bkey_s_c k;
1700         int ret;
1701
1702         k = bch2_btree_iter_peek(iter);
1703         if (!k.k)
1704                 return 0;
1705
1706         ret = bkey_err(k);
1707         if (ret)
1708                 return ret;
1709
1710         ret = check_key_has_snapshot(trans, iter, k);
1711         if (ret)
1712                 return ret;
1713
1714         ret = __walk_inode(trans, inode, k.k->p);
1715         if (ret < 0)
1716                 return ret;
1717
1718         if (fsck_err_on(ret == INT_MAX, c,
1719                         "xattr for missing inode %llu",
1720                         k.k->p.inode))
1721                 return bch2_btree_delete_at(trans, iter, 0);
1722
1723         if (ret == INT_MAX)
1724                 return 0;
1725
1726         ret = 0;
1727
1728         if (inode->first_this_inode)
1729                 *hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
1730
1731         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
1732 fsck_err:
1733         return ret;
1734 }
1735
1736 /*
1737  * Walk xattrs: verify that they all have a corresponding inode
1738  */
1739 noinline_for_stack
1740 static int check_xattrs(struct bch_fs *c)
1741 {
1742         struct inode_walker inode = inode_walker_init();
1743         struct bch_hash_info hash_info;
1744         struct btree_trans trans;
1745         struct btree_iter iter;
1746         int ret = 0;
1747
1748         bch_verbose(c, "checking xattrs");
1749
1750         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1751
1752         bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
1753                              POS(BCACHEFS_ROOT_INO, 0),
1754                              BTREE_ITER_INTENT|
1755                              BTREE_ITER_PREFETCH|
1756                              BTREE_ITER_ALL_SNAPSHOTS);
1757
1758         do {
1759                 ret = __bch2_trans_do(&trans, NULL, NULL,
1760                                       BTREE_INSERT_LAZY_RW|
1761                                       BTREE_INSERT_NOFAIL,
1762                                       check_xattr(&trans, &iter, &hash_info,
1763                                                   &inode));
1764                 if (ret)
1765                         break;
1766         } while (bch2_btree_iter_advance(&iter));
1767         bch2_trans_iter_exit(&trans, &iter);
1768
1769         bch2_trans_exit(&trans);
1770         return ret;
1771 }
1772
1773 static int check_root_trans(struct btree_trans *trans)
1774 {
1775         struct bch_fs *c = trans->c;
1776         struct bch_inode_unpacked root_inode;
1777         u32 snapshot;
1778         u64 inum;
1779         int ret;
1780
1781         ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
1782         if (ret && ret != -ENOENT)
1783                 return ret;
1784
1785         if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
1786                 struct bkey_i_subvolume root_subvol;
1787
1788                 snapshot        = U32_MAX;
1789                 inum            = BCACHEFS_ROOT_INO;
1790
1791                 bkey_subvolume_init(&root_subvol.k_i);
1792                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
1793                 root_subvol.v.flags     = 0;
1794                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
1795                 root_subvol.v.inode     = cpu_to_le64(inum);
1796                 ret = __bch2_trans_do(trans, NULL, NULL,
1797                                       BTREE_INSERT_NOFAIL|
1798                                       BTREE_INSERT_LAZY_RW,
1799                         __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
1800                 if (ret) {
1801                         bch_err(c, "error writing root subvol: %i", ret);
1802                         goto err;
1803                 }
1804
1805         }
1806
1807         ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
1808         if (ret && ret != -ENOENT)
1809                 return ret;
1810
1811         if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
1812             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
1813                                 "root inode not a directory")) {
1814                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
1815                                 0, NULL);
1816                 root_inode.bi_inum = inum;
1817
1818                 ret = __write_inode(trans, &root_inode, snapshot);
1819                 if (ret)
1820                         bch_err(c, "error writing root inode: %i", ret);
1821         }
1822 err:
1823 fsck_err:
1824         return ret;
1825 }
1826
1827 /* Get root directory, create if it doesn't exist: */
1828 noinline_for_stack
1829 static int check_root(struct bch_fs *c)
1830 {
1831         bch_verbose(c, "checking root directory");
1832
1833         return bch2_trans_do(c, NULL, NULL,
1834                              BTREE_INSERT_NOFAIL|
1835                              BTREE_INSERT_LAZY_RW,
1836                 check_root_trans(&trans));
1837 }
1838
1839 struct pathbuf {
1840         size_t          nr;
1841         size_t          size;
1842
1843         struct pathbuf_entry {
1844                 u64     inum;
1845                 u32     snapshot;
1846         }               *entries;
1847 };
1848
1849 static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
1850 {
1851         struct pathbuf_entry *i;
1852
1853         for (i = p->entries; i < p->entries + p->nr; i++)
1854                 if (i->inum     == inum &&
1855                     i->snapshot == snapshot)
1856                         return true;
1857
1858         return false;
1859 }
1860
1861 static int path_down(struct bch_fs *c, struct pathbuf *p,
1862                      u64 inum, u32 snapshot)
1863 {
1864         if (p->nr == p->size) {
1865                 size_t new_size = max_t(size_t, 256UL, p->size * 2);
1866                 void *n = krealloc(p->entries,
1867                                    new_size * sizeof(p->entries[0]),
1868                                    GFP_KERNEL);
1869                 if (!n) {
1870                         bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
1871                                 new_size);
1872                         return -ENOMEM;
1873                 }
1874
1875                 p->entries = n;
1876                 p->size = new_size;
1877         };
1878
1879         p->entries[p->nr++] = (struct pathbuf_entry) {
1880                 .inum           = inum,
1881                 .snapshot       = snapshot,
1882         };
1883         return 0;
1884 }
1885
1886 /*
1887  * Check that a given inode is reachable from the root:
1888  *
1889  * XXX: we should also be verifying that inodes are in the right subvolumes
1890  */
1891 static int check_path(struct btree_trans *trans,
1892                       struct pathbuf *p,
1893                       struct bch_inode_unpacked *inode,
1894                       u32 snapshot)
1895 {
1896         struct bch_fs *c = trans->c;
1897         int ret = 0;
1898
1899         snapshot = snapshot_t(c, snapshot)->equiv;
1900         p->nr = 0;
1901
1902         while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
1903                  inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
1904                 struct btree_iter dirent_iter;
1905                 struct bkey_s_c_dirent d;
1906                 u32 parent_snapshot = snapshot;
1907
1908                 if (inode->bi_subvol) {
1909                         u64 inum;
1910
1911                         ret = subvol_lookup(trans, inode->bi_parent_subvol,
1912                                             &parent_snapshot, &inum);
1913                         if (ret)
1914                                 break;
1915                 }
1916
1917                 ret = lockrestart_do(trans,
1918                         PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
1919                                           SPOS(inode->bi_dir, inode->bi_dir_offset,
1920                                                parent_snapshot))).k));
1921                 if (ret && ret != -ENOENT)
1922                         break;
1923
1924                 if (!ret && !dirent_points_to_inode(d, inode)) {
1925                         bch2_trans_iter_exit(trans, &dirent_iter);
1926                         ret = -ENOENT;
1927                 }
1928
1929                 if (ret == -ENOENT) {
1930                         if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
1931                                      inode->bi_inum, snapshot,
1932                                      bch2_d_type_str(inode_d_type(inode)),
1933                                      inode->bi_nlink,
1934                                      inode->bi_dir,
1935                                      inode->bi_dir_offset))
1936                                 ret = reattach_inode(trans, inode, snapshot);
1937                         break;
1938                 }
1939
1940                 bch2_trans_iter_exit(trans, &dirent_iter);
1941
1942                 if (!S_ISDIR(inode->bi_mode))
1943                         break;
1944
1945                 ret = path_down(c, p, inode->bi_inum, snapshot);
1946                 if (ret) {
1947                         bch_err(c, "memory allocation failure");
1948                         return ret;
1949                 }
1950
1951                 snapshot = parent_snapshot;
1952
1953                 ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
1954                 if (ret) {
1955                         /* Should have been caught in dirents pass */
1956                         bch_err(c, "error looking up parent directory: %i", ret);
1957                         break;
1958                 }
1959
1960                 if (path_is_dup(p, inode->bi_inum, snapshot)) {
1961                         struct pathbuf_entry *i;
1962
1963                         /* XXX print path */
1964                         bch_err(c, "directory structure loop");
1965
1966                         for (i = p->entries; i < p->entries + p->nr; i++)
1967                                 pr_err("%llu:%u", i->inum, i->snapshot);
1968                         pr_err("%llu:%u", inode->bi_inum, snapshot);
1969
1970                         if (!fsck_err(c, "directory structure loop"))
1971                                 return 0;
1972
1973                         ret = __bch2_trans_do(trans, NULL, NULL,
1974                                               BTREE_INSERT_NOFAIL|
1975                                               BTREE_INSERT_LAZY_RW,
1976                                         remove_backpointer(trans, inode));
1977                         if (ret) {
1978                                 bch_err(c, "error removing dirent: %i", ret);
1979                                 break;
1980                         }
1981
1982                         ret = reattach_inode(trans, inode, snapshot);
1983                 }
1984         }
1985 fsck_err:
1986         if (ret)
1987                 bch_err(c, "%s: err %i", __func__, ret);
1988         return ret;
1989 }
1990
1991 /*
1992  * Check for unreachable inodes, as well as loops in the directory structure:
1993  * After check_dirents(), if an inode backpointer doesn't exist that means it's
1994  * unreachable:
1995  */
1996 noinline_for_stack
1997 static int check_directory_structure(struct bch_fs *c)
1998 {
1999         struct btree_trans trans;
2000         struct btree_iter iter;
2001         struct bkey_s_c k;
2002         struct bch_inode_unpacked u;
2003         struct pathbuf path = { 0, 0, NULL };
2004         int ret;
2005
2006         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2007
2008         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
2009                            BTREE_ITER_INTENT|
2010                            BTREE_ITER_PREFETCH|
2011                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2012                 if (!bkey_is_inode(k.k))
2013                         continue;
2014
2015                 ret = bch2_inode_unpack(k, &u);
2016                 if (ret) {
2017                         /* Should have been caught earlier in fsck: */
2018                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
2019                         break;
2020                 }
2021
2022                 if (u.bi_flags & BCH_INODE_UNLINKED)
2023                         continue;
2024
2025                 ret = check_path(&trans, &path, &u, iter.pos.snapshot);
2026                 if (ret)
2027                         break;
2028         }
2029         bch2_trans_iter_exit(&trans, &iter);
2030
2031         BUG_ON(ret == -EINTR);
2032
2033         kfree(path.entries);
2034
2035         bch2_trans_exit(&trans);
2036         return ret;
2037 }
2038
2039 struct nlink_table {
2040         size_t          nr;
2041         size_t          size;
2042
2043         struct nlink {
2044                 u64     inum;
2045                 u32     snapshot;
2046                 u32     count;
2047         }               *d;
2048 };
2049
2050 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2051                      u64 inum, u32 snapshot)
2052 {
2053         if (t->nr == t->size) {
2054                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2055                 void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
2056                 if (!d) {
2057                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2058                                 new_size);
2059                         return -ENOMEM;
2060                 }
2061
2062                 if (t->d)
2063                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2064                 kvfree(t->d);
2065
2066                 t->d = d;
2067                 t->size = new_size;
2068         }
2069
2070
2071         t->d[t->nr++] = (struct nlink) {
2072                 .inum           = inum,
2073                 .snapshot       = snapshot,
2074         };
2075
2076         return 0;
2077 }
2078
2079 static int nlink_cmp(const void *_l, const void *_r)
2080 {
2081         const struct nlink *l = _l;
2082         const struct nlink *r = _r;
2083
2084         return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
2085 }
2086
2087 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2088                      struct nlink_table *links,
2089                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2090 {
2091         struct nlink *link, key = {
2092                 .inum = inum, .snapshot = U32_MAX,
2093         };
2094
2095         if (inum < range_start || inum >= range_end)
2096                 return;
2097
2098         link = __inline_bsearch(&key, links->d, links->nr,
2099                                 sizeof(links->d[0]), nlink_cmp);
2100         if (!link)
2101                 return;
2102
2103         while (link > links->d && link[0].inum == link[-1].inum)
2104                 --link;
2105
2106         for (; link < links->d + links->nr && link->inum == inum; link++)
2107                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2108                         link->count++;
2109                         if (link->snapshot >= snapshot)
2110                                 break;
2111                 }
2112 }
2113
2114 noinline_for_stack
2115 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2116                                        struct nlink_table *t,
2117                                        u64 start, u64 *end)
2118 {
2119         struct btree_trans trans;
2120         struct btree_iter iter;
2121         struct bkey_s_c k;
2122         struct bch_inode_unpacked u;
2123         int ret = 0;
2124
2125         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2126
2127         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2128                            POS(0, start),
2129                            BTREE_ITER_INTENT|
2130                            BTREE_ITER_PREFETCH|
2131                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2132                 if (!bkey_is_inode(k.k))
2133                         continue;
2134
2135                 /* Should never fail, checked by bch2_inode_invalid: */
2136                 BUG_ON(bch2_inode_unpack(k, &u));
2137
2138                 /*
2139                  * Backpointer and directory structure checks are sufficient for
2140                  * directories, since they can't have hardlinks:
2141                  */
2142                 if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2143                         continue;
2144
2145                 if (!u.bi_nlink)
2146                         continue;
2147
2148                 ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2149                 if (ret) {
2150                         *end = k.k->p.offset;
2151                         ret = 0;
2152                         break;
2153                 }
2154
2155         }
2156         bch2_trans_iter_exit(&trans, &iter);
2157         bch2_trans_exit(&trans);
2158
2159         if (ret)
2160                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2161
2162         return ret;
2163 }
2164
2165 noinline_for_stack
2166 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2167                                      u64 range_start, u64 range_end)
2168 {
2169         struct btree_trans trans;
2170         struct snapshots_seen s;
2171         struct btree_iter iter;
2172         struct bkey_s_c k;
2173         struct bkey_s_c_dirent d;
2174         int ret;
2175
2176         snapshots_seen_init(&s);
2177
2178         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2179
2180         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
2181                            BTREE_ITER_INTENT|
2182                            BTREE_ITER_PREFETCH|
2183                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2184                 ret = snapshots_seen_update(c, &s, k.k->p);
2185                 if (ret)
2186                         break;
2187
2188                 switch (k.k->type) {
2189                 case KEY_TYPE_dirent:
2190                         d = bkey_s_c_to_dirent(k);
2191
2192                         if (d.v->d_type != DT_DIR &&
2193                             d.v->d_type != DT_SUBVOL)
2194                                 inc_link(c, &s, links, range_start, range_end,
2195                                          le64_to_cpu(d.v->d_inum),
2196                                          d.k->p.snapshot);
2197                         break;
2198                 }
2199         }
2200         bch2_trans_iter_exit(&trans, &iter);
2201
2202         if (ret)
2203                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
2204
2205         bch2_trans_exit(&trans);
2206         snapshots_seen_exit(&s);
2207         return ret;
2208 }
2209
2210 noinline_for_stack
2211 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2212                                struct nlink_table *links,
2213                                u64 range_start, u64 range_end)
2214 {
2215         struct btree_trans trans;
2216         struct btree_iter iter;
2217         struct bkey_s_c k;
2218         struct bch_inode_unpacked u;
2219         struct nlink *link = links->d;
2220         int ret = 0;
2221
2222         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2223
2224         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
2225                            POS(0, range_start),
2226                            BTREE_ITER_INTENT|
2227                            BTREE_ITER_PREFETCH|
2228                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2229                 if (k.k->p.offset >= range_end)
2230                         break;
2231
2232                 if (!bkey_is_inode(k.k))
2233                         continue;
2234
2235                 BUG_ON(bch2_inode_unpack(k, &u));
2236
2237                 if (S_ISDIR(le16_to_cpu(u.bi_mode)))
2238                         continue;
2239
2240                 if (!u.bi_nlink)
2241                         continue;
2242
2243                 while ((cmp_int(link->inum, k.k->p.offset) ?:
2244                         cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2245                         link++;
2246                         BUG_ON(link >= links->d + links->nr);
2247                 }
2248
2249                 if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
2250                                 "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
2251                                 u.bi_inum, mode_to_type(u.bi_mode),
2252                                 bch2_inode_nlink_get(&u), link->count)) {
2253                         bch2_inode_nlink_set(&u, link->count);
2254
2255                         ret = write_inode(&trans, &u, k.k->p.snapshot);
2256                         if (ret)
2257                                 bch_err(c, "error in fsck: error %i updating inode", ret);
2258                 }
2259         }
2260 fsck_err:
2261         bch2_trans_iter_exit(&trans, &iter);
2262         bch2_trans_exit(&trans);
2263
2264         if (ret)
2265                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
2266
2267         return ret;
2268 }
2269
2270 noinline_for_stack
2271 static int check_nlinks(struct bch_fs *c)
2272 {
2273         struct nlink_table links = { 0 };
2274         u64 this_iter_range_start, next_iter_range_start = 0;
2275         int ret = 0;
2276
2277         bch_verbose(c, "checking inode nlinks");
2278
2279         do {
2280                 this_iter_range_start = next_iter_range_start;
2281                 next_iter_range_start = U64_MAX;
2282
2283                 ret = check_nlinks_find_hardlinks(c, &links,
2284                                                   this_iter_range_start,
2285                                                   &next_iter_range_start);
2286
2287                 ret = check_nlinks_walk_dirents(c, &links,
2288                                           this_iter_range_start,
2289                                           next_iter_range_start);
2290                 if (ret)
2291                         break;
2292
2293                 ret = check_nlinks_update_hardlinks(c, &links,
2294                                          this_iter_range_start,
2295                                          next_iter_range_start);
2296                 if (ret)
2297                         break;
2298
2299                 links.nr = 0;
2300         } while (next_iter_range_start != U64_MAX);
2301
2302         kvfree(links.d);
2303
2304         return ret;
2305 }
2306
2307 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
2308 {
2309         struct bkey_s_c k;
2310         struct bkey_s_c_reflink_p p;
2311         struct bkey_i_reflink_p *u;
2312         int ret;
2313
2314         k = bch2_btree_iter_peek(iter);
2315         if (!k.k)
2316                 return 0;
2317
2318         ret = bkey_err(k);
2319         if (ret)
2320                 return ret;
2321
2322         if (k.k->type != KEY_TYPE_reflink_p)
2323                 return 0;
2324
2325         p = bkey_s_c_to_reflink_p(k);
2326
2327         if (!p.v->front_pad && !p.v->back_pad)
2328                 return 0;
2329
2330         u = bch2_trans_kmalloc(trans, sizeof(*u));
2331         ret = PTR_ERR_OR_ZERO(u);
2332         if (ret)
2333                 return ret;
2334
2335         bkey_reassemble(&u->k_i, k);
2336         u->v.front_pad  = 0;
2337         u->v.back_pad   = 0;
2338
2339         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2340 }
2341
2342 noinline_for_stack
2343 static int fix_reflink_p(struct bch_fs *c)
2344 {
2345         struct btree_trans trans;
2346         struct btree_iter iter;
2347         struct bkey_s_c k;
2348         int ret;
2349
2350         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2351                 return 0;
2352
2353         bch_verbose(c, "fixing reflink_p keys");
2354
2355         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
2356
2357         for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
2358                            BTREE_ITER_INTENT|
2359                            BTREE_ITER_PREFETCH|
2360                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
2361                 if (k.k->type == KEY_TYPE_reflink_p) {
2362                         ret = __bch2_trans_do(&trans, NULL, NULL,
2363                                               BTREE_INSERT_NOFAIL|
2364                                               BTREE_INSERT_LAZY_RW,
2365                                               fix_reflink_p_key(&trans, &iter));
2366                         if (ret)
2367                                 break;
2368                 }
2369         }
2370         bch2_trans_iter_exit(&trans, &iter);
2371
2372         bch2_trans_exit(&trans);
2373         return ret;
2374 }
2375
2376 /*
2377  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
2378  * Doesn't fix them yet, mainly because they haven't yet been observed:
2379  */
2380 int bch2_fsck_full(struct bch_fs *c)
2381 {
2382         return  bch2_fs_snapshots_check(c) ?:
2383                 check_inodes(c, true) ?:
2384                 check_subvols(c) ?:
2385                 check_extents(c) ?:
2386                 check_dirents(c) ?:
2387                 check_xattrs(c) ?:
2388                 check_root(c) ?:
2389                 check_directory_structure(c) ?:
2390                 check_nlinks(c) ?:
2391                 fix_reflink_p(c);
2392 }
2393
2394 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
2395 {
2396         return check_inodes(c, false);
2397 }