]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
Update bcachefs sources to a8b3ce7599 fixup! bcachefs: Eliminate more PAGE_SIZE uses
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "dirent.h"
7 #include "error.h"
8 #include "fs-common.h"
9 #include "fsck.h"
10 #include "inode.h"
11 #include "keylist.h"
12 #include "super.h"
13 #include "xattr.h"
14
15 #include <linux/dcache.h> /* struct qstr */
16 #include <linux/generic-radix-tree.h>
17
18 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
19
20 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
21 {
22         struct btree_iter *iter;
23         struct bkey_s_c k;
24         u64 sectors = 0;
25         int ret;
26
27         for_each_btree_key(trans, iter, BTREE_ID_extents,
28                            POS(inum, 0), 0, k, ret) {
29                 if (k.k->p.inode != inum)
30                         break;
31
32                 if (bkey_extent_is_allocation(k.k))
33                         sectors += k.k->size;
34         }
35
36         bch2_trans_iter_free(trans, iter);
37
38         return ret ?: sectors;
39 }
40
41 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
42                         struct bch_inode_unpacked *inode,
43                         u32 *snapshot)
44 {
45         struct btree_iter *iter;
46         struct bkey_s_c k;
47         int ret;
48
49         iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
50                         POS(0, inode_nr), 0);
51         k = bch2_btree_iter_peek_slot(iter);
52         ret = bkey_err(k);
53         if (ret)
54                 goto err;
55
56         if (snapshot)
57                 *snapshot = iter->pos.snapshot;
58         ret = k.k->type == KEY_TYPE_inode
59                 ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
60                 : -ENOENT;
61 err:
62         bch2_trans_iter_put(trans, iter);
63         return ret;
64 }
65
66 static int write_inode(struct btree_trans *trans,
67                        struct bch_inode_unpacked *inode,
68                        u32 snapshot)
69 {
70         struct btree_iter *inode_iter =
71                 bch2_trans_get_iter(trans, BTREE_ID_inodes,
72                                     SPOS(0, inode->bi_inum, snapshot),
73                                     BTREE_ITER_INTENT);
74         int ret = __bch2_trans_do(trans, NULL, NULL,
75                                   BTREE_INSERT_NOFAIL|
76                                   BTREE_INSERT_LAZY_RW,
77                                   bch2_inode_write(trans, inode_iter, inode));
78         bch2_trans_iter_put(trans, inode_iter);
79         if (ret)
80                 bch_err(trans->c, "error in fsck: error %i updating inode", ret);
81         return ret;
82 }
83
84 static int __remove_dirent(struct btree_trans *trans,
85                            struct bkey_s_c_dirent dirent)
86 {
87         struct bch_fs *c = trans->c;
88         struct qstr name;
89         struct bch_inode_unpacked dir_inode;
90         struct bch_hash_info dir_hash_info;
91         u64 dir_inum = dirent.k->p.inode;
92         int ret;
93         char *buf;
94
95         name.len = bch2_dirent_name_bytes(dirent);
96         buf = bch2_trans_kmalloc(trans, name.len + 1);
97         if (IS_ERR(buf))
98                 return PTR_ERR(buf);
99
100         memcpy(buf, dirent.v->d_name, name.len);
101         buf[name.len] = '\0';
102         name.name = buf;
103
104         ret = lookup_inode(trans, dir_inum, &dir_inode, NULL);
105         if (ret && ret != -EINTR)
106                 bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
107         if (ret)
108                 return ret;
109
110         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
111
112         ret = bch2_hash_delete(trans, bch2_dirent_hash_desc,
113                                &dir_hash_info, dir_inum, &name);
114         if (ret && ret != -EINTR)
115                 bch_err(c, "remove_dirent: err %i deleting dirent", ret);
116         if (ret)
117                 return ret;
118
119         return 0;
120 }
121
122 static int remove_dirent(struct btree_trans *trans,
123                          struct bkey_s_c_dirent dirent)
124 {
125         return __bch2_trans_do(trans, NULL, NULL,
126                                BTREE_INSERT_NOFAIL|
127                                BTREE_INSERT_LAZY_RW,
128                                __remove_dirent(trans, dirent));
129 }
130
131 static int reattach_inode(struct bch_fs *c,
132                           struct bch_inode_unpacked *lostfound_inode,
133                           u64 inum)
134 {
135         struct bch_inode_unpacked dir_u, inode_u;
136         char name_buf[20];
137         struct qstr name;
138         int ret;
139
140         snprintf(name_buf, sizeof(name_buf), "%llu", inum);
141         name = (struct qstr) QSTR(name_buf);
142
143         ret = bch2_trans_do(c, NULL, NULL,
144                             BTREE_INSERT_LAZY_RW,
145                 bch2_link_trans(&trans, lostfound_inode->bi_inum,
146                                 inum, &dir_u, &inode_u, &name));
147         if (ret)
148                 bch_err(c, "error %i reattaching inode %llu", ret, inum);
149
150         return ret;
151 }
152
153 struct inode_walker {
154         bool                    first_this_inode;
155         bool                    have_inode;
156         u64                     cur_inum;
157         u32                     snapshot;
158         struct bch_inode_unpacked inode;
159 };
160
161 static struct inode_walker inode_walker_init(void)
162 {
163         return (struct inode_walker) {
164                 .cur_inum       = -1,
165                 .have_inode     = false,
166         };
167 }
168
169 static int walk_inode(struct btree_trans *trans,
170                       struct inode_walker *w, u64 inum)
171 {
172         if (inum != w->cur_inum) {
173                 int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot);
174
175                 if (ret && ret != -ENOENT)
176                         return ret;
177
178                 w->have_inode   = !ret;
179                 w->cur_inum     = inum;
180                 w->first_this_inode = true;
181         } else {
182                 w->first_this_inode = false;
183         }
184
185         return 0;
186 }
187
188 static int hash_redo_key(struct btree_trans *trans,
189                          const struct bch_hash_desc desc,
190                          struct bch_hash_info *hash_info,
191                          struct btree_iter *k_iter, struct bkey_s_c k)
192 {
193         struct bkey_i delete;
194         struct bkey_i *tmp;
195
196         tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
197         if (IS_ERR(tmp))
198                 return PTR_ERR(tmp);
199
200         bkey_reassemble(tmp, k);
201
202         bkey_init(&delete.k);
203         delete.k.p = k_iter->pos;
204         bch2_trans_update(trans, k_iter, &delete, 0);
205
206         return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode,
207                              tmp, 0);
208 }
209
210 static int fsck_hash_delete_at(struct btree_trans *trans,
211                                const struct bch_hash_desc desc,
212                                struct bch_hash_info *info,
213                                struct btree_iter *iter)
214 {
215         int ret;
216 retry:
217         ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
218                 bch2_trans_commit(trans, NULL, NULL,
219                                   BTREE_INSERT_NOFAIL|
220                                   BTREE_INSERT_LAZY_RW);
221         if (ret == -EINTR) {
222                 ret = bch2_btree_iter_traverse(iter);
223                 if (!ret)
224                         goto retry;
225         }
226
227         return ret;
228 }
229
230 static int hash_check_key(struct btree_trans *trans,
231                           const struct bch_hash_desc desc,
232                           struct bch_hash_info *hash_info,
233                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
234 {
235         struct bch_fs *c = trans->c;
236         struct btree_iter *iter = NULL;
237         char buf[200];
238         struct bkey_s_c k;
239         u64 hash;
240         int ret = 0;
241
242         if (hash_k.k->type != desc.key_type)
243                 return 0;
244
245         hash = desc.hash_bkey(hash_info, hash_k);
246
247         if (likely(hash == hash_k.k->p.offset))
248                 return 0;
249
250         if (hash_k.k->p.offset < hash)
251                 goto bad_hash;
252
253         for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
254                            BTREE_ITER_SLOTS, k, ret) {
255                 if (!bkey_cmp(k.k->p, hash_k.k->p))
256                         break;
257
258                 if (fsck_err_on(k.k->type == desc.key_type &&
259                                 !desc.cmp_bkey(k, hash_k), c,
260                                 "duplicate hash table keys:\n%s",
261                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
262                                                        hash_k), buf))) {
263                         ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
264                         if (ret)
265                                 return ret;
266                         ret = 1;
267                         break;
268                 }
269
270                 if (bkey_deleted(k.k)) {
271                         bch2_trans_iter_free(trans, iter);
272                         goto bad_hash;
273                 }
274
275         }
276         bch2_trans_iter_free(trans, iter);
277         return ret;
278 bad_hash:
279         if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
280                      "hashed to %llu should be at %llu\n%s",
281                      desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset,
282                      hash, iter->pos.offset,
283                      (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
284                 return 0;
285
286         ret = __bch2_trans_do(trans, NULL, NULL,
287                               BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
288                 hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
289         if (ret) {
290                 bch_err(c, "hash_redo_key err %i", ret);
291                 return ret;
292         }
293         return -EINTR;
294 fsck_err:
295         return ret;
296 }
297
298 static int check_inode(struct btree_trans *trans,
299                        struct btree_iter *iter,
300                        struct bkey_s_c_inode inode)
301 {
302         struct bch_fs *c = trans->c;
303         struct bch_inode_unpacked u;
304         bool do_update = false;
305         int ret = 0;
306
307         ret = bch2_inode_unpack(inode, &u);
308
309         if (bch2_fs_inconsistent_on(ret, c,
310                          "error unpacking inode %llu in fsck",
311                          inode.k->p.inode))
312                 return ret;
313
314         if (u.bi_flags & BCH_INODE_UNLINKED &&
315             (!c->sb.clean ||
316              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
317                       u.bi_inum))) {
318                 bch_verbose(c, "deleting inode %llu", u.bi_inum);
319
320                 bch2_trans_unlock(trans);
321                 bch2_fs_lazy_rw(c);
322
323                 ret = bch2_inode_rm(c, u.bi_inum, false);
324                 if (ret)
325                         bch_err(c, "error in fsck: error %i while deleting inode", ret);
326                 return ret;
327         }
328
329         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
330             (!c->sb.clean ||
331              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
332                       u.bi_inum))) {
333                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
334
335                 bch2_trans_unlock(trans);
336                 bch2_fs_lazy_rw(c);
337
338                 /*
339                  * XXX: need to truncate partial blocks too here - or ideally
340                  * just switch units to bytes and that issue goes away
341                  */
342                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
343                                 POS(u.bi_inum, round_up(u.bi_size, block_bytes(c))),
344                                 POS(u.bi_inum, U64_MAX),
345                                 NULL);
346                 if (ret) {
347                         bch_err(c, "error in fsck: error %i truncating inode", ret);
348                         return ret;
349                 }
350
351                 /*
352                  * We truncated without our normal sector accounting hook, just
353                  * make sure we recalculate it:
354                  */
355                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
356
357                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
358                 do_update = true;
359         }
360
361         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
362             (!c->sb.clean ||
363              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
364                       u.bi_inum))) {
365                 s64 sectors;
366
367                 bch_verbose(c, "recounting sectors for inode %llu",
368                             u.bi_inum);
369
370                 sectors = bch2_count_inode_sectors(trans, u.bi_inum);
371                 if (sectors < 0) {
372                         bch_err(c, "error in fsck: error %i recounting inode sectors",
373                                 (int) sectors);
374                         return sectors;
375                 }
376
377                 u.bi_sectors = sectors;
378                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
379                 do_update = true;
380         }
381
382         if (!S_ISDIR(u.bi_mode) &&
383             u.bi_nlink &&
384             !(u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
385             (fsck_err_on(c->sb.version >= bcachefs_metadata_version_inode_backpointers, c,
386                          "inode missing BCH_INODE_BACKPTR_UNTRUSTED flags") ||
387              c->opts.version_upgrade)) {
388                 u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
389                 do_update = true;
390         }
391
392         if (do_update) {
393                 struct bkey_inode_buf p;
394
395                 bch2_inode_pack(c, &p, &u);
396                 p.inode.k.p = iter->pos;
397
398                 ret = __bch2_trans_do(trans, NULL, NULL,
399                                       BTREE_INSERT_NOFAIL|
400                                       BTREE_INSERT_LAZY_RW,
401                         (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
402                 if (ret)
403                         bch_err(c, "error in fsck: error %i "
404                                 "updating inode", ret);
405         }
406 fsck_err:
407         return ret;
408 }
409
410 noinline_for_stack
411 static int check_inodes(struct bch_fs *c, bool full)
412 {
413         struct btree_trans trans;
414         struct btree_iter *iter;
415         struct bkey_s_c k;
416         struct bkey_s_c_inode inode;
417         int ret;
418
419         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
420
421         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
422                 if (k.k->type != KEY_TYPE_inode)
423                         continue;
424
425                 inode = bkey_s_c_to_inode(k);
426
427                 if (full ||
428                     (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
429                                           BCH_INODE_I_SECTORS_DIRTY|
430                                           BCH_INODE_UNLINKED))) {
431                         ret = check_inode(&trans, iter, inode);
432                         if (ret)
433                                 break;
434                 }
435         }
436         bch2_trans_iter_put(&trans, iter);
437
438         BUG_ON(ret == -EINTR);
439
440         return bch2_trans_exit(&trans) ?: ret;
441 }
442
443 static int fix_overlapping_extent(struct btree_trans *trans,
444                                        struct bkey_s_c k, struct bpos cut_at)
445 {
446         struct btree_iter *iter;
447         struct bkey_i *u;
448         int ret;
449
450         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
451         ret = PTR_ERR_OR_ZERO(u);
452         if (ret)
453                 return ret;
454
455         bkey_reassemble(u, k);
456         bch2_cut_front(cut_at, u);
457
458
459         /*
460          * We don't want to go through the extent_handle_overwrites path:
461          *
462          * XXX: this is going to screw up disk accounting, extent triggers
463          * assume things about extent overwrites - we should be running the
464          * triggers manually here
465          */
466         iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
467                                    BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
468
469         BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
470         bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN);
471         bch2_trans_iter_put(trans, iter);
472
473         return bch2_trans_commit(trans, NULL, NULL,
474                                  BTREE_INSERT_NOFAIL|
475                                  BTREE_INSERT_LAZY_RW);
476 }
477
478 static int inode_backpointer_exists(struct btree_trans *trans,
479                                     struct bch_inode_unpacked *inode)
480 {
481         struct btree_iter *iter;
482         struct bkey_s_c k;
483         int ret;
484
485         iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
486                                    POS(inode->bi_dir, inode->bi_dir_offset), 0);
487         k = bch2_btree_iter_peek_slot(iter);
488         ret = bkey_err(k);
489         if (ret)
490                 goto out;
491         if (k.k->type != KEY_TYPE_dirent)
492                 goto out;
493
494         ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
495 out:
496         bch2_trans_iter_free(trans, iter);
497         return ret;
498 }
499
500 static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
501                                       struct bch_inode_unpacked *inode)
502 {
503         return d.k->p.inode == inode->bi_dir &&
504                 d.k->p.offset == inode->bi_dir_offset;
505 }
506
507 /*
508  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
509  * that i_size an i_sectors are consistent
510  */
511 noinline_for_stack
512 static int check_extents(struct bch_fs *c)
513 {
514         struct inode_walker w = inode_walker_init();
515         struct btree_trans trans;
516         struct btree_iter *iter;
517         struct bkey_s_c k;
518         struct bkey_buf prev;
519         u64 i_sectors = 0;
520         int ret = 0;
521
522         bch2_bkey_buf_init(&prev);
523         prev.k->k = KEY(0, 0, 0);
524         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
525
526         bch_verbose(c, "checking extents");
527
528         iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
529                                    POS(BCACHEFS_ROOT_INO, 0),
530                                    BTREE_ITER_INTENT);
531 retry:
532         while ((k = bch2_btree_iter_peek(iter)).k &&
533                !(ret = bkey_err(k))) {
534                 if (w.have_inode &&
535                     w.cur_inum != k.k->p.inode &&
536                     !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
537                     fsck_err_on(w.inode.bi_sectors != i_sectors, c,
538                                 "inode %llu has incorrect i_sectors: got %llu, should be %llu",
539                                 w.inode.bi_inum,
540                                 w.inode.bi_sectors, i_sectors)) {
541                         w.inode.bi_sectors = i_sectors;
542
543                         ret = write_inode(&trans, &w.inode, w.snapshot);
544                         if (ret)
545                                 break;
546                 }
547
548                 if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
549                         char buf1[200];
550                         char buf2[200];
551
552                         bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
553                         bch2_bkey_val_to_text(&PBUF(buf2), c, k);
554
555                         if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
556                                 return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
557                 }
558
559                 ret = walk_inode(&trans, &w, k.k->p.inode);
560                 if (ret)
561                         break;
562
563                 if (w.first_this_inode)
564                         i_sectors = 0;
565
566                 if (fsck_err_on(!w.have_inode, c,
567                                 "extent type %u for missing inode %llu",
568                                 k.k->type, k.k->p.inode) ||
569                     fsck_err_on(w.have_inode &&
570                                 !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
571                                 "extent type %u for non regular file, inode %llu mode %o",
572                                 k.k->type, k.k->p.inode, w.inode.bi_mode)) {
573                         bch2_fs_lazy_rw(c);
574                         return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
575                                                        POS(k.k->p.inode, 0),
576                                                        POS(k.k->p.inode, U64_MAX),
577                                                        NULL) ?: -EINTR;
578                 }
579
580                 if (fsck_err_on(w.have_inode &&
581                                 !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
582                                 k.k->type != KEY_TYPE_reservation &&
583                                 k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
584                                 "extent type %u offset %llu past end of inode %llu, i_size %llu",
585                                 k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
586                         bch2_fs_lazy_rw(c);
587                         return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
588                                         POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c))),
589                                         POS(k.k->p.inode, U64_MAX),
590                                         NULL) ?: -EINTR;
591                 }
592
593                 if (bkey_extent_is_allocation(k.k))
594                         i_sectors += k.k->size;
595                 bch2_bkey_buf_reassemble(&prev, c, k);
596
597                 bch2_btree_iter_advance(iter);
598         }
599 fsck_err:
600         if (ret == -EINTR)
601                 goto retry;
602         bch2_trans_iter_put(&trans, iter);
603         bch2_bkey_buf_exit(&prev, c);
604         return bch2_trans_exit(&trans) ?: ret;
605 }
606
607 /*
608  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
609  * validate d_type
610  */
611 noinline_for_stack
612 static int check_dirents(struct bch_fs *c)
613 {
614         struct inode_walker w = inode_walker_init();
615         struct bch_hash_info hash_info;
616         struct btree_trans trans;
617         struct btree_iter *iter;
618         struct bkey_s_c k;
619         char buf[200];
620         unsigned nr_subdirs = 0;
621         int ret = 0;
622
623         bch_verbose(c, "checking dirents");
624
625         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
626
627         iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
628                                    POS(BCACHEFS_ROOT_INO, 0), 0);
629 retry:
630         while ((k = bch2_btree_iter_peek(iter)).k &&
631                !(ret = bkey_err(k))) {
632                 struct bkey_s_c_dirent d;
633                 struct bch_inode_unpacked target;
634                 u32 target_snapshot;
635                 bool have_target;
636                 bool backpointer_exists = true;
637                 u64 d_inum;
638
639                 if (w.have_inode &&
640                     w.cur_inum != k.k->p.inode &&
641                     fsck_err_on(w.inode.bi_nlink != nr_subdirs, c,
642                                 "directory %llu with wrong i_nlink: got %u, should be %u",
643                                 w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) {
644                         w.inode.bi_nlink = nr_subdirs;
645                         ret = write_inode(&trans, &w.inode, w.snapshot);
646                         if (ret)
647                                 break;
648                 }
649
650                 ret = walk_inode(&trans, &w, k.k->p.inode);
651                 if (ret)
652                         break;
653
654                 if (w.first_this_inode)
655                         nr_subdirs = 0;
656
657                 if (fsck_err_on(!w.have_inode, c,
658                                 "dirent in nonexisting directory:\n%s",
659                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
660                                                        k), buf)) ||
661                     fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
662                                 "dirent in non directory inode type %u:\n%s",
663                                 mode_to_type(w.inode.bi_mode),
664                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
665                                                        k), buf))) {
666                         ret = bch2_btree_delete_at(&trans, iter, 0);
667                         if (ret)
668                                 goto err;
669                         goto next;
670                 }
671
672                 if (!w.have_inode)
673                         goto next;
674
675                 if (w.first_this_inode)
676                         hash_info = bch2_hash_info_init(c, &w.inode);
677
678                 ret = hash_check_key(&trans, bch2_dirent_hash_desc,
679                                      &hash_info, iter, k);
680                 if (ret > 0) {
681                         ret = 0;
682                         goto next;
683                 }
684                 if (ret)
685                         goto fsck_err;
686
687                 if (k.k->type != KEY_TYPE_dirent)
688                         goto next;
689
690                 d = bkey_s_c_to_dirent(k);
691                 d_inum = le64_to_cpu(d.v->d_inum);
692
693                 ret = lookup_inode(&trans, d_inum, &target, &target_snapshot);
694                 if (ret && ret != -ENOENT)
695                         break;
696
697                 have_target = !ret;
698                 ret = 0;
699
700                 if (fsck_err_on(!have_target, c,
701                                 "dirent points to missing inode:\n%s",
702                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
703                                                        k), buf))) {
704                         ret = remove_dirent(&trans, d);
705                         if (ret)
706                                 goto err;
707                         goto next;
708                 }
709
710                 if (!have_target)
711                         goto next;
712
713                 if (!inode_backpointer_matches(d, &target)) {
714                         ret = inode_backpointer_exists(&trans, &target);
715                         if (ret < 0)
716                                 goto err;
717
718                         backpointer_exists = ret;
719                         ret = 0;
720                 }
721
722                 if (fsck_err_on(S_ISDIR(target.bi_mode) &&
723                                 !inode_backpointer_matches(d, &target) &&
724                                 backpointer_exists, c,
725                                 "directory %llu with multiple links",
726                                 target.bi_inum)) {
727                         ret = remove_dirent(&trans, d);
728                         if (ret)
729                                 goto err;
730                         continue;
731                 }
732
733                 if (!inode_backpointer_matches(d, &target) &&
734                     (S_ISDIR(target.bi_mode) || !target.bi_nlink)) {
735                         if (backpointer_exists) {
736                                 if (!fsck_err(c, "inode %llu has multiple links but i_nlink 0",
737                                               d_inum))
738                                         goto check_type;
739
740                                 target.bi_nlink++;
741                                 target.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
742                         } else {
743                                 if (c->sb.version >= bcachefs_metadata_version_inode_backpointers &&
744                                     !(target.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) &&
745                                     !fsck_err(c, "inode %llu has wrong backpointer:\n"
746                                               "got       %llu:%llu\n"
747                                               "should be %llu:%llu",
748                                               d_inum,
749                                               target.bi_dir,
750                                               target.bi_dir_offset,
751                                               k.k->p.inode,
752                                               k.k->p.offset))
753                                         goto check_type;
754
755                                 target.bi_dir           = k.k->p.inode;
756                                 target.bi_dir_offset    = k.k->p.offset;
757                                 target.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
758                         }
759
760                         ret = write_inode(&trans, &target, target_snapshot);
761                         if (ret)
762                                 goto err;
763                         continue;
764                 }
765 check_type:
766                 if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
767                                 "incorrect d_type: should be %u:\n%s",
768                                 mode_to_type(target.bi_mode),
769                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
770                                                        k), buf))) {
771                         struct bkey_i_dirent *n;
772
773                         n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
774                         if (!n) {
775                                 ret = -ENOMEM;
776                                 goto err;
777                         }
778
779                         bkey_reassemble(&n->k_i, d.s_c);
780                         n->v.d_type = mode_to_type(target.bi_mode);
781
782                         ret = __bch2_trans_do(&trans, NULL, NULL,
783                                               BTREE_INSERT_NOFAIL|
784                                               BTREE_INSERT_LAZY_RW,
785                                 (bch2_trans_update(&trans, iter, &n->k_i, 0), 0));
786                         kfree(n);
787                         if (ret)
788                                 goto err;
789
790                 }
791
792                 nr_subdirs += d.v->d_type == DT_DIR;
793 next:
794                 bch2_btree_iter_advance(iter);
795         }
796 err:
797 fsck_err:
798         if (ret == -EINTR)
799                 goto retry;
800
801         bch2_trans_iter_put(&trans, iter);
802         return bch2_trans_exit(&trans) ?: ret;
803 }
804
805 /*
806  * Walk xattrs: verify that they all have a corresponding inode
807  */
808 noinline_for_stack
809 static int check_xattrs(struct bch_fs *c)
810 {
811         struct inode_walker w = inode_walker_init();
812         struct bch_hash_info hash_info;
813         struct btree_trans trans;
814         struct btree_iter *iter;
815         struct bkey_s_c k;
816         int ret = 0;
817
818         bch_verbose(c, "checking xattrs");
819
820         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
821
822         iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
823                                    POS(BCACHEFS_ROOT_INO, 0), 0);
824 retry:
825         while ((k = bch2_btree_iter_peek(iter)).k &&
826                !(ret = bkey_err(k))) {
827                 ret = walk_inode(&trans, &w, k.k->p.inode);
828                 if (ret)
829                         break;
830
831                 if (fsck_err_on(!w.have_inode, c,
832                                 "xattr for missing inode %llu",
833                                 k.k->p.inode)) {
834                         ret = bch2_btree_delete_at(&trans, iter, 0);
835                         if (ret)
836                                 break;
837                         continue;
838                 }
839
840                 if (w.first_this_inode && w.have_inode)
841                         hash_info = bch2_hash_info_init(c, &w.inode);
842
843                 ret = hash_check_key(&trans, bch2_xattr_hash_desc,
844                                      &hash_info, iter, k);
845                 if (ret)
846                         break;
847
848                 bch2_btree_iter_advance(iter);
849         }
850 fsck_err:
851         if (ret == -EINTR)
852                 goto retry;
853
854         bch2_trans_iter_put(&trans, iter);
855         return bch2_trans_exit(&trans) ?: ret;
856 }
857
858 /* Get root directory, create if it doesn't exist: */
859 static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
860 {
861         struct bkey_inode_buf packed;
862         int ret;
863
864         bch_verbose(c, "checking root directory");
865
866         ret = bch2_trans_do(c, NULL, NULL, 0,
867                 __bch2_inode_find_by_inum_trans(&trans, BCACHEFS_ROOT_INO,
868                                                 root_inode, 0));
869         if (ret && ret != -ENOENT)
870                 return ret;
871
872         if (fsck_err_on(ret, c, "root directory missing"))
873                 goto create_root;
874
875         if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
876                         "root inode not a directory"))
877                 goto create_root;
878
879         return 0;
880 fsck_err:
881         return ret;
882 create_root:
883         bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
884                         0, NULL);
885         root_inode->bi_inum = BCACHEFS_ROOT_INO;
886
887         bch2_inode_pack(c, &packed, root_inode);
888
889         return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
890                                  NULL, NULL,
891                                  BTREE_INSERT_NOFAIL|
892                                  BTREE_INSERT_LAZY_RW);
893 }
894
895 /* Get lost+found, create if it doesn't exist: */
896 static int check_lostfound(struct bch_fs *c,
897                            struct bch_inode_unpacked *root_inode,
898                            struct bch_inode_unpacked *lostfound_inode)
899 {
900         struct qstr lostfound = QSTR("lost+found");
901         struct bch_hash_info root_hash_info =
902                 bch2_hash_info_init(c, root_inode);
903         u64 inum;
904         int ret;
905
906         bch_verbose(c, "checking lost+found");
907
908         inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
909                                  &lostfound);
910         if (!inum) {
911                 bch_notice(c, "creating lost+found");
912                 goto create_lostfound;
913         }
914
915         ret = bch2_trans_do(c, NULL, NULL, 0,
916                 __bch2_inode_find_by_inum_trans(&trans, inum, lostfound_inode, 0));
917         if (ret && ret != -ENOENT)
918                 return ret;
919
920         if (fsck_err_on(ret, c, "lost+found missing"))
921                 goto create_lostfound;
922
923         if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c,
924                         "lost+found inode not a directory"))
925                 goto create_lostfound;
926
927         return 0;
928 fsck_err:
929         return ret;
930 create_lostfound:
931         bch2_inode_init_early(c, lostfound_inode);
932
933         ret = bch2_trans_do(c, NULL, NULL,
934                             BTREE_INSERT_NOFAIL|
935                             BTREE_INSERT_LAZY_RW,
936                 bch2_create_trans(&trans,
937                                   BCACHEFS_ROOT_INO, root_inode,
938                                   lostfound_inode, &lostfound,
939                                   0, 0, S_IFDIR|0700, 0, NULL, NULL));
940         if (ret)
941                 bch_err(c, "error creating lost+found: %i", ret);
942
943         return ret;
944 }
945
946 typedef GENRADIX(unsigned long) inode_bitmap;
947
948 static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr)
949 {
950         unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG);
951         return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false;
952 }
953
954 static inline int inode_bitmap_set(inode_bitmap *b, size_t nr)
955 {
956         unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL);
957
958         if (!w)
959                 return -ENOMEM;
960
961         *w |= 1UL << (nr & (BITS_PER_LONG - 1));
962         return 0;
963 }
964
965 struct pathbuf {
966         size_t          nr;
967         size_t          size;
968
969         struct pathbuf_entry {
970                 u64     inum;
971                 u64     offset;
972         }               *entries;
973 };
974
975 static int path_down(struct pathbuf *p, u64 inum)
976 {
977         if (p->nr == p->size) {
978                 size_t new_size = max_t(size_t, 256UL, p->size * 2);
979                 void *n = krealloc(p->entries,
980                                    new_size * sizeof(p->entries[0]),
981                                    GFP_KERNEL);
982                 if (!n)
983                         return -ENOMEM;
984
985                 p->entries = n;
986                 p->size = new_size;
987         };
988
989         p->entries[p->nr++] = (struct pathbuf_entry) {
990                 .inum = inum,
991                 .offset = 0,
992         };
993         return 0;
994 }
995
996 noinline_for_stack
997 static int check_directory_structure(struct bch_fs *c,
998                                      struct bch_inode_unpacked *lostfound_inode)
999 {
1000         inode_bitmap dirs_done;
1001         struct pathbuf path = { 0, 0, NULL };
1002         struct pathbuf_entry *e;
1003         struct btree_trans trans;
1004         struct btree_iter *iter;
1005         struct bkey_s_c k;
1006         struct bkey_s_c_dirent dirent;
1007         bool had_unreachable;
1008         u64 d_inum;
1009         int ret = 0;
1010
1011         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1012
1013         bch_verbose(c, "checking directory structure");
1014
1015         /* DFS: */
1016 restart_dfs:
1017         genradix_init(&dirs_done);
1018         had_unreachable = false;
1019
1020         ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
1021         if (ret) {
1022                 bch_err(c, "memory allocation failure in inode_bitmap_set()");
1023                 goto err;
1024         }
1025
1026         ret = path_down(&path, BCACHEFS_ROOT_INO);
1027         if (ret)
1028                 goto err;
1029
1030         while (path.nr) {
1031 next:
1032                 e = &path.entries[path.nr - 1];
1033
1034                 if (e->offset == U64_MAX)
1035                         goto up;
1036
1037                 for_each_btree_key(&trans, iter, BTREE_ID_dirents,
1038                                    POS(e->inum, e->offset + 1), 0, k, ret) {
1039                         if (k.k->p.inode != e->inum)
1040                                 break;
1041
1042                         e->offset = k.k->p.offset;
1043
1044                         if (k.k->type != KEY_TYPE_dirent)
1045                                 continue;
1046
1047                         dirent = bkey_s_c_to_dirent(k);
1048
1049                         if (dirent.v->d_type != DT_DIR)
1050                                 continue;
1051
1052                         d_inum = le64_to_cpu(dirent.v->d_inum);
1053
1054                         if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c,
1055                                         "directory %llu has multiple hardlinks",
1056                                         d_inum)) {
1057                                 ret = remove_dirent(&trans, dirent);
1058                                 if (ret)
1059                                         goto err;
1060                                 continue;
1061                         }
1062
1063                         ret = inode_bitmap_set(&dirs_done, d_inum);
1064                         if (ret) {
1065                                 bch_err(c, "memory allocation failure in inode_bitmap_set()");
1066                                 goto err;
1067                         }
1068
1069                         ret = path_down(&path, d_inum);
1070                         if (ret) {
1071                                 goto err;
1072                         }
1073
1074                         ret = bch2_trans_iter_free(&trans, iter);
1075                         if (ret) {
1076                                 bch_err(c, "btree error %i in fsck", ret);
1077                                 goto err;
1078                         }
1079                         goto next;
1080                 }
1081                 ret = bch2_trans_iter_free(&trans, iter) ?: ret;
1082                 if (ret) {
1083                         bch_err(c, "btree error %i in fsck", ret);
1084                         goto err;
1085                 }
1086 up:
1087                 path.nr--;
1088         }
1089
1090         iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS_MIN, 0);
1091 retry:
1092         for_each_btree_key_continue(iter, 0, k, ret) {
1093                 if (k.k->type != KEY_TYPE_inode)
1094                         continue;
1095
1096                 if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode)))
1097                         continue;
1098
1099                 ret = bch2_empty_dir_trans(&trans, k.k->p.inode);
1100                 if (ret == -EINTR)
1101                         goto retry;
1102                 if (!ret)
1103                         continue;
1104
1105                 if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c,
1106                                 "unreachable directory found (inum %llu)",
1107                                 k.k->p.offset)) {
1108                         bch2_trans_unlock(&trans);
1109
1110                         ret = reattach_inode(c, lostfound_inode, k.k->p.offset);
1111                         if (ret) {
1112                                 goto err;
1113                         }
1114
1115                         had_unreachable = true;
1116                 }
1117         }
1118         bch2_trans_iter_free(&trans, iter);
1119         if (ret)
1120                 goto err;
1121
1122         if (had_unreachable) {
1123                 bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
1124                 genradix_free(&dirs_done);
1125                 kfree(path.entries);
1126                 memset(&dirs_done, 0, sizeof(dirs_done));
1127                 memset(&path, 0, sizeof(path));
1128                 goto restart_dfs;
1129         }
1130 err:
1131 fsck_err:
1132         ret = bch2_trans_exit(&trans) ?: ret;
1133         genradix_free(&dirs_done);
1134         kfree(path.entries);
1135         return ret;
1136 }
1137
1138 struct nlink {
1139         u32     count;
1140 };
1141
1142 typedef GENRADIX(struct nlink) nlink_table;
1143
1144 static void inc_link(struct bch_fs *c, nlink_table *links,
1145                      u64 range_start, u64 *range_end, u64 inum)
1146 {
1147         struct nlink *link;
1148
1149         if (inum < range_start || inum >= *range_end)
1150                 return;
1151
1152         if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) {
1153                 *range_end = inum;
1154                 return;
1155         }
1156
1157         link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
1158         if (!link) {
1159                 bch_verbose(c, "allocation failed during fsck - will need another pass");
1160                 *range_end = inum;
1161                 return;
1162         }
1163
1164         link->count++;
1165 }
1166
1167 noinline_for_stack
1168 static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
1169                                u64 range_start, u64 *range_end)
1170 {
1171         struct btree_trans trans;
1172         struct btree_iter *iter;
1173         struct bkey_s_c k;
1174         struct bkey_s_c_dirent d;
1175         int ret;
1176
1177         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1178
1179         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) {
1180                 switch (k.k->type) {
1181                 case KEY_TYPE_dirent:
1182                         d = bkey_s_c_to_dirent(k);
1183
1184                         if (d.v->d_type != DT_DIR)
1185                                 inc_link(c, links, range_start, range_end,
1186                                          le64_to_cpu(d.v->d_inum));
1187                         break;
1188                 }
1189
1190                 bch2_trans_cond_resched(&trans);
1191         }
1192         bch2_trans_iter_put(&trans, iter);
1193
1194         ret = bch2_trans_exit(&trans) ?: ret;
1195         if (ret)
1196                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
1197
1198         return ret;
1199 }
1200
1201 static int check_inode_nlink(struct btree_trans *trans,
1202                              struct bch_inode_unpacked *lostfound_inode,
1203                              struct btree_iter *iter,
1204                              struct bkey_s_c_inode inode,
1205                              unsigned nlink)
1206 {
1207         struct bch_fs *c = trans->c;
1208         struct bch_inode_unpacked u;
1209         int ret = 0;
1210
1211         /*
1212          * Backpointer and directory structure checks are sufficient for
1213          * directories, since they can't have hardlinks:
1214          */
1215         if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
1216                 return 0;
1217
1218         ret = bch2_inode_unpack(inode, &u);
1219
1220         /* Should never happen, checked by bch2_inode_invalid: */
1221         if (bch2_fs_inconsistent_on(ret, c,
1222                          "error unpacking inode %llu in fsck",
1223                          inode.k->p.inode))
1224                 return ret;
1225
1226         /* Improved directory structure pass will catch this: */
1227         if (fsck_err_on(!nlink, c,
1228                         "unreachable inode %llu not marked as unlinked (type %u)",
1229                         u.bi_inum, mode_to_type(u.bi_mode))) {
1230                 ret = reattach_inode(c, lostfound_inode, u.bi_inum);
1231                 if (ret)
1232                         return ret;
1233
1234                 nlink = 1;
1235         }
1236
1237         if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c,
1238                         "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
1239                         u.bi_inum, mode_to_type(u.bi_mode),
1240                         bch2_inode_nlink_get(&u), nlink)) {
1241                 struct bkey_inode_buf p;
1242
1243                 if (nlink > 1)
1244                         u.bi_flags |= BCH_INODE_BACKPTR_UNTRUSTED;
1245
1246                 bch2_inode_nlink_set(&u, nlink);
1247                 bch2_inode_pack(c, &p, &u);
1248                 p.inode.k.p = iter->pos;
1249
1250                 ret = __bch2_trans_do(trans, NULL, NULL,
1251                                       BTREE_INSERT_NOFAIL|
1252                                       BTREE_INSERT_LAZY_RW,
1253                         (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0));
1254                 if (ret)
1255                         bch_err(c, "error in fsck: error %i updating inode", ret);
1256         }
1257 fsck_err:
1258         return ret;
1259 }
1260
1261 noinline_for_stack
1262 static int bch2_gc_walk_inodes(struct bch_fs *c,
1263                                struct bch_inode_unpacked *lostfound_inode,
1264                                nlink_table *links,
1265                                u64 range_start, u64 range_end)
1266 {
1267         struct btree_trans trans;
1268         struct btree_iter *iter;
1269         struct bkey_s_c k;
1270         struct nlink *link;
1271         int ret = 0;
1272
1273         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1274
1275         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
1276                            POS(0, range_start), 0, k, ret) {
1277                 if (!k.k || k.k->p.offset >= range_end)
1278                         break;
1279
1280                 if (k.k->type != KEY_TYPE_inode)
1281                         continue;
1282
1283                 link = genradix_ptr(links, k.k->p.offset - range_start);
1284                 ret = check_inode_nlink(&trans, lostfound_inode, iter,
1285                                         bkey_s_c_to_inode(k), link ? link->count : 0);
1286                 if (ret)
1287                         break;
1288
1289         }
1290         bch2_trans_iter_put(&trans, iter);
1291         bch2_trans_exit(&trans);
1292
1293         if (ret)
1294                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
1295
1296         return ret;
1297 }
1298
1299 noinline_for_stack
1300 static int check_nlinks(struct bch_fs *c,
1301                               struct bch_inode_unpacked *lostfound_inode)
1302 {
1303         nlink_table links;
1304         u64 this_iter_range_start, next_iter_range_start = 0;
1305         int ret = 0;
1306
1307         bch_verbose(c, "checking inode nlinks");
1308
1309         genradix_init(&links);
1310
1311         do {
1312                 this_iter_range_start = next_iter_range_start;
1313                 next_iter_range_start = U64_MAX;
1314
1315                 ret = bch2_gc_walk_dirents(c, &links,
1316                                           this_iter_range_start,
1317                                           &next_iter_range_start);
1318                 if (ret)
1319                         break;
1320
1321                 ret = bch2_gc_walk_inodes(c, lostfound_inode, &links,
1322                                          this_iter_range_start,
1323                                          next_iter_range_start);
1324                 if (ret)
1325                         break;
1326
1327                 genradix_free(&links);
1328         } while (next_iter_range_start != U64_MAX);
1329
1330         genradix_free(&links);
1331
1332         return ret;
1333 }
1334
1335 /*
1336  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
1337  * Doesn't fix them yet, mainly because they haven't yet been observed:
1338  */
1339 int bch2_fsck_full(struct bch_fs *c)
1340 {
1341         struct bch_inode_unpacked root_inode, lostfound_inode;
1342
1343         return  check_inodes(c, true) ?:
1344                 check_extents(c) ?:
1345                 check_dirents(c) ?:
1346                 check_xattrs(c) ?:
1347                 check_root(c, &root_inode) ?:
1348                 check_lostfound(c, &root_inode, &lostfound_inode) ?:
1349                 check_directory_structure(c, &lostfound_inode) ?:
1350                 check_nlinks(c, &lostfound_inode);
1351 }
1352
1353 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
1354 {
1355         return check_inodes(c, false);
1356 }