]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
eb8ac164b0b618d3ad1eb0d1224a0a0caeab1588
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "dirent.h"
7 #include "error.h"
8 #include "fs-common.h"
9 #include "fsck.h"
10 #include "inode.h"
11 #include "keylist.h"
12 #include "super.h"
13 #include "xattr.h"
14
15 #include <linux/dcache.h> /* struct qstr */
16 #include <linux/generic-radix-tree.h>
17
18 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
19
20 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
21 {
22         struct btree_iter *iter;
23         struct bkey_s_c k;
24         u64 sectors = 0;
25         int ret;
26
27         for_each_btree_key(trans, iter, BTREE_ID_extents,
28                            POS(inum, 0), 0, k, ret) {
29                 if (k.k->p.inode != inum)
30                         break;
31
32                 if (bkey_extent_is_allocation(k.k))
33                         sectors += k.k->size;
34         }
35
36         bch2_trans_iter_free(trans, iter);
37
38         return ret ?: sectors;
39 }
40
41 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
42                         struct bch_inode_unpacked *inode,
43                         u32 *snapshot)
44 {
45         struct btree_iter *iter;
46         struct bkey_s_c k;
47         int ret;
48
49         iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
50                         POS(0, inode_nr), 0);
51         k = bch2_btree_iter_peek_slot(iter);
52         ret = bkey_err(k);
53         if (ret)
54                 goto err;
55
56         if (snapshot)
57                 *snapshot = iter->pos.snapshot;
58         ret = k.k->type == KEY_TYPE_inode
59                 ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
60                 : -ENOENT;
61 err:
62         bch2_trans_iter_free(trans, iter);
63         return ret;
64 }
65
66 static int write_inode(struct btree_trans *trans,
67                        struct bch_inode_unpacked *inode,
68                        u32 snapshot)
69 {
70         struct btree_iter *inode_iter =
71                 bch2_trans_get_iter(trans, BTREE_ID_inodes,
72                                     SPOS(0, inode->bi_inum, snapshot),
73                                     BTREE_ITER_INTENT);
74         int ret = __bch2_trans_do(trans, NULL, NULL,
75                                   BTREE_INSERT_NOFAIL|
76                                   BTREE_INSERT_LAZY_RW,
77                                   bch2_inode_write(trans, inode_iter, inode));
78         bch2_trans_iter_put(trans, inode_iter);
79         if (ret)
80                 bch_err(trans->c, "error in fsck: error %i updating inode", ret);
81         return ret;
82 }
83
84 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
85 {
86         struct bch_fs *c = trans->c;
87         struct btree_iter *iter;
88         struct bch_inode_unpacked dir_inode;
89         struct bch_hash_info dir_hash_info;
90         int ret;
91
92         ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
93         if (ret)
94                 return ret;
95
96         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
97
98         iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
99
100         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
101                                   &dir_hash_info, iter);
102         bch2_trans_iter_put(trans, iter);
103         return ret;
104 }
105
106 static int remove_dirent(struct btree_trans *trans, struct bpos pos)
107 {
108         int ret = __bch2_trans_do(trans, NULL, NULL,
109                                   BTREE_INSERT_NOFAIL|
110                                   BTREE_INSERT_LAZY_RW,
111                                   __remove_dirent(trans, pos));
112         if (ret)
113                 bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
114         return ret;
115 }
116
117 static int __reattach_inode(struct btree_trans *trans,
118                             struct bch_inode_unpacked *lostfound,
119                             u64 inum)
120 {
121         struct bch_hash_info dir_hash =
122                 bch2_hash_info_init(trans->c, lostfound);
123         struct bch_inode_unpacked inode_u;
124         char name_buf[20];
125         struct qstr name;
126         u64 dir_offset = 0;
127         u32 snapshot;
128         int ret;
129
130         snprintf(name_buf, sizeof(name_buf), "%llu", inum);
131         name = (struct qstr) QSTR(name_buf);
132
133         ret = lookup_inode(trans, inum, &inode_u, &snapshot);
134         if (ret)
135                 return ret;
136
137         if (S_ISDIR(inode_u.bi_mode)) {
138                 lostfound->bi_nlink++;
139
140                 ret = write_inode(trans, lostfound, U32_MAX);
141                 if (ret)
142                         return ret;
143         }
144
145         ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash,
146                                  mode_to_type(inode_u.bi_mode),
147                                  &name, inum, &dir_offset,
148                                  BCH_HASH_SET_MUST_CREATE);
149         if (ret)
150                 return ret;
151
152         inode_u.bi_dir          = lostfound->bi_inum;
153         inode_u.bi_dir_offset   = dir_offset;
154
155         return write_inode(trans, &inode_u, U32_MAX);
156 }
157
158 static int reattach_inode(struct btree_trans *trans,
159                           struct bch_inode_unpacked *lostfound,
160                           u64 inum)
161 {
162         int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
163                               __reattach_inode(trans, lostfound, inum));
164         if (ret)
165                 bch_err(trans->c, "error %i reattaching inode %llu", ret, inum);
166
167         return ret;
168 }
169
170 static int remove_backpointer(struct btree_trans *trans,
171                               struct bch_inode_unpacked *inode)
172 {
173         struct btree_iter *iter;
174         struct bkey_s_c k;
175         int ret;
176
177         iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
178                                    POS(inode->bi_dir, inode->bi_dir_offset), 0);
179         k = bch2_btree_iter_peek_slot(iter);
180         ret = bkey_err(k);
181         if (ret)
182                 goto out;
183         if (k.k->type != KEY_TYPE_dirent) {
184                 ret = -ENOENT;
185                 goto out;
186         }
187
188         ret = remove_dirent(trans, k.k->p);
189 out:
190         bch2_trans_iter_put(trans, iter);
191         return ret;
192 }
193
194 struct inode_walker {
195         bool                    first_this_inode;
196         bool                    have_inode;
197         u64                     cur_inum;
198         u32                     snapshot;
199         struct bch_inode_unpacked inode;
200 };
201
202 static struct inode_walker inode_walker_init(void)
203 {
204         return (struct inode_walker) {
205                 .cur_inum       = -1,
206                 .have_inode     = false,
207         };
208 }
209
210 static int walk_inode(struct btree_trans *trans,
211                       struct inode_walker *w, u64 inum)
212 {
213         if (inum != w->cur_inum) {
214                 int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot);
215
216                 if (ret && ret != -ENOENT)
217                         return ret;
218
219                 w->have_inode   = !ret;
220                 w->cur_inum     = inum;
221                 w->first_this_inode = true;
222         } else {
223                 w->first_this_inode = false;
224         }
225
226         return 0;
227 }
228
229 static int hash_redo_key(struct btree_trans *trans,
230                          const struct bch_hash_desc desc,
231                          struct bch_hash_info *hash_info,
232                          struct btree_iter *k_iter, struct bkey_s_c k)
233 {
234         struct bkey_i delete;
235         struct bkey_i *tmp;
236
237         tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
238         if (IS_ERR(tmp))
239                 return PTR_ERR(tmp);
240
241         bkey_reassemble(tmp, k);
242
243         bkey_init(&delete.k);
244         delete.k.p = k_iter->pos;
245         bch2_trans_update(trans, k_iter, &delete, 0);
246
247         return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode,
248                              tmp, 0);
249 }
250
251 static int fsck_hash_delete_at(struct btree_trans *trans,
252                                const struct bch_hash_desc desc,
253                                struct bch_hash_info *info,
254                                struct btree_iter *iter)
255 {
256         int ret;
257 retry:
258         ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
259                 bch2_trans_commit(trans, NULL, NULL,
260                                   BTREE_INSERT_NOFAIL|
261                                   BTREE_INSERT_LAZY_RW);
262         if (ret == -EINTR) {
263                 ret = bch2_btree_iter_traverse(iter);
264                 if (!ret)
265                         goto retry;
266         }
267
268         return ret;
269 }
270
271 static int hash_check_key(struct btree_trans *trans,
272                           const struct bch_hash_desc desc,
273                           struct bch_hash_info *hash_info,
274                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
275 {
276         struct bch_fs *c = trans->c;
277         struct btree_iter *iter = NULL;
278         char buf[200];
279         struct bkey_s_c k;
280         u64 hash;
281         int ret = 0;
282
283         if (hash_k.k->type != desc.key_type)
284                 return 0;
285
286         hash = desc.hash_bkey(hash_info, hash_k);
287
288         if (likely(hash == hash_k.k->p.offset))
289                 return 0;
290
291         if (hash_k.k->p.offset < hash)
292                 goto bad_hash;
293
294         for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
295                            BTREE_ITER_SLOTS, k, ret) {
296                 if (!bkey_cmp(k.k->p, hash_k.k->p))
297                         break;
298
299                 if (fsck_err_on(k.k->type == desc.key_type &&
300                                 !desc.cmp_bkey(k, hash_k), c,
301                                 "duplicate hash table keys:\n%s",
302                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
303                                                        hash_k), buf))) {
304                         ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
305                         if (ret)
306                                 return ret;
307                         ret = 1;
308                         break;
309                 }
310
311                 if (bkey_deleted(k.k)) {
312                         bch2_trans_iter_free(trans, iter);
313                         goto bad_hash;
314                 }
315
316         }
317         bch2_trans_iter_free(trans, iter);
318         return ret;
319 bad_hash:
320         if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
321                      "hashed to %llu should be at %llu\n%s",
322                      desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset,
323                      hash, iter->pos.offset,
324                      (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
325                 return 0;
326
327         ret = __bch2_trans_do(trans, NULL, NULL,
328                               BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
329                 hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
330         if (ret) {
331                 bch_err(c, "hash_redo_key err %i", ret);
332                 return ret;
333         }
334         return -EINTR;
335 fsck_err:
336         return ret;
337 }
338
339 static int check_inode(struct btree_trans *trans,
340                        struct btree_iter *iter,
341                        struct bkey_s_c_inode inode)
342 {
343         struct bch_fs *c = trans->c;
344         struct bch_inode_unpacked u;
345         bool do_update = false;
346         int ret = 0;
347
348         ret = bch2_inode_unpack(inode, &u);
349
350         if (bch2_fs_inconsistent_on(ret, c,
351                          "error unpacking inode %llu in fsck",
352                          inode.k->p.inode))
353                 return ret;
354
355         if (u.bi_flags & BCH_INODE_UNLINKED &&
356             (!c->sb.clean ||
357              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
358                       u.bi_inum))) {
359                 bch_verbose(c, "deleting inode %llu", u.bi_inum);
360
361                 bch2_trans_unlock(trans);
362                 bch2_fs_lazy_rw(c);
363
364                 ret = bch2_inode_rm(c, u.bi_inum, false);
365                 if (ret)
366                         bch_err(c, "error in fsck: error %i while deleting inode", ret);
367                 return ret;
368         }
369
370         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
371             (!c->sb.clean ||
372              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
373                       u.bi_inum))) {
374                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
375
376                 bch2_trans_unlock(trans);
377                 bch2_fs_lazy_rw(c);
378
379                 /*
380                  * XXX: need to truncate partial blocks too here - or ideally
381                  * just switch units to bytes and that issue goes away
382                  */
383                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
384                                 POS(u.bi_inum, round_up(u.bi_size, block_bytes(c))),
385                                 POS(u.bi_inum, U64_MAX),
386                                 NULL);
387                 if (ret) {
388                         bch_err(c, "error in fsck: error %i truncating inode", ret);
389                         return ret;
390                 }
391
392                 /*
393                  * We truncated without our normal sector accounting hook, just
394                  * make sure we recalculate it:
395                  */
396                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
397
398                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
399                 do_update = true;
400         }
401
402         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
403             (!c->sb.clean ||
404              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
405                       u.bi_inum))) {
406                 s64 sectors;
407
408                 bch_verbose(c, "recounting sectors for inode %llu",
409                             u.bi_inum);
410
411                 sectors = bch2_count_inode_sectors(trans, u.bi_inum);
412                 if (sectors < 0) {
413                         bch_err(c, "error in fsck: error %i recounting inode sectors",
414                                 (int) sectors);
415                         return sectors;
416                 }
417
418                 u.bi_sectors = sectors;
419                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
420                 do_update = true;
421         }
422
423         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
424                 u.bi_dir = 0;
425                 u.bi_dir_offset = 0;
426                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
427                 do_update = true;
428         }
429
430         if (do_update) {
431                 ret = __bch2_trans_do(trans, NULL, NULL,
432                                       BTREE_INSERT_NOFAIL|
433                                       BTREE_INSERT_LAZY_RW,
434                                 bch2_inode_write(trans, iter, &u));
435                 if (ret)
436                         bch_err(c, "error in fsck: error %i "
437                                 "updating inode", ret);
438         }
439 fsck_err:
440         return ret;
441 }
442
443 noinline_for_stack
444 static int check_inodes(struct bch_fs *c, bool full)
445 {
446         struct btree_trans trans;
447         struct btree_iter *iter;
448         struct bkey_s_c k;
449         struct bkey_s_c_inode inode;
450         int ret;
451
452         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
453
454         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
455                 if (k.k->type != KEY_TYPE_inode)
456                         continue;
457
458                 inode = bkey_s_c_to_inode(k);
459
460                 if (full ||
461                     (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
462                                           BCH_INODE_I_SECTORS_DIRTY|
463                                           BCH_INODE_UNLINKED))) {
464                         ret = check_inode(&trans, iter, inode);
465                         if (ret)
466                                 break;
467                 }
468         }
469         bch2_trans_iter_put(&trans, iter);
470
471         BUG_ON(ret == -EINTR);
472
473         return bch2_trans_exit(&trans) ?: ret;
474 }
475
476 static int fix_overlapping_extent(struct btree_trans *trans,
477                                        struct bkey_s_c k, struct bpos cut_at)
478 {
479         struct btree_iter *iter;
480         struct bkey_i *u;
481         int ret;
482
483         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
484         ret = PTR_ERR_OR_ZERO(u);
485         if (ret)
486                 return ret;
487
488         bkey_reassemble(u, k);
489         bch2_cut_front(cut_at, u);
490
491
492         /*
493          * We don't want to go through the extent_handle_overwrites path:
494          *
495          * XXX: this is going to screw up disk accounting, extent triggers
496          * assume things about extent overwrites - we should be running the
497          * triggers manually here
498          */
499         iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
500                                    BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
501
502         BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
503         bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN);
504         bch2_trans_iter_put(trans, iter);
505
506         return bch2_trans_commit(trans, NULL, NULL,
507                                  BTREE_INSERT_NOFAIL|
508                                  BTREE_INSERT_LAZY_RW);
509 }
510
511 static int inode_backpointer_exists(struct btree_trans *trans,
512                                     struct bch_inode_unpacked *inode)
513 {
514         struct btree_iter *iter;
515         struct bkey_s_c k;
516         int ret;
517
518         iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
519                                    POS(inode->bi_dir, inode->bi_dir_offset), 0);
520         k = bch2_btree_iter_peek_slot(iter);
521         ret = bkey_err(k);
522         if (ret)
523                 goto out;
524         if (k.k->type != KEY_TYPE_dirent)
525                 goto out;
526
527         ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
528 out:
529         bch2_trans_iter_free(trans, iter);
530         return ret;
531 }
532
533 static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
534                                       struct bch_inode_unpacked *inode)
535 {
536         return d.k->p.inode == inode->bi_dir &&
537                 d.k->p.offset == inode->bi_dir_offset;
538 }
539
540 /*
541  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
542  * that i_size an i_sectors are consistent
543  */
544 noinline_for_stack
545 static int check_extents(struct bch_fs *c)
546 {
547         struct inode_walker w = inode_walker_init();
548         struct btree_trans trans;
549         struct btree_iter *iter;
550         struct bkey_s_c k;
551         struct bkey_buf prev;
552         u64 i_sectors = 0;
553         int ret = 0;
554
555         bch2_bkey_buf_init(&prev);
556         prev.k->k = KEY(0, 0, 0);
557         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
558
559         bch_verbose(c, "checking extents");
560
561         iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
562                                    POS(BCACHEFS_ROOT_INO, 0),
563                                    BTREE_ITER_INTENT);
564 retry:
565         while ((k = bch2_btree_iter_peek(iter)).k &&
566                !(ret = bkey_err(k))) {
567                 if (w.have_inode &&
568                     w.cur_inum != k.k->p.inode &&
569                     !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
570                     fsck_err_on(w.inode.bi_sectors != i_sectors, c,
571                                 "inode %llu has incorrect i_sectors: got %llu, should be %llu",
572                                 w.inode.bi_inum,
573                                 w.inode.bi_sectors, i_sectors)) {
574                         w.inode.bi_sectors = i_sectors;
575
576                         ret = write_inode(&trans, &w.inode, w.snapshot);
577                         if (ret)
578                                 break;
579                 }
580
581                 if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
582                         char buf1[200];
583                         char buf2[200];
584
585                         bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
586                         bch2_bkey_val_to_text(&PBUF(buf2), c, k);
587
588                         if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
589                                 return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
590                 }
591
592                 ret = walk_inode(&trans, &w, k.k->p.inode);
593                 if (ret)
594                         break;
595
596                 if (w.first_this_inode)
597                         i_sectors = 0;
598
599                 if (fsck_err_on(!w.have_inode, c,
600                                 "extent type %u for missing inode %llu",
601                                 k.k->type, k.k->p.inode) ||
602                     fsck_err_on(w.have_inode &&
603                                 !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
604                                 "extent type %u for non regular file, inode %llu mode %o",
605                                 k.k->type, k.k->p.inode, w.inode.bi_mode)) {
606                         bch2_fs_lazy_rw(c);
607                         return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
608                                                        POS(k.k->p.inode, 0),
609                                                        POS(k.k->p.inode, U64_MAX),
610                                                        NULL) ?: -EINTR;
611                 }
612
613                 if (fsck_err_on(w.have_inode &&
614                                 !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
615                                 k.k->type != KEY_TYPE_reservation &&
616                                 k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
617                                 "extent type %u offset %llu past end of inode %llu, i_size %llu",
618                                 k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
619                         bch2_fs_lazy_rw(c);
620                         return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
621                                         POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c))),
622                                         POS(k.k->p.inode, U64_MAX),
623                                         NULL) ?: -EINTR;
624                 }
625
626                 if (bkey_extent_is_allocation(k.k))
627                         i_sectors += k.k->size;
628                 bch2_bkey_buf_reassemble(&prev, c, k);
629
630                 bch2_btree_iter_advance(iter);
631         }
632 fsck_err:
633         if (ret == -EINTR)
634                 goto retry;
635         bch2_trans_iter_put(&trans, iter);
636         bch2_bkey_buf_exit(&prev, c);
637         return bch2_trans_exit(&trans) ?: ret;
638 }
639
640 /*
641  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
642  * validate d_type
643  */
644 noinline_for_stack
645 static int check_dirents(struct bch_fs *c)
646 {
647         struct inode_walker w = inode_walker_init();
648         struct bch_hash_info hash_info;
649         struct btree_trans trans;
650         struct btree_iter *iter;
651         struct bkey_s_c k;
652         char buf[200];
653         unsigned nr_subdirs = 0;
654         int ret = 0;
655
656         bch_verbose(c, "checking dirents");
657
658         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
659
660         iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
661                                    POS(BCACHEFS_ROOT_INO, 0), 0);
662 retry:
663         while ((k = bch2_btree_iter_peek(iter)).k &&
664                !(ret = bkey_err(k))) {
665                 struct bkey_s_c_dirent d;
666                 struct bch_inode_unpacked target;
667                 u32 target_snapshot;
668                 bool have_target;
669                 bool backpointer_exists = true;
670                 u64 d_inum;
671
672                 if (w.have_inode &&
673                     w.cur_inum != k.k->p.inode &&
674                     fsck_err_on(w.inode.bi_nlink != nr_subdirs, c,
675                                 "directory %llu with wrong i_nlink: got %u, should be %u",
676                                 w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) {
677                         w.inode.bi_nlink = nr_subdirs;
678                         ret = write_inode(&trans, &w.inode, w.snapshot);
679                         if (ret)
680                                 break;
681                 }
682
683                 ret = walk_inode(&trans, &w, k.k->p.inode);
684                 if (ret)
685                         break;
686
687                 if (w.first_this_inode)
688                         nr_subdirs = 0;
689
690                 if (fsck_err_on(!w.have_inode, c,
691                                 "dirent in nonexisting directory:\n%s",
692                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
693                                                        k), buf)) ||
694                     fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
695                                 "dirent in non directory inode type %u:\n%s",
696                                 mode_to_type(w.inode.bi_mode),
697                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
698                                                        k), buf))) {
699                         ret = lockrestart_do(&trans,
700                                         bch2_btree_delete_at(&trans, iter, 0));
701                         if (ret)
702                                 goto err;
703                         goto next;
704                 }
705
706                 if (!w.have_inode)
707                         goto next;
708
709                 if (w.first_this_inode)
710                         hash_info = bch2_hash_info_init(c, &w.inode);
711
712                 ret = hash_check_key(&trans, bch2_dirent_hash_desc,
713                                      &hash_info, iter, k);
714                 if (ret > 0) {
715                         ret = 0;
716                         goto next;
717                 }
718                 if (ret)
719                         goto fsck_err;
720
721                 if (k.k->type != KEY_TYPE_dirent)
722                         goto next;
723
724                 d = bkey_s_c_to_dirent(k);
725                 d_inum = le64_to_cpu(d.v->d_inum);
726
727                 ret = lookup_inode(&trans, d_inum, &target, &target_snapshot);
728                 if (ret && ret != -ENOENT)
729                         break;
730
731                 have_target = !ret;
732                 ret = 0;
733
734                 if (fsck_err_on(!have_target, c,
735                                 "dirent points to missing inode:\n%s",
736                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
737                                                        k), buf))) {
738                         ret = remove_dirent(&trans, d.k->p);
739                         if (ret)
740                                 goto err;
741                         goto next;
742                 }
743
744                 if (!have_target)
745                         goto next;
746
747                 if (!target.bi_dir &&
748                     !target.bi_dir_offset) {
749                         target.bi_dir           = k.k->p.inode;
750                         target.bi_dir_offset    = k.k->p.offset;
751
752                         ret = write_inode(&trans, &target, target_snapshot);
753                         if (ret)
754                                 goto err;
755                 }
756
757                 if (!inode_backpointer_matches(d, &target)) {
758                         ret = inode_backpointer_exists(&trans, &target);
759                         if (ret < 0)
760                                 goto err;
761
762                         backpointer_exists = ret;
763                         ret = 0;
764
765                         if (fsck_err_on(S_ISDIR(target.bi_mode) &&
766                                         backpointer_exists, c,
767                                         "directory %llu with multiple links",
768                                         target.bi_inum)) {
769                                 ret = remove_dirent(&trans, d.k->p);
770                                 if (ret)
771                                         goto err;
772                                 continue;
773                         }
774
775                         if (fsck_err_on(backpointer_exists &&
776                                         !target.bi_nlink, c,
777                                         "inode %llu has multiple links but i_nlink 0",
778                                         d_inum)) {
779                                 target.bi_nlink++;
780                                 target.bi_flags &= ~BCH_INODE_UNLINKED;
781
782                                 ret = write_inode(&trans, &target, target_snapshot);
783                                 if (ret)
784                                         goto err;
785                         }
786
787                         if (fsck_err_on(!backpointer_exists, c,
788                                         "inode %llu has wrong backpointer:\n"
789                                         "got       %llu:%llu\n"
790                                         "should be %llu:%llu",
791                                         d_inum,
792                                         target.bi_dir,
793                                         target.bi_dir_offset,
794                                         k.k->p.inode,
795                                         k.k->p.offset)) {
796                                 target.bi_dir           = k.k->p.inode;
797                                 target.bi_dir_offset    = k.k->p.offset;
798
799                                 ret = write_inode(&trans, &target, target_snapshot);
800                                 if (ret)
801                                         goto err;
802                         }
803                 }
804
805                 if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
806                                 "incorrect d_type: should be %u:\n%s",
807                                 mode_to_type(target.bi_mode),
808                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
809                                                        k), buf))) {
810                         struct bkey_i_dirent *n;
811
812                         n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
813                         if (!n) {
814                                 ret = -ENOMEM;
815                                 goto err;
816                         }
817
818                         bkey_reassemble(&n->k_i, d.s_c);
819                         n->v.d_type = mode_to_type(target.bi_mode);
820
821                         ret = __bch2_trans_do(&trans, NULL, NULL,
822                                               BTREE_INSERT_NOFAIL|
823                                               BTREE_INSERT_LAZY_RW,
824                                 (bch2_trans_update(&trans, iter, &n->k_i, 0), 0));
825                         kfree(n);
826                         if (ret)
827                                 goto err;
828
829                 }
830
831                 nr_subdirs += d.v->d_type == DT_DIR;
832 next:
833                 bch2_btree_iter_advance(iter);
834         }
835 err:
836 fsck_err:
837         if (ret == -EINTR)
838                 goto retry;
839
840         bch2_trans_iter_put(&trans, iter);
841         return bch2_trans_exit(&trans) ?: ret;
842 }
843
844 /*
845  * Walk xattrs: verify that they all have a corresponding inode
846  */
847 noinline_for_stack
848 static int check_xattrs(struct bch_fs *c)
849 {
850         struct inode_walker w = inode_walker_init();
851         struct bch_hash_info hash_info;
852         struct btree_trans trans;
853         struct btree_iter *iter;
854         struct bkey_s_c k;
855         int ret = 0;
856
857         bch_verbose(c, "checking xattrs");
858
859         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
860
861         iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
862                                    POS(BCACHEFS_ROOT_INO, 0), 0);
863 retry:
864         while ((k = bch2_btree_iter_peek(iter)).k &&
865                !(ret = bkey_err(k))) {
866                 ret = walk_inode(&trans, &w, k.k->p.inode);
867                 if (ret)
868                         break;
869
870                 if (fsck_err_on(!w.have_inode, c,
871                                 "xattr for missing inode %llu",
872                                 k.k->p.inode)) {
873                         ret = bch2_btree_delete_at(&trans, iter, 0);
874                         if (ret)
875                                 break;
876                         continue;
877                 }
878
879                 if (w.first_this_inode && w.have_inode)
880                         hash_info = bch2_hash_info_init(c, &w.inode);
881
882                 ret = hash_check_key(&trans, bch2_xattr_hash_desc,
883                                      &hash_info, iter, k);
884                 if (ret)
885                         break;
886
887                 bch2_btree_iter_advance(iter);
888         }
889 fsck_err:
890         if (ret == -EINTR)
891                 goto retry;
892
893         bch2_trans_iter_put(&trans, iter);
894         return bch2_trans_exit(&trans) ?: ret;
895 }
896
897 /* Get root directory, create if it doesn't exist: */
898 static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
899 {
900         struct bkey_inode_buf packed;
901         u32 snapshot;
902         int ret;
903
904         bch_verbose(c, "checking root directory");
905
906         ret = bch2_trans_do(c, NULL, NULL, 0,
907                 lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
908         if (ret && ret != -ENOENT)
909                 return ret;
910
911         if (fsck_err_on(ret, c, "root directory missing"))
912                 goto create_root;
913
914         if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
915                         "root inode not a directory"))
916                 goto create_root;
917
918         return 0;
919 fsck_err:
920         return ret;
921 create_root:
922         bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
923                         0, NULL);
924         root_inode->bi_inum = BCACHEFS_ROOT_INO;
925
926         bch2_inode_pack(c, &packed, root_inode);
927
928         return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
929                                  NULL, NULL,
930                                  BTREE_INSERT_NOFAIL|
931                                  BTREE_INSERT_LAZY_RW);
932 }
933
934 /* Get lost+found, create if it doesn't exist: */
935 static int check_lostfound(struct bch_fs *c,
936                            struct bch_inode_unpacked *root_inode,
937                            struct bch_inode_unpacked *lostfound_inode)
938 {
939         struct qstr lostfound = QSTR("lost+found");
940         struct bch_hash_info root_hash_info =
941                 bch2_hash_info_init(c, root_inode);
942         u64 inum;
943         u32 snapshot;
944         int ret;
945
946         bch_verbose(c, "checking lost+found");
947
948         inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
949                                  &lostfound);
950         if (!inum) {
951                 bch_notice(c, "creating lost+found");
952                 goto create_lostfound;
953         }
954
955         ret = bch2_trans_do(c, NULL, NULL, 0,
956                 lookup_inode(&trans, inum, lostfound_inode, &snapshot));
957         if (ret && ret != -ENOENT)
958                 return ret;
959
960         if (fsck_err_on(ret, c, "lost+found missing"))
961                 goto create_lostfound;
962
963         if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c,
964                         "lost+found inode not a directory"))
965                 goto create_lostfound;
966
967         return 0;
968 fsck_err:
969         return ret;
970 create_lostfound:
971         bch2_inode_init_early(c, lostfound_inode);
972
973         ret = bch2_trans_do(c, NULL, NULL,
974                             BTREE_INSERT_NOFAIL|
975                             BTREE_INSERT_LAZY_RW,
976                 bch2_create_trans(&trans,
977                                   BCACHEFS_ROOT_INO, root_inode,
978                                   lostfound_inode, &lostfound,
979                                   0, 0, S_IFDIR|0700, 0, NULL, NULL));
980         if (ret)
981                 bch_err(c, "error creating lost+found: %i", ret);
982
983         return ret;
984 }
985
986 struct pathbuf {
987         size_t          nr;
988         size_t          size;
989
990         struct pathbuf_entry {
991                 u64     inum;
992         }               *entries;
993 };
994
995 static int path_down(struct pathbuf *p, u64 inum)
996 {
997         if (p->nr == p->size) {
998                 size_t new_size = max_t(size_t, 256UL, p->size * 2);
999                 void *n = krealloc(p->entries,
1000                                    new_size * sizeof(p->entries[0]),
1001                                    GFP_KERNEL);
1002                 if (!n) {
1003                         return -ENOMEM;
1004                 }
1005
1006                 p->entries = n;
1007                 p->size = new_size;
1008         };
1009
1010         p->entries[p->nr++] = (struct pathbuf_entry) {
1011                 .inum = inum,
1012         };
1013         return 0;
1014 }
1015
1016 static int check_path(struct btree_trans *trans,
1017                       struct bch_inode_unpacked *lostfound,
1018                       struct pathbuf *p,
1019                       struct bch_inode_unpacked *inode)
1020 {
1021         struct bch_fs *c = trans->c;
1022         u32 snapshot;
1023         size_t i;
1024         int ret = 0;
1025
1026         p->nr = 0;
1027
1028         while (inode->bi_inum != BCACHEFS_ROOT_INO) {
1029                 ret = lockrestart_do(trans,
1030                         inode_backpointer_exists(trans, inode));
1031                 if (ret < 0)
1032                         break;
1033
1034                 if (!ret) {
1035                         if (fsck_err(c,  "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
1036                                      inode->bi_inum,
1037                                      mode_to_type(inode->bi_mode),
1038                                      inode->bi_nlink,
1039                                      inode->bi_dir,
1040                                      inode->bi_dir_offset))
1041                                 ret = reattach_inode(trans, lostfound, inode->bi_inum);
1042                         break;
1043                 }
1044                 ret = 0;
1045
1046                 if (!S_ISDIR(inode->bi_mode))
1047                         break;
1048
1049                 ret = path_down(p, inode->bi_inum);
1050                 if (ret) {
1051                         bch_err(c, "memory allocation failure");
1052                         return ret;
1053                 }
1054
1055                 for (i = 0; i < p->nr; i++) {
1056                         if (inode->bi_dir != p->entries[i].inum)
1057                                 continue;
1058
1059                         /* XXX print path */
1060                         if (!fsck_err(c, "directory structure loop"))
1061                                 return 0;
1062
1063                         ret = lockrestart_do(trans,
1064                                          remove_backpointer(trans, inode));
1065                         if (ret) {
1066                                 bch_err(c, "error removing dirent: %i", ret);
1067                                 break;
1068                         }
1069
1070                         ret = reattach_inode(trans, lostfound, inode->bi_inum);
1071                         break;
1072                 }
1073
1074                 ret = lockrestart_do(trans,
1075                                 lookup_inode(trans, inode->bi_dir, inode, &snapshot));
1076                 if (ret) {
1077                         /* Should have been caught in dirents pass */
1078                         bch_err(c, "error looking up parent directory: %i", ret);
1079                         break;
1080                 }
1081         }
1082 fsck_err:
1083         if (ret)
1084                 bch_err(c, "%s: err %i", __func__, ret);
1085         return ret;
1086 }
1087
1088 /*
1089  * Check for unreachable inodes, as well as loops in the directory structure:
1090  * After check_dirents(), if an inode backpointer doesn't exist that means it's
1091  * unreachable:
1092  */
1093 static int check_directory_structure(struct bch_fs *c,
1094                                      struct bch_inode_unpacked *lostfound)
1095 {
1096         struct btree_trans trans;
1097         struct btree_iter *iter;
1098         struct bkey_s_c k;
1099         struct bch_inode_unpacked u;
1100         struct pathbuf path = { 0, 0, NULL };
1101         int ret;
1102
1103         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1104
1105         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
1106                 if (k.k->type != KEY_TYPE_inode)
1107                         continue;
1108
1109                 ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
1110                 if (ret) {
1111                         /* Should have been caught earlier in fsck: */
1112                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
1113                         break;
1114                 }
1115
1116                 ret = check_path(&trans, lostfound, &path, &u);
1117                 if (ret)
1118                         break;
1119         }
1120         bch2_trans_iter_put(&trans, iter);
1121
1122         BUG_ON(ret == -EINTR);
1123
1124         kfree(path.entries);
1125
1126         return bch2_trans_exit(&trans) ?: ret;
1127 }
1128
1129 struct nlink {
1130         u32     count;
1131 };
1132
1133 typedef GENRADIX(struct nlink) nlink_table;
1134
1135 static void inc_link(struct bch_fs *c, nlink_table *links,
1136                      u64 range_start, u64 *range_end, u64 inum)
1137 {
1138         struct nlink *link;
1139
1140         if (inum < range_start || inum >= *range_end)
1141                 return;
1142
1143         if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) {
1144                 *range_end = inum;
1145                 return;
1146         }
1147
1148         link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
1149         if (!link) {
1150                 bch_verbose(c, "allocation failed during fsck - will need another pass");
1151                 *range_end = inum;
1152                 return;
1153         }
1154
1155         link->count++;
1156 }
1157
1158 noinline_for_stack
1159 static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
1160                                u64 range_start, u64 *range_end)
1161 {
1162         struct btree_trans trans;
1163         struct btree_iter *iter;
1164         struct bkey_s_c k;
1165         struct bkey_s_c_dirent d;
1166         int ret;
1167
1168         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1169
1170         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) {
1171                 switch (k.k->type) {
1172                 case KEY_TYPE_dirent:
1173                         d = bkey_s_c_to_dirent(k);
1174
1175                         if (d.v->d_type != DT_DIR)
1176                                 inc_link(c, links, range_start, range_end,
1177                                          le64_to_cpu(d.v->d_inum));
1178                         break;
1179                 }
1180
1181                 bch2_trans_cond_resched(&trans);
1182         }
1183         bch2_trans_iter_put(&trans, iter);
1184
1185         ret = bch2_trans_exit(&trans) ?: ret;
1186         if (ret)
1187                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
1188
1189         return ret;
1190 }
1191
1192 static int check_inode_nlink(struct btree_trans *trans,
1193                              struct bch_inode_unpacked *lostfound_inode,
1194                              struct btree_iter *iter,
1195                              struct bkey_s_c_inode inode,
1196                              unsigned nlink)
1197 {
1198         struct bch_fs *c = trans->c;
1199         struct bch_inode_unpacked u;
1200         int ret = 0;
1201
1202         /*
1203          * Backpointer and directory structure checks are sufficient for
1204          * directories, since they can't have hardlinks:
1205          */
1206         if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
1207                 return 0;
1208
1209         if (!nlink) {
1210                 bch_err(c, "no links found to inode %llu", inode.k->p.offset);
1211                 return -EINVAL;
1212         }
1213
1214         ret = bch2_inode_unpack(inode, &u);
1215
1216         /* Should never happen, checked by bch2_inode_invalid: */
1217         if (bch2_fs_inconsistent_on(ret, c,
1218                          "error unpacking inode %llu in fsck",
1219                          inode.k->p.inode))
1220                 return ret;
1221
1222         if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c,
1223                         "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
1224                         u.bi_inum, mode_to_type(u.bi_mode),
1225                         bch2_inode_nlink_get(&u), nlink)) {
1226                 bch2_inode_nlink_set(&u, nlink);
1227
1228                 ret = __bch2_trans_do(trans, NULL, NULL,
1229                                       BTREE_INSERT_NOFAIL|
1230                                       BTREE_INSERT_LAZY_RW,
1231                                 bch2_inode_write(trans, iter, &u));
1232                 if (ret)
1233                         bch_err(c, "error in fsck: error %i updating inode", ret);
1234         }
1235 fsck_err:
1236         return ret;
1237 }
1238
1239 noinline_for_stack
1240 static int bch2_gc_walk_inodes(struct bch_fs *c,
1241                                struct bch_inode_unpacked *lostfound_inode,
1242                                nlink_table *links,
1243                                u64 range_start, u64 range_end)
1244 {
1245         struct btree_trans trans;
1246         struct btree_iter *iter;
1247         struct bkey_s_c k;
1248         struct nlink *link;
1249         int ret = 0;
1250
1251         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1252
1253         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
1254                            POS(0, range_start), 0, k, ret) {
1255                 if (!k.k || k.k->p.offset >= range_end)
1256                         break;
1257
1258                 if (k.k->type != KEY_TYPE_inode)
1259                         continue;
1260
1261                 link = genradix_ptr(links, k.k->p.offset - range_start);
1262                 ret = check_inode_nlink(&trans, lostfound_inode, iter,
1263                                         bkey_s_c_to_inode(k), link ? link->count : 0);
1264                 if (ret)
1265                         break;
1266
1267         }
1268         bch2_trans_iter_put(&trans, iter);
1269         bch2_trans_exit(&trans);
1270
1271         if (ret)
1272                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
1273
1274         return ret;
1275 }
1276
1277 noinline_for_stack
1278 static int check_nlinks(struct bch_fs *c,
1279                               struct bch_inode_unpacked *lostfound_inode)
1280 {
1281         nlink_table links;
1282         u64 this_iter_range_start, next_iter_range_start = 0;
1283         int ret = 0;
1284
1285         bch_verbose(c, "checking inode nlinks");
1286
1287         genradix_init(&links);
1288
1289         do {
1290                 this_iter_range_start = next_iter_range_start;
1291                 next_iter_range_start = U64_MAX;
1292
1293                 ret = bch2_gc_walk_dirents(c, &links,
1294                                           this_iter_range_start,
1295                                           &next_iter_range_start);
1296                 if (ret)
1297                         break;
1298
1299                 ret = bch2_gc_walk_inodes(c, lostfound_inode, &links,
1300                                          this_iter_range_start,
1301                                          next_iter_range_start);
1302                 if (ret)
1303                         break;
1304
1305                 genradix_free(&links);
1306         } while (next_iter_range_start != U64_MAX);
1307
1308         genradix_free(&links);
1309
1310         return ret;
1311 }
1312
1313 /*
1314  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
1315  * Doesn't fix them yet, mainly because they haven't yet been observed:
1316  */
1317 int bch2_fsck_full(struct bch_fs *c)
1318 {
1319         struct bch_inode_unpacked root_inode, lostfound_inode;
1320
1321         return  check_inodes(c, true) ?:
1322                 check_extents(c) ?:
1323                 check_dirents(c) ?:
1324                 check_xattrs(c) ?:
1325                 check_root(c, &root_inode) ?:
1326                 check_lostfound(c, &root_inode, &lostfound_inode) ?:
1327                 check_directory_structure(c, &lostfound_inode) ?:
1328                 check_nlinks(c, &lostfound_inode);
1329 }
1330
1331 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
1332 {
1333         return check_inodes(c, false);
1334 }