]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
Update bcachefs sources to 8eca47e4d5 bcachefs: Improved check_directory_structure()
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_update.h"
6 #include "dirent.h"
7 #include "error.h"
8 #include "fs-common.h"
9 #include "fsck.h"
10 #include "inode.h"
11 #include "keylist.h"
12 #include "super.h"
13 #include "xattr.h"
14
15 #include <linux/dcache.h> /* struct qstr */
16 #include <linux/generic-radix-tree.h>
17
18 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
19
20 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
21 {
22         struct btree_iter *iter;
23         struct bkey_s_c k;
24         u64 sectors = 0;
25         int ret;
26
27         for_each_btree_key(trans, iter, BTREE_ID_extents,
28                            POS(inum, 0), 0, k, ret) {
29                 if (k.k->p.inode != inum)
30                         break;
31
32                 if (bkey_extent_is_allocation(k.k))
33                         sectors += k.k->size;
34         }
35
36         bch2_trans_iter_free(trans, iter);
37
38         return ret ?: sectors;
39 }
40
41 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
42                         struct bch_inode_unpacked *inode,
43                         u32 *snapshot)
44 {
45         struct btree_iter *iter;
46         struct bkey_s_c k;
47         int ret;
48
49         iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
50                         POS(0, inode_nr), 0);
51         k = bch2_btree_iter_peek_slot(iter);
52         ret = bkey_err(k);
53         if (ret)
54                 goto err;
55
56         if (snapshot)
57                 *snapshot = iter->pos.snapshot;
58         ret = k.k->type == KEY_TYPE_inode
59                 ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
60                 : -ENOENT;
61 err:
62         bch2_trans_iter_free(trans, iter);
63         return ret;
64 }
65
66 static int write_inode(struct btree_trans *trans,
67                        struct bch_inode_unpacked *inode,
68                        u32 snapshot)
69 {
70         struct btree_iter *inode_iter =
71                 bch2_trans_get_iter(trans, BTREE_ID_inodes,
72                                     SPOS(0, inode->bi_inum, snapshot),
73                                     BTREE_ITER_INTENT);
74         int ret = __bch2_trans_do(trans, NULL, NULL,
75                                   BTREE_INSERT_NOFAIL|
76                                   BTREE_INSERT_LAZY_RW,
77                                   bch2_inode_write(trans, inode_iter, inode));
78         bch2_trans_iter_put(trans, inode_iter);
79         if (ret)
80                 bch_err(trans->c, "error in fsck: error %i updating inode", ret);
81         return ret;
82 }
83
84 static int __remove_dirent(struct btree_trans *trans,
85                            struct bkey_s_c_dirent dirent)
86 {
87         struct bch_fs *c = trans->c;
88         struct qstr name;
89         struct bch_inode_unpacked dir_inode;
90         struct bch_hash_info dir_hash_info;
91         u64 dir_inum = dirent.k->p.inode;
92         int ret;
93         char *buf;
94
95         name.len = bch2_dirent_name_bytes(dirent);
96         buf = bch2_trans_kmalloc(trans, name.len + 1);
97         if (IS_ERR(buf))
98                 return PTR_ERR(buf);
99
100         memcpy(buf, dirent.v->d_name, name.len);
101         buf[name.len] = '\0';
102         name.name = buf;
103
104         ret = lookup_inode(trans, dir_inum, &dir_inode, NULL);
105         if (ret && ret != -EINTR)
106                 bch_err(c, "remove_dirent: err %i looking up directory inode", ret);
107         if (ret)
108                 return ret;
109
110         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
111
112         ret = bch2_hash_delete(trans, bch2_dirent_hash_desc,
113                                &dir_hash_info, dir_inum, &name);
114         if (ret && ret != -EINTR)
115                 bch_err(c, "remove_dirent: err %i deleting dirent", ret);
116         if (ret)
117                 return ret;
118
119         return 0;
120 }
121
122 static int remove_dirent(struct btree_trans *trans,
123                          struct bkey_s_c_dirent dirent)
124 {
125         return __bch2_trans_do(trans, NULL, NULL,
126                                BTREE_INSERT_NOFAIL|
127                                BTREE_INSERT_LAZY_RW,
128                                __remove_dirent(trans, dirent));
129 }
130
131 static int __reattach_inode(struct btree_trans *trans,
132                             struct bch_inode_unpacked *lostfound,
133                             u64 inum)
134 {
135         struct bch_hash_info dir_hash =
136                 bch2_hash_info_init(trans->c, lostfound);
137         struct bch_inode_unpacked inode_u;
138         char name_buf[20];
139         struct qstr name;
140         u64 dir_offset = 0;
141         u32 snapshot;
142         int ret;
143
144         snprintf(name_buf, sizeof(name_buf), "%llu", inum);
145         name = (struct qstr) QSTR(name_buf);
146
147         ret = lookup_inode(trans, inum, &inode_u, &snapshot);
148         if (ret)
149                 return ret;
150
151         if (S_ISDIR(inode_u.bi_mode)) {
152                 lostfound->bi_nlink++;
153
154                 ret = write_inode(trans, lostfound, U32_MAX);
155                 if (ret)
156                         return ret;
157         }
158
159         ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash,
160                                  mode_to_type(inode_u.bi_mode),
161                                  &name, inum, &dir_offset,
162                                  BCH_HASH_SET_MUST_CREATE);
163         if (ret)
164                 return ret;
165
166         inode_u.bi_dir          = lostfound->bi_inum;
167         inode_u.bi_dir_offset   = dir_offset;
168
169         return write_inode(trans, &inode_u, U32_MAX);
170 }
171
172 static int reattach_inode(struct btree_trans *trans,
173                           struct bch_inode_unpacked *lostfound,
174                           u64 inum)
175 {
176         struct bch_fs *c = trans->c;
177         int ret;
178
179         ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
180                               __reattach_inode(trans, lostfound, inum));
181         if (ret)
182                 bch_err(c, "error %i reattaching inode %llu", ret, inum);
183
184         return ret;
185 }
186
187 static int remove_backpointer(struct btree_trans *trans,
188                               struct bch_inode_unpacked *inode)
189 {
190         struct btree_iter *iter;
191         struct bkey_s_c k;
192         int ret;
193
194         iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
195                                    POS(inode->bi_dir, inode->bi_dir_offset), 0);
196         k = bch2_btree_iter_peek_slot(iter);
197         ret = bkey_err(k);
198         if (ret)
199                 goto out;
200         if (k.k->type != KEY_TYPE_dirent) {
201                 ret = -ENOENT;
202                 goto out;
203         }
204
205         ret = remove_dirent(trans, bkey_s_c_to_dirent(k));
206 out:
207         bch2_trans_iter_put(trans, iter);
208         return ret;
209 }
210
211 struct inode_walker {
212         bool                    first_this_inode;
213         bool                    have_inode;
214         u64                     cur_inum;
215         u32                     snapshot;
216         struct bch_inode_unpacked inode;
217 };
218
219 static struct inode_walker inode_walker_init(void)
220 {
221         return (struct inode_walker) {
222                 .cur_inum       = -1,
223                 .have_inode     = false,
224         };
225 }
226
227 static int walk_inode(struct btree_trans *trans,
228                       struct inode_walker *w, u64 inum)
229 {
230         if (inum != w->cur_inum) {
231                 int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot);
232
233                 if (ret && ret != -ENOENT)
234                         return ret;
235
236                 w->have_inode   = !ret;
237                 w->cur_inum     = inum;
238                 w->first_this_inode = true;
239         } else {
240                 w->first_this_inode = false;
241         }
242
243         return 0;
244 }
245
246 static int hash_redo_key(struct btree_trans *trans,
247                          const struct bch_hash_desc desc,
248                          struct bch_hash_info *hash_info,
249                          struct btree_iter *k_iter, struct bkey_s_c k)
250 {
251         struct bkey_i delete;
252         struct bkey_i *tmp;
253
254         tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
255         if (IS_ERR(tmp))
256                 return PTR_ERR(tmp);
257
258         bkey_reassemble(tmp, k);
259
260         bkey_init(&delete.k);
261         delete.k.p = k_iter->pos;
262         bch2_trans_update(trans, k_iter, &delete, 0);
263
264         return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode,
265                              tmp, 0);
266 }
267
268 static int fsck_hash_delete_at(struct btree_trans *trans,
269                                const struct bch_hash_desc desc,
270                                struct bch_hash_info *info,
271                                struct btree_iter *iter)
272 {
273         int ret;
274 retry:
275         ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
276                 bch2_trans_commit(trans, NULL, NULL,
277                                   BTREE_INSERT_NOFAIL|
278                                   BTREE_INSERT_LAZY_RW);
279         if (ret == -EINTR) {
280                 ret = bch2_btree_iter_traverse(iter);
281                 if (!ret)
282                         goto retry;
283         }
284
285         return ret;
286 }
287
288 static int hash_check_key(struct btree_trans *trans,
289                           const struct bch_hash_desc desc,
290                           struct bch_hash_info *hash_info,
291                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
292 {
293         struct bch_fs *c = trans->c;
294         struct btree_iter *iter = NULL;
295         char buf[200];
296         struct bkey_s_c k;
297         u64 hash;
298         int ret = 0;
299
300         if (hash_k.k->type != desc.key_type)
301                 return 0;
302
303         hash = desc.hash_bkey(hash_info, hash_k);
304
305         if (likely(hash == hash_k.k->p.offset))
306                 return 0;
307
308         if (hash_k.k->p.offset < hash)
309                 goto bad_hash;
310
311         for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
312                            BTREE_ITER_SLOTS, k, ret) {
313                 if (!bkey_cmp(k.k->p, hash_k.k->p))
314                         break;
315
316                 if (fsck_err_on(k.k->type == desc.key_type &&
317                                 !desc.cmp_bkey(k, hash_k), c,
318                                 "duplicate hash table keys:\n%s",
319                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
320                                                        hash_k), buf))) {
321                         ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
322                         if (ret)
323                                 return ret;
324                         ret = 1;
325                         break;
326                 }
327
328                 if (bkey_deleted(k.k)) {
329                         bch2_trans_iter_free(trans, iter);
330                         goto bad_hash;
331                 }
332
333         }
334         bch2_trans_iter_free(trans, iter);
335         return ret;
336 bad_hash:
337         if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
338                      "hashed to %llu should be at %llu\n%s",
339                      desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset,
340                      hash, iter->pos.offset,
341                      (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
342                 return 0;
343
344         ret = __bch2_trans_do(trans, NULL, NULL,
345                               BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
346                 hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
347         if (ret) {
348                 bch_err(c, "hash_redo_key err %i", ret);
349                 return ret;
350         }
351         return -EINTR;
352 fsck_err:
353         return ret;
354 }
355
356 static int check_inode(struct btree_trans *trans,
357                        struct btree_iter *iter,
358                        struct bkey_s_c_inode inode)
359 {
360         struct bch_fs *c = trans->c;
361         struct bch_inode_unpacked u;
362         bool do_update = false;
363         int ret = 0;
364
365         ret = bch2_inode_unpack(inode, &u);
366
367         if (bch2_fs_inconsistent_on(ret, c,
368                          "error unpacking inode %llu in fsck",
369                          inode.k->p.inode))
370                 return ret;
371
372         if (u.bi_flags & BCH_INODE_UNLINKED &&
373             (!c->sb.clean ||
374              fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
375                       u.bi_inum))) {
376                 bch_verbose(c, "deleting inode %llu", u.bi_inum);
377
378                 bch2_trans_unlock(trans);
379                 bch2_fs_lazy_rw(c);
380
381                 ret = bch2_inode_rm(c, u.bi_inum, false);
382                 if (ret)
383                         bch_err(c, "error in fsck: error %i while deleting inode", ret);
384                 return ret;
385         }
386
387         if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
388             (!c->sb.clean ||
389              fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
390                       u.bi_inum))) {
391                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
392
393                 bch2_trans_unlock(trans);
394                 bch2_fs_lazy_rw(c);
395
396                 /*
397                  * XXX: need to truncate partial blocks too here - or ideally
398                  * just switch units to bytes and that issue goes away
399                  */
400                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
401                                 POS(u.bi_inum, round_up(u.bi_size, block_bytes(c))),
402                                 POS(u.bi_inum, U64_MAX),
403                                 NULL);
404                 if (ret) {
405                         bch_err(c, "error in fsck: error %i truncating inode", ret);
406                         return ret;
407                 }
408
409                 /*
410                  * We truncated without our normal sector accounting hook, just
411                  * make sure we recalculate it:
412                  */
413                 u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
414
415                 u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
416                 do_update = true;
417         }
418
419         if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
420             (!c->sb.clean ||
421              fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
422                       u.bi_inum))) {
423                 s64 sectors;
424
425                 bch_verbose(c, "recounting sectors for inode %llu",
426                             u.bi_inum);
427
428                 sectors = bch2_count_inode_sectors(trans, u.bi_inum);
429                 if (sectors < 0) {
430                         bch_err(c, "error in fsck: error %i recounting inode sectors",
431                                 (int) sectors);
432                         return sectors;
433                 }
434
435                 u.bi_sectors = sectors;
436                 u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
437                 do_update = true;
438         }
439
440         if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
441                 u.bi_dir = 0;
442                 u.bi_dir_offset = 0;
443                 u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
444                 do_update = true;
445         }
446
447         if (do_update) {
448                 ret = __bch2_trans_do(trans, NULL, NULL,
449                                       BTREE_INSERT_NOFAIL|
450                                       BTREE_INSERT_LAZY_RW,
451                                 bch2_inode_write(trans, iter, &u));
452                 if (ret)
453                         bch_err(c, "error in fsck: error %i "
454                                 "updating inode", ret);
455         }
456 fsck_err:
457         return ret;
458 }
459
460 noinline_for_stack
461 static int check_inodes(struct bch_fs *c, bool full)
462 {
463         struct btree_trans trans;
464         struct btree_iter *iter;
465         struct bkey_s_c k;
466         struct bkey_s_c_inode inode;
467         int ret;
468
469         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
470
471         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
472                 if (k.k->type != KEY_TYPE_inode)
473                         continue;
474
475                 inode = bkey_s_c_to_inode(k);
476
477                 if (full ||
478                     (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
479                                           BCH_INODE_I_SECTORS_DIRTY|
480                                           BCH_INODE_UNLINKED))) {
481                         ret = check_inode(&trans, iter, inode);
482                         if (ret)
483                                 break;
484                 }
485         }
486         bch2_trans_iter_put(&trans, iter);
487
488         BUG_ON(ret == -EINTR);
489
490         return bch2_trans_exit(&trans) ?: ret;
491 }
492
493 static int fix_overlapping_extent(struct btree_trans *trans,
494                                        struct bkey_s_c k, struct bpos cut_at)
495 {
496         struct btree_iter *iter;
497         struct bkey_i *u;
498         int ret;
499
500         u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
501         ret = PTR_ERR_OR_ZERO(u);
502         if (ret)
503                 return ret;
504
505         bkey_reassemble(u, k);
506         bch2_cut_front(cut_at, u);
507
508
509         /*
510          * We don't want to go through the extent_handle_overwrites path:
511          *
512          * XXX: this is going to screw up disk accounting, extent triggers
513          * assume things about extent overwrites - we should be running the
514          * triggers manually here
515          */
516         iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
517                                    BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
518
519         BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
520         bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN);
521         bch2_trans_iter_put(trans, iter);
522
523         return bch2_trans_commit(trans, NULL, NULL,
524                                  BTREE_INSERT_NOFAIL|
525                                  BTREE_INSERT_LAZY_RW);
526 }
527
528 static int inode_backpointer_exists(struct btree_trans *trans,
529                                     struct bch_inode_unpacked *inode)
530 {
531         struct btree_iter *iter;
532         struct bkey_s_c k;
533         int ret;
534
535         iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
536                                    POS(inode->bi_dir, inode->bi_dir_offset), 0);
537         k = bch2_btree_iter_peek_slot(iter);
538         ret = bkey_err(k);
539         if (ret)
540                 goto out;
541         if (k.k->type != KEY_TYPE_dirent)
542                 goto out;
543
544         ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
545 out:
546         bch2_trans_iter_free(trans, iter);
547         return ret;
548 }
549
550 static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
551                                       struct bch_inode_unpacked *inode)
552 {
553         return d.k->p.inode == inode->bi_dir &&
554                 d.k->p.offset == inode->bi_dir_offset;
555 }
556
557 /*
558  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
559  * that i_size an i_sectors are consistent
560  */
561 noinline_for_stack
562 static int check_extents(struct bch_fs *c)
563 {
564         struct inode_walker w = inode_walker_init();
565         struct btree_trans trans;
566         struct btree_iter *iter;
567         struct bkey_s_c k;
568         struct bkey_buf prev;
569         u64 i_sectors = 0;
570         int ret = 0;
571
572         bch2_bkey_buf_init(&prev);
573         prev.k->k = KEY(0, 0, 0);
574         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
575
576         bch_verbose(c, "checking extents");
577
578         iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
579                                    POS(BCACHEFS_ROOT_INO, 0),
580                                    BTREE_ITER_INTENT);
581 retry:
582         while ((k = bch2_btree_iter_peek(iter)).k &&
583                !(ret = bkey_err(k))) {
584                 if (w.have_inode &&
585                     w.cur_inum != k.k->p.inode &&
586                     !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
587                     fsck_err_on(w.inode.bi_sectors != i_sectors, c,
588                                 "inode %llu has incorrect i_sectors: got %llu, should be %llu",
589                                 w.inode.bi_inum,
590                                 w.inode.bi_sectors, i_sectors)) {
591                         w.inode.bi_sectors = i_sectors;
592
593                         ret = write_inode(&trans, &w.inode, w.snapshot);
594                         if (ret)
595                                 break;
596                 }
597
598                 if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
599                         char buf1[200];
600                         char buf2[200];
601
602                         bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
603                         bch2_bkey_val_to_text(&PBUF(buf2), c, k);
604
605                         if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
606                                 return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
607                 }
608
609                 ret = walk_inode(&trans, &w, k.k->p.inode);
610                 if (ret)
611                         break;
612
613                 if (w.first_this_inode)
614                         i_sectors = 0;
615
616                 if (fsck_err_on(!w.have_inode, c,
617                                 "extent type %u for missing inode %llu",
618                                 k.k->type, k.k->p.inode) ||
619                     fsck_err_on(w.have_inode &&
620                                 !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
621                                 "extent type %u for non regular file, inode %llu mode %o",
622                                 k.k->type, k.k->p.inode, w.inode.bi_mode)) {
623                         bch2_fs_lazy_rw(c);
624                         return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
625                                                        POS(k.k->p.inode, 0),
626                                                        POS(k.k->p.inode, U64_MAX),
627                                                        NULL) ?: -EINTR;
628                 }
629
630                 if (fsck_err_on(w.have_inode &&
631                                 !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
632                                 k.k->type != KEY_TYPE_reservation &&
633                                 k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
634                                 "extent type %u offset %llu past end of inode %llu, i_size %llu",
635                                 k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
636                         bch2_fs_lazy_rw(c);
637                         return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
638                                         POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c))),
639                                         POS(k.k->p.inode, U64_MAX),
640                                         NULL) ?: -EINTR;
641                 }
642
643                 if (bkey_extent_is_allocation(k.k))
644                         i_sectors += k.k->size;
645                 bch2_bkey_buf_reassemble(&prev, c, k);
646
647                 bch2_btree_iter_advance(iter);
648         }
649 fsck_err:
650         if (ret == -EINTR)
651                 goto retry;
652         bch2_trans_iter_put(&trans, iter);
653         bch2_bkey_buf_exit(&prev, c);
654         return bch2_trans_exit(&trans) ?: ret;
655 }
656
657 /*
658  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
659  * validate d_type
660  */
661 noinline_for_stack
662 static int check_dirents(struct bch_fs *c)
663 {
664         struct inode_walker w = inode_walker_init();
665         struct bch_hash_info hash_info;
666         struct btree_trans trans;
667         struct btree_iter *iter;
668         struct bkey_s_c k;
669         char buf[200];
670         unsigned nr_subdirs = 0;
671         int ret = 0;
672
673         bch_verbose(c, "checking dirents");
674
675         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
676
677         iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
678                                    POS(BCACHEFS_ROOT_INO, 0), 0);
679 retry:
680         while ((k = bch2_btree_iter_peek(iter)).k &&
681                !(ret = bkey_err(k))) {
682                 struct bkey_s_c_dirent d;
683                 struct bch_inode_unpacked target;
684                 u32 target_snapshot;
685                 bool have_target;
686                 bool backpointer_exists = true;
687                 u64 d_inum;
688
689                 if (w.have_inode &&
690                     w.cur_inum != k.k->p.inode &&
691                     fsck_err_on(w.inode.bi_nlink != nr_subdirs, c,
692                                 "directory %llu with wrong i_nlink: got %u, should be %u",
693                                 w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) {
694                         w.inode.bi_nlink = nr_subdirs;
695                         ret = write_inode(&trans, &w.inode, w.snapshot);
696                         if (ret)
697                                 break;
698                 }
699
700                 ret = walk_inode(&trans, &w, k.k->p.inode);
701                 if (ret)
702                         break;
703
704                 if (w.first_this_inode)
705                         nr_subdirs = 0;
706
707                 if (fsck_err_on(!w.have_inode, c,
708                                 "dirent in nonexisting directory:\n%s",
709                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
710                                                        k), buf)) ||
711                     fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
712                                 "dirent in non directory inode type %u:\n%s",
713                                 mode_to_type(w.inode.bi_mode),
714                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
715                                                        k), buf))) {
716                         ret = lockrestart_do(&trans,
717                                         bch2_btree_delete_at(&trans, iter, 0));
718                         if (ret)
719                                 goto err;
720                         goto next;
721                 }
722
723                 if (!w.have_inode)
724                         goto next;
725
726                 if (w.first_this_inode)
727                         hash_info = bch2_hash_info_init(c, &w.inode);
728
729                 ret = hash_check_key(&trans, bch2_dirent_hash_desc,
730                                      &hash_info, iter, k);
731                 if (ret > 0) {
732                         ret = 0;
733                         goto next;
734                 }
735                 if (ret)
736                         goto fsck_err;
737
738                 if (k.k->type != KEY_TYPE_dirent)
739                         goto next;
740
741                 d = bkey_s_c_to_dirent(k);
742                 d_inum = le64_to_cpu(d.v->d_inum);
743
744                 ret = lookup_inode(&trans, d_inum, &target, &target_snapshot);
745                 if (ret && ret != -ENOENT)
746                         break;
747
748                 have_target = !ret;
749                 ret = 0;
750
751                 if (fsck_err_on(!have_target, c,
752                                 "dirent points to missing inode:\n%s",
753                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
754                                                        k), buf))) {
755                         ret = remove_dirent(&trans, d);
756                         if (ret)
757                                 goto err;
758                         goto next;
759                 }
760
761                 if (!have_target)
762                         goto next;
763
764                 if (!target.bi_dir &&
765                     !target.bi_dir_offset) {
766                         target.bi_dir           = k.k->p.inode;
767                         target.bi_dir_offset    = k.k->p.offset;
768
769                         ret = write_inode(&trans, &target, target_snapshot);
770                         if (ret)
771                                 goto err;
772                 }
773
774                 if (!inode_backpointer_matches(d, &target)) {
775                         ret = inode_backpointer_exists(&trans, &target);
776                         if (ret < 0)
777                                 goto err;
778
779                         backpointer_exists = ret;
780                         ret = 0;
781
782                         if (fsck_err_on(S_ISDIR(target.bi_mode) &&
783                                         backpointer_exists, c,
784                                         "directory %llu with multiple links",
785                                         target.bi_inum)) {
786                                 ret = remove_dirent(&trans, d);
787                                 if (ret)
788                                         goto err;
789                                 continue;
790                         }
791
792                         if (fsck_err_on(backpointer_exists &&
793                                         !target.bi_nlink, c,
794                                         "inode %llu has multiple links but i_nlink 0",
795                                         d_inum)) {
796                                 target.bi_nlink++;
797                                 target.bi_flags &= ~BCH_INODE_UNLINKED;
798
799                                 ret = write_inode(&trans, &target, target_snapshot);
800                                 if (ret)
801                                         goto err;
802                         }
803
804                         if (fsck_err_on(!backpointer_exists, c,
805                                         "inode %llu has wrong backpointer:\n"
806                                         "got       %llu:%llu\n"
807                                         "should be %llu:%llu",
808                                         d_inum,
809                                         target.bi_dir,
810                                         target.bi_dir_offset,
811                                         k.k->p.inode,
812                                         k.k->p.offset)) {
813                                 target.bi_dir           = k.k->p.inode;
814                                 target.bi_dir_offset    = k.k->p.offset;
815
816                                 ret = write_inode(&trans, &target, target_snapshot);
817                                 if (ret)
818                                         goto err;
819                         }
820                 }
821
822                 if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
823                                 "incorrect d_type: should be %u:\n%s",
824                                 mode_to_type(target.bi_mode),
825                                 (bch2_bkey_val_to_text(&PBUF(buf), c,
826                                                        k), buf))) {
827                         struct bkey_i_dirent *n;
828
829                         n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
830                         if (!n) {
831                                 ret = -ENOMEM;
832                                 goto err;
833                         }
834
835                         bkey_reassemble(&n->k_i, d.s_c);
836                         n->v.d_type = mode_to_type(target.bi_mode);
837
838                         ret = __bch2_trans_do(&trans, NULL, NULL,
839                                               BTREE_INSERT_NOFAIL|
840                                               BTREE_INSERT_LAZY_RW,
841                                 (bch2_trans_update(&trans, iter, &n->k_i, 0), 0));
842                         kfree(n);
843                         if (ret)
844                                 goto err;
845
846                 }
847
848                 nr_subdirs += d.v->d_type == DT_DIR;
849 next:
850                 bch2_btree_iter_advance(iter);
851         }
852 err:
853 fsck_err:
854         if (ret == -EINTR)
855                 goto retry;
856
857         bch2_trans_iter_put(&trans, iter);
858         return bch2_trans_exit(&trans) ?: ret;
859 }
860
861 /*
862  * Walk xattrs: verify that they all have a corresponding inode
863  */
864 noinline_for_stack
865 static int check_xattrs(struct bch_fs *c)
866 {
867         struct inode_walker w = inode_walker_init();
868         struct bch_hash_info hash_info;
869         struct btree_trans trans;
870         struct btree_iter *iter;
871         struct bkey_s_c k;
872         int ret = 0;
873
874         bch_verbose(c, "checking xattrs");
875
876         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
877
878         iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
879                                    POS(BCACHEFS_ROOT_INO, 0), 0);
880 retry:
881         while ((k = bch2_btree_iter_peek(iter)).k &&
882                !(ret = bkey_err(k))) {
883                 ret = walk_inode(&trans, &w, k.k->p.inode);
884                 if (ret)
885                         break;
886
887                 if (fsck_err_on(!w.have_inode, c,
888                                 "xattr for missing inode %llu",
889                                 k.k->p.inode)) {
890                         ret = bch2_btree_delete_at(&trans, iter, 0);
891                         if (ret)
892                                 break;
893                         continue;
894                 }
895
896                 if (w.first_this_inode && w.have_inode)
897                         hash_info = bch2_hash_info_init(c, &w.inode);
898
899                 ret = hash_check_key(&trans, bch2_xattr_hash_desc,
900                                      &hash_info, iter, k);
901                 if (ret)
902                         break;
903
904                 bch2_btree_iter_advance(iter);
905         }
906 fsck_err:
907         if (ret == -EINTR)
908                 goto retry;
909
910         bch2_trans_iter_put(&trans, iter);
911         return bch2_trans_exit(&trans) ?: ret;
912 }
913
914 /* Get root directory, create if it doesn't exist: */
915 static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
916 {
917         struct bkey_inode_buf packed;
918         u32 snapshot;
919         int ret;
920
921         bch_verbose(c, "checking root directory");
922
923         ret = bch2_trans_do(c, NULL, NULL, 0,
924                 lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
925         if (ret && ret != -ENOENT)
926                 return ret;
927
928         if (fsck_err_on(ret, c, "root directory missing"))
929                 goto create_root;
930
931         if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
932                         "root inode not a directory"))
933                 goto create_root;
934
935         return 0;
936 fsck_err:
937         return ret;
938 create_root:
939         bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
940                         0, NULL);
941         root_inode->bi_inum = BCACHEFS_ROOT_INO;
942
943         bch2_inode_pack(c, &packed, root_inode);
944
945         return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
946                                  NULL, NULL,
947                                  BTREE_INSERT_NOFAIL|
948                                  BTREE_INSERT_LAZY_RW);
949 }
950
951 /* Get lost+found, create if it doesn't exist: */
952 static int check_lostfound(struct bch_fs *c,
953                            struct bch_inode_unpacked *root_inode,
954                            struct bch_inode_unpacked *lostfound_inode)
955 {
956         struct qstr lostfound = QSTR("lost+found");
957         struct bch_hash_info root_hash_info =
958                 bch2_hash_info_init(c, root_inode);
959         u64 inum;
960         u32 snapshot;
961         int ret;
962
963         bch_verbose(c, "checking lost+found");
964
965         inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
966                                  &lostfound);
967         if (!inum) {
968                 bch_notice(c, "creating lost+found");
969                 goto create_lostfound;
970         }
971
972         ret = bch2_trans_do(c, NULL, NULL, 0,
973                 lookup_inode(&trans, inum, lostfound_inode, &snapshot));
974         if (ret && ret != -ENOENT)
975                 return ret;
976
977         if (fsck_err_on(ret, c, "lost+found missing"))
978                 goto create_lostfound;
979
980         if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c,
981                         "lost+found inode not a directory"))
982                 goto create_lostfound;
983
984         return 0;
985 fsck_err:
986         return ret;
987 create_lostfound:
988         bch2_inode_init_early(c, lostfound_inode);
989
990         ret = bch2_trans_do(c, NULL, NULL,
991                             BTREE_INSERT_NOFAIL|
992                             BTREE_INSERT_LAZY_RW,
993                 bch2_create_trans(&trans,
994                                   BCACHEFS_ROOT_INO, root_inode,
995                                   lostfound_inode, &lostfound,
996                                   0, 0, S_IFDIR|0700, 0, NULL, NULL));
997         if (ret)
998                 bch_err(c, "error creating lost+found: %i", ret);
999
1000         return ret;
1001 }
1002
1003 struct pathbuf {
1004         size_t          nr;
1005         size_t          size;
1006
1007         struct pathbuf_entry {
1008                 u64     inum;
1009         }               *entries;
1010 };
1011
1012 static int path_down(struct pathbuf *p, u64 inum)
1013 {
1014         if (p->nr == p->size) {
1015                 size_t new_size = max_t(size_t, 256UL, p->size * 2);
1016                 void *n = krealloc(p->entries,
1017                                    new_size * sizeof(p->entries[0]),
1018                                    GFP_KERNEL);
1019                 if (!n) {
1020                         return -ENOMEM;
1021                 }
1022
1023                 p->entries = n;
1024                 p->size = new_size;
1025         };
1026
1027         p->entries[p->nr++] = (struct pathbuf_entry) {
1028                 .inum = inum,
1029         };
1030         return 0;
1031 }
1032
1033 static int check_path(struct btree_trans *trans,
1034                       struct bch_inode_unpacked *lostfound,
1035                       struct pathbuf *p,
1036                       struct bch_inode_unpacked *inode)
1037 {
1038         struct bch_fs *c = trans->c;
1039         u32 snapshot;
1040         size_t i;
1041         int ret = 0;
1042
1043         p->nr = 0;
1044
1045         while (inode->bi_inum != BCACHEFS_ROOT_INO) {
1046                 ret = lockrestart_do(trans,
1047                         inode_backpointer_exists(trans, inode));
1048                 if (ret < 0)
1049                         break;
1050
1051                 if (!ret) {
1052                         if (fsck_err(c,  "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
1053                                      inode->bi_inum,
1054                                      mode_to_type(inode->bi_mode),
1055                                      inode->bi_nlink,
1056                                      inode->bi_dir,
1057                                      inode->bi_dir_offset))
1058                                 ret = reattach_inode(trans, lostfound, inode->bi_inum);
1059                         break;
1060                 }
1061                 ret = 0;
1062
1063                 if (!S_ISDIR(inode->bi_mode))
1064                         break;
1065
1066                 ret = path_down(p, inode->bi_inum);
1067                 if (ret) {
1068                         bch_err(c, "memory allocation failure");
1069                         return ret;
1070                 }
1071
1072                 for (i = 0; i < p->nr; i++) {
1073                         if (inode->bi_dir != p->entries[i].inum)
1074                                 continue;
1075
1076                         /* XXX print path */
1077                         if (!fsck_err(c, "directory structure loop"))
1078                                 return 0;
1079
1080                         ret = lockrestart_do(trans,
1081                                          remove_backpointer(trans, inode));
1082                         if (ret) {
1083                                 bch_err(c, "error removing dirent: %i", ret);
1084                                 break;
1085                         }
1086
1087                         ret = reattach_inode(trans, lostfound, inode->bi_inum);
1088                         break;
1089                 }
1090
1091                 ret = lockrestart_do(trans,
1092                                 lookup_inode(trans, inode->bi_dir, inode, &snapshot));
1093                 if (ret) {
1094                         /* Should have been caught in dirents pass */
1095                         bch_err(c, "error looking up parent directory: %i", ret);
1096                         break;
1097                 }
1098         }
1099 fsck_err:
1100         if (ret)
1101                 bch_err(c, "%s: err %i", __func__, ret);
1102         return ret;
1103 }
1104
1105 /*
1106  * Check for unreachable inodes, as well as loops in the directory structure:
1107  * After check_dirents(), if an inode backpointer doesn't exist that means it's
1108  * unreachable:
1109  */
1110 static int check_directory_structure(struct bch_fs *c,
1111                                      struct bch_inode_unpacked *lostfound)
1112 {
1113         struct btree_trans trans;
1114         struct btree_iter *iter;
1115         struct bkey_s_c k;
1116         struct bch_inode_unpacked u;
1117         struct pathbuf path = { 0, 0, NULL };
1118         int ret;
1119
1120         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1121
1122         for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
1123                 if (k.k->type != KEY_TYPE_inode)
1124                         continue;
1125
1126                 ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
1127                 if (ret) {
1128                         /* Should have been caught earlier in fsck: */
1129                         bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
1130                         break;
1131                 }
1132
1133                 ret = check_path(&trans, lostfound, &path, &u);
1134                 if (ret)
1135                         break;
1136         }
1137         bch2_trans_iter_put(&trans, iter);
1138
1139         BUG_ON(ret == -EINTR);
1140
1141         return bch2_trans_exit(&trans) ?: ret;
1142 }
1143
1144 struct nlink {
1145         u32     count;
1146 };
1147
1148 typedef GENRADIX(struct nlink) nlink_table;
1149
1150 static void inc_link(struct bch_fs *c, nlink_table *links,
1151                      u64 range_start, u64 *range_end, u64 inum)
1152 {
1153         struct nlink *link;
1154
1155         if (inum < range_start || inum >= *range_end)
1156                 return;
1157
1158         if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) {
1159                 *range_end = inum;
1160                 return;
1161         }
1162
1163         link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
1164         if (!link) {
1165                 bch_verbose(c, "allocation failed during fsck - will need another pass");
1166                 *range_end = inum;
1167                 return;
1168         }
1169
1170         link->count++;
1171 }
1172
1173 noinline_for_stack
1174 static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
1175                                u64 range_start, u64 *range_end)
1176 {
1177         struct btree_trans trans;
1178         struct btree_iter *iter;
1179         struct bkey_s_c k;
1180         struct bkey_s_c_dirent d;
1181         int ret;
1182
1183         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1184
1185         for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) {
1186                 switch (k.k->type) {
1187                 case KEY_TYPE_dirent:
1188                         d = bkey_s_c_to_dirent(k);
1189
1190                         if (d.v->d_type != DT_DIR)
1191                                 inc_link(c, links, range_start, range_end,
1192                                          le64_to_cpu(d.v->d_inum));
1193                         break;
1194                 }
1195
1196                 bch2_trans_cond_resched(&trans);
1197         }
1198         bch2_trans_iter_put(&trans, iter);
1199
1200         ret = bch2_trans_exit(&trans) ?: ret;
1201         if (ret)
1202                 bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
1203
1204         return ret;
1205 }
1206
1207 static int check_inode_nlink(struct btree_trans *trans,
1208                              struct bch_inode_unpacked *lostfound_inode,
1209                              struct btree_iter *iter,
1210                              struct bkey_s_c_inode inode,
1211                              unsigned nlink)
1212 {
1213         struct bch_fs *c = trans->c;
1214         struct bch_inode_unpacked u;
1215         int ret = 0;
1216
1217         /*
1218          * Backpointer and directory structure checks are sufficient for
1219          * directories, since they can't have hardlinks:
1220          */
1221         if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
1222                 return 0;
1223
1224         if (!nlink) {
1225                 bch_err(c, "no links found to inode %llu", inode.k->p.offset);
1226                 return -EINVAL;
1227         }
1228
1229         ret = bch2_inode_unpack(inode, &u);
1230
1231         /* Should never happen, checked by bch2_inode_invalid: */
1232         if (bch2_fs_inconsistent_on(ret, c,
1233                          "error unpacking inode %llu in fsck",
1234                          inode.k->p.inode))
1235                 return ret;
1236
1237         if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c,
1238                         "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
1239                         u.bi_inum, mode_to_type(u.bi_mode),
1240                         bch2_inode_nlink_get(&u), nlink)) {
1241                 bch2_inode_nlink_set(&u, nlink);
1242
1243                 ret = __bch2_trans_do(trans, NULL, NULL,
1244                                       BTREE_INSERT_NOFAIL|
1245                                       BTREE_INSERT_LAZY_RW,
1246                                 bch2_inode_write(trans, iter, &u));
1247                 if (ret)
1248                         bch_err(c, "error in fsck: error %i updating inode", ret);
1249         }
1250 fsck_err:
1251         return ret;
1252 }
1253
1254 noinline_for_stack
1255 static int bch2_gc_walk_inodes(struct bch_fs *c,
1256                                struct bch_inode_unpacked *lostfound_inode,
1257                                nlink_table *links,
1258                                u64 range_start, u64 range_end)
1259 {
1260         struct btree_trans trans;
1261         struct btree_iter *iter;
1262         struct bkey_s_c k;
1263         struct nlink *link;
1264         int ret = 0;
1265
1266         bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
1267
1268         for_each_btree_key(&trans, iter, BTREE_ID_inodes,
1269                            POS(0, range_start), 0, k, ret) {
1270                 if (!k.k || k.k->p.offset >= range_end)
1271                         break;
1272
1273                 if (k.k->type != KEY_TYPE_inode)
1274                         continue;
1275
1276                 link = genradix_ptr(links, k.k->p.offset - range_start);
1277                 ret = check_inode_nlink(&trans, lostfound_inode, iter,
1278                                         bkey_s_c_to_inode(k), link ? link->count : 0);
1279                 if (ret)
1280                         break;
1281
1282         }
1283         bch2_trans_iter_put(&trans, iter);
1284         bch2_trans_exit(&trans);
1285
1286         if (ret)
1287                 bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
1288
1289         return ret;
1290 }
1291
1292 noinline_for_stack
1293 static int check_nlinks(struct bch_fs *c,
1294                               struct bch_inode_unpacked *lostfound_inode)
1295 {
1296         nlink_table links;
1297         u64 this_iter_range_start, next_iter_range_start = 0;
1298         int ret = 0;
1299
1300         bch_verbose(c, "checking inode nlinks");
1301
1302         genradix_init(&links);
1303
1304         do {
1305                 this_iter_range_start = next_iter_range_start;
1306                 next_iter_range_start = U64_MAX;
1307
1308                 ret = bch2_gc_walk_dirents(c, &links,
1309                                           this_iter_range_start,
1310                                           &next_iter_range_start);
1311                 if (ret)
1312                         break;
1313
1314                 ret = bch2_gc_walk_inodes(c, lostfound_inode, &links,
1315                                          this_iter_range_start,
1316                                          next_iter_range_start);
1317                 if (ret)
1318                         break;
1319
1320                 genradix_free(&links);
1321         } while (next_iter_range_start != U64_MAX);
1322
1323         genradix_free(&links);
1324
1325         return ret;
1326 }
1327
1328 /*
1329  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
1330  * Doesn't fix them yet, mainly because they haven't yet been observed:
1331  */
1332 int bch2_fsck_full(struct bch_fs *c)
1333 {
1334         struct bch_inode_unpacked root_inode, lostfound_inode;
1335
1336         return  check_inodes(c, true) ?:
1337                 check_extents(c) ?:
1338                 check_dirents(c) ?:
1339                 check_xattrs(c) ?:
1340                 check_root(c, &root_inode) ?:
1341                 check_lostfound(c, &root_inode, &lostfound_inode) ?:
1342                 check_directory_structure(c, &lostfound_inode) ?:
1343                 check_nlinks(c, &lostfound_inode);
1344 }
1345
1346 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
1347 {
1348         return check_inodes(c, false);
1349 }