]> git.sesse.net Git - bcachefs-tools-debian/blob - libbcachefs/fsck.c
3f74b6769a38381468c8e7eaaba1c646b54069aa
[bcachefs-tools-debian] / libbcachefs / fsck.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "dirent.h"
9 #include "error.h"
10 #include "fs-common.h"
11 #include "fsck.h"
12 #include "inode.h"
13 #include "keylist.h"
14 #include "recovery.h"
15 #include "snapshot.h"
16 #include "super.h"
17 #include "xattr.h"
18
19 #include <linux/bsearch.h>
20 #include <linux/darray.h>
21 #include <linux/dcache.h> /* struct qstr */
22
23 /*
24  * XXX: this is handling transaction restarts without returning
25  * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
26  */
27 static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
28                                     u32 snapshot)
29 {
30         u64 sectors = 0;
31
32         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
33                                 SPOS(inum, 0, snapshot),
34                                 POS(inum, U64_MAX),
35                                 0, k, ({
36                 if (bkey_extent_is_allocation(k.k))
37                         sectors += k.k->size;
38                 0;
39         }));
40
41         return ret ?: sectors;
42 }
43
44 static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
45                                     u32 snapshot)
46 {
47         u64 subdirs = 0;
48
49         int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
50                                     SPOS(inum, 0, snapshot),
51                                     POS(inum, U64_MAX),
52                                     0, k, ({
53                 if (k.k->type == KEY_TYPE_dirent &&
54                     bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
55                         subdirs++;
56                 0;
57         }));
58
59         return ret ?: subdirs;
60 }
61
62 static int subvol_lookup(struct btree_trans *trans, u32 subvol,
63                          u32 *snapshot, u64 *inum)
64 {
65         struct bch_subvolume s;
66         int ret;
67
68         ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
69
70         *snapshot = le32_to_cpu(s.snapshot);
71         *inum = le64_to_cpu(s.inode);
72         return ret;
73 }
74
75 static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
76                               struct bch_inode_unpacked *inode)
77 {
78         struct btree_iter iter;
79         struct bkey_s_c k;
80         int ret;
81
82         bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
83                              POS(0, inode_nr),
84                              BTREE_ITER_ALL_SNAPSHOTS);
85         k = bch2_btree_iter_peek(&iter);
86         ret = bkey_err(k);
87         if (ret)
88                 goto err;
89
90         if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
91                 ret = -BCH_ERR_ENOENT_inode;
92                 goto err;
93         }
94
95         ret = bch2_inode_unpack(k, inode);
96 err:
97         bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
98         bch2_trans_iter_exit(trans, &iter);
99         return ret;
100 }
101
102 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
103                         struct bch_inode_unpacked *inode,
104                         u32 *snapshot)
105 {
106         struct btree_iter iter;
107         struct bkey_s_c k;
108         int ret;
109
110         k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
111                                SPOS(0, inode_nr, *snapshot), 0);
112         ret = bkey_err(k);
113         if (ret)
114                 goto err;
115
116         ret = bkey_is_inode(k.k)
117                 ? bch2_inode_unpack(k, inode)
118                 : -BCH_ERR_ENOENT_inode;
119         if (!ret)
120                 *snapshot = iter.pos.snapshot;
121 err:
122         bch2_trans_iter_exit(trans, &iter);
123         return ret;
124 }
125
126 static int lookup_dirent_in_snapshot(struct btree_trans *trans,
127                            struct bch_hash_info hash_info,
128                            subvol_inum dir, struct qstr *name,
129                            u64 *target, unsigned *type, u32 snapshot)
130 {
131         struct btree_iter iter;
132         struct bkey_s_c_dirent d;
133         int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
134                                &hash_info, dir, name, 0, snapshot);
135         if (ret)
136                 return ret;
137
138         d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
139         *target = le64_to_cpu(d.v->d_inum);
140         *type = d.v->d_type;
141         bch2_trans_iter_exit(trans, &iter);
142         return 0;
143 }
144
145 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
146 {
147         struct bch_fs *c = trans->c;
148         struct btree_iter iter;
149         struct bch_inode_unpacked dir_inode;
150         struct bch_hash_info dir_hash_info;
151         int ret;
152
153         ret = lookup_first_inode(trans, pos.inode, &dir_inode);
154         if (ret)
155                 goto err;
156
157         dir_hash_info = bch2_hash_info_init(c, &dir_inode);
158
159         bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
160
161         ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
162                                   &dir_hash_info, &iter,
163                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
164         bch2_trans_iter_exit(trans, &iter);
165 err:
166         bch_err_fn(c, ret);
167         return ret;
168 }
169
170 /* Get lost+found, create if it doesn't exist: */
171 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
172                             struct bch_inode_unpacked *lostfound)
173 {
174         struct bch_fs *c = trans->c;
175         struct qstr lostfound_str = QSTR("lost+found");
176         u64 inum = 0;
177         unsigned d_type = 0;
178         int ret;
179
180         struct bch_snapshot_tree st;
181         ret = bch2_snapshot_tree_lookup(trans,
182                         bch2_snapshot_tree(c, snapshot), &st);
183         if (ret)
184                 return ret;
185
186         subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
187         u32 subvol_snapshot;
188
189         ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
190                             &subvol_snapshot, &root_inum.inum);
191         bch_err_msg(c, ret, "looking up root subvol");
192         if (ret)
193                 return ret;
194
195         struct bch_inode_unpacked root_inode;
196         struct bch_hash_info root_hash_info;
197         u32 root_inode_snapshot = snapshot;
198         ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
199         bch_err_msg(c, ret, "looking up root inode");
200         if (ret)
201                 return ret;
202
203         root_hash_info = bch2_hash_info_init(c, &root_inode);
204
205         ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
206                               &lostfound_str, &inum, &d_type, snapshot);
207         if (bch2_err_matches(ret, ENOENT))
208                 goto create_lostfound;
209
210         bch_err_fn(c, ret);
211         if (ret)
212                 return ret;
213
214         if (d_type != DT_DIR) {
215                 bch_err(c, "error looking up lost+found: not a directory");
216                 return -BCH_ERR_ENOENT_not_directory;
217         }
218
219         /*
220          * The bch2_check_dirents pass has already run, dangling dirents
221          * shouldn't exist here:
222          */
223         ret = lookup_inode(trans, inum, lostfound, &snapshot);
224         bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
225                     inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
226         return ret;
227
228 create_lostfound:
229         /*
230          * XXX: we could have a nicer log message here  if we had a nice way to
231          * walk backpointers to print a path
232          */
233         bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
234
235         u64 now = bch2_current_time(c);
236         struct btree_iter lostfound_iter = { NULL };
237         u64 cpu = raw_smp_processor_id();
238
239         bch2_inode_init_early(c, lostfound);
240         bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
241         lostfound->bi_dir = root_inode.bi_inum;
242
243         root_inode.bi_nlink++;
244
245         ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
246         if (ret)
247                 goto err;
248
249         bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
250         ret = bch2_btree_iter_traverse(&lostfound_iter);
251         if (ret)
252                 goto err;
253
254         ret =   bch2_dirent_create_snapshot(trans,
255                                 0, root_inode.bi_inum, snapshot, &root_hash_info,
256                                 mode_to_type(lostfound->bi_mode),
257                                 &lostfound_str,
258                                 lostfound->bi_inum,
259                                 &lostfound->bi_dir_offset,
260                                 BCH_HASH_SET_MUST_CREATE) ?:
261                 bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
262                                        BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
263 err:
264         bch_err_msg(c, ret, "creating lost+found");
265         bch2_trans_iter_exit(trans, &lostfound_iter);
266         return ret;
267 }
268
269 static int reattach_inode(struct btree_trans *trans,
270                           struct bch_inode_unpacked *inode,
271                           u32 inode_snapshot)
272 {
273         struct bch_hash_info dir_hash;
274         struct bch_inode_unpacked lostfound;
275         char name_buf[20];
276         struct qstr name;
277         u64 dir_offset = 0;
278         u32 dirent_snapshot = inode_snapshot;
279         int ret;
280
281         if (inode->bi_subvol) {
282                 inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
283
284                 u64 root_inum;
285                 ret = subvol_lookup(trans, inode->bi_parent_subvol,
286                                     &dirent_snapshot, &root_inum);
287                 if (ret)
288                         return ret;
289
290                 snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
291         } else {
292                 snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
293         }
294
295         ret = lookup_lostfound(trans, dirent_snapshot, &lostfound);
296         if (ret)
297                 return ret;
298
299         if (S_ISDIR(inode->bi_mode)) {
300                 lostfound.bi_nlink++;
301
302                 ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
303                 if (ret)
304                         return ret;
305         }
306
307         dir_hash = bch2_hash_info_init(trans->c, &lostfound);
308
309         name = (struct qstr) QSTR(name_buf);
310
311         ret = bch2_dirent_create_snapshot(trans,
312                                 inode->bi_parent_subvol, lostfound.bi_inum,
313                                 dirent_snapshot,
314                                 &dir_hash,
315                                 inode_d_type(inode),
316                                 &name,
317                                 inode->bi_subvol ?: inode->bi_inum,
318                                 &dir_offset,
319                                 BCH_HASH_SET_MUST_CREATE);
320         if (ret)
321                 return ret;
322
323         inode->bi_dir           = lostfound.bi_inum;
324         inode->bi_dir_offset    = dir_offset;
325
326         return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
327 }
328
329 static int remove_backpointer(struct btree_trans *trans,
330                               struct bch_inode_unpacked *inode)
331 {
332         struct btree_iter iter;
333         struct bkey_s_c_dirent d;
334         int ret;
335
336         d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
337                                      POS(inode->bi_dir, inode->bi_dir_offset), 0,
338                                      dirent);
339         ret =   bkey_err(d) ?:
340                 __remove_dirent(trans, d.k->p);
341         bch2_trans_iter_exit(trans, &iter);
342         return ret;
343 }
344
345 struct snapshots_seen_entry {
346         u32                             id;
347         u32                             equiv;
348 };
349
350 struct snapshots_seen {
351         struct bpos                     pos;
352         DARRAY(struct snapshots_seen_entry) ids;
353 };
354
355 static inline void snapshots_seen_exit(struct snapshots_seen *s)
356 {
357         darray_exit(&s->ids);
358 }
359
360 static inline void snapshots_seen_init(struct snapshots_seen *s)
361 {
362         memset(s, 0, sizeof(*s));
363 }
364
365 static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
366 {
367         struct snapshots_seen_entry *i, n = {
368                 .id     = id,
369                 .equiv  = bch2_snapshot_equiv(c, id),
370         };
371         int ret = 0;
372
373         __darray_for_each(s->ids, i) {
374                 if (i->id == id)
375                         return 0;
376                 if (i->id > id)
377                         break;
378         }
379
380         ret = darray_insert_item(&s->ids, i - s->ids.data, n);
381         if (ret)
382                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
383                         s->ids.size);
384         return ret;
385 }
386
387 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
388                                  enum btree_id btree_id, struct bpos pos)
389 {
390         struct snapshots_seen_entry n = {
391                 .id     = pos.snapshot,
392                 .equiv  = bch2_snapshot_equiv(c, pos.snapshot),
393         };
394         int ret = 0;
395
396         if (!bkey_eq(s->pos, pos))
397                 s->ids.nr = 0;
398
399         s->pos = pos;
400         s->pos.snapshot = n.equiv;
401
402         darray_for_each(s->ids, i) {
403                 if (i->id == n.id)
404                         return 0;
405
406                 /*
407                  * We currently don't rigorously track for snapshot cleanup
408                  * needing to be run, so it shouldn't be a fsck error yet:
409                  */
410                 if (i->equiv == n.equiv) {
411                         bch_err(c, "snapshot deletion did not finish:\n"
412                                 "  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
413                                 bch2_btree_id_str(btree_id),
414                                 pos.inode, pos.offset,
415                                 i->id, n.id, n.equiv);
416                         set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
417                         return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
418                 }
419         }
420
421         ret = darray_push(&s->ids, n);
422         if (ret)
423                 bch_err(c, "error reallocating snapshots_seen table (size %zu)",
424                         s->ids.size);
425         return ret;
426 }
427
428 /**
429  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
430  * and @ancestor hasn't been overwritten in @seen
431  *
432  * @c:          filesystem handle
433  * @seen:       list of snapshot ids already seen at current position
434  * @id:         descendent snapshot id
435  * @ancestor:   ancestor snapshot id
436  *
437  * Returns:     whether key in @ancestor snapshot is visible in @id snapshot
438  */
439 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
440                                     u32 id, u32 ancestor)
441 {
442         ssize_t i;
443
444         EBUG_ON(id > ancestor);
445         EBUG_ON(!bch2_snapshot_is_equiv(c, id));
446         EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor));
447
448         /* @ancestor should be the snapshot most recently added to @seen */
449         EBUG_ON(ancestor != seen->pos.snapshot);
450         EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv);
451
452         if (id == ancestor)
453                 return true;
454
455         if (!bch2_snapshot_is_ancestor(c, id, ancestor))
456                 return false;
457
458         /*
459          * We know that @id is a descendant of @ancestor, we're checking if
460          * we've seen a key that overwrote @ancestor - i.e. also a descendent of
461          * @ascestor and with @id as a descendent.
462          *
463          * But we already know that we're scanning IDs between @id and @ancestor
464          * numerically, since snapshot ID lists are kept sorted, so if we find
465          * an id that's an ancestor of @id we're done:
466          */
467
468         for (i = seen->ids.nr - 2;
469              i >= 0 && seen->ids.data[i].equiv >= id;
470              --i)
471                 if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv))
472                         return false;
473
474         return true;
475 }
476
477 /**
478  * ref_visible - given a key with snapshot id @src that points to a key with
479  * snapshot id @dst, test whether there is some snapshot in which @dst is
480  * visible.
481  *
482  * @c:          filesystem handle
483  * @s:          list of snapshot IDs already seen at @src
484  * @src:        snapshot ID of src key
485  * @dst:        snapshot ID of dst key
486  * Returns:     true if there is some snapshot in which @dst is visible
487  *
488  * Assumes we're visiting @src keys in natural key order
489  */
490 static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
491                         u32 src, u32 dst)
492 {
493         return dst <= src
494                 ? key_visible_in_snapshot(c, s, dst, src)
495                 : bch2_snapshot_is_ancestor(c, src, dst);
496 }
497
498 static int ref_visible2(struct bch_fs *c,
499                         u32 src, struct snapshots_seen *src_seen,
500                         u32 dst, struct snapshots_seen *dst_seen)
501 {
502         src = bch2_snapshot_equiv(c, src);
503         dst = bch2_snapshot_equiv(c, dst);
504
505         if (dst > src) {
506                 swap(dst, src);
507                 swap(dst_seen, src_seen);
508         }
509         return key_visible_in_snapshot(c, src_seen, dst, src);
510 }
511
512 #define for_each_visible_inode(_c, _s, _w, _snapshot, _i)                               \
513         for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr &&        \
514              (_i)->snapshot <= (_snapshot); _i++)                                       \
515                 if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
516
517 struct inode_walker_entry {
518         struct bch_inode_unpacked inode;
519         u32                     snapshot;
520         bool                    seen_this_pos;
521         u64                     count;
522 };
523
524 struct inode_walker {
525         bool                            first_this_inode;
526         bool                            recalculate_sums;
527         struct bpos                     last_pos;
528
529         DARRAY(struct inode_walker_entry) inodes;
530 };
531
532 static void inode_walker_exit(struct inode_walker *w)
533 {
534         darray_exit(&w->inodes);
535 }
536
537 static struct inode_walker inode_walker_init(void)
538 {
539         return (struct inode_walker) { 0, };
540 }
541
542 static int add_inode(struct bch_fs *c, struct inode_walker *w,
543                      struct bkey_s_c inode)
544 {
545         struct bch_inode_unpacked u;
546
547         BUG_ON(bch2_inode_unpack(inode, &u));
548
549         return darray_push(&w->inodes, ((struct inode_walker_entry) {
550                 .inode          = u,
551                 .snapshot       = bch2_snapshot_equiv(c, inode.k->p.snapshot),
552         }));
553 }
554
555 static int get_inodes_all_snapshots(struct btree_trans *trans,
556                                     struct inode_walker *w, u64 inum)
557 {
558         struct bch_fs *c = trans->c;
559         struct btree_iter iter;
560         struct bkey_s_c k;
561         int ret;
562
563         w->recalculate_sums = false;
564         w->inodes.nr = 0;
565
566         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
567                                      BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
568                 if (k.k->p.offset != inum)
569                         break;
570
571                 if (bkey_is_inode(k.k))
572                         add_inode(c, w, k);
573         }
574         bch2_trans_iter_exit(trans, &iter);
575
576         if (ret)
577                 return ret;
578
579         w->first_this_inode = true;
580         return 0;
581 }
582
583 static struct inode_walker_entry *
584 lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
585 {
586         bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
587         u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
588
589         struct inode_walker_entry *i;
590         __darray_for_each(w->inodes, i)
591                 if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
592                         goto found;
593
594         return NULL;
595 found:
596         BUG_ON(snapshot > i->snapshot);
597
598         if (snapshot != i->snapshot && !is_whiteout) {
599                 struct inode_walker_entry new = *i;
600
601                 new.snapshot = snapshot;
602                 new.count = 0;
603
604                 struct printbuf buf = PRINTBUF;
605                 bch2_bkey_val_to_text(&buf, c, k);
606
607                 bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
608                          "unexpected because we should always update the inode when we update a key in that inode\n"
609                          "%s",
610                          w->last_pos.inode, snapshot, i->snapshot, buf.buf);
611                 printbuf_exit(&buf);
612
613                 while (i > w->inodes.data && i[-1].snapshot > snapshot)
614                         --i;
615
616                 size_t pos = i - w->inodes.data;
617                 int ret = darray_insert_item(&w->inodes, pos, new);
618                 if (ret)
619                         return ERR_PTR(ret);
620
621                 i = w->inodes.data + pos;
622         }
623
624         return i;
625 }
626
627 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
628                                              struct inode_walker *w,
629                                              struct bkey_s_c k)
630 {
631         if (w->last_pos.inode != k.k->p.inode) {
632                 int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
633                 if (ret)
634                         return ERR_PTR(ret);
635         } else if (bkey_cmp(w->last_pos, k.k->p)) {
636                 darray_for_each(w->inodes, i)
637                         i->seen_this_pos = false;
638         }
639
640         w->last_pos = k.k->p;
641
642         return lookup_inode_for_snapshot(trans->c, w, k);
643 }
644
645 static int __get_visible_inodes(struct btree_trans *trans,
646                                 struct inode_walker *w,
647                                 struct snapshots_seen *s,
648                                 u64 inum)
649 {
650         struct bch_fs *c = trans->c;
651         struct btree_iter iter;
652         struct bkey_s_c k;
653         int ret;
654
655         w->inodes.nr = 0;
656
657         for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
658                            BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
659                 u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
660
661                 if (k.k->p.offset != inum)
662                         break;
663
664                 if (!ref_visible(c, s, s->pos.snapshot, equiv))
665                         continue;
666
667                 if (bkey_is_inode(k.k))
668                         add_inode(c, w, k);
669
670                 if (equiv >= s->pos.snapshot)
671                         break;
672         }
673         bch2_trans_iter_exit(trans, &iter);
674
675         return ret;
676 }
677
678 static int check_key_has_snapshot(struct btree_trans *trans,
679                                   struct btree_iter *iter,
680                                   struct bkey_s_c k)
681 {
682         struct bch_fs *c = trans->c;
683         struct printbuf buf = PRINTBUF;
684         int ret = 0;
685
686         if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
687                                 bkey_in_missing_snapshot,
688                                 "key in missing snapshot: %s",
689                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
690                 ret = bch2_btree_delete_at(trans, iter,
691                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
692 fsck_err:
693         printbuf_exit(&buf);
694         return ret;
695 }
696
697 static int hash_redo_key(struct btree_trans *trans,
698                          const struct bch_hash_desc desc,
699                          struct bch_hash_info *hash_info,
700                          struct btree_iter *k_iter, struct bkey_s_c k)
701 {
702         struct bkey_i *delete;
703         struct bkey_i *tmp;
704
705         delete = bch2_trans_kmalloc(trans, sizeof(*delete));
706         if (IS_ERR(delete))
707                 return PTR_ERR(delete);
708
709         tmp = bch2_bkey_make_mut_noupdate(trans, k);
710         if (IS_ERR(tmp))
711                 return PTR_ERR(tmp);
712
713         bkey_init(&delete->k);
714         delete->k.p = k_iter->pos;
715         return  bch2_btree_iter_traverse(k_iter) ?:
716                 bch2_trans_update(trans, k_iter, delete, 0) ?:
717                 bch2_hash_set_in_snapshot(trans, desc, hash_info,
718                                        (subvol_inum) { 0, k.k->p.inode },
719                                        k.k->p.snapshot, tmp,
720                                        BCH_HASH_SET_MUST_CREATE,
721                                        BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
722                 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
723 }
724
725 static int hash_check_key(struct btree_trans *trans,
726                           const struct bch_hash_desc desc,
727                           struct bch_hash_info *hash_info,
728                           struct btree_iter *k_iter, struct bkey_s_c hash_k)
729 {
730         struct bch_fs *c = trans->c;
731         struct btree_iter iter = { NULL };
732         struct printbuf buf = PRINTBUF;
733         struct bkey_s_c k;
734         u64 hash;
735         int ret = 0;
736
737         if (hash_k.k->type != desc.key_type)
738                 return 0;
739
740         hash = desc.hash_bkey(hash_info, hash_k);
741
742         if (likely(hash == hash_k.k->p.offset))
743                 return 0;
744
745         if (hash_k.k->p.offset < hash)
746                 goto bad_hash;
747
748         for_each_btree_key_norestart(trans, iter, desc.btree_id,
749                                      SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
750                                      BTREE_ITER_SLOTS, k, ret) {
751                 if (bkey_eq(k.k->p, hash_k.k->p))
752                         break;
753
754                 if (fsck_err_on(k.k->type == desc.key_type &&
755                                 !desc.cmp_bkey(k, hash_k), c,
756                                 hash_table_key_duplicate,
757                                 "duplicate hash table keys:\n%s",
758                                 (printbuf_reset(&buf),
759                                  bch2_bkey_val_to_text(&buf, c, hash_k),
760                                  buf.buf))) {
761                         ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
762                         break;
763                 }
764
765                 if (bkey_deleted(k.k)) {
766                         bch2_trans_iter_exit(trans, &iter);
767                         goto bad_hash;
768                 }
769         }
770 out:
771         bch2_trans_iter_exit(trans, &iter);
772         printbuf_exit(&buf);
773         return ret;
774 bad_hash:
775         if (fsck_err(c, hash_table_key_wrong_offset,
776                      "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
777                      bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
778                      (printbuf_reset(&buf),
779                       bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
780                 ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
781                 bch_err_fn(c, ret);
782                 if (ret)
783                         return ret;
784                 ret = -BCH_ERR_transaction_restart_nested;
785         }
786 fsck_err:
787         goto out;
788 }
789
790 static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
791                                                 struct btree_iter *iter,
792                                                 struct bpos pos)
793 {
794         return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
795 }
796
797 static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
798                                                struct btree_iter *iter,
799                                                struct bch_inode_unpacked *inode,
800                                                u32 *snapshot)
801 {
802         if (inode->bi_subvol) {
803                 u64 inum;
804                 int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
805                 if (ret)
806                         return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
807         }
808
809         return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
810 }
811
812 static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
813                                    struct bkey_s_c_dirent d)
814 {
815         return  inode->bi_dir           == d.k->p.inode &&
816                 inode->bi_dir_offset    == d.k->p.offset;
817 }
818
819 static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
820                                    struct bch_inode_unpacked *inode)
821 {
822         return d.v->d_type == DT_SUBVOL
823                 ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
824                 : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
825 }
826
827 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
828 {
829         struct btree_iter iter;
830         struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
831         int ret = bkey_err(k);
832         if (ret)
833                 return ret;
834
835         bch2_trans_iter_exit(trans, &iter);
836         return k.k->type == KEY_TYPE_set;
837 }
838
839 static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
840                                     struct bch_inode_unpacked *inode,
841                                     u32 inode_snapshot, bool *write_inode)
842 {
843         struct bch_fs *c = trans->c;
844         struct printbuf buf = PRINTBUF;
845
846         struct btree_iter dirent_iter = {};
847         struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
848         int ret = bkey_err(d);
849         if (ret && !bch2_err_matches(ret, ENOENT))
850                 return ret;
851
852         if (fsck_err_on(ret,
853                         c, inode_points_to_missing_dirent,
854                         "inode points to missing dirent\n%s",
855                         (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
856             fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
857                         c, inode_points_to_wrong_dirent,
858                         "inode points to dirent that does not point back:\n%s",
859                         (bch2_bkey_val_to_text(&buf, c, inode_k),
860                          prt_newline(&buf),
861                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
862                 /*
863                  * We just clear the backpointer fields for now. If we find a
864                  * dirent that points to this inode in check_dirents(), we'll
865                  * update it then; then when we get to check_path() if the
866                  * backpointer is still 0 we'll reattach it.
867                  */
868                 inode->bi_dir = 0;
869                 inode->bi_dir_offset = 0;
870                 inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
871                 *write_inode = true;
872         }
873
874         ret = 0;
875 fsck_err:
876         bch2_trans_iter_exit(trans, &dirent_iter);
877         printbuf_exit(&buf);
878         bch_err_fn(c, ret);
879         return ret;
880 }
881
882 static int check_inode(struct btree_trans *trans,
883                        struct btree_iter *iter,
884                        struct bkey_s_c k,
885                        struct bch_inode_unpacked *prev,
886                        struct snapshots_seen *s,
887                        bool full)
888 {
889         struct bch_fs *c = trans->c;
890         struct bch_inode_unpacked u;
891         bool do_update = false;
892         int ret;
893
894         ret = check_key_has_snapshot(trans, iter, k);
895         if (ret < 0)
896                 goto err;
897         if (ret)
898                 return 0;
899
900         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
901         if (ret)
902                 goto err;
903
904         if (!bkey_is_inode(k.k))
905                 return 0;
906
907         BUG_ON(bch2_inode_unpack(k, &u));
908
909         if (!full &&
910             !(u.bi_flags & (BCH_INODE_i_size_dirty|
911                             BCH_INODE_i_sectors_dirty|
912                             BCH_INODE_unlinked)))
913                 return 0;
914
915         if (prev->bi_inum != u.bi_inum)
916                 *prev = u;
917
918         if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
919                         inode_d_type(prev)      != inode_d_type(&u),
920                         c, inode_snapshot_mismatch,
921                         "inodes in different snapshots don't match")) {
922                 bch_err(c, "repair not implemented yet");
923                 return -BCH_ERR_fsck_repair_unimplemented;
924         }
925
926         if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
927             bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
928                 struct bpos new_min_pos;
929
930                 ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
931                 if (ret)
932                         goto err;
933
934                 u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
935
936                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
937
938                 bch_err_msg(c, ret, "in fsck updating inode");
939                 if (ret)
940                         return ret;
941
942                 if (!bpos_eq(new_min_pos, POS_MIN))
943                         bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
944                 return 0;
945         }
946
947         if (u.bi_flags & BCH_INODE_unlinked) {
948                 ret = check_inode_deleted_list(trans, k.k->p);
949                 if (ret < 0)
950                         return ret;
951
952                 fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
953                             "inode %llu:%u unlinked, but not on deleted list",
954                             u.bi_inum, k.k->p.snapshot);
955                 ret = 0;
956         }
957
958         if (u.bi_flags & BCH_INODE_unlinked &&
959             (!c->sb.clean ||
960              fsck_err(c, inode_unlinked_but_clean,
961                       "filesystem marked clean, but inode %llu unlinked",
962                       u.bi_inum))) {
963                 ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
964                 bch_err_msg(c, ret, "in fsck deleting inode");
965                 return ret;
966         }
967
968         if (u.bi_flags & BCH_INODE_i_size_dirty &&
969             (!c->sb.clean ||
970              fsck_err(c, inode_i_size_dirty_but_clean,
971                       "filesystem marked clean, but inode %llu has i_size dirty",
972                       u.bi_inum))) {
973                 bch_verbose(c, "truncating inode %llu", u.bi_inum);
974
975                 /*
976                  * XXX: need to truncate partial blocks too here - or ideally
977                  * just switch units to bytes and that issue goes away
978                  */
979                 ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
980                                 SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
981                                      iter->pos.snapshot),
982                                 POS(u.bi_inum, U64_MAX),
983                                 0, NULL);
984                 bch_err_msg(c, ret, "in fsck truncating inode");
985                 if (ret)
986                         return ret;
987
988                 /*
989                  * We truncated without our normal sector accounting hook, just
990                  * make sure we recalculate it:
991                  */
992                 u.bi_flags |= BCH_INODE_i_sectors_dirty;
993
994                 u.bi_flags &= ~BCH_INODE_i_size_dirty;
995                 do_update = true;
996         }
997
998         if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
999             (!c->sb.clean ||
1000              fsck_err(c, inode_i_sectors_dirty_but_clean,
1001                       "filesystem marked clean, but inode %llu has i_sectors dirty",
1002                       u.bi_inum))) {
1003                 s64 sectors;
1004
1005                 bch_verbose(c, "recounting sectors for inode %llu",
1006                             u.bi_inum);
1007
1008                 sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
1009                 if (sectors < 0) {
1010                         bch_err_msg(c, sectors, "in fsck recounting inode sectors");
1011                         return sectors;
1012                 }
1013
1014                 u.bi_sectors = sectors;
1015                 u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
1016                 do_update = true;
1017         }
1018
1019         if (u.bi_flags & BCH_INODE_backptr_untrusted) {
1020                 u.bi_dir = 0;
1021                 u.bi_dir_offset = 0;
1022                 u.bi_flags &= ~BCH_INODE_backptr_untrusted;
1023                 do_update = true;
1024         }
1025
1026         if (u.bi_dir || u.bi_dir_offset) {
1027                 ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
1028                 if (ret)
1029                         goto err;
1030         }
1031
1032         if (fsck_err_on(u.bi_parent_subvol &&
1033                         (u.bi_subvol == 0 ||
1034                          u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
1035                         c, inode_bi_parent_nonzero,
1036                         "inode %llu:%u has subvol %u but nonzero parent subvol %u",
1037                         u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
1038                 u.bi_parent_subvol = 0;
1039                 do_update = true;
1040         }
1041
1042         if (u.bi_subvol) {
1043                 struct bch_subvolume s;
1044
1045                 ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
1046                 if (ret && !bch2_err_matches(ret, ENOENT))
1047                         goto err;
1048
1049                 if (fsck_err_on(ret,
1050                                 c, inode_bi_subvol_missing,
1051                                 "inode %llu:%u bi_subvol points to missing subvolume %u",
1052                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
1053                     fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
1054                                 !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
1055                                                            k.k->p.snapshot),
1056                                 c, inode_bi_subvol_wrong,
1057                                 "inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
1058                                 u.bi_inum, k.k->p.snapshot, u.bi_subvol,
1059                                 le64_to_cpu(s.inode),
1060                                 le32_to_cpu(s.snapshot))) {
1061                         u.bi_subvol = 0;
1062                         u.bi_parent_subvol = 0;
1063                         do_update = true;
1064                 }
1065         }
1066
1067         if (do_update) {
1068                 ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
1069                 bch_err_msg(c, ret, "in fsck updating inode");
1070                 if (ret)
1071                         return ret;
1072         }
1073 err:
1074 fsck_err:
1075         bch_err_fn(c, ret);
1076         return ret;
1077 }
1078
1079 int bch2_check_inodes(struct bch_fs *c)
1080 {
1081         bool full = c->opts.fsck;
1082         struct bch_inode_unpacked prev = { 0 };
1083         struct snapshots_seen s;
1084
1085         snapshots_seen_init(&s);
1086
1087         int ret = bch2_trans_run(c,
1088                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
1089                                 POS_MIN,
1090                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1091                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1092                         check_inode(trans, &iter, k, &prev, &s, full)));
1093
1094         snapshots_seen_exit(&s);
1095         bch_err_fn(c, ret);
1096         return ret;
1097 }
1098
1099 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
1100 {
1101         struct bch_fs *c = trans->c;
1102         u32 restart_count = trans->restart_count;
1103         int ret = 0;
1104         s64 count2;
1105
1106         darray_for_each(w->inodes, i) {
1107                 if (i->inode.bi_sectors == i->count)
1108                         continue;
1109
1110                 count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
1111
1112                 if (w->recalculate_sums)
1113                         i->count = count2;
1114
1115                 if (i->count != count2) {
1116                         bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
1117                                 w->last_pos.inode, i->snapshot, i->count, count2);
1118                         return -BCH_ERR_internal_fsck_err;
1119                 }
1120
1121                 if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
1122                                 c, inode_i_sectors_wrong,
1123                                 "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
1124                                 w->last_pos.inode, i->snapshot,
1125                                 i->inode.bi_sectors, i->count)) {
1126                         i->inode.bi_sectors = i->count;
1127                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1128                         if (ret)
1129                                 break;
1130                 }
1131         }
1132 fsck_err:
1133         bch_err_fn(c, ret);
1134         return ret ?: trans_was_restarted(trans, restart_count);
1135 }
1136
1137 struct extent_end {
1138         u32                     snapshot;
1139         u64                     offset;
1140         struct snapshots_seen   seen;
1141 };
1142
1143 struct extent_ends {
1144         struct bpos                     last_pos;
1145         DARRAY(struct extent_end)       e;
1146 };
1147
1148 static void extent_ends_reset(struct extent_ends *extent_ends)
1149 {
1150         darray_for_each(extent_ends->e, i)
1151                 snapshots_seen_exit(&i->seen);
1152         extent_ends->e.nr = 0;
1153 }
1154
1155 static void extent_ends_exit(struct extent_ends *extent_ends)
1156 {
1157         extent_ends_reset(extent_ends);
1158         darray_exit(&extent_ends->e);
1159 }
1160
1161 static void extent_ends_init(struct extent_ends *extent_ends)
1162 {
1163         memset(extent_ends, 0, sizeof(*extent_ends));
1164 }
1165
1166 static int extent_ends_at(struct bch_fs *c,
1167                           struct extent_ends *extent_ends,
1168                           struct snapshots_seen *seen,
1169                           struct bkey_s_c k)
1170 {
1171         struct extent_end *i, n = (struct extent_end) {
1172                 .offset         = k.k->p.offset,
1173                 .snapshot       = k.k->p.snapshot,
1174                 .seen           = *seen,
1175         };
1176
1177         n.seen.ids.data = kmemdup(seen->ids.data,
1178                               sizeof(seen->ids.data[0]) * seen->ids.size,
1179                               GFP_KERNEL);
1180         if (!n.seen.ids.data)
1181                 return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
1182
1183         __darray_for_each(extent_ends->e, i) {
1184                 if (i->snapshot == k.k->p.snapshot) {
1185                         snapshots_seen_exit(&i->seen);
1186                         *i = n;
1187                         return 0;
1188                 }
1189
1190                 if (i->snapshot >= k.k->p.snapshot)
1191                         break;
1192         }
1193
1194         return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n);
1195 }
1196
1197 static int overlapping_extents_found(struct btree_trans *trans,
1198                                      enum btree_id btree,
1199                                      struct bpos pos1, struct snapshots_seen *pos1_seen,
1200                                      struct bkey pos2,
1201                                      bool *fixed,
1202                                      struct extent_end *extent_end)
1203 {
1204         struct bch_fs *c = trans->c;
1205         struct printbuf buf = PRINTBUF;
1206         struct btree_iter iter1, iter2 = { NULL };
1207         struct bkey_s_c k1, k2;
1208         int ret;
1209
1210         BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
1211
1212         bch2_trans_iter_init(trans, &iter1, btree, pos1,
1213                              BTREE_ITER_ALL_SNAPSHOTS|
1214                              BTREE_ITER_NOT_EXTENTS);
1215         k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
1216         ret = bkey_err(k1);
1217         if (ret)
1218                 goto err;
1219
1220         prt_str(&buf, "\n  ");
1221         bch2_bkey_val_to_text(&buf, c, k1);
1222
1223         if (!bpos_eq(pos1, k1.k->p)) {
1224                 prt_str(&buf, "\n  wanted\n  ");
1225                 bch2_bpos_to_text(&buf, pos1);
1226                 prt_str(&buf, "\n  ");
1227                 bch2_bkey_to_text(&buf, &pos2);
1228
1229                 bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
1230                         __func__, buf.buf);
1231                 ret = -BCH_ERR_internal_fsck_err;
1232                 goto err;
1233         }
1234
1235         bch2_trans_copy_iter(&iter2, &iter1);
1236
1237         while (1) {
1238                 bch2_btree_iter_advance(&iter2);
1239
1240                 k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
1241                 ret = bkey_err(k2);
1242                 if (ret)
1243                         goto err;
1244
1245                 if (bpos_ge(k2.k->p, pos2.p))
1246                         break;
1247         }
1248
1249         prt_str(&buf, "\n  ");
1250         bch2_bkey_val_to_text(&buf, c, k2);
1251
1252         if (bpos_gt(k2.k->p, pos2.p) ||
1253             pos2.size != k2.k->size) {
1254                 bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
1255                         __func__, buf.buf);
1256                 ret = -BCH_ERR_internal_fsck_err;
1257                 goto err;
1258         }
1259
1260         prt_printf(&buf, "\n  overwriting %s extent",
1261                    pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
1262
1263         if (fsck_err(c, extent_overlapping,
1264                      "overlapping extents%s", buf.buf)) {
1265                 struct btree_iter *old_iter = &iter1;
1266                 struct disk_reservation res = { 0 };
1267
1268                 if (pos1.snapshot < pos2.p.snapshot) {
1269                         old_iter = &iter2;
1270                         swap(k1, k2);
1271                 }
1272
1273                 trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
1274
1275                 ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
1276                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
1277                                 k1, k2) ?:
1278                         bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
1279                 bch2_disk_reservation_put(c, &res);
1280
1281                 if (ret)
1282                         goto err;
1283
1284                 *fixed = true;
1285
1286                 if (pos1.snapshot == pos2.p.snapshot) {
1287                         /*
1288                          * We overwrote the first extent, and did the overwrite
1289                          * in the same snapshot:
1290                          */
1291                         extent_end->offset = bkey_start_offset(&pos2);
1292                 } else if (pos1.snapshot > pos2.p.snapshot) {
1293                         /*
1294                          * We overwrote the first extent in pos2's snapshot:
1295                          */
1296                         ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
1297                 } else {
1298                         /*
1299                          * We overwrote the second extent - restart
1300                          * check_extent() from the top:
1301                          */
1302                         ret = -BCH_ERR_transaction_restart_nested;
1303                 }
1304         }
1305 fsck_err:
1306 err:
1307         bch2_trans_iter_exit(trans, &iter2);
1308         bch2_trans_iter_exit(trans, &iter1);
1309         printbuf_exit(&buf);
1310         return ret;
1311 }
1312
1313 static int check_overlapping_extents(struct btree_trans *trans,
1314                               struct snapshots_seen *seen,
1315                               struct extent_ends *extent_ends,
1316                               struct bkey_s_c k,
1317                               u32 equiv,
1318                               struct btree_iter *iter,
1319                               bool *fixed)
1320 {
1321         struct bch_fs *c = trans->c;
1322         int ret = 0;
1323
1324         /* transaction restart, running again */
1325         if (bpos_eq(extent_ends->last_pos, k.k->p))
1326                 return 0;
1327
1328         if (extent_ends->last_pos.inode != k.k->p.inode)
1329                 extent_ends_reset(extent_ends);
1330
1331         darray_for_each(extent_ends->e, i) {
1332                 if (i->offset <= bkey_start_offset(k.k))
1333                         continue;
1334
1335                 if (!ref_visible2(c,
1336                                   k.k->p.snapshot, seen,
1337                                   i->snapshot, &i->seen))
1338                         continue;
1339
1340                 ret = overlapping_extents_found(trans, iter->btree_id,
1341                                                 SPOS(iter->pos.inode,
1342                                                      i->offset,
1343                                                      i->snapshot),
1344                                                 &i->seen,
1345                                                 *k.k, fixed, i);
1346                 if (ret)
1347                         goto err;
1348         }
1349
1350         ret = extent_ends_at(c, extent_ends, seen, k);
1351         if (ret)
1352                 goto err;
1353
1354         extent_ends->last_pos = k.k->p;
1355 err:
1356         return ret;
1357 }
1358
1359 static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
1360                                 struct bkey_s_c k)
1361 {
1362         struct bch_fs *c = trans->c;
1363         struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1364         struct bch_extent_crc_unpacked crc;
1365         const union bch_extent_entry *i;
1366         unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
1367
1368         bkey_for_each_crc(k.k, ptrs, crc, i)
1369                 if (crc_is_encoded(crc) &&
1370                     crc.uncompressed_size > encoded_extent_max_sectors) {
1371                         struct printbuf buf = PRINTBUF;
1372
1373                         bch2_bkey_val_to_text(&buf, c, k);
1374                         bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
1375                         printbuf_exit(&buf);
1376                 }
1377
1378         return 0;
1379 }
1380
1381 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
1382                         struct bkey_s_c k,
1383                         struct inode_walker *inode,
1384                         struct snapshots_seen *s,
1385                         struct extent_ends *extent_ends)
1386 {
1387         struct bch_fs *c = trans->c;
1388         struct inode_walker_entry *i;
1389         struct printbuf buf = PRINTBUF;
1390         struct bpos equiv = k.k->p;
1391         int ret = 0;
1392
1393         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1394
1395         ret = check_key_has_snapshot(trans, iter, k);
1396         if (ret) {
1397                 ret = ret < 0 ? ret : 0;
1398                 goto out;
1399         }
1400
1401         if (inode->last_pos.inode != k.k->p.inode) {
1402                 ret = check_i_sectors(trans, inode);
1403                 if (ret)
1404                         goto err;
1405         }
1406
1407         i = walk_inode(trans, inode, k);
1408         ret = PTR_ERR_OR_ZERO(i);
1409         if (ret)
1410                 goto err;
1411
1412         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1413         if (ret)
1414                 goto err;
1415
1416         if (k.k->type != KEY_TYPE_whiteout) {
1417                 if (fsck_err_on(!i, c, extent_in_missing_inode,
1418                                 "extent in missing inode:\n  %s",
1419                                 (printbuf_reset(&buf),
1420                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1421                         goto delete;
1422
1423                 if (fsck_err_on(i &&
1424                                 !S_ISREG(i->inode.bi_mode) &&
1425                                 !S_ISLNK(i->inode.bi_mode),
1426                                 c, extent_in_non_reg_inode,
1427                                 "extent in non regular inode mode %o:\n  %s",
1428                                 i->inode.bi_mode,
1429                                 (printbuf_reset(&buf),
1430                                  bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
1431                         goto delete;
1432
1433                 ret = check_overlapping_extents(trans, s, extent_ends, k,
1434                                                 equiv.snapshot, iter,
1435                                                 &inode->recalculate_sums);
1436                 if (ret)
1437                         goto err;
1438         }
1439
1440         /*
1441          * Check inodes in reverse order, from oldest snapshots to newest,
1442          * starting from the inode that matches this extent's snapshot. If we
1443          * didn't have one, iterate over all inodes:
1444          */
1445         if (!i)
1446                 i = inode->inodes.data + inode->inodes.nr - 1;
1447
1448         for (;
1449              inode->inodes.data && i >= inode->inodes.data;
1450              --i) {
1451                 if (i->snapshot > equiv.snapshot ||
1452                     !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot))
1453                         continue;
1454
1455                 if (k.k->type != KEY_TYPE_whiteout) {
1456                         if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
1457                                         k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
1458                                         !bkey_extent_is_reservation(k),
1459                                         c, extent_past_end_of_inode,
1460                                         "extent type past end of inode %llu:%u, i_size %llu\n  %s",
1461                                         i->inode.bi_inum, i->snapshot, i->inode.bi_size,
1462                                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1463                                 struct btree_iter iter2;
1464
1465                                 bch2_trans_copy_iter(&iter2, iter);
1466                                 bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
1467                                 ret =   bch2_btree_iter_traverse(&iter2) ?:
1468                                         bch2_btree_delete_at(trans, &iter2,
1469                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1470                                 bch2_trans_iter_exit(trans, &iter2);
1471                                 if (ret)
1472                                         goto err;
1473
1474                                 iter->k.type = KEY_TYPE_whiteout;
1475                         }
1476
1477                         if (bkey_extent_is_allocation(k.k))
1478                                 i->count += k.k->size;
1479                 }
1480
1481                 i->seen_this_pos = true;
1482         }
1483 out:
1484 err:
1485 fsck_err:
1486         printbuf_exit(&buf);
1487         bch_err_fn(c, ret);
1488         return ret;
1489 delete:
1490         ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1491         goto out;
1492 }
1493
1494 /*
1495  * Walk extents: verify that extents have a corresponding S_ISREG inode, and
1496  * that i_size an i_sectors are consistent
1497  */
1498 int bch2_check_extents(struct bch_fs *c)
1499 {
1500         struct inode_walker w = inode_walker_init();
1501         struct snapshots_seen s;
1502         struct extent_ends extent_ends;
1503         struct disk_reservation res = { 0 };
1504
1505         snapshots_seen_init(&s);
1506         extent_ends_init(&extent_ends);
1507
1508         int ret = bch2_trans_run(c,
1509                 for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
1510                                 POS(BCACHEFS_ROOT_INO, 0),
1511                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1512                                 &res, NULL,
1513                                 BCH_TRANS_COMMIT_no_enospc, ({
1514                         bch2_disk_reservation_put(c, &res);
1515                         check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
1516                         check_extent_overbig(trans, &iter, k);
1517                 })) ?:
1518                 check_i_sectors(trans, &w));
1519
1520         bch2_disk_reservation_put(c, &res);
1521         extent_ends_exit(&extent_ends);
1522         inode_walker_exit(&w);
1523         snapshots_seen_exit(&s);
1524
1525         bch_err_fn(c, ret);
1526         return ret;
1527 }
1528
1529 int bch2_check_indirect_extents(struct bch_fs *c)
1530 {
1531         struct disk_reservation res = { 0 };
1532
1533         int ret = bch2_trans_run(c,
1534                 for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
1535                                 POS_MIN,
1536                                 BTREE_ITER_PREFETCH, k,
1537                                 &res, NULL,
1538                                 BCH_TRANS_COMMIT_no_enospc, ({
1539                         bch2_disk_reservation_put(c, &res);
1540                         check_extent_overbig(trans, &iter, k);
1541                 })));
1542
1543         bch2_disk_reservation_put(c, &res);
1544         bch_err_fn(c, ret);
1545         return ret;
1546 }
1547
1548 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
1549 {
1550         struct bch_fs *c = trans->c;
1551         u32 restart_count = trans->restart_count;
1552         int ret = 0;
1553         s64 count2;
1554
1555         darray_for_each(w->inodes, i) {
1556                 if (i->inode.bi_nlink == i->count)
1557                         continue;
1558
1559                 count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
1560                 if (count2 < 0)
1561                         return count2;
1562
1563                 if (i->count != count2) {
1564                         bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
1565                                 i->count, count2);
1566                         i->count = count2;
1567                         if (i->inode.bi_nlink == i->count)
1568                                 continue;
1569                 }
1570
1571                 if (fsck_err_on(i->inode.bi_nlink != i->count,
1572                                 c, inode_dir_wrong_nlink,
1573                                 "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
1574                                 w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
1575                         i->inode.bi_nlink = i->count;
1576                         ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
1577                         if (ret)
1578                                 break;
1579                 }
1580         }
1581 fsck_err:
1582         bch_err_fn(c, ret);
1583         return ret ?: trans_was_restarted(trans, restart_count);
1584 }
1585
1586 static int check_dirent_inode_dirent(struct btree_trans *trans,
1587                                    struct btree_iter *iter,
1588                                    struct bkey_s_c_dirent d,
1589                                    struct bch_inode_unpacked *target,
1590                                    u32 target_snapshot)
1591 {
1592         struct bch_fs *c = trans->c;
1593         struct printbuf buf = PRINTBUF;
1594         int ret = 0;
1595
1596         if (inode_points_to_dirent(target, d))
1597                 return 0;
1598
1599         if (!target->bi_dir &&
1600             !target->bi_dir_offset) {
1601                 target->bi_dir          = d.k->p.inode;
1602                 target->bi_dir_offset   = d.k->p.offset;
1603                 return __bch2_fsck_write_inode(trans, target, target_snapshot);
1604         }
1605
1606         struct btree_iter bp_iter = { NULL };
1607         struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
1608                               SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
1609         ret = bkey_err(bp_dirent);
1610         if (ret && !bch2_err_matches(ret, ENOENT))
1611                 goto err;
1612
1613         bool backpointer_exists = !ret;
1614         ret = 0;
1615
1616         if (fsck_err_on(!backpointer_exists,
1617                         c, inode_wrong_backpointer,
1618                         "inode %llu:%u has wrong backpointer:\n"
1619                         "got       %llu:%llu\n"
1620                         "should be %llu:%llu",
1621                         target->bi_inum, target_snapshot,
1622                         target->bi_dir,
1623                         target->bi_dir_offset,
1624                         d.k->p.inode,
1625                         d.k->p.offset)) {
1626                 target->bi_dir          = d.k->p.inode;
1627                 target->bi_dir_offset   = d.k->p.offset;
1628                 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1629                 goto out;
1630         }
1631
1632         bch2_bkey_val_to_text(&buf, c, d.s_c);
1633         prt_newline(&buf);
1634         if (backpointer_exists)
1635                 bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
1636
1637         if (fsck_err_on(backpointer_exists &&
1638                         (S_ISDIR(target->bi_mode) ||
1639                          target->bi_subvol),
1640                         c, inode_dir_multiple_links,
1641                         "%s %llu:%u with multiple links\n%s",
1642                         S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
1643                         target->bi_inum, target_snapshot, buf.buf)) {
1644                 ret = __remove_dirent(trans, d.k->p);
1645                 goto out;
1646         }
1647
1648         /*
1649          * hardlinked file with nlink 0:
1650          * We're just adjusting nlink here so check_nlinks() will pick
1651          * it up, it ignores inodes with nlink 0
1652          */
1653         if (fsck_err_on(backpointer_exists && !target->bi_nlink,
1654                         c, inode_multiple_links_but_nlink_0,
1655                         "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
1656                         target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
1657                 target->bi_nlink++;
1658                 target->bi_flags &= ~BCH_INODE_unlinked;
1659                 ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
1660                 if (ret)
1661                         goto err;
1662         }
1663 out:
1664 err:
1665 fsck_err:
1666         bch2_trans_iter_exit(trans, &bp_iter);
1667         printbuf_exit(&buf);
1668         bch_err_fn(c, ret);
1669         return ret;
1670 }
1671
1672 static int check_dirent_target(struct btree_trans *trans,
1673                                struct btree_iter *iter,
1674                                struct bkey_s_c_dirent d,
1675                                struct bch_inode_unpacked *target,
1676                                u32 target_snapshot)
1677 {
1678         struct bch_fs *c = trans->c;
1679         struct bkey_i_dirent *n;
1680         struct printbuf buf = PRINTBUF;
1681         int ret = 0;
1682
1683         ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
1684         if (ret)
1685                 goto err;
1686
1687         if (fsck_err_on(d.v->d_type != inode_d_type(target),
1688                         c, dirent_d_type_wrong,
1689                         "incorrect d_type: got %s, should be %s:\n%s",
1690                         bch2_d_type_str(d.v->d_type),
1691                         bch2_d_type_str(inode_d_type(target)),
1692                         (printbuf_reset(&buf),
1693                          bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1694                 n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
1695                 ret = PTR_ERR_OR_ZERO(n);
1696                 if (ret)
1697                         goto err;
1698
1699                 bkey_reassemble(&n->k_i, d.s_c);
1700                 n->v.d_type = inode_d_type(target);
1701                 if (n->v.d_type == DT_SUBVOL) {
1702                         n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
1703                         n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
1704                 } else {
1705                         n->v.d_inum = cpu_to_le64(target->bi_inum);
1706                 }
1707
1708                 ret = bch2_trans_update(trans, iter, &n->k_i, 0);
1709                 if (ret)
1710                         goto err;
1711
1712                 d = dirent_i_to_s_c(n);
1713         }
1714 err:
1715 fsck_err:
1716         printbuf_exit(&buf);
1717         bch_err_fn(c, ret);
1718         return ret;
1719 }
1720
1721 /* find a subvolume that's a descendent of @snapshot: */
1722 static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
1723 {
1724         struct btree_iter iter;
1725         struct bkey_s_c k;
1726         int ret;
1727
1728         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
1729                 if (k.k->type != KEY_TYPE_subvolume)
1730                         continue;
1731
1732                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
1733                 if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
1734                         bch2_trans_iter_exit(trans, &iter);
1735                         *subvolid = k.k->p.offset;
1736                         goto found;
1737                 }
1738         }
1739         if (!ret)
1740                 ret = -ENOENT;
1741 found:
1742         bch2_trans_iter_exit(trans, &iter);
1743         return ret;
1744 }
1745
1746 static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
1747                                   struct bkey_s_c_dirent d)
1748 {
1749         struct bch_fs *c = trans->c;
1750         struct btree_iter subvol_iter = {};
1751         struct bch_inode_unpacked subvol_root;
1752         u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
1753         u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
1754         u32 parent_snapshot;
1755         u64 parent_inum;
1756         struct printbuf buf = PRINTBUF;
1757         int ret = 0;
1758
1759         ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
1760         if (ret && !bch2_err_matches(ret, ENOENT))
1761                 return ret;
1762
1763         if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
1764                         "dirent parent_subvol points to missing subvolume\n%s",
1765                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
1766             fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
1767                         c, dirent_not_visible_in_parent_subvol,
1768                         "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
1769                         parent_snapshot,
1770                         (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
1771                 u32 new_parent_subvol;
1772                 ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
1773                 if (ret)
1774                         goto err;
1775
1776                 struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
1777                 ret = PTR_ERR_OR_ZERO(new_dirent);
1778                 if (ret)
1779                         goto err;
1780
1781                 new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
1782         }
1783
1784         struct bkey_s_c_subvolume s =
1785                 bch2_bkey_get_iter_typed(trans, &subvol_iter,
1786                                          BTREE_ID_subvolumes, POS(0, target_subvol),
1787                                          0, subvolume);
1788         ret = bkey_err(s.s_c);
1789         if (ret && !bch2_err_matches(ret, ENOENT))
1790                 return ret;
1791
1792         if (ret) {
1793                 if (fsck_err(c, dirent_to_missing_subvol,
1794                              "dirent points to missing subvolume\n%s",
1795                              (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
1796                         return __remove_dirent(trans, d.k->p);
1797                 ret = 0;
1798                 goto out;
1799         }
1800
1801         if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
1802                         c, subvol_fs_path_parent_wrong,
1803                         "subvol with wrong fs_path_parent, should be be %u\n%s",
1804                         parent_subvol,
1805                         (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
1806                 struct bkey_i_subvolume *n =
1807                         bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
1808                 ret = PTR_ERR_OR_ZERO(n);
1809                 if (ret)
1810                         goto err;
1811
1812                 n->v.fs_path_parent = cpu_to_le32(parent_subvol);
1813         }
1814
1815         u64 target_inum = le64_to_cpu(s.v->inode);
1816         u32 target_snapshot = le32_to_cpu(s.v->snapshot);
1817
1818         ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
1819         if (ret && !bch2_err_matches(ret, ENOENT))
1820                 return ret;
1821
1822         if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol,
1823                         c, inode_bi_parent_wrong,
1824                         "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
1825                         target_inum,
1826                         subvol_root.bi_parent_subvol, parent_subvol)) {
1827                 subvol_root.bi_parent_subvol = parent_subvol;
1828                 ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
1829                 if (ret)
1830                         return ret;
1831         }
1832
1833         ret = check_dirent_target(trans, iter, d, &subvol_root,
1834                                   target_snapshot);
1835         if (ret)
1836                 return ret;
1837 out:
1838 err:
1839 fsck_err:
1840         bch2_trans_iter_exit(trans, &subvol_iter);
1841         printbuf_exit(&buf);
1842         return ret;
1843 }
1844
1845 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
1846                         struct bkey_s_c k,
1847                         struct bch_hash_info *hash_info,
1848                         struct inode_walker *dir,
1849                         struct inode_walker *target,
1850                         struct snapshots_seen *s)
1851 {
1852         struct bch_fs *c = trans->c;
1853         struct bkey_s_c_dirent d;
1854         struct inode_walker_entry *i;
1855         struct printbuf buf = PRINTBUF;
1856         struct bpos equiv;
1857         int ret = 0;
1858
1859         ret = check_key_has_snapshot(trans, iter, k);
1860         if (ret) {
1861                 ret = ret < 0 ? ret : 0;
1862                 goto out;
1863         }
1864
1865         equiv = k.k->p;
1866         equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
1867
1868         ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p);
1869         if (ret)
1870                 goto err;
1871
1872         if (k.k->type == KEY_TYPE_whiteout)
1873                 goto out;
1874
1875         if (dir->last_pos.inode != k.k->p.inode) {
1876                 ret = check_subdir_count(trans, dir);
1877                 if (ret)
1878                         goto err;
1879         }
1880
1881         BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
1882
1883         i = walk_inode(trans, dir, k);
1884         ret = PTR_ERR_OR_ZERO(i);
1885         if (ret < 0)
1886                 goto err;
1887
1888         if (dir->first_this_inode && dir->inodes.nr)
1889                 *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
1890         dir->first_this_inode = false;
1891
1892         if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
1893                         "dirent in nonexisting directory:\n%s",
1894                         (printbuf_reset(&buf),
1895                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1896                 ret = bch2_btree_delete_at(trans, iter,
1897                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1898                 goto out;
1899         }
1900
1901         if (!i)
1902                 goto out;
1903
1904         if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
1905                         c, dirent_in_non_dir_inode,
1906                         "dirent in non directory inode type %s:\n%s",
1907                         bch2_d_type_str(inode_d_type(&i->inode)),
1908                         (printbuf_reset(&buf),
1909                          bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
1910                 ret = bch2_btree_delete_at(trans, iter, 0);
1911                 goto out;
1912         }
1913
1914         ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
1915         if (ret < 0)
1916                 goto err;
1917         if (ret) {
1918                 /* dirent has been deleted */
1919                 ret = 0;
1920                 goto out;
1921         }
1922
1923         if (k.k->type != KEY_TYPE_dirent)
1924                 goto out;
1925
1926         d = bkey_s_c_to_dirent(k);
1927
1928         if (d.v->d_type == DT_SUBVOL) {
1929                 ret = check_dirent_to_subvol(trans, iter, d);
1930                 if (ret)
1931                         goto err;
1932         } else {
1933                 ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
1934                 if (ret)
1935                         goto err;
1936
1937                 if (fsck_err_on(!target->inodes.nr,
1938                                 c, dirent_to_missing_inode,
1939                                 "dirent points to missing inode: (equiv %u)\n%s",
1940                                 equiv.snapshot,
1941                                 (printbuf_reset(&buf),
1942                                  bch2_bkey_val_to_text(&buf, c, k),
1943                                  buf.buf))) {
1944                         ret = __remove_dirent(trans, d.k->p);
1945                         if (ret)
1946                                 goto err;
1947                 }
1948
1949                 darray_for_each(target->inodes, i) {
1950                         ret = check_dirent_target(trans, iter, d,
1951                                                   &i->inode, i->snapshot);
1952                         if (ret)
1953                                 goto err;
1954                 }
1955
1956                 if (d.v->d_type == DT_DIR)
1957                         for_each_visible_inode(c, s, dir, equiv.snapshot, i)
1958                                 i->count++;
1959         }
1960 out:
1961 err:
1962 fsck_err:
1963         printbuf_exit(&buf);
1964         bch_err_fn(c, ret);
1965         return ret;
1966 }
1967
1968 /*
1969  * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
1970  * validate d_type
1971  */
1972 int bch2_check_dirents(struct bch_fs *c)
1973 {
1974         struct inode_walker dir = inode_walker_init();
1975         struct inode_walker target = inode_walker_init();
1976         struct snapshots_seen s;
1977         struct bch_hash_info hash_info;
1978
1979         snapshots_seen_init(&s);
1980
1981         int ret = bch2_trans_run(c,
1982                 for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
1983                                 POS(BCACHEFS_ROOT_INO, 0),
1984                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
1985                                 k,
1986                                 NULL, NULL,
1987                                 BCH_TRANS_COMMIT_no_enospc,
1988                         check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
1989
1990         snapshots_seen_exit(&s);
1991         inode_walker_exit(&dir);
1992         inode_walker_exit(&target);
1993         bch_err_fn(c, ret);
1994         return ret;
1995 }
1996
1997 static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
1998                        struct bkey_s_c k,
1999                        struct bch_hash_info *hash_info,
2000                        struct inode_walker *inode)
2001 {
2002         struct bch_fs *c = trans->c;
2003         struct inode_walker_entry *i;
2004         int ret;
2005
2006         ret = check_key_has_snapshot(trans, iter, k);
2007         if (ret)
2008                 return ret;
2009
2010         i = walk_inode(trans, inode, k);
2011         ret = PTR_ERR_OR_ZERO(i);
2012         if (ret)
2013                 return ret;
2014
2015         if (inode->first_this_inode && inode->inodes.nr)
2016                 *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
2017         inode->first_this_inode = false;
2018
2019         if (fsck_err_on(!i, c, xattr_in_missing_inode,
2020                         "xattr for missing inode %llu",
2021                         k.k->p.inode))
2022                 return bch2_btree_delete_at(trans, iter, 0);
2023
2024         if (!i)
2025                 return 0;
2026
2027         ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
2028 fsck_err:
2029         bch_err_fn(c, ret);
2030         return ret;
2031 }
2032
2033 /*
2034  * Walk xattrs: verify that they all have a corresponding inode
2035  */
2036 int bch2_check_xattrs(struct bch_fs *c)
2037 {
2038         struct inode_walker inode = inode_walker_init();
2039         struct bch_hash_info hash_info;
2040         int ret = 0;
2041
2042         ret = bch2_trans_run(c,
2043                 for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
2044                         POS(BCACHEFS_ROOT_INO, 0),
2045                         BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
2046                         k,
2047                         NULL, NULL,
2048                         BCH_TRANS_COMMIT_no_enospc,
2049                 check_xattr(trans, &iter, k, &hash_info, &inode)));
2050         bch_err_fn(c, ret);
2051         return ret;
2052 }
2053
2054 static int check_root_trans(struct btree_trans *trans)
2055 {
2056         struct bch_fs *c = trans->c;
2057         struct bch_inode_unpacked root_inode;
2058         u32 snapshot;
2059         u64 inum;
2060         int ret;
2061
2062         ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
2063         if (ret && !bch2_err_matches(ret, ENOENT))
2064                 return ret;
2065
2066         if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
2067                                 "root subvol missing")) {
2068                 struct bkey_i_subvolume root_subvol;
2069
2070                 snapshot        = U32_MAX;
2071                 inum            = BCACHEFS_ROOT_INO;
2072
2073                 bkey_subvolume_init(&root_subvol.k_i);
2074                 root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
2075                 root_subvol.v.flags     = 0;
2076                 root_subvol.v.snapshot  = cpu_to_le32(snapshot);
2077                 root_subvol.v.inode     = cpu_to_le64(inum);
2078                 ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
2079                 bch_err_msg(c, ret, "writing root subvol");
2080                 if (ret)
2081                         goto err;
2082         }
2083
2084         ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
2085         if (ret && !bch2_err_matches(ret, ENOENT))
2086                 return ret;
2087
2088         if (mustfix_fsck_err_on(ret, c, root_dir_missing,
2089                                 "root directory missing") ||
2090             mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
2091                                 c, root_inode_not_dir,
2092                                 "root inode not a directory")) {
2093                 bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
2094                                 0, NULL);
2095                 root_inode.bi_inum = inum;
2096
2097                 ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
2098                 bch_err_msg(c, ret, "writing root inode");
2099         }
2100 err:
2101 fsck_err:
2102         return ret;
2103 }
2104
2105 /* Get root directory, create if it doesn't exist: */
2106 int bch2_check_root(struct bch_fs *c)
2107 {
2108         int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2109                 check_root_trans(trans));
2110         bch_err_fn(c, ret);
2111         return ret;
2112 }
2113
2114 struct pathbuf_entry {
2115         u64     inum;
2116         u32     snapshot;
2117 };
2118
2119 typedef DARRAY(struct pathbuf_entry) pathbuf;
2120
2121 static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
2122 {
2123         darray_for_each(*p, i)
2124                 if (i->inum     == inum &&
2125                     i->snapshot == snapshot)
2126                         return true;
2127         return false;
2128 }
2129
2130 static int path_down(struct bch_fs *c, pathbuf *p,
2131                      u64 inum, u32 snapshot)
2132 {
2133         int ret = darray_push(p, ((struct pathbuf_entry) {
2134                 .inum           = inum,
2135                 .snapshot       = snapshot,
2136         }));
2137
2138         if (ret)
2139                 bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
2140                         p->size);
2141         return ret;
2142 }
2143
2144 /*
2145  * Check that a given inode is reachable from the root:
2146  *
2147  * XXX: we should also be verifying that inodes are in the right subvolumes
2148  */
2149 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
2150 {
2151         struct bch_fs *c = trans->c;
2152         struct btree_iter inode_iter = {};
2153         struct bch_inode_unpacked inode;
2154         struct printbuf buf = PRINTBUF;
2155         u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
2156         int ret = 0;
2157
2158         p->nr = 0;
2159
2160         BUG_ON(bch2_inode_unpack(inode_k, &inode));
2161
2162         while (!(inode.bi_inum == BCACHEFS_ROOT_INO &&
2163                  inode.bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
2164                 struct btree_iter dirent_iter;
2165                 struct bkey_s_c_dirent d;
2166                 u32 parent_snapshot = snapshot;
2167
2168                 d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
2169                 ret = bkey_err(d.s_c);
2170                 if (ret && !bch2_err_matches(ret, ENOENT))
2171                         break;
2172
2173                 if (!ret && !dirent_points_to_inode(d, &inode)) {
2174                         bch2_trans_iter_exit(trans, &dirent_iter);
2175                         ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
2176                 }
2177
2178                 if (bch2_err_matches(ret, ENOENT)) {
2179                         ret = 0;
2180                         if (fsck_err(c, inode_unreachable,
2181                                      "unreachable inode\n%s",
2182                                      (printbuf_reset(&buf),
2183                                       bch2_bkey_val_to_text(&buf, c, inode_k),
2184                                       buf.buf)))
2185                                 ret = reattach_inode(trans, &inode, snapshot);
2186                         goto out;
2187                 }
2188
2189                 bch2_trans_iter_exit(trans, &dirent_iter);
2190
2191                 if (!S_ISDIR(inode.bi_mode))
2192                         break;
2193
2194                 ret = path_down(c, p, inode.bi_inum, snapshot);
2195                 if (ret) {
2196                         bch_err(c, "memory allocation failure");
2197                         return ret;
2198                 }
2199
2200                 snapshot = parent_snapshot;
2201
2202                 bch2_trans_iter_exit(trans, &inode_iter);
2203                 inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
2204                                              SPOS(0, inode.bi_dir, snapshot), 0);
2205                 ret = bkey_err(inode_k) ?:
2206                         !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
2207                         : bch2_inode_unpack(inode_k, &inode);
2208                 if (ret) {
2209                         /* Should have been caught in dirents pass */
2210                         if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
2211                                 bch_err(c, "error looking up parent directory: %i", ret);
2212                         break;
2213                 }
2214
2215                 snapshot = inode_k.k->p.snapshot;
2216
2217                 if (path_is_dup(p, inode.bi_inum, snapshot)) {
2218                         /* XXX print path */
2219                         bch_err(c, "directory structure loop");
2220
2221                         darray_for_each(*p, i)
2222                                 pr_err("%llu:%u", i->inum, i->snapshot);
2223                         pr_err("%llu:%u", inode.bi_inum, snapshot);
2224
2225                         if (!fsck_err(c, dir_loop, "directory structure loop"))
2226                                 return 0;
2227
2228                         ret = remove_backpointer(trans, &inode);
2229                         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
2230                                 bch_err_msg(c, ret, "removing dirent");
2231                         if (ret)
2232                                 break;
2233
2234                         ret = reattach_inode(trans, &inode, snapshot);
2235                         if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
2236                                 bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
2237                         break;
2238                 }
2239         }
2240 out:
2241 fsck_err:
2242         bch2_trans_iter_exit(trans, &inode_iter);
2243         printbuf_exit(&buf);
2244         bch_err_fn(c, ret);
2245         return ret;
2246 }
2247
2248 /*
2249  * Check for unreachable inodes, as well as loops in the directory structure:
2250  * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
2251  * unreachable:
2252  */
2253 int bch2_check_directory_structure(struct bch_fs *c)
2254 {
2255         pathbuf path = { 0, };
2256         int ret;
2257
2258         ret = bch2_trans_run(c,
2259                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
2260                                           BTREE_ITER_INTENT|
2261                                           BTREE_ITER_PREFETCH|
2262                                           BTREE_ITER_ALL_SNAPSHOTS, k,
2263                                           NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
2264                         if (!bkey_is_inode(k.k))
2265                                 continue;
2266
2267                         if (bch2_inode_flags(k) & BCH_INODE_unlinked)
2268                                 continue;
2269
2270                         check_path(trans, &path, k);
2271                 })));
2272         darray_exit(&path);
2273
2274         bch_err_fn(c, ret);
2275         return ret;
2276 }
2277
2278 struct nlink_table {
2279         size_t          nr;
2280         size_t          size;
2281
2282         struct nlink {
2283                 u64     inum;
2284                 u32     snapshot;
2285                 u32     count;
2286         }               *d;
2287 };
2288
2289 static int add_nlink(struct bch_fs *c, struct nlink_table *t,
2290                      u64 inum, u32 snapshot)
2291 {
2292         if (t->nr == t->size) {
2293                 size_t new_size = max_t(size_t, 128UL, t->size * 2);
2294                 void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL);
2295
2296                 if (!d) {
2297                         bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
2298                                 new_size);
2299                         return -BCH_ERR_ENOMEM_fsck_add_nlink;
2300                 }
2301
2302                 if (t->d)
2303                         memcpy(d, t->d, t->size * sizeof(t->d[0]));
2304                 kvfree(t->d);
2305
2306                 t->d = d;
2307                 t->size = new_size;
2308         }
2309
2310
2311         t->d[t->nr++] = (struct nlink) {
2312                 .inum           = inum,
2313                 .snapshot       = snapshot,
2314         };
2315
2316         return 0;
2317 }
2318
2319 static int nlink_cmp(const void *_l, const void *_r)
2320 {
2321         const struct nlink *l = _l;
2322         const struct nlink *r = _r;
2323
2324         return cmp_int(l->inum, r->inum);
2325 }
2326
2327 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
2328                      struct nlink_table *links,
2329                      u64 range_start, u64 range_end, u64 inum, u32 snapshot)
2330 {
2331         struct nlink *link, key = {
2332                 .inum = inum, .snapshot = U32_MAX,
2333         };
2334
2335         if (inum < range_start || inum >= range_end)
2336                 return;
2337
2338         link = __inline_bsearch(&key, links->d, links->nr,
2339                                 sizeof(links->d[0]), nlink_cmp);
2340         if (!link)
2341                 return;
2342
2343         while (link > links->d && link[0].inum == link[-1].inum)
2344                 --link;
2345
2346         for (; link < links->d + links->nr && link->inum == inum; link++)
2347                 if (ref_visible(c, s, snapshot, link->snapshot)) {
2348                         link->count++;
2349                         if (link->snapshot >= snapshot)
2350                                 break;
2351                 }
2352 }
2353
2354 noinline_for_stack
2355 static int check_nlinks_find_hardlinks(struct bch_fs *c,
2356                                        struct nlink_table *t,
2357                                        u64 start, u64 *end)
2358 {
2359         int ret = bch2_trans_run(c,
2360                 for_each_btree_key(trans, iter, BTREE_ID_inodes,
2361                                    POS(0, start),
2362                                    BTREE_ITER_INTENT|
2363                                    BTREE_ITER_PREFETCH|
2364                                    BTREE_ITER_ALL_SNAPSHOTS, k, ({
2365                         if (!bkey_is_inode(k.k))
2366                                 continue;
2367
2368                         /* Should never fail, checked by bch2_inode_invalid: */
2369                         struct bch_inode_unpacked u;
2370                         BUG_ON(bch2_inode_unpack(k, &u));
2371
2372                         /*
2373                          * Backpointer and directory structure checks are sufficient for
2374                          * directories, since they can't have hardlinks:
2375                          */
2376                         if (S_ISDIR(u.bi_mode))
2377                                 continue;
2378
2379                         if (!u.bi_nlink)
2380                                 continue;
2381
2382                         ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
2383                         if (ret) {
2384                                 *end = k.k->p.offset;
2385                                 ret = 0;
2386                                 break;
2387                         }
2388                         0;
2389                 })));
2390
2391         bch_err_fn(c, ret);
2392         return ret;
2393 }
2394
2395 noinline_for_stack
2396 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
2397                                      u64 range_start, u64 range_end)
2398 {
2399         struct snapshots_seen s;
2400
2401         snapshots_seen_init(&s);
2402
2403         int ret = bch2_trans_run(c,
2404                 for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
2405                                    BTREE_ITER_INTENT|
2406                                    BTREE_ITER_PREFETCH|
2407                                    BTREE_ITER_ALL_SNAPSHOTS, k, ({
2408                         ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
2409                         if (ret)
2410                                 break;
2411
2412                         if (k.k->type == KEY_TYPE_dirent) {
2413                                 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
2414
2415                                 if (d.v->d_type != DT_DIR &&
2416                                     d.v->d_type != DT_SUBVOL)
2417                                         inc_link(c, &s, links, range_start, range_end,
2418                                                  le64_to_cpu(d.v->d_inum),
2419                                                  bch2_snapshot_equiv(c, d.k->p.snapshot));
2420                         }
2421                         0;
2422                 })));
2423
2424         snapshots_seen_exit(&s);
2425
2426         bch_err_fn(c, ret);
2427         return ret;
2428 }
2429
2430 static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter,
2431                                      struct bkey_s_c k,
2432                                      struct nlink_table *links,
2433                                      size_t *idx, u64 range_end)
2434 {
2435         struct bch_fs *c = trans->c;
2436         struct bch_inode_unpacked u;
2437         struct nlink *link = &links->d[*idx];
2438         int ret = 0;
2439
2440         if (k.k->p.offset >= range_end)
2441                 return 1;
2442
2443         if (!bkey_is_inode(k.k))
2444                 return 0;
2445
2446         BUG_ON(bch2_inode_unpack(k, &u));
2447
2448         if (S_ISDIR(u.bi_mode))
2449                 return 0;
2450
2451         if (!u.bi_nlink)
2452                 return 0;
2453
2454         while ((cmp_int(link->inum, k.k->p.offset) ?:
2455                 cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
2456                 BUG_ON(*idx == links->nr);
2457                 link = &links->d[++*idx];
2458         }
2459
2460         if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
2461                         c, inode_wrong_nlink,
2462                         "inode %llu type %s has wrong i_nlink (%u, should be %u)",
2463                         u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
2464                         bch2_inode_nlink_get(&u), link->count)) {
2465                 bch2_inode_nlink_set(&u, link->count);
2466                 ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
2467         }
2468 fsck_err:
2469         return ret;
2470 }
2471
2472 noinline_for_stack
2473 static int check_nlinks_update_hardlinks(struct bch_fs *c,
2474                                struct nlink_table *links,
2475                                u64 range_start, u64 range_end)
2476 {
2477         size_t idx = 0;
2478
2479         int ret = bch2_trans_run(c,
2480                 for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
2481                                 POS(0, range_start),
2482                                 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
2483                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2484                         check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
2485         if (ret < 0) {
2486                 bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
2487                 return ret;
2488         }
2489
2490         return 0;
2491 }
2492
2493 int bch2_check_nlinks(struct bch_fs *c)
2494 {
2495         struct nlink_table links = { 0 };
2496         u64 this_iter_range_start, next_iter_range_start = 0;
2497         int ret = 0;
2498
2499         do {
2500                 this_iter_range_start = next_iter_range_start;
2501                 next_iter_range_start = U64_MAX;
2502
2503                 ret = check_nlinks_find_hardlinks(c, &links,
2504                                                   this_iter_range_start,
2505                                                   &next_iter_range_start);
2506
2507                 ret = check_nlinks_walk_dirents(c, &links,
2508                                           this_iter_range_start,
2509                                           next_iter_range_start);
2510                 if (ret)
2511                         break;
2512
2513                 ret = check_nlinks_update_hardlinks(c, &links,
2514                                          this_iter_range_start,
2515                                          next_iter_range_start);
2516                 if (ret)
2517                         break;
2518
2519                 links.nr = 0;
2520         } while (next_iter_range_start != U64_MAX);
2521
2522         kvfree(links.d);
2523         bch_err_fn(c, ret);
2524         return ret;
2525 }
2526
2527 static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
2528                              struct bkey_s_c k)
2529 {
2530         struct bkey_s_c_reflink_p p;
2531         struct bkey_i_reflink_p *u;
2532
2533         if (k.k->type != KEY_TYPE_reflink_p)
2534                 return 0;
2535
2536         p = bkey_s_c_to_reflink_p(k);
2537
2538         if (!p.v->front_pad && !p.v->back_pad)
2539                 return 0;
2540
2541         u = bch2_trans_kmalloc(trans, sizeof(*u));
2542         int ret = PTR_ERR_OR_ZERO(u);
2543         if (ret)
2544                 return ret;
2545
2546         bkey_reassemble(&u->k_i, k);
2547         u->v.front_pad  = 0;
2548         u->v.back_pad   = 0;
2549
2550         return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
2551 }
2552
2553 int bch2_fix_reflink_p(struct bch_fs *c)
2554 {
2555         if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
2556                 return 0;
2557
2558         int ret = bch2_trans_run(c,
2559                 for_each_btree_key_commit(trans, iter,
2560                                 BTREE_ID_extents, POS_MIN,
2561                                 BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
2562                                 BTREE_ITER_ALL_SNAPSHOTS, k,
2563                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
2564                         fix_reflink_p_key(trans, &iter, k)));
2565         bch_err_fn(c, ret);
2566         return ret;
2567 }