+}
+
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
+{
+ struct bkey_i_btree_ptr_v2 *new;
+ int ret;
+
+ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ btree_ptr_to_v2(b, new);
+ b->data->min_key = new_min;
+ new->v.min_key = new_min;
+ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+ if (ret) {
+ kfree(new);
+ return ret;
+ }
+
+ bch2_btree_node_drop_keys_outside_node(b);
+
+ return 0;
+}
+
+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
+{
+ struct bkey_i_btree_ptr_v2 *new;
+ int ret;
+
+ ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
+ if (ret)
+ return ret;
+
+ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ btree_ptr_to_v2(b, new);
+ b->data->max_key = new_max;
+ new->k.p = new_max;
+ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+ if (ret) {
+ kfree(new);
+ return ret;
+ }
+
+ bch2_btree_node_drop_keys_outside_node(b);
+
+ mutex_lock(&c->btree_cache.lock);
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ bkey_copy(&b->key, &new->k_i);
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+ BUG_ON(ret);
+ mutex_unlock(&c->btree_cache.lock);
+ return 0;
+}
+
+static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
+ struct btree *prev, struct btree *cur)
+{
+ struct bpos expected_start = !prev
+ ? b->data->min_key
+ : bpos_successor(prev->key.k.p);
+ char buf1[200], buf2[200];
+ int ret = 0;
+
+ if (!prev) {
+ struct printbuf out = PBUF(buf1);
+ pr_buf(&out, "start of node: ");
+ bch2_bpos_to_text(&out, b->data->min_key);
+ } else {
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
+ }
+
+ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key));
+
+ if (prev &&
+ bpos_cmp(expected_start, cur->data->min_key) > 0 &&
+ BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
+ /* cur overwrites prev: */
+
+ if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key,
+ cur->data->min_key) >= 0, c,
+ "btree node overwritten by next node at btree %s level %u:\n"
+ " node %s\n"
+ " next %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1, buf2))
+ return DROP_PREV_NODE;
+
+ if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p,
+ bpos_predecessor(cur->data->min_key)), c,
+ "btree node with incorrect max_key at btree %s level %u:\n"
+ " node %s\n"
+ " next %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1, buf2))
+ ret = set_node_max(c, prev,
+ bpos_predecessor(cur->data->min_key));
+ } else {
+ /* prev overwrites cur: */
+
+ if (mustfix_fsck_err_on(bpos_cmp(expected_start,
+ cur->data->max_key) >= 0, c,
+ "btree node overwritten by prev node at btree %s level %u:\n"
+ " prev %s\n"
+ " node %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1, buf2))
+ return DROP_THIS_NODE;
+
+ if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
+ "btree node with incorrect min_key at btree %s level %u:\n"
+ " prev %s\n"
+ " node %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ buf1, buf2))
+ ret = set_node_min(c, cur, expected_start);
+ }
+fsck_err:
+ return ret;
+}
+
+static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+ struct btree *child)
+{
+ char buf1[200], buf2[200];
+ int ret = 0;
+
+ if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
+ "btree node with incorrect max_key at btree %s level %u:\n"
+ " %s\n"
+ " expected %s",
+ bch2_btree_ids[b->c.btree_id], b->c.level,
+ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
+ (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
+ ret = set_node_max(c, child, b->key.k.p);
+ if (ret)
+ return ret;
+ }
+fsck_err:
+ return ret;
+}
+
+static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
+{
+ struct btree_and_journal_iter iter;
+ struct bkey_s_c k;
+ struct bkey_buf prev_k, cur_k;
+ struct btree *prev = NULL, *cur = NULL;
+ bool have_child, dropped_children = false;
+ char buf[200];
+ int ret = 0;
+
+ if (!b->c.level)
+ return 0;
+again:
+ prev = NULL;
+ have_child = dropped_children = false;
+ bch2_bkey_buf_init(&prev_k);
+ bch2_bkey_buf_init(&cur_k);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
+ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
+
+ bch2_btree_and_journal_iter_advance(&iter);
+ bch2_bkey_buf_reassemble(&cur_k, c, k);
+
+ cur = bch2_btree_node_get_noiter(c, cur_k.k,
+ b->c.btree_id, b->c.level - 1,
+ false);
+ ret = PTR_ERR_OR_ZERO(cur);
+
+ if (mustfix_fsck_err_on(ret == -EIO, c,
+ "Unreadable btree node at btree %s level %u:\n"
+ " %s",
+ bch2_btree_ids[b->c.btree_id],
+ b->c.level - 1,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur_k.k)), buf))) {
+ bch2_btree_node_evict(c, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ if (ret)
+ break;
+ continue;
+ }
+
+ if (ret) {
+ bch_err(c, "%s: error %i getting btree node",
+ __func__, ret);
+ break;
+ }
+
+ ret = btree_repair_node_boundaries(c, b, prev, cur);
+
+ if (ret == DROP_THIS_NODE) {
+ six_unlock_read(&cur->c.lock);
+ bch2_btree_node_evict(c, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ if (ret)
+ break;
+ continue;
+ }
+
+ if (prev)
+ six_unlock_read(&prev->c.lock);
+ prev = NULL;
+
+ if (ret == DROP_PREV_NODE) {
+ bch2_btree_node_evict(c, prev_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, prev_k.k->k.p);
+ if (ret)
+ break;
+
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
+ goto again;
+ } else if (ret)
+ break;
+
+ prev = cur;
+ cur = NULL;
+ bch2_bkey_buf_copy(&prev_k, c, cur_k.k);
+ }
+
+ if (!ret && !IS_ERR_OR_NULL(prev)) {
+ BUG_ON(cur);
+ ret = btree_repair_node_end(c, b, prev);
+ }
+
+ if (!IS_ERR_OR_NULL(prev))
+ six_unlock_read(&prev->c.lock);
+ prev = NULL;
+ if (!IS_ERR_OR_NULL(cur))
+ six_unlock_read(&cur->c.lock);
+ cur = NULL;
+
+ if (ret)
+ goto err;
+
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ bch2_bkey_buf_reassemble(&cur_k, c, k);
+ bch2_btree_and_journal_iter_advance(&iter);
+
+ cur = bch2_btree_node_get_noiter(c, cur_k.k,
+ b->c.btree_id, b->c.level - 1,
+ false);
+ ret = PTR_ERR_OR_ZERO(cur);
+
+ if (ret) {
+ bch_err(c, "%s: error %i getting btree node",
+ __func__, ret);
+ goto err;
+ }
+
+ ret = bch2_btree_repair_topology_recurse(c, cur);
+ six_unlock_read(&cur->c.lock);
+ cur = NULL;
+
+ if (ret == DROP_THIS_NODE) {
+ bch2_btree_node_evict(c, cur_k.k);
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
+ b->c.level, cur_k.k->k.p);
+ dropped_children = true;
+ }
+
+ if (ret)
+ goto err;
+
+ have_child = true;
+ }
+
+ if (mustfix_fsck_err_on(!have_child, c,
+ "empty interior btree node at btree %s level %u\n"
+ " %s",
+ bch2_btree_ids[b->c.btree_id],
+ b->c.level,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
+ ret = DROP_THIS_NODE;
+err:
+fsck_err:
+ if (!IS_ERR_OR_NULL(prev))
+ six_unlock_read(&prev->c.lock);
+ if (!IS_ERR_OR_NULL(cur))
+ six_unlock_read(&cur->c.lock);
+
+ bch2_btree_and_journal_iter_exit(&iter);
+ bch2_bkey_buf_exit(&prev_k, c);
+ bch2_bkey_buf_exit(&cur_k, c);
+
+ if (!ret && dropped_children)
+ goto again;
+
+ return ret;
+}
+
+static int bch2_repair_topology(struct bch_fs *c)
+{
+ struct btree *b;
+ unsigned i;
+ int ret = 0;
+
+ for (i = 0; i < BTREE_ID_NR && !ret; i++) {
+ b = c->btree_roots[i].b;
+ if (btree_node_fake(b))
+ continue;
+
+ six_lock_read(&b->c.lock, NULL, NULL);
+ ret = bch2_btree_repair_topology_recurse(c, b);
+ six_unlock_read(&b->c.lock);
+
+ if (ret == DROP_THIS_NODE) {
+ bch_err(c, "empty btree root - repair unimplemented");
+ ret = FSCK_ERR_EXIT;
+ }
+ }
+
+ return ret;
+}
+
+static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+ unsigned level, bool is_root,
+ struct bkey_s_c *k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ char buf[200];
+ int ret = 0;
+
+ /*
+ * XXX
+ * use check_bucket_ref here
+ */
+ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
+ struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
+
+ if (fsck_err_on(!g->gen_valid, c,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ if (!p.ptr.cached) {
+ g2->_mark.gen = g->_mark.gen = p.ptr.gen;
+ g2->gen_valid = g->gen_valid = true;
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (fsck_err_on(data_type == BCH_DATA_btree &&
+ g->mark.gen != p.ptr.gen, c,
+ "bucket %u:%zu data type %s has metadata but wrong gen: %u != %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->mark.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ g2->_mark.data_type = g->_mark.data_type = data_type;
+ g2->gen_valid = g->gen_valid = true;
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ }
+
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->mark.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ if (!p.ptr.cached) {
+ g2->_mark.gen = g->_mark.gen = p.ptr.gen;
+ g2->gen_valid = g->gen_valid = true;
+ g2->_mark.data_type = 0;
+ g2->_mark.dirty_sectors = 0;
+ g2->_mark.cached_sectors = 0;
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ do_update = true;
+
+ if (fsck_err_on(!p.ptr.cached &&
+ gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ p.ptr.gen, g->mark.gen,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ do_update = true;
+
+ if (p.ptr.gen != g->mark.gen)
+ continue;
+
+ if (fsck_err_on(g->mark.data_type &&
+ g->mark.data_type != data_type, c,
+ "bucket %u:%zu different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_types[g->mark.data_type],
+ bch2_data_types[data_type],
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g2->_mark.data_type = g->_mark.data_type = data_type;
+ g2->gen_valid = g->gen_valid = true;
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+ } else {
+ do_update = true;
+ }
+ }
+
+ if (p.has_ec) {
+ struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
+
+ if (fsck_err_on(!m || !m->alive, c,
+ "pointer to nonexistent stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ do_update = true;
+
+ if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
+ "pointer does not match stripe %llu\n"
+ "while marking %s",
+ (u64) p.ec.idx,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+ do_update = true;
+ }
+ }
+
+ if (do_update) {
+ struct bkey_ptrs ptrs;
+ union bch_extent_entry *entry;
+ struct bch_extent_ptr *ptr;
+ struct bkey_i *new;
+
+ if (is_root) {
+ bch_err(c, "cannot update btree roots yet");
+ return -EINVAL;
+ }
+
+ new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+ if (!new) {
+ bch_err(c, "%s: error allocating new key", __func__);
+ return -ENOMEM;
+ }
+
+ bkey_reassemble(new, *k);
+
+ if (level) {
+ /*
+ * We don't want to drop btree node pointers - if the
+ * btree node isn't there anymore, the read path will
+ * sort it out:
+ */
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+ ptr->gen = g->mark.gen;
+ }
+ } else {
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
+
+ (ptr->cached &&
+ (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+ (!ptr->cached &&
+ gen_cmp(ptr->gen, g->mark.gen) < 0) ||
+ gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+ (g->mark.data_type &&
+ g->mark.data_type != data_type);
+ }));
+again:
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ bkey_extent_entry_for_each(ptrs, entry) {
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
+ struct stripe *m = genradix_ptr(&c->stripes[true],
+ entry->stripe_ptr.idx);
+ union bch_extent_entry *next_ptr;
+
+ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
+ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
+ goto found;
+ next_ptr = NULL;
+found:
+ if (!next_ptr) {
+ bch_err(c, "aieee, found stripe ptr with no data ptr");
+ continue;
+ }
+
+ if (!m || !m->alive ||
+ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
+ &next_ptr->ptr,
+ m->sectors)) {
+ bch2_bkey_extent_entry_drop(new, entry);
+ goto again;
+ }
+ }
+ }
+ }
+
+ ret = bch2_journal_key_insert(c, btree_id, level, new);
+ if (ret)
+ kfree(new);
+ else
+ *k = bkey_i_to_s_c(new);
+ }