From 7740db24f7b169dc09938ad67d2c15771fa070da Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Jan 2021 16:16:51 -0500 Subject: [PATCH] Update bcachefs sources to bee34d805c bcachefs: Repair bad data pointers --- .bcachefs_revision | 2 +- libbcachefs/bcachefs.h | 6 +- libbcachefs/bcachefs_format.h | 5 +- libbcachefs/btree_cache.c | 43 +++- libbcachefs/btree_cache.h | 2 +- libbcachefs/btree_gc.c | 370 ++++++++++++++++++++-------- libbcachefs/btree_gc.h | 3 +- libbcachefs/btree_io.c | 5 + libbcachefs/btree_update_interior.c | 1 - libbcachefs/ec.c | 5 +- libbcachefs/extents.c | 8 +- libbcachefs/journal.c | 8 +- libbcachefs/journal_io.c | 110 +++++++-- libbcachefs/journal_io.h | 4 +- libbcachefs/recovery.c | 214 +++++++++++----- libbcachefs/recovery.h | 17 +- libbcachefs/super-io.c | 8 +- libbcachefs/super.c | 1 + libbcachefs/sysfs.c | 2 +- 19 files changed, 584 insertions(+), 230 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 4902641..69ea54b 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -ffc900d5936ae538e34d18a6ce739d0a5a9178cf +bee34d805cf75e57f9380e0ee91771b9d90b2b2d diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 505777b..91b9375 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -509,7 +509,8 @@ enum { BCH_FS_ERRORS_FIXED, /* misc: */ - BCH_FS_FIXED_GENS, + BCH_FS_NEED_ANOTHER_GC, + BCH_FS_DELETED_NODES, BCH_FS_NEED_ALLOC_WRITE, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, @@ -539,11 +540,13 @@ struct journal_keys { struct journal_key { enum btree_id btree_id:8; unsigned level:8; + bool allocated; struct bkey_i *k; u32 journal_seq; u32 journal_offset; } *d; size_t nr; + size_t size; u64 journal_seq_base; }; @@ -840,6 +843,7 @@ struct bch_fs { struct journal journal; struct list_head journal_entries; struct journal_keys journal_keys; + struct list_head journal_iters; u64 last_bucket_seq_cleanup; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 307d552..6dc150c 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -603,13 +603,14 @@ struct bch_btree_ptr_v2 { __u64 mem_ptr; __le64 seq; __le16 sectors_written; - /* In case we ever decide to do variable size btree nodes: */ - __le16 sectors; + __le16 flags; struct bpos min_key; struct bch_extent_ptr start[0]; __u64 _data[0]; } __attribute__((packed, aligned(8))); +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + struct bch_extent { struct bch_val v; diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index bebf9fb..4fa3f80 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -7,6 +7,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "debug.h" +#include "error.h" #include #include @@ -812,9 +813,12 @@ lock_node: return ERR_PTR(-EIO); } - EBUG_ON(b->c.btree_id != iter->btree_id || - BTREE_NODE_LEVEL(b->data) != level || - bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->c.btree_id != iter->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + bkey_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); return b; } @@ -822,7 +826,8 @@ lock_node: struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, const struct bkey_i *k, enum btree_id btree_id, - unsigned level) + unsigned level, + bool nofill) { struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -837,6 +842,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { + if (nofill) + return NULL; + b = bch2_btree_node_fill(c, NULL, k, btree_id, level, SIX_LOCK_read, true); @@ -883,9 +891,12 @@ lock_node: return ERR_PTR(-EIO); } - EBUG_ON(b->c.btree_id != btree_id || - BTREE_NODE_LEVEL(b->data) != level || - bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->c.btree_id != btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + EBUG_ON(bkey_cmp(b->data->max_key, k->k.p)); + EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + bkey_cmp(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)); return b; } @@ -995,8 +1006,22 @@ out: if (sib != btree_prev_sib) swap(n1, n2); - BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), - n2->data->min_key)); + if (bkey_cmp(bkey_successor(n1->key.k.p), + n2->data->min_key)) { + char buf1[200], buf2[200]; + + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key)); + bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key)); + + bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n" + "prev: %s\n" + "next: %s\n", + bch2_btree_ids[iter->btree_id], level, + buf1, buf2); + + six_unlock_intent(&ret->c.lock); + ret = NULL; + } } bch2_btree_trans_verify_locks(trans); diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 0eeca0b..5fffae9 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, enum six_lock_type, unsigned long); struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, - enum btree_id, unsigned); + enum btree_id, unsigned, bool); struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, struct btree *, enum btree_node_sibling); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index efeaec3..bab5ebd 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -50,39 +50,199 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) __gc_pos_set(c, new_pos); } +/* + * Missing: if an interior btree node is empty, we need to do something - + * perhaps just kill it + */ static int bch2_gc_check_topology(struct bch_fs *c, - struct bkey_s_c k, - struct bpos *expected_start, - struct bpos expected_end, + struct btree *b, + struct bkey_buf *prev, + struct bkey_buf cur, bool is_last) { + struct bpos node_start = b->data->min_key; + struct bpos node_end = b->data->max_key; + struct bpos expected_start = bkey_deleted(&prev->k->k) + ? node_start + : bkey_successor(prev->k->k.p); + char buf1[200], buf2[200]; + bool update_min = false; + bool update_max = false; int ret = 0; - if (k.k->type == KEY_TYPE_btree_ptr_v2) { - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); + + if (bkey_deleted(&prev->k->k)) + scnprintf(buf1, sizeof(buf1), "start of node: %llu:%llu", + node_start.inode, + node_start.offset); + else + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k)); + + if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " cur %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1, + (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) + update_min = true; + } + + if (fsck_err_on(is_last && + bkey_cmp(cur.k->k.p, node_end), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1), + (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) + update_max = true; + + bch2_bkey_buf_copy(prev, c, cur.k); + + if (update_min || update_max) { + struct bkey_i *new; + struct bkey_i_btree_ptr_v2 *bp = NULL; + struct btree *n; + + if (update_max) { + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur.k->k.p); + if (ret) + return ret; + } + + new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL); + if (!new) + return -ENOMEM; + + bkey_copy(new, cur.k); + + if (new->k.type == KEY_TYPE_btree_ptr_v2) + bp = bkey_i_to_btree_ptr_v2(new); + + if (update_min) + bp->v.min_key = expected_start; + if (update_max) + new->k.p = node_end; + if (bp) + SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true); - if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, - "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", - bp.v->min_key.inode, - bp.v->min_key.offset, - expected_start->inode, - expected_start->offset)) { - BUG(); + ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new); + if (ret) { + kfree(new); + return ret; + } + + n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id, + b->c.level - 1, true); + if (n) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, n); + + bkey_copy(&n->key, new); + if (update_min) + n->data->min_key = expected_start; + if (update_max) + n->data->max_key = node_end; + + ret = __bch2_btree_node_hash_insert(&c->btree_cache, n); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&n->c.lock); } } +fsck_err: + return ret; +} - *expected_start = bkey_cmp(k.k->p, POS_MAX) - ? bkey_successor(k.k->p) - : k.k->p; +static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); + const struct bch_extent_ptr *ptr; + bool do_update = false; + int ret = 0; - if (fsck_err_on(is_last && - bkey_cmp(k.k->p, expected_end), c, - "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", - k.k->p.inode, - k.k->p.offset, - expected_end.inode, - expected_end.offset)) { - BUG(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, true); + struct bucket *g2 = PTR_BUCKET(ca, ptr, false); + + if (fsck_err_on(!g->gen_valid, c, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", + ptr->dev, PTR_BUCKET_NR(ca, ptr), + bch2_data_types[ptr_data_type(k->k, ptr)], + ptr->gen)) { + if (!ptr->cached) { + g2->_mark.gen = g->_mark.gen = ptr->gen; + g2->gen_valid = g->gen_valid = true; + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { + do_update = true; + } + } + + if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u", + ptr->dev, PTR_BUCKET_NR(ca, ptr), + bch2_data_types[ptr_data_type(k->k, ptr)], + ptr->gen, g->mark.gen)) { + if (!ptr->cached) { + g2->_mark.gen = g->_mark.gen = ptr->gen; + g2->gen_valid = g->gen_valid = true; + g2->_mark.data_type = 0; + g2->_mark.dirty_sectors = 0; + g2->_mark.cached_sectors = 0; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + } else { + do_update = true; + } + } + + if (fsck_err_on(!ptr->cached && + gen_cmp(ptr->gen, g->mark.gen) < 0, c, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u", + ptr->dev, PTR_BUCKET_NR(ca, ptr), + bch2_data_types[ptr_data_type(k->k, ptr)], + ptr->gen, g->mark.gen)) + do_update = true; + } + + if (do_update) { + struct bch_extent_ptr *ptr; + struct bkey_i *new; + + if (is_root) { + bch_err(c, "cannot update btree roots yet"); + return -EINVAL; + } + + new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + if (!new) + return -ENOMEM; + + bkey_reassemble(new, *k); + + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, true); + + (ptr->cached && + (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || + (!ptr->cached && + gen_cmp(ptr->gen, g->mark.gen) < 0); + })); + + ret = bch2_journal_key_insert(c, btree_id, level, new); + if (ret) + kfree(new); + else + *k = bkey_i_to_s_c(new); } fsck_err: return ret; @@ -90,7 +250,9 @@ fsck_err: /* marking of btree keys/nodes: */ -static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, +static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c k, u8 *max_stale, bool initial) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -104,7 +266,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, BUG_ON(bch2_journal_seq_verify && k.k->version.lo > journal_cur_seq(&c->journal)); - /* XXX change to fsck check */ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, "key version number higher than recorded: %llu > %llu", k.k->version.lo, @@ -120,35 +281,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, return ret; } - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); - struct bucket *g2 = PTR_BUCKET(ca, ptr, false); - - if (mustfix_fsck_err_on(!g->gen_valid, c, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", - ptr->dev, PTR_BUCKET_NR(ca, ptr), - bch2_data_types[ptr_data_type(k.k, ptr)], - ptr->gen)) { - g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->gen_valid = g->gen_valid = true; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); - } - - if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u", - ptr->dev, PTR_BUCKET_NR(ca, ptr), - bch2_data_types[ptr_data_type(k.k, ptr)], - ptr->gen, g->mark.gen)) { - g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->gen_valid = g->gen_valid = true; - g2->_mark.data_type = 0; - g2->_mark.dirty_sectors = 0; - g2->_mark.cached_sectors = 0; - set_bit(BCH_FS_FIXED_GENS, &c->flags); - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); - } - } + ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k); } bkey_for_each_ptr(ptrs, ptr) { @@ -169,10 +302,10 @@ fsck_err: static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, bool initial) { - struct bpos next_node_start = b->data->min_key; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; + struct bkey_buf prev, cur; int ret = 0; *max_stale = 0; @@ -181,26 +314,32 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, return 0; bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { bch2_bkey_debugcheck(c, b, k); - ret = bch2_gc_mark_key(c, k, max_stale, initial); + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + k, max_stale, initial); if (ret) break; bch2_btree_node_iter_advance(&iter, b); if (b->c.level) { - ret = bch2_gc_check_topology(c, k, - &next_node_start, - b->data->max_key, + bch2_bkey_buf_reassemble(&cur, c, k); + + ret = bch2_gc_check_topology(c, b, &prev, cur, bch2_btree_node_iter_end(&iter)); if (ret) break; } } + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); return ret; } @@ -253,7 +392,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, mutex_lock(&c->btree_root_lock); b = c->btree_roots[btree_id].b; if (!btree_node_fake(b)) - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, + bkey_i_to_s_c(&b->key), &max_stale, initial); gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); mutex_unlock(&c->btree_root_lock); @@ -262,18 +402,18 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, } static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, - struct journal_keys *journal_keys, unsigned target_depth) { struct btree_and_journal_iter iter; struct bkey_s_c k; - struct bpos next_node_start = b->data->min_key; - struct bkey_buf tmp; + struct bkey_buf cur, prev; u8 max_stale = 0; int ret = 0; - bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); - bch2_bkey_buf_init(&tmp); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_bkey_debugcheck(c, b, k); @@ -281,50 +421,72 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); - ret = bch2_gc_mark_key(c, k, &max_stale, true); + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, + k, &max_stale, true); if (ret) break; if (b->c.level) { - struct btree *child; - - bch2_bkey_buf_reassemble(&tmp, c, k); - k = bkey_i_to_s_c(tmp.k); + bch2_bkey_buf_reassemble(&cur, c, k); + k = bkey_i_to_s_c(cur.k); bch2_btree_and_journal_iter_advance(&iter); - ret = bch2_gc_check_topology(c, k, - &next_node_start, - b->data->max_key, + ret = bch2_gc_check_topology(c, b, + &prev, cur, !bch2_btree_and_journal_iter_peek(&iter).k); if (ret) break; + } else { + bch2_btree_and_journal_iter_advance(&iter); + } + } - if (b->c.level > target_depth) { - child = bch2_btree_node_get_noiter(c, tmp.k, - b->c.btree_id, b->c.level - 1); - ret = PTR_ERR_OR_ZERO(child); - if (ret) - break; + if (b->c.level > target_depth) { + bch2_btree_and_journal_iter_exit(&iter); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + struct btree *child; + + bch2_bkey_buf_reassemble(&cur, c, k); + bch2_btree_and_journal_iter_advance(&iter); - ret = bch2_gc_btree_init_recurse(c, child, - journal_keys, target_depth); - six_unlock_read(&child->c.lock); + child = bch2_btree_node_get_noiter(c, cur.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(child); + if (fsck_err_on(ret == -EIO, c, + "unreadable btree node")) { + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur.k->k.p); if (ret) - break; + return ret; + + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + continue; } - } else { - bch2_btree_and_journal_iter_advance(&iter); + + if (ret) + break; + + ret = bch2_gc_btree_init_recurse(c, child, + target_depth); + six_unlock_read(&child->c.lock); + + if (ret) + break; } } - - bch2_bkey_buf_exit(&tmp, c); +fsck_err: + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + bch2_btree_and_journal_iter_exit(&iter); return ret; } static int bch2_gc_btree_init(struct bch_fs *c, - struct journal_keys *journal_keys, enum btree_id btree_id) { struct btree *b; @@ -355,11 +517,11 @@ static int bch2_gc_btree_init(struct bch_fs *c, } if (b->c.level >= target_depth) - ret = bch2_gc_btree_init_recurse(c, b, - journal_keys, target_depth); + ret = bch2_gc_btree_init_recurse(c, b, target_depth); if (!ret) - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, + bkey_i_to_s_c(&b->key), &max_stale, true); fsck_err: six_unlock_read(&b->c.lock); @@ -373,8 +535,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) (int) btree_id_to_gc_phase(r); } -static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, - bool initial) +static int bch2_gc_btrees(struct bch_fs *c, bool initial) { enum btree_id ids[BTREE_ID_NR]; unsigned i; @@ -386,8 +547,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, for (i = 0; i < BTREE_ID_NR; i++) { enum btree_id id = ids[i]; int ret = initial - ? bch2_gc_btree_init(c, journal_keys, - id) + ? bch2_gc_btree_init(c, id) : bch2_gc_btree(c, id, initial); if (ret) return ret; @@ -775,8 +935,7 @@ static int bch2_gc_start(struct bch_fs *c) * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ -int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, - bool initial) +int bch2_gc(struct bch_fs *c, bool initial) { struct bch_dev *ca; u64 start_time = local_clock(); @@ -798,7 +957,7 @@ again: bch2_mark_superblocks(c); - ret = bch2_gc_btrees(c, journal_keys, initial); + ret = bch2_gc_btrees(c, initial); if (ret) goto out; @@ -808,16 +967,15 @@ again: bch2_mark_allocator_buckets(c); c->gc_count++; -out: - if (!ret && - (test_bit(BCH_FS_FIXED_GENS, &c->flags) || - (!iter && bch2_test_restart_gc))) { + + if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + (!iter && bch2_test_restart_gc)) { /* * XXX: make sure gens we fixed got saved */ if (iter++ <= 2) { - bch_info(c, "Fixed gens, restarting mark and sweep:"); - clear_bit(BCH_FS_FIXED_GENS, &c->flags); + bch_info(c, "Second GC pass needed, restarting:"); + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); percpu_down_write(&c->mark_lock); @@ -832,7 +990,7 @@ out: bch_info(c, "Unable to fix bucket gens, looping"); ret = -EINVAL; } - +out: if (!ret) { bch2_journal_block(&c->journal); @@ -1371,7 +1529,7 @@ static int bch2_gc_thread(void *arg) * Full gc is currently incompatible with btree key cache: */ #if 0 - ret = bch2_gc(c, NULL, false, false); + ret = bch2_gc(c, false, false); #else ret = bch2_gc_gens(c); #endif diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index f0435a5..fa604ef 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -6,8 +6,7 @@ void bch2_coalesce(struct bch_fs *); -struct journal_keys; -int bch2_gc(struct bch_fs *, struct journal_keys *, bool); +int bch2_gc(struct bch_fs *, bool); int bch2_gc_gens(struct bch_fs *); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 65f7e36..91e578b 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -753,6 +753,11 @@ static int validate_bset(struct bch_fs *c, struct btree *b, struct bch_btree_ptr_v2 *bp = &bkey_i_to_btree_ptr_v2(&b->key)->v; + if (BTREE_PTR_RANGE_UPDATED(bp)) { + b->data->min_key = bp->min_key; + b->data->max_key = b->key.k.p; + } + btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), BTREE_ERR_MUST_RETRY, c, b, NULL, "incorrect min_key: got %llu:%llu should be %llu:%llu", diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 5bb6532..8919ea6 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -297,7 +297,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev bp->v.mem_ptr = 0; bp->v.seq = b->data->keys.seq; bp->v.sectors_written = 0; - bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); } if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 9c7cc78..086897c 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -744,7 +744,6 @@ err: static int ec_stripe_bkey_update(struct btree_trans *trans, struct bkey_i_stripe *new) { - struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_s_c k; const struct bch_stripe *existing; @@ -759,7 +758,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans, goto err; if (!k.k || k.k->type != KEY_TYPE_stripe) { - bch_err(c, "error updating stripe: not found"); + bch_err(trans->c, "error updating stripe: not found"); ret = -ENOENT; goto err; } @@ -767,7 +766,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans, existing = bkey_s_c_to_stripe(k).v; if (existing->nr_blocks != new->v.nr_blocks) { - bch_err(c, "error updating stripe: nr_blocks does not match"); + bch_err(trans->c, "error updating stripe: nr_blocks does not match"); ret = -EINVAL; goto err; } diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index c0ae312..67ba2c2 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -215,9 +215,8 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - pr_buf(out, "seq %llx sectors %u written %u min_key ", + pr_buf(out, "seq %llx written %u min_key ", le64_to_cpu(bp.v->seq), - le16_to_cpu(bp.v->sectors), le16_to_cpu(bp.v->sectors_written)); bch2_bpos_to_text(out, bp.v->min_key); @@ -1082,10 +1081,9 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) unsigned nonce = UINT_MAX; unsigned i; - if (k.k->type == KEY_TYPE_btree_ptr) + if (k.k->type == KEY_TYPE_btree_ptr || + k.k->type == KEY_TYPE_btree_ptr_v2) size_ondisk = c->opts.btree_node_size; - if (k.k->type == KEY_TYPE_btree_ptr_v2) - size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); bkey_extent_entry_for_each(ptrs, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index d6273c8..a7c5f5f 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1011,13 +1011,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, } list_for_each_entry(i, journal_entries, list) { + unsigned ptr; + seq = le64_to_cpu(i->j.seq); BUG_ON(seq >= cur_seq); if (seq < last_seq) continue; - journal_seq_pin(j, seq)->devs = i->devs; + p = journal_seq_pin(j, seq); + + p->devs.nr = 0; + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); } spin_lock(&j->lock); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 750f6fa..f5264d1 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -46,15 +46,16 @@ struct journal_list { * be replayed: */ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct bch_extent_ptr entry_ptr, struct journal_list *jlist, struct jset *j, bool bad) { - struct journal_replay *i, *pos; - struct bch_devs_list devs = { .nr = 0 }; + struct journal_replay *i, *pos, *dup = NULL; + struct bch_extent_ptr *ptr; struct list_head *where; size_t bytes = vstruct_bytes(j); u64 last_seq = 0; - int ret; + int ret = JOURNAL_ENTRY_ADD_OK; list_for_each_entry_reverse(i, jlist->head, list) { if (!JSET_NO_FLUSH(&i->j)) { @@ -88,28 +89,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, where = jlist->head; add: - i = where->next != jlist->head + dup = where->next != jlist->head ? container_of(where->next, struct journal_replay, list) : NULL; + if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq)) + dup = NULL; + /* * Duplicate journal entries? If so we want the one that didn't have a * checksum error: */ - if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { - if (i->bad) { - devs = i->devs; - __journal_replay_free(i); + if (dup) { + if (dup->bad) { + /* we'll replace @dup: */ } else if (bad) { goto found; } else { - fsck_err_on(bytes != vstruct_bytes(&i->j) || - memcmp(j, &i->j, bytes), c, + fsck_err_on(bytes != vstruct_bytes(&dup->j) || + memcmp(j, &dup->j, bytes), c, "found duplicate but non identical journal entries (seq %llu)", le64_to_cpu(j->seq)); goto found; } - } i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); @@ -118,17 +120,34 @@ add: goto out; } - list_add(&i->list, where); - i->devs = devs; - i->bad = bad; - i->ignore = false; + i->nr_ptrs = 0; + i->bad = bad; + i->ignore = false; memcpy(&i->j, j, bytes); + + if (dup) { + i->nr_ptrs = dup->nr_ptrs; + memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs)); + __journal_replay_free(dup); + } + + list_add(&i->list, where); found: - if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) - bch2_dev_list_add_dev(&i->devs, ca->dev_idx); - else - fsck_err_on(1, c, "duplicate journal entries on same device"); - ret = JOURNAL_ENTRY_ADD_OK; + for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; i++) { + if (ptr->dev == ca->dev_idx) { + bch_err(c, "duplicate journal entry %llu on same device", + le64_to_cpu(i->j.seq)); + goto out; + } + } + + if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { + bch_err(c, "found too many copies of journal entry %llu", + le64_to_cpu(i->j.seq)); + goto out; + } + + i->ptrs[i->nr_ptrs++] = entry_ptr; out: fsck_err: return ret; @@ -654,7 +673,10 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, jlist, j, ret != 0); + ret = journal_entry_add(c, ca, (struct bch_extent_ptr) { + .dev = ca->dev_idx, + .offset = offset, + }, jlist, j, ret != 0); mutex_unlock(&jlist->lock); switch (ret) { @@ -742,6 +764,23 @@ err: goto out; } +static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + unsigned i; + + for (i = 0; i < j->nr_ptrs; i++) { + struct bch_dev *ca = c->devs[j->ptrs[i].dev]; + + if (i) + pr_buf(out, " "); + pr_buf(out, "%u:%llu (offset %llu)", + j->ptrs[i].dev, + (u64) j->ptrs[i].offset, + (u64) j->ptrs[i].offset % ca->mi.bucket_size); + } +} + int bch2_journal_read(struct bch_fs *c, struct list_head *list, u64 *blacklist_seq, u64 *start_seq) { @@ -839,6 +878,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, while (seq < le64_to_cpu(i->j.seq)) { u64 missing_start, missing_end; + char buf1[200], buf2[200]; while (seq < le64_to_cpu(i->j.seq) && bch2_journal_seq_is_blacklisted(c, seq, false)) @@ -853,10 +893,23 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, !bch2_journal_seq_is_blacklisted(c, seq, false)) seq++; + if (i->list.prev != list) { + struct printbuf out = PBUF(buf1); + struct journal_replay *p = list_prev_entry(i, list); + + bch2_journal_ptrs_to_text(&out, c, p); + pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits)); + } else + sprintf(buf1, "(none)"); + bch2_journal_ptrs_to_text(&PBUF(buf2), c, i); + missing_end = seq - 1; - fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" + " prev at %s\n" + " next at %s", missing_start, missing_end, - last_seq, *blacklist_seq - 1); + last_seq, *blacklist_seq - 1, + buf1, buf2); } seq++; @@ -865,7 +918,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, list_for_each_entry(i, list, list) { struct jset_entry *entry; struct bkey_i *k, *_n; - struct bch_replicas_padded replicas; + struct bch_replicas_padded replicas = { + .e.data_type = BCH_DATA_journal, + .e.nr_required = 1, + }; + unsigned ptr; char buf[80]; if (i->ignore) @@ -875,13 +932,14 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list, if (ret) goto fsck_err; + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + /* * If we're mounting in degraded mode - if we didn't read all * the devices - this is wrong: */ - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); - if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 6b4c809..a4931ab 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -8,7 +8,9 @@ */ struct journal_replay { struct list_head list; - struct bch_devs_list devs; + struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX]; + unsigned nr_ptrs; + /* checksum error, but we may want to try using it anyways: */ bool bad; bool ignore; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 422f2fb..f470e0e 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -40,78 +40,169 @@ static void drop_alloc_keys(struct journal_keys *keys) /* iterate over keys read from the journal: */ -static struct journal_key *journal_key_search(struct journal_keys *journal_keys, - enum btree_id id, unsigned level, - struct bpos pos) +static int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + struct journal_key *r) +{ + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bkey_cmp(l_pos, r->k->k.p)); +} + +static int journal_key_cmp(struct journal_key *l, struct journal_key *r) +{ + return (cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->level, r->level) ?: + bkey_cmp(l->k->k.p, r->k->k.p)); +} + +static size_t journal_key_search(struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) { size_t l = 0, r = journal_keys->nr, m; while (l < r) { m = l + ((r - l) >> 1); - if ((cmp_int(id, journal_keys->d[m].btree_id) ?: - cmp_int(level, journal_keys->d[m].level) ?: - bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) + if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0) l = m + 1; else r = m; } BUG_ON(l < journal_keys->nr && - (cmp_int(id, journal_keys->d[l].btree_id) ?: - cmp_int(level, journal_keys->d[l].level) ?: - bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); + __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0); BUG_ON(l && - (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: - cmp_int(level, journal_keys->d[l - 1].level) ?: - bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); + __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0); - return l < journal_keys->nr ? journal_keys->d + l : NULL; + return l; +} + +static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx) +{ + struct bkey_i *n = iter->keys->d[idx].k; + struct btree_and_journal_iter *biter = + container_of(iter, struct btree_and_journal_iter, journal); + + if (iter->idx > idx || + (iter->idx == idx && + biter->last && + bkey_cmp(n->k.p, biter->unpacked.p) <= 0)) + iter->idx++; +} + +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct journal_key n = { + .btree_id = id, + .level = level, + .k = k, + .allocated = true + }; + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + unsigned idx = journal_key_search(keys, id, level, k->k.p); + + if (idx < keys->nr && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); + keys->d[idx] = n; + return 0; + } + + if (keys->nr == keys->size) { + struct journal_keys new_keys = { + .nr = keys->nr, + .size = keys->size * 2, + .journal_seq_base = keys->journal_seq_base, + }; + + new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL); + if (!new_keys.d) + return -ENOMEM; + + memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); + kvfree(keys->d); + *keys = new_keys; + } + + array_insert_item(keys->d, keys->nr, idx, n); + + list_for_each_entry(iter, &c->journal_iters, list) + journal_iter_fix(c, iter, idx); + + return 0; +} + +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ + struct bkey_i *whiteout = + kmalloc(sizeof(struct bkey), GFP_KERNEL); + int ret; + + if (!whiteout) + return -ENOMEM; + + bkey_init(&whiteout->k); + whiteout->k.p = pos; + + ret = bch2_journal_key_insert(c, id, level, whiteout); + if (ret) + kfree(whiteout); + return ret; } static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) { - if (iter->k && - iter->k < iter->keys->d + iter->keys->nr && - iter->k->btree_id == iter->btree_id && - iter->k->level == iter->level) - return iter->k->k; + struct journal_key *k = iter->idx - iter->keys->nr + ? iter->keys->d + iter->idx : NULL; + + if (k && + k->btree_id == iter->btree_id && + k->level == iter->level) + return k->k; - iter->k = NULL; + iter->idx = iter->keys->nr; return NULL; } static void bch2_journal_iter_advance(struct journal_iter *iter) { - if (iter->k) - iter->k++; + if (iter->idx < iter->keys->nr) + iter->idx++; +} + +static void bch2_journal_iter_exit(struct journal_iter *iter) +{ + list_del(&iter->list); } -static void bch2_journal_iter_init(struct journal_iter *iter, - struct journal_keys *journal_keys, +static void bch2_journal_iter_init(struct bch_fs *c, + struct journal_iter *iter, enum btree_id id, unsigned level, struct bpos pos) { iter->btree_id = id; iter->level = level; - iter->keys = journal_keys; - iter->k = journal_key_search(journal_keys, id, level, pos); + iter->keys = &c->journal_keys; + iter->idx = journal_key_search(&c->journal_keys, id, level, pos); + list_add(&iter->list, &c->journal_iters); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) { - return iter->btree - ? bch2_btree_iter_peek(iter->btree) - : bch2_btree_node_iter_peek_unpack(&iter->node_iter, - iter->b, &iter->unpacked); + return bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); } static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) { - if (iter->btree) - bch2_btree_iter_next(iter->btree); - else - bch2_btree_node_iter_advance(&iter->node_iter, iter->b); + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); } void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) @@ -160,7 +251,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * if (iter->b && bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { - iter->journal.k = NULL; + iter->journal.idx = iter->journal.keys->nr; iter->last = none; return bkey_s_c_null; } @@ -181,26 +272,20 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter * return bch2_btree_and_journal_iter_peek(iter); } -void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, - struct btree_trans *trans, - struct journal_keys *journal_keys, - enum btree_id id, struct bpos pos) +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) { - memset(iter, 0, sizeof(*iter)); - - iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH); - bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); + bch2_journal_iter_exit(&iter->journal); } void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct journal_keys *journal_keys, + struct bch_fs *c, struct btree *b) { memset(iter, 0, sizeof(*iter)); iter->b = b; bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); - bch2_journal_iter_init(&iter->journal, journal_keys, + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, b->data->min_key); } @@ -244,7 +329,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b int ret = 0; bch2_bkey_buf_init(&tmp); - bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ret = key_fn(c, btree_id, b->c.level, k); @@ -257,7 +342,8 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b bch2_btree_and_journal_iter_advance(&iter); child = bch2_btree_node_get_noiter(c, tmp.k, - b->c.btree_id, b->c.level - 1); + b->c.btree_id, b->c.level - 1, + false); ret = PTR_ERR_OR_ZERO(child); if (ret) @@ -277,6 +363,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b } } + bch2_btree_and_journal_iter_exit(&iter); bch2_bkey_buf_exit(&tmp, c); return ret; } @@ -333,6 +420,12 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) void bch2_journal_keys_free(struct journal_keys *keys) { + struct journal_key *i; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->allocated) + kfree(i->k); + kvfree(keys->d); keys->d = NULL; keys->nr = 0; @@ -361,7 +454,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries) nr_keys++; } - keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); + keys.size = roundup_pow_of_two(nr_keys); + + keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL); if (!keys.d) goto err; @@ -545,14 +640,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans, return ret; } -static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) +static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) { - return bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY, - __bch2_journal_replay_key(&trans, id, level, k)); + unsigned commit_flags = BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW; + + if (!k->allocated) + commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; + + return bch2_trans_do(c, NULL, NULL, commit_flags, + __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k)); } static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) @@ -628,7 +725,7 @@ static int bch2_journal_replay(struct bch_fs *c, if (i->level) { j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; - ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + ret = bch2_journal_replay_key(c, i); if (ret) goto err; } @@ -658,7 +755,7 @@ static int bch2_journal_replay(struct bch_fs *c, ret = i->k->k.size ? bch2_extent_replay_key(c, i->btree_id, i->k) - : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + : bch2_journal_replay_key(c, i); if (ret) goto err; } @@ -670,7 +767,8 @@ static int bch2_journal_replay(struct bch_fs *c, bch2_journal_flush_all_pins(j); return bch2_journal_error(j); err: - bch_err(c, "journal replay: error %d while replaying key", ret); + bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", + ret, bch2_btree_ids[i->btree_id], i->level); return ret; } @@ -1105,7 +1203,7 @@ use_clean: test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bch_info(c, "starting mark and sweep"); err = "error in mark and sweep"; - ret = bch2_gc(c, &c->journal_keys, true); + ret = bch2_gc(c, true); if (ret) goto err; bch_verbose(c, "mark and sweep done"); diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index a66827c..fa91851 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -6,10 +6,11 @@ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) struct journal_iter { + struct list_head list; enum btree_id btree_id; unsigned level; + size_t idx; struct journal_keys *keys; - struct journal_key *k; }; /* @@ -17,8 +18,6 @@ struct journal_iter { */ struct btree_and_journal_iter { - struct btree_iter *btree; - struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; @@ -32,16 +31,18 @@ struct btree_and_journal_iter { } last; }; +int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_delete(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); -void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, - struct btree_trans *, - struct journal_keys *, - enum btree_id, struct bpos); +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct journal_keys *, + struct bch_fs *, struct btree *); typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 78835bd..751efd2 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -276,19 +276,19 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) return "Bad number of member devices"; if (!BCH_SB_META_REPLICAS_WANT(sb) || - BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; if (!BCH_SB_META_REPLICAS_REQ(sb) || - BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; if (!BCH_SB_DATA_REPLICAS_WANT(sb) || - BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) return "Invalid number of data replicas"; if (!BCH_SB_DATA_REPLICAS_REQ(sb) || - BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) return "Invalid number of data replicas"; if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 9f7a6f1..f3c12d8 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -684,6 +684,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_blacklist_entries_gc); INIT_LIST_HEAD(&c->journal_entries); + INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 4fc5777..80964bd 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -475,7 +475,7 @@ STORE(bch2_fs) */ #if 0 down_read(&c->state_lock); - bch2_gc(c, NULL, false, false); + bch2_gc(c, false, false); up_read(&c->state_lock); #else bch2_gc_gens(c); -- 2.39.5