X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_gc.c;h=146f2428fe04ced98b545b545518d7c14903dc42;hb=34c9be19b376a007041555a7c9a47dfef3d0b1e6;hp=9f0de5cd25ab31093f5fc321a8f04ef561110ab9;hpb=b485aae1bac95e3b5be235116e2bc43da85906c5;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 9f0de5c..146f242 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet * Copyright (C) 2014 Datera Inc. @@ -46,65 +47,42 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) __gc_pos_set(c, new_pos); } -/* range_checks - for validating min/max pos of each btree node: */ - -struct range_checks { - struct range_level { - struct bpos min; - struct bpos max; - } l[BTREE_MAX_DEPTH]; - unsigned depth; -}; - -static void btree_node_range_checks_init(struct range_checks *r, unsigned depth) -{ - unsigned i; - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - r->l[i].min = r->l[i].max = POS_MIN; - r->depth = depth; -} - -static void btree_node_range_checks(struct bch_fs *c, struct btree *b, - struct range_checks *r) +static int bch2_gc_check_topology(struct bch_fs *c, + struct bkey_s_c k, + struct bpos *expected_start, + struct bpos expected_end, + bool is_last) { - struct range_level *l = &r->l[b->level]; - - struct bpos expected_min = bkey_cmp(l->min, l->max) - ? btree_type_successor(b->btree_id, l->max) - : l->max; - - bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c, - "btree node has incorrect min key: %llu:%llu != %llu:%llu", - b->data->min_key.inode, - b->data->min_key.offset, - expected_min.inode, - expected_min.offset); - - l->max = b->data->max_key; + int ret = 0; - if (b->level > r->depth) { - l = &r->l[b->level - 1]; + if (k.k->type == KEY_TYPE_btree_ptr_v2) { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c, - "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu", - b->data->min_key.inode, - b->data->min_key.offset, - l->min.inode, - l->min.offset); + if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, + "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", + bp.v->min_key.inode, + bp.v->min_key.offset, + expected_start->inode, + expected_start->offset)) { + BUG(); + } + } - bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c, - "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu", - b->data->max_key.inode, - b->data->max_key.offset, - l->max.inode, - l->max.offset); - - if (bkey_cmp(b->data->max_key, POS_MAX)) - l->min = l->max = - btree_type_successor(b->btree_id, - b->data->max_key); + *expected_start = bkey_cmp(k.k->p, POS_MAX) + ? bkey_successor(k.k->p) + : k.k->p; + + if (fsck_err_on(is_last && + bkey_cmp(k.k->p, expected_end), c, + "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", + k.k->p.inode, + k.k->p.offset, + expected_end.inode, + expected_end.offset)) { + BUG(); } +fsck_err: + return ret; } /* marking of btree keys/nodes: */ @@ -115,15 +93,19 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; unsigned flags = - BCH_BUCKET_MARK_GC| - (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); int ret = 0; if (initial) { BUG_ON(journal_seq_verify(c) && k.k->version.lo > journal_cur_seq(&c->journal)); - if (k.k->version.lo > atomic64_read(&c->key_version)) + /* XXX change to fsck check */ + if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, + "key version number higher than recorded: %llu > %llu", + k.k->version.lo, + atomic64_read(&c->key_version))) atomic64_set(&c->key_version, k.k->version.lo); if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || @@ -141,20 +123,24 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, struct bucket *g2 = PTR_BUCKET(ca, ptr, false); if (mustfix_fsck_err_on(!g->gen_valid, c, - "found ptr with missing gen in alloc btree,\n" - "type %u gen %u", - k.k->type, ptr->gen)) { + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", + ptr->dev, PTR_BUCKET_NR(ca, ptr), + bch2_data_types[ptr_data_type(k.k, ptr)], + ptr->gen)) { g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->_mark.dirty = g->_mark.dirty = true; g2->gen_valid = g->gen_valid = true; } if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, - "%u ptr gen in the future: %u > %u", - k.k->type, ptr->gen, g->mark.gen)) { + "bucket %u:%zu data type %s ptr gen in the future: %u > %u", + ptr->dev, PTR_BUCKET_NR(ca, ptr), + bch2_data_types[ptr_data_type(k.k, ptr)], + ptr->gen, g->mark.gen)) { g2->_mark.gen = g->_mark.gen = ptr->gen; - g2->_mark.dirty = g->_mark.dirty = true; g2->gen_valid = g->gen_valid = true; + g2->_mark.data_type = 0; + g2->_mark.dirty_sectors = 0; + g2->_mark.cached_sectors = 0; set_bit(BCH_FS_FIXED_GENS, &c->flags); } } @@ -170,14 +156,15 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, *max_stale = max(*max_stale, ptr_stale(ca, ptr)); } - bch2_mark_key(c, k, true, k.k->size, NULL, 0, flags); + bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); fsck_err: return ret; } -static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, - u8 *max_stale, bool initial) +static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, + bool initial) { + struct bpos next_node_start = b->data->min_key; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; @@ -188,13 +175,25 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, if (!btree_node_type_needs_gc(btree_node_type(b))) return 0; - for_each_btree_node_key_unpack(b, k, &iter, - &unpacked) { + bch2_btree_node_iter_init_from_start(&iter, b); + + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { bch2_bkey_debugcheck(c, b, k); ret = bch2_gc_mark_key(c, k, max_stale, initial); if (ret) break; + + bch2_btree_node_iter_advance(&iter, b); + + if (b->level) { + ret = bch2_gc_check_topology(c, k, + &next_node_start, + b->data->max_key, + bch2_btree_node_iter_end(&iter)); + if (ret) + break; + } } return ret; @@ -206,24 +205,19 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, struct btree_trans trans; struct btree_iter *iter; struct btree *b; - struct range_checks r; unsigned depth = metadata_only ? 1 : expensive_debug_checks(c) ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; - u8 max_stale; + u8 max_stale = 0; int ret = 0; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); - btree_node_range_checks_init(&r, depth); - __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, depth, BTREE_ITER_PREFETCH, b) { - btree_node_range_checks(c, b, &r); - bch2_verify_btree_nr_keys(b); gc_pos_set(c, gc_pos_btree_node(b)); @@ -264,40 +258,116 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, return ret; } -static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) +static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, + struct journal_keys *journal_keys, + unsigned target_depth) { - return (int) btree_id_to_gc_phase(l) - - (int) btree_id_to_gc_phase(r); + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bpos next_node_start = b->data->min_key; + u8 max_stale = 0; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_debugcheck(c, b, k); + + BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); + + ret = bch2_gc_mark_key(c, k, &max_stale, true); + if (ret) + break; + + if (b->level) { + struct btree *child; + BKEY_PADDED(k) tmp; + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + + bch2_btree_and_journal_iter_advance(&iter); + + ret = bch2_gc_check_topology(c, k, + &next_node_start, + b->data->max_key, + !bch2_btree_and_journal_iter_peek(&iter).k); + if (ret) + break; + + if (b->level > target_depth) { + child = bch2_btree_node_get_noiter(c, &tmp.k, + b->btree_id, b->level - 1); + ret = PTR_ERR_OR_ZERO(child); + if (ret) + break; + + ret = bch2_gc_btree_init_recurse(c, child, + journal_keys, target_depth); + six_unlock_read(&child->lock); + + if (ret) + break; + } + } else { + bch2_btree_and_journal_iter_advance(&iter); + } + } + + return ret; } -static int mark_journal_key(struct bch_fs *c, enum btree_id id, - struct bkey_i *insert) +static int bch2_gc_btree_init(struct bch_fs *c, + struct journal_keys *journal_keys, + enum btree_id btree_id, + bool metadata_only) { - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - u8 max_stale; + struct btree *b; + unsigned target_depth = metadata_only ? 1 + : expensive_debug_checks(c) ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; int ret = 0; - ret = bch2_gc_mark_key(c, bkey_i_to_s_c(insert), &max_stale, true); - if (ret) - return ret; + b = c->btree_roots[btree_id].b; - bch2_trans_init(&trans, c); + if (btree_node_fake(b)) + return 0; - for_each_btree_key(&trans, iter, id, bkey_start_pos(&insert->k), - BTREE_ITER_SLOTS, k, ret) { - percpu_down_read_preempt_disable(&c->mark_lock); - ret = bch2_mark_overwrite(&trans, iter, k, insert, NULL, - BCH_BUCKET_MARK_GC| - BCH_BUCKET_MARK_NOATOMIC); - percpu_up_read_preempt_enable(&c->mark_lock); + six_lock_read(&b->lock); + if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, + "btree root with incorrect min_key: %llu:%llu", + b->data->min_key.inode, + b->data->min_key.offset)) { + BUG(); + } - if (!ret) - break; + if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, + "btree root with incorrect min_key: %llu:%llu", + b->data->max_key.inode, + b->data->max_key.offset)) { + BUG(); } - return bch2_trans_exit(&trans) ?: ret; + if (b->level >= target_depth) + ret = bch2_gc_btree_init_recurse(c, b, + journal_keys, target_depth); + + if (!ret) + ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), + &max_stale, true); +fsck_err: + six_unlock_read(&b->lock); + + return ret; +} + +static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) +{ + return (int) btree_id_to_gc_phase(l) - + (int) btree_id_to_gc_phase(r); } static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, @@ -312,24 +382,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, for (i = 0; i < BTREE_ID_NR; i++) { enum btree_id id = ids[i]; - enum btree_node_type type = __btree_node_type(0, id); - - int ret = bch2_gc_btree(c, id, initial, metadata_only); + int ret = initial + ? bch2_gc_btree_init(c, journal_keys, + id, metadata_only) + : bch2_gc_btree(c, id, initial, metadata_only); if (ret) return ret; - - if (journal_keys && !metadata_only && - btree_node_type_needs_gc(type)) { - struct journal_key *j; - int ret; - - for_each_journal_key(*journal_keys, j) - if (j->btree_id == id) { - ret = mark_journal_key(c, id, j->k); - if (ret) - return ret; - } - } } return 0; @@ -367,9 +425,7 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, */ if (c) { lockdep_assert_held(&c->sb_lock); - percpu_down_read_preempt_disable(&c->mark_lock); - } else { - preempt_disable(); + percpu_down_read(&c->mark_lock); } for (i = 0; i < layout->nr_superblocks; i++) { @@ -391,11 +447,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, gc_phase(GC_PHASE_SB), flags); } - if (c) { - percpu_up_read_preempt_enable(&c->mark_lock); - } else { - preempt_enable(); - } + if (c) + percpu_up_read(&c->mark_lock); } static void bch2_mark_superblocks(struct bch_fs *c) @@ -407,7 +460,7 @@ static void bch2_mark_superblocks(struct bch_fs *c) gc_pos_set(c, gc_phase(GC_PHASE_SB)); for_each_online_member(ca, c, i) - bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC); + bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); mutex_unlock(&c->sb_lock); } @@ -423,8 +476,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) bch2_mark_key(c, bkey_i_to_s_c(&d->key), - true, 0, NULL, 0, - BCH_BUCKET_MARK_GC); + 0, 0, NULL, 0, + BTREE_TRIGGER_GC); mutex_unlock(&c->btree_interior_update_lock); } @@ -436,7 +489,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) size_t i, j, iter; unsigned ci; - percpu_down_read_preempt_disable(&c->mark_lock); + percpu_down_read(&c->mark_lock); spin_lock(&c->freelist_lock); gc_pos_set(c, gc_pos_alloc(c, NULL)); @@ -445,7 +498,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free_inc, iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_GC); @@ -453,7 +506,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free[j], iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_GC); } spin_unlock(&c->freelist_lock); @@ -467,12 +520,12 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) ca = bch_dev_bkey_exists(c, ob->ptr.dev); bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, gc_pos_alloc(c, ob), - BCH_BUCKET_MARK_GC); + BTREE_TRIGGER_GC); } spin_unlock(&ob->lock); } - percpu_up_read_preempt_enable(&c->mark_lock); + percpu_up_read(&c->mark_lock); } static void bch2_gc_free(struct bch_fs *c) @@ -530,7 +583,6 @@ static int bch2_gc_done(struct bch_fs *c, ": got %u, should be %u", i, b, \ dst->b[b].mark._f, src->b[b].mark._f); \ dst->b[b]._mark._f = src->b[b].mark._f; \ - dst->b[b]._mark.dirty = true; \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) @@ -582,10 +634,7 @@ static int bch2_gc_done(struct bch_fs *c, copy_bucket_field(dirty_sectors); copy_bucket_field(cached_sectors); - if (dst->b[b].oldest_gen != src->b[b].oldest_gen) { - dst->b[b].oldest_gen = src->b[b].oldest_gen; - dst->b[b]._mark.dirty = true; - } + dst->b[b].oldest_gen = src->b[b].oldest_gen; } }; @@ -644,19 +693,16 @@ static int bch2_gc_start(struct bch_fs *c, { struct bch_dev *ca; unsigned i; - - /* - * indicate to stripe code that we need to allocate for the gc stripes - * radix tree, too - */ - gc_pos_set(c, gc_phase(GC_PHASE_START)); + int ret; BUG_ON(c->usage_gc); c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), sizeof(u64), GFP_KERNEL); - if (!c->usage_gc) + if (!c->usage_gc) { + bch_err(c, "error allocating c->usage_gc"); return -ENOMEM; + } for_each_member_device(ca, c, i) { BUG_ON(ca->buckets[1]); @@ -667,16 +713,32 @@ static int bch2_gc_start(struct bch_fs *c, GFP_KERNEL|__GFP_ZERO); if (!ca->buckets[1]) { percpu_ref_put(&ca->ref); + bch_err(c, "error allocating ca->buckets[gc]"); return -ENOMEM; } ca->usage[1] = alloc_percpu(struct bch_dev_usage); if (!ca->usage[1]) { + bch_err(c, "error allocating ca->usage[gc]"); percpu_ref_put(&ca->ref); return -ENOMEM; } } + ret = bch2_ec_mem_alloc(c, true); + if (ret) { + bch_err(c, "error allocating ec gc mem"); + return ret; + } + + percpu_down_write(&c->mark_lock); + + /* + * indicate to stripe code that we need to allocate for the gc stripes + * radix tree, too + */ + gc_pos_set(c, gc_phase(GC_PHASE_START)); + for_each_member_device(ca, c, i) { struct bucket_array *dst = __bucket_array(ca, 1); struct bucket_array *src = __bucket_array(ca, 0); @@ -701,7 +763,9 @@ static int bch2_gc_start(struct bch_fs *c, } }; - return bch2_ec_mem_alloc(c, true); + percpu_up_write(&c->mark_lock); + + return 0; } /** @@ -734,10 +798,7 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, down_write(&c->gc_lock); again: - percpu_down_write(&c->mark_lock); ret = bch2_gc_start(c, metadata_only); - percpu_up_write(&c->mark_lock); - if (ret) goto out; @@ -766,6 +827,8 @@ out: percpu_down_write(&c->mark_lock); bch2_gc_free(c); percpu_up_write(&c->mark_lock); + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); goto again; } @@ -876,7 +939,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, return; } - as = bch2_btree_update_start(c, iter->btree_id, + as = bch2_btree_update_start(iter->trans, iter->btree_id, btree_update_reserve_required(c, parent) + nr_old_nodes, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE, @@ -918,7 +981,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, k < vstruct_last(s2) && vstruct_blocks_plus(n1->data, c->block_bits, u64s + k->u64s) <= blocks; - k = bkey_next(k)) { + k = bkey_next_skip_noops(k, vstruct_last(s2))) { last = k; u64s += k->u64s; } @@ -947,9 +1010,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, n1->key.k.p = n1->data->max_key = bkey_unpack_pos(n1, last); - n2->data->min_key = - btree_type_successor(iter->btree_id, - n1->data->max_key); + n2->data->min_key = bkey_successor(n1->data->max_key); memcpy_u64s(vstruct_last(s1), s2->start, u64s); @@ -1036,11 +1097,12 @@ next: old_nodes[i] = new_nodes[i]; } else { old_nodes[i] = NULL; - if (new_nodes[i]) - six_unlock_intent(&new_nodes[i]->lock); } } + for (i = 0; i < nr_new_nodes; i++) + six_unlock_intent(&new_nodes[i]->lock); + bch2_btree_update_done(as); bch2_keylist_free(&keylist, NULL); } @@ -1057,7 +1119,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) struct btree *merge[GC_MERGE_NODES]; u32 lock_seq[GC_MERGE_NODES]; - bch2_trans_init(&trans, c); + bch2_trans_init(&trans, c, 0, 0); /* * XXX: We don't have a good way of positively matching on sibling nodes