From 209695dedf49425ad9e91ba2b2239c3a040ea159 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 4 Apr 2021 22:12:56 -0400 Subject: [PATCH] Update bcachefs sources to f26267fc82 bcachefs: kill bset_tree->max_key --- .bcachefs_revision | 2 +- libbcachefs/bcachefs.h | 6 +- libbcachefs/bcachefs_format.h | 14 +- libbcachefs/bset.c | 36 +-- libbcachefs/btree_cache.c | 134 +-------- libbcachefs/btree_cache.h | 5 +- libbcachefs/btree_gc.c | 7 +- libbcachefs/btree_io.c | 3 + libbcachefs/btree_iter.c | 201 +++++++------- libbcachefs/btree_iter.h | 25 +- libbcachefs/btree_key_cache.c | 36 ++- libbcachefs/btree_key_cache.h | 12 +- libbcachefs/btree_locking.h | 24 +- libbcachefs/btree_types.h | 13 +- libbcachefs/btree_update_interior.c | 415 +++++++++++----------------- libbcachefs/btree_update_interior.h | 28 +- libbcachefs/btree_update_leaf.c | 160 +++++++---- libbcachefs/buckets.c | 277 +++++++------------ libbcachefs/buckets.h | 22 +- libbcachefs/buckets_types.h | 28 +- libbcachefs/chardev.c | 6 +- libbcachefs/journal.c | 57 ++-- libbcachefs/journal.h | 46 +-- libbcachefs/journal_reclaim.c | 123 ++++++--- libbcachefs/journal_types.h | 10 +- libbcachefs/migrate.c | 9 +- libbcachefs/move.c | 7 +- libbcachefs/opts.c | 5 + libbcachefs/opts.h | 1 + libbcachefs/recovery.c | 30 +- libbcachefs/replicas.c | 54 +++- libbcachefs/replicas.h | 25 ++ libbcachefs/super-io.c | 9 +- libbcachefs/sysfs.c | 16 +- linux/six.c | 3 +- 35 files changed, 861 insertions(+), 988 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 2e71c6c..7a002c2 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -9922afc8b6d6227f4193feef6442f8c3d881f78c +f26267fc82539ef3390cf2bb2bc818436dd504c7 diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 4133651..549cded 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -690,10 +690,11 @@ struct bch_fs { struct bch_fs_usage *usage_base; struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; struct bch_fs_usage __percpu *usage_gc; + u64 __percpu *online_reserved; /* single element mempool: */ struct mutex usage_scratch_lock; - struct bch_fs_usage *usage_scratch; + struct bch_fs_usage_online *usage_scratch; struct io_clock io_clock[2]; @@ -804,6 +805,9 @@ struct bch_fs { struct bio_set dio_write_bioset; struct bio_set dio_read_bioset; + + atomic64_t btree_writes_nr; + atomic64_t btree_writes_sectors; struct bio_list btree_write_error_list; struct work_struct btree_write_error_work; spinlock_t btree_write_error_lock; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index cb22595..ead7268 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1398,11 +1398,17 @@ enum bch_sb_feature { BCH_FEATURE_NR, }; +#define BCH_SB_COMPAT() \ + x(alloc_info, 0) \ + x(alloc_metadata, 1) \ + x(extents_above_btree_updates_done, 2) \ + x(bformat_overflow_done, 3) + enum bch_sb_compat { - BCH_COMPAT_FEAT_ALLOC_INFO = 0, - BCH_COMPAT_FEAT_ALLOC_METADATA = 1, - BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE = 2, - BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE = 3, +#define x(f, n) BCH_COMPAT_##f, + BCH_SB_COMPAT() +#undef x + BCH_COMPAT_NR, }; /* options: */ diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 3fb9a9e..f92a757 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -698,7 +698,7 @@ static void make_bfloat(struct btree *b, struct bset_tree *t, if (!bkey_pack_pos(max_key, b->data->max_key, b)) { k = (void *) max_key; bkey_init(&k->k); - k->k.p = t->max_key; + k->k.p = b->data->max_key; } } @@ -782,8 +782,6 @@ retry: while (k != btree_bkey_last(b, t)) prev = k, k = bkey_next(k); - t->max_key = bkey_unpack_pos(b, prev); - if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { bkey_init(&min_key.k); min_key.k.p = b->data->min_key; @@ -791,7 +789,7 @@ retry: if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { bkey_init(&max_key.k); - max_key.k.p = t->max_key; + max_key.k.p = b->data->max_key; } /* Then we build the tree */ @@ -970,8 +968,6 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b, min_key.u64s = max_key.u64s = 0; if (bkey_next(k) == btree_bkey_last(b, t)) { - t->max_key = bkey_unpack_pos(b, k); - for (j = 1; j < t->size; j = j * 2 + 1) make_bfloat(b, t, j, &min_key, &max_key); } @@ -1311,16 +1307,6 @@ struct bkey_packed *__bch2_bset_search(struct btree *b, case BSET_RW_AUX_TREE: return bset_search_write_set(b, t, search); case BSET_RO_AUX_TREE: - /* - * Each node in the auxiliary search tree covers a certain range - * of bits, and keys above and below the set it covers might - * differ outside those bits - so we have to special case the - * start and end - handle that here: - */ - - if (bpos_cmp(*search, t->max_key) > 0) - return btree_bkey_last(b, t); - return bset_search_tree(b, t, search, lossy_packed_search); default: unreachable(); @@ -1357,23 +1343,6 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, return m; } -/* - * Returns the first key greater than or equal to @search - */ -static __always_inline __flatten -struct bkey_packed *bch2_bset_search(struct btree *b, - struct bset_tree *t, - struct bpos *search, - struct bkey_packed *packed_search, - const struct bkey_packed *lossy_packed_search) -{ - struct bkey_packed *m = __bch2_bset_search(b, t, search, - lossy_packed_search); - - return bch2_bset_search_linear(b, t, search, - packed_search, lossy_packed_search, m); -} - /* Btree node iterator */ static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, @@ -1469,6 +1438,7 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, unsigned i; EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0); + EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0); bset_aux_tree_verify(b); memset(iter, 0, sizeof(*iter)); diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 8a4667b..1abc50f 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -906,136 +906,6 @@ out: return b; } -struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, - struct btree_iter *iter, - struct btree *b, - enum btree_node_sibling sib) -{ - struct btree_trans *trans = iter->trans; - struct btree *parent; - struct btree_node_iter node_iter; - struct bkey_packed *k; - struct bkey_buf tmp; - struct btree *ret = NULL; - unsigned level = b->c.level; - - bch2_bkey_buf_init(&tmp); - - parent = btree_iter_node(iter, level + 1); - if (!parent) - return NULL; - - /* - * There's a corner case where a btree_iter might have a node locked - * that is just outside its current pos - when - * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. - * - * But the lock ordering checks in __bch2_btree_node_lock() go off of - * iter->pos, not the node's key: so if the iterator is marked as - * needing to be traversed, we risk deadlock if we don't bail out here: - */ - if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) - return ERR_PTR(-EINTR); - - if (!bch2_btree_node_relock(iter, level + 1)) { - ret = ERR_PTR(-EINTR); - goto out; - } - - node_iter = iter->l[parent->c.level].iter; - - k = bch2_btree_node_iter_peek_all(&node_iter, parent); - BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); - - k = sib == btree_prev_sib - ? bch2_btree_node_iter_prev(&node_iter, parent) - : (bch2_btree_node_iter_advance(&node_iter, parent), - bch2_btree_node_iter_peek(&node_iter, parent)); - if (!k) - goto out; - - bch2_bkey_buf_unpack(&tmp, c, parent, k); - - ret = bch2_btree_node_get(c, iter, tmp.k, level, - SIX_LOCK_intent, _THIS_IP_); - - if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { - struct btree_iter *linked; - - if (!bch2_btree_node_relock(iter, level + 1)) - goto out; - - /* - * We might have got -EINTR because trylock failed, and we're - * holding other locks that would cause us to deadlock: - */ - trans_for_each_iter(trans, linked) - if (btree_iter_lock_cmp(iter, linked) < 0) - __bch2_btree_iter_unlock(linked); - - if (sib == btree_prev_sib) - btree_node_unlock(iter, level); - - ret = bch2_btree_node_get(c, iter, tmp.k, level, - SIX_LOCK_intent, _THIS_IP_); - - /* - * before btree_iter_relock() calls btree_iter_verify_locks(): - */ - if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level + 1); - - if (!bch2_btree_node_relock(iter, level)) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); - - if (!IS_ERR(ret)) { - six_unlock_intent(&ret->c.lock); - ret = ERR_PTR(-EINTR); - } - } - - bch2_trans_relock(trans); - } -out: - if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level + 1); - - if (PTR_ERR_OR_ZERO(ret) == -EINTR) - bch2_btree_iter_upgrade(iter, level + 2); - - BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); - - if (!IS_ERR_OR_NULL(ret)) { - struct btree *n1 = ret, *n2 = b; - - if (sib != btree_prev_sib) - swap(n1, n2); - - if (bpos_cmp(bpos_successor(n1->key.k.p), - n2->data->min_key)) { - char buf1[200], buf2[200]; - - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key)); - bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key)); - - bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n" - "prev: %s\n" - "next: %s\n", - bch2_btree_ids[iter->btree_id], level, - buf1, buf2); - - six_unlock_intent(&ret->c.lock); - ret = NULL; - } - } - - bch2_btree_trans_verify_locks(trans); - - bch2_bkey_buf_exit(&tmp, c); - - return ret; -} - void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, enum btree_id btree_id, unsigned level) @@ -1075,7 +945,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, " format: u64s %u fields %u %u %u %u %u\n" " unpack fn len: %u\n" " bytes used %zu/%zu (%zu%% full)\n" - " sib u64s: %u, %u (merge threshold %zu)\n" + " sib u64s: %u, %u (merge threshold %u)\n" " nr packed keys %u\n" " nr unpacked keys %u\n" " floats %zu\n" @@ -1092,7 +962,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], - BTREE_FOREGROUND_MERGE_THRESHOLD(c), + c->btree_foreground_merge_threshold, b->nr.packed_keys, b->nr.unpacked_keys, stats.floats, diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 2179886..4791c3b 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, enum btree_id, unsigned, bool); -struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, - struct btree *, enum btree_node_sibling); - void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, const struct bkey_i *, enum btree_id, unsigned); @@ -92,7 +89,7 @@ static inline unsigned btree_blocks(struct bch_fs *c) #define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) #define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ - (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) #define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 88c549c..268e007 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -779,7 +779,7 @@ static int bch2_gc_done(struct bch_fs *c, { struct bch_dev *ca; bool verify = (!initial || - (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); + (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); unsigned i, dev; int ret = 0; @@ -1297,11 +1297,10 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, return; } - as = bch2_btree_update_start(iter->trans, iter->btree_id, + as = bch2_btree_update_start(iter, old_nodes[0]->c.level, btree_update_reserve_required(c, parent) + nr_old_nodes, BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - NULL); + BTREE_INSERT_USE_RESERVE); if (IS_ERR(as)) { trace_btree_gc_coalesce_fail(c, BTREE_GC_COALESCE_FAIL_RESERVE_GET); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 7fbacd9..ec1290f 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1547,6 +1547,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, b->written += sectors_to_write; + atomic64_inc(&c->btree_writes_nr); + atomic64_add(sectors_to_write, &c->btree_writes_sectors); + /* XXX: submitting IO with btree locks held: */ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k); bch2_bkey_buf_exit(&k, c); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 8190e73..425c9ad 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -12,6 +12,7 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "replicas.h" #include #include @@ -238,6 +239,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, struct btree_iter *linked, *deadlock_iter = NULL; u64 start_time = local_clock(); unsigned reason = 9; + bool ret; /* Check if it's safe to block: */ trans_for_each_iter(trans, linked) { @@ -258,17 +260,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - if (!(trans->nounlock)) { - linked->locks_want = max_t(unsigned, - linked->locks_want, - __fls(linked->nodes_locked) + 1); - if (!btree_iter_get_locks(linked, true, false)) { - deadlock_iter = linked; - reason = 1; - } - } else { + linked->locks_want = max_t(unsigned, + linked->locks_want, + __fls(linked->nodes_locked) + 1); + if (!btree_iter_get_locks(linked, true, false)) { deadlock_iter = linked; - reason = 2; + reason = 1; } } @@ -298,18 +295,13 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, * we're about to lock, it must have the ancestors locked too: */ if (level > __fls(linked->nodes_locked)) { - if (!(trans->nounlock)) { - linked->locks_want = - max(level + 1, max_t(unsigned, - linked->locks_want, - iter->locks_want)); - if (!btree_iter_get_locks(linked, true, false)) { - deadlock_iter = linked; - reason = 5; - } - } else { + linked->locks_want = + max(level + 1, max_t(unsigned, + linked->locks_want, + iter->locks_want)); + if (!btree_iter_get_locks(linked, true, false)) { deadlock_iter = linked; - reason = 6; + reason = 5; } } @@ -346,12 +338,23 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, if (six_trylock_type(&b->c.lock, type)) return true; - if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) - return false; +#ifdef CONFIG_BCACHEFS_DEBUG + trans->locking_iter_idx = iter->idx; + trans->locking_pos = pos; + trans->locking_btree_id = iter->btree_id; + trans->locking_level = level; + trans->locking = b; +#endif - bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], - start_time); - return true; + ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; + +#ifdef CONFIG_BCACHEFS_DEBUG + trans->locking = NULL; +#endif + if (ret) + bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], + start_time); + return ret; } /* Btree iterator locking: */ @@ -421,50 +424,25 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, return false; } -bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, - unsigned new_locks_want) +void __bch2_btree_iter_downgrade(struct btree_iter *iter, + unsigned new_locks_want) { - unsigned l = iter->level; + unsigned l; - EBUG_ON(iter->locks_want >= new_locks_want); + EBUG_ON(iter->locks_want < new_locks_want); iter->locks_want = new_locks_want; - do { - if (!btree_iter_node(iter, l)) - break; - - if (!bch2_btree_node_upgrade(iter, l)) { - iter->locks_want = l; - return false; - } - - l++; - } while (l < iter->locks_want); - - return true; -} - -void __bch2_btree_iter_downgrade(struct btree_iter *iter, - unsigned downgrade_to) -{ - unsigned l, new_locks_want = downgrade_to ?: - (iter->flags & BTREE_ITER_INTENT ? 1 : 0); - - if (iter->locks_want < downgrade_to) { - iter->locks_want = new_locks_want; - - while (iter->nodes_locked && - (l = __fls(iter->nodes_locked)) >= iter->locks_want) { - if (l > iter->level) { - btree_node_unlock(iter, l); - } else { - if (btree_node_intent_locked(iter, l)) { - six_lock_downgrade(&iter->l[l].b->c.lock); - iter->nodes_intent_locked ^= 1 << l; - } - break; + while (iter->nodes_locked && + (l = __fls(iter->nodes_locked)) >= iter->locks_want) { + if (l > iter->level) { + btree_node_unlock(iter, l); + } else { + if (btree_node_intent_locked(iter, l)) { + six_lock_downgrade(&iter->l[l].b->c.lock); + iter->nodes_intent_locked ^= 1 << l; } + break; } } @@ -484,13 +462,12 @@ void bch2_trans_downgrade(struct btree_trans *trans) bool bch2_trans_relock(struct btree_trans *trans) { struct btree_iter *iter; - bool ret = true; trans_for_each_iter(trans, iter) - if (iter->uptodate == BTREE_ITER_NEED_RELOCK) - ret &= bch2_btree_iter_relock(iter, true); - - return ret; + if (btree_iter_keep(trans, iter) && + !bch2_btree_iter_relock(iter, true)) + return false; + return true; } void bch2_trans_unlock(struct btree_trans *trans) @@ -1027,7 +1004,7 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) trans_for_each_iter(iter->trans, linked) if (linked->l[level].b == b) { - __btree_node_unlock(linked, level); + btree_node_unlock(linked, level); linked->l[level].b = BTREE_ITER_NO_NODE_DROP; } } @@ -2008,6 +1985,8 @@ static inline void btree_iter_copy(struct btree_iter *dst, struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, unsigned flags) { struct btree_iter *iter, *best = NULL; @@ -2020,10 +1999,6 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, pos.snapshot = btree_type_has_snapshots(btree_id) ? U32_MAX : 0; - /* We always want a fresh iterator for node iterators: */ - if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_NODES) - goto alloc_iter; - trans_for_each_iter(trans, iter) { if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) continue; @@ -2038,7 +2013,7 @@ struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, best = iter; } -alloc_iter: + if (!best) { iter = btree_trans_iter_alloc(trans); bch2_btree_iter_init(trans, iter, btree_id); @@ -2062,10 +2037,25 @@ alloc_iter: iter->snapshot = pos.snapshot; - if (!(iter->flags & BTREE_ITER_INTENT)) - bch2_btree_iter_downgrade(iter); - else if (!iter->locks_want) - __bch2_btree_iter_upgrade_nounlock(iter, 1); + locks_want = min(locks_want, BTREE_MAX_DEPTH); + + if (locks_want > iter->locks_want) { + iter->locks_want = locks_want; + btree_iter_get_locks(iter, true, false); + } else if (locks_want < iter->locks_want) { + __bch2_btree_iter_downgrade(iter, locks_want); + } + + while (iter->level < depth) { + btree_node_unlock(iter, iter->level); + iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; + iter->level++; + } + + while (iter->level > depth) + iter->l[--iter->level].b = BTREE_ITER_NO_NODE_INIT; + + iter->min_depth = depth; bch2_btree_iter_set_pos(iter, pos); btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); @@ -2082,21 +2072,16 @@ struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, { struct btree_iter *iter = __bch2_trans_get_iter(trans, btree_id, pos, - BTREE_ITER_NODES| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS| - flags); - unsigned i; + locks_want, depth, + BTREE_ITER_NODES| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + flags); BUG_ON(bkey_cmp(iter->pos, pos)); - - iter->locks_want = locks_want; - iter->level = depth; - iter->min_depth = depth; - - for (i = 0; i < ARRAY_SIZE(iter->l); i++) - iter->l[i].b = NULL; - iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; + BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(iter->level != depth); + BUG_ON(iter->min_depth != depth); iter->ip_allocated = _RET_IP_; return iter; @@ -2304,11 +2289,24 @@ bch2_btree_iter_node_to_text(struct printbuf *out, struct btree_bkey_cached_common *_b, enum btree_iter_type type) { - pr_buf(out, " %px l=%u %s:", - _b, _b->level, bch2_btree_ids[_b->btree_id]); + pr_buf(out, " l=%u %s:", + _b->level, bch2_btree_ids[_b->btree_id]); bch2_bpos_to_text(out, btree_node_pos(_b, type)); } +#ifdef CONFIG_BCACHEFS_DEBUG +static bool trans_has_btree_nodes_locked(struct btree_trans *trans) +{ + struct btree_iter *iter; + + trans_for_each_iter(trans, iter) + if (btree_iter_type(iter) != BTREE_ITER_CACHED && + iter->nodes_locked) + return true; + return false; +} +#endif + void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -2319,14 +2317,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); + if (!trans_has_btree_nodes_locked(trans)) + continue; + + pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); trans_for_each_iter(trans, iter) { if (!iter->nodes_locked) continue; - pr_buf(out, " iter %u %s:", + pr_buf(out, " iter %u %c %s:", iter->idx, + btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', bch2_btree_ids[iter->btree_id]); bch2_bpos_to_text(out, iter->pos); pr_buf(out, "\n"); @@ -2345,17 +2347,18 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) b = READ_ONCE(trans->locking); if (b) { - pr_buf(out, " locking iter %u l=%u %s:", + iter = &trans->iters[trans->locking_iter_idx]; + pr_buf(out, " locking iter %u %c l=%u %s:", trans->locking_iter_idx, + btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', trans->locking_level, bch2_btree_ids[trans->locking_btree_id]); bch2_bpos_to_text(out, trans->locking_pos); - pr_buf(out, " node "); bch2_btree_iter_node_to_text(out, (void *) b, - btree_iter_type(&trans->iters[trans->locking_iter_idx])); + btree_iter_type(iter)); pr_buf(out, "\n"); } } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 7585f98..07d9b6d 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -116,7 +116,6 @@ bool bch2_trans_relock(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); -bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, unsigned new_locks_want) @@ -124,9 +123,7 @@ static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); return iter->locks_want < new_locks_want - ? (!iter->trans->nounlock - ? __bch2_btree_iter_upgrade(iter, new_locks_want) - : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) + ? __bch2_btree_iter_upgrade(iter, new_locks_want) : iter->uptodate <= BTREE_ITER_NEED_PEEK; } @@ -134,8 +131,10 @@ void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) { - if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) - __bch2_btree_iter_downgrade(iter, 0); + unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0); + + if (iter->locks_want > new_locks_want) + __bch2_btree_iter_downgrade(iter, new_locks_want); } void bch2_trans_downgrade(struct btree_trans *); @@ -175,8 +174,11 @@ static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) new_pos.snapshot = iter->snapshot; - bkey_init(&iter->k); - iter->k.p = iter->pos = new_pos; + iter->k.type = KEY_TYPE_deleted; + iter->k.p.inode = iter->pos.inode = new_pos.inode; + iter->k.p.offset = iter->pos.offset = new_pos.offset; + iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; + iter->k.size = 0; } /* Sort order for locking btree iterators: */ @@ -261,14 +263,17 @@ int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); void bch2_trans_unlink_iters(struct btree_trans *); struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, - struct bpos, unsigned); + struct bpos, unsigned, + unsigned, unsigned); static inline struct btree_iter * bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, struct bpos pos, unsigned flags) { struct btree_iter *iter = - __bch2_trans_get_iter(trans, btree_id, pos, flags); + __bch2_trans_get_iter(trans, btree_id, pos, + (flags & BTREE_ITER_INTENT) != 0, 0, + flags); iter->ip_allocated = _THIS_IP_; return iter; } diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 04354f5..0d3c0a4 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -352,6 +352,7 @@ err: static int btree_key_cache_flush_pos(struct btree_trans *trans, struct bkey_cached_key key, u64 journal_seq, + unsigned commit_flags, bool evict) { struct bch_fs *c = trans->c; @@ -390,12 +391,17 @@ retry: BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED| - BTREE_INSERT_JOURNAL_RECLAIM); + (ck->journal.seq == journal_last_seq(j) + ? BTREE_INSERT_JOURNAL_RESERVED + : 0)| + commit_flags); err: if (ret == -EINTR) goto retry; + if (ret == -EAGAIN) + goto out; + if (ret) { bch2_fs_fatal_err_on(!bch2_journal_error(j), c, "error flushing key cache: %i", ret); @@ -438,15 +444,15 @@ out: return ret; } -static void btree_key_cache_journal_flush(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) +int bch2_btree_key_cache_journal_flush(struct journal *j, + struct journal_entry_pin *pin, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bkey_cached *ck = container_of(pin, struct bkey_cached, journal); struct bkey_cached_key key; struct btree_trans trans; + int ret = 0; int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); @@ -461,10 +467,13 @@ static void btree_key_cache_journal_flush(struct journal *j, six_unlock_read(&ck->c.lock); bch2_trans_init(&trans, c, 0, 0); - btree_key_cache_flush_pos(&trans, key, seq, false); + ret = btree_key_cache_flush_pos(&trans, key, seq, + BTREE_INSERT_JOURNAL_RECLAIM, false); bch2_trans_exit(&trans); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + + return ret; } /* @@ -480,7 +489,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, if (!bch2_btree_key_cache_find(c, id, pos)) return 0; - return btree_key_cache_flush_pos(trans, key, 0, true); + return btree_key_cache_flush_pos(trans, key, 0, 0, true); } bool bch2_btree_insert_key_cached(struct btree_trans *trans, @@ -517,7 +526,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, } bch2_journal_pin_update(&c->journal, trans->journal_res.seq, - &ck->journal, btree_key_cache_journal_flush); + &ck->journal, bch2_btree_key_cache_journal_flush); if (kick_reclaim) journal_reclaim_kick(&c->journal); @@ -581,9 +590,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, do { struct rhash_head *pos, *next; - rht_for_each_entry_safe(ck, pos, next, tbl, bc->shrink_iter, hash) { + pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); + + while (!rht_is_a_nulls(pos)) { + next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); + ck = container_of(pos, struct bkey_cached, hash); + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) - continue; + goto next; if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); @@ -595,6 +609,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, scanned++; if (scanned >= nr) break; +next: + pos = next; } bc->shrink_iter++; diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index 4e1e5a9..7e2b0a0 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -1,15 +1,6 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_H #define _BCACHEFS_BTREE_KEY_CACHE_H -static inline size_t bch2_nr_btree_keys_want_flush(struct bch_fs *c) -{ - size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); - size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); - size_t max_dirty = nr_keys / 4; - - return max_t(ssize_t, 0, nr_dirty - max_dirty); -} - static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) { size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); @@ -29,6 +20,9 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); } +int bch2_btree_key_cache_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 38323e3..7532bcd 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -95,7 +95,7 @@ btree_lock_want(struct btree_iter *iter, int level) return BTREE_NODE_UNLOCKED; } -static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) +static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) { int lock_type = btree_node_locked_type(iter, level); @@ -106,13 +106,6 @@ static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) mark_btree_node_unlocked(iter, level); } -static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) -{ - EBUG_ON(!level && iter->trans->nounlock); - - __btree_node_unlock(iter, level); -} - static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) { btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); @@ -187,27 +180,14 @@ static inline bool btree_node_lock(struct btree *b, unsigned long ip) { struct btree_trans *trans = iter->trans; - bool ret; EBUG_ON(level >= BTREE_MAX_DEPTH); EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); -#ifdef CONFIG_BCACHEFS_DEBUG - trans->locking = b; - trans->locking_iter_idx = iter->idx; - trans->locking_pos = pos; - trans->locking_btree_id = iter->btree_id; - trans->locking_level = level; -#endif - ret = likely(six_trylock_type(&b->c.lock, type)) || + return likely(six_trylock_type(&b->c.lock, type)) || btree_node_lock_increment(trans, b, level, type) || __bch2_btree_node_lock(b, pos, level, iter, type, should_sleep_fn, p, ip); - -#ifdef CONFIG_BCACHEFS_DEBUG - trans->locking = NULL; -#endif - return ret; } bool __bch2_btree_node_relock(struct btree_iter *, unsigned); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 1941616..39e93da 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -47,8 +47,6 @@ struct bset_tree { u16 data_offset; u16 aux_data_offset; u16 end_offset; - - struct bpos max_key; }; struct btree_write { @@ -98,6 +96,11 @@ struct btree { u8 byte_order; u8 unpack_fn_len; + struct btree_write writes[2]; + + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + /* * XXX: add a delete sequence number, so when bch2_btree_node_relock() * fails because the lock sequence number has changed - i.e. the @@ -128,11 +131,6 @@ struct btree { /* lru list */ struct list_head list; - - struct btree_write writes[2]; - - /* Key/pointer for this btree node */ - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); }; struct btree_cache { @@ -372,7 +370,6 @@ struct btree_trans { u8 nr_updates2; unsigned used_mempool:1; unsigned error:1; - unsigned nounlock:1; unsigned in_traverse_all:1; u64 iters_linked; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 19dfc32..0014470 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -437,10 +437,6 @@ static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, goto err_free; } - ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); - if (ret) - goto err_free; - as->prealloc_nodes[as->nr_prealloc_nodes++] = b; } @@ -458,6 +454,10 @@ static void bch2_btree_update_free(struct btree_update *as) { struct bch_fs *c = as->c; + if (as->took_gc_lock) + up_read(&c->gc_lock); + as->took_gc_lock = false; + bch2_journal_preres_put(&c->journal, &as->journal_preres); bch2_journal_pin_drop(&c->journal, &as->journal); @@ -893,24 +893,33 @@ void bch2_btree_update_done(struct btree_update *as) { BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + if (as->took_gc_lock) + up_read(&as->c->gc_lock); + as->took_gc_lock = false; + bch2_btree_reserve_put(as); continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); } struct btree_update * -bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, - unsigned nr_nodes, unsigned flags, - struct closure *cl) +bch2_btree_update_start(struct btree_iter *iter, unsigned level, + unsigned nr_nodes, unsigned flags) { + struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct btree_update *as; + struct closure cl; int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0; - int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) - ? JOURNAL_RES_GET_RECLAIM : 0; + int journal_flags = 0; int ret = 0; + if (flags & BTREE_INSERT_JOURNAL_RESERVED) + journal_flags |= JOURNAL_RES_GET_RESERVED; + + closure_init_stack(&cl); +retry: /* * This check isn't necessary for correctness - it's just to potentially * prevent us from doing a lot of work that'll end up being wasted: @@ -919,12 +928,36 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, if (ret) return ERR_PTR(ret); + /* + * XXX: figure out how far we might need to split, + * instead of locking/reserving all the way to the root: + */ + if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->ip); + return ERR_PTR(-EINTR); + } + + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + else if (!down_read_trylock(&c->gc_lock)) { + if (flags & BTREE_INSERT_NOUNLOCK) + return ERR_PTR(-EINTR); + + bch2_trans_unlock(trans); + down_read(&c->gc_lock); + if (!bch2_trans_relock(trans)) { + up_read(&c->gc_lock); + return ERR_PTR(-EINTR); + } + } + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); as->c = c; as->mode = BTREE_INTERIOR_NO_UPDATE; - as->btree_id = id; + as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->btree_id = iter->btree_id; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -936,16 +969,25 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, BTREE_UPDATE_JOURNAL_RES, journal_flags|JOURNAL_RES_GET_NONBLOCK); if (ret == -EAGAIN) { - if (flags & BTREE_INSERT_NOUNLOCK) - return ERR_PTR(-EINTR); + /* + * this would be cleaner if bch2_journal_preres_get() took a + * closure argument + */ + if (flags & BTREE_INSERT_NOUNLOCK) { + ret = -EINTR; + goto err; + } bch2_trans_unlock(trans); + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) + goto err; + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, journal_flags); if (ret) - return ERR_PTR(ret); + goto err; if (!bch2_trans_relock(trans)) { ret = -EINTR; @@ -960,7 +1002,8 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, if (ret) goto err; - ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); + ret = bch2_btree_reserve_get(as, nr_nodes, flags, + !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); if (ret) goto err; @@ -975,6 +1018,18 @@ bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, return as; err: bch2_btree_update_free(as); + + if (ret == -EAGAIN) { + BUG_ON(flags & BTREE_INSERT_NOUNLOCK); + + bch2_trans_unlock(trans); + closure_sync(&cl); + ret = -EINTR; + } + + if (ret == -EINTR && bch2_trans_relock(trans)) + goto retry; + return ERR_PTR(ret); } @@ -1419,6 +1474,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; + lockdep_assert_held(&c->gc_lock); BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); BUG_ON(!b->c.level); BUG_ON(!as || as->b); @@ -1450,14 +1506,6 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, bch2_btree_node_unlock_write(b, iter); btree_node_interior_verify(c, b); - - /* - * when called from the btree_split path the new nodes aren't added to - * the btree iterator yet, so the merge path's unlock/wait/relock dance - * won't work: - */ - bch2_foreground_maybe_merge(c, iter, b->c.level, - flags|BTREE_INSERT_NOUNLOCK); return; split: btree_split(as, b, iter, keys, flags); @@ -1466,109 +1514,73 @@ split: int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, unsigned flags) { - struct btree_trans *trans = iter->trans; struct btree *b = iter_l(iter)->b; struct btree_update *as; - struct closure cl; + unsigned l; int ret = 0; - closure_init_stack(&cl); - - /* Hack, because gc and splitting nodes doesn't mix yet: */ - if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && - !down_read_trylock(&c->gc_lock)) { - if (flags & BTREE_INSERT_NOUNLOCK) { - trace_transaction_restart_ip(trans->ip, _THIS_IP_); - return -EINTR; - } - - bch2_trans_unlock(trans); - down_read(&c->gc_lock); - - if (!bch2_trans_relock(trans)) - ret = -EINTR; - } - - /* - * XXX: figure out how far we might need to split, - * instead of locking/reserving all the way to the root: - */ - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { - trace_trans_restart_iter_upgrade(trans->ip); - ret = -EINTR; - goto out; - } - - as = bch2_btree_update_start(trans, iter->btree_id, - btree_update_reserve_required(c, b), flags, - !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); - if (IS_ERR(as)) { - ret = PTR_ERR(as); - if (ret == -EAGAIN) { - BUG_ON(flags & BTREE_INSERT_NOUNLOCK); - bch2_trans_unlock(trans); - ret = -EINTR; - - trace_transaction_restart_ip(trans->ip, _THIS_IP_); - } - goto out; - } + as = bch2_btree_update_start(iter, iter->level, + btree_update_reserve_required(c, b), flags); + if (IS_ERR(as)) + return PTR_ERR(as); btree_split(as, b, iter, NULL, flags); bch2_btree_update_done(as); - /* - * We haven't successfully inserted yet, so don't downgrade all the way - * back to read locks; - */ - __bch2_btree_iter_downgrade(iter, 1); -out: - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) - up_read(&c->gc_lock); - closure_sync(&cl); + for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++) + ret = bch2_foreground_maybe_merge(c, iter, l, flags); + return ret; } -void __bch2_foreground_maybe_merge(struct bch_fs *c, - struct btree_iter *iter, - unsigned level, - unsigned flags, - enum btree_node_sibling sib) +int __bch2_foreground_maybe_merge(struct bch_fs *c, + struct btree_iter *iter, + unsigned level, + unsigned flags, + enum btree_node_sibling sib) { struct btree_trans *trans = iter->trans; + struct btree_iter *sib_iter = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; struct bkey_i delete; struct btree *b, *m, *n, *prev, *next, *parent; - struct closure cl; + struct bpos sib_pos; size_t sib_u64s; - int ret = 0; + int ret = 0, ret2 = 0; BUG_ON(!btree_node_locked(iter, level)); - - closure_init_stack(&cl); retry: + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + BUG_ON(!btree_node_locked(iter, level)); b = iter->l[level].b; - parent = btree_node_parent(iter, b); - if (!parent) + if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || + (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) { + b->sib_u64s[sib] = U16_MAX; goto out; + } - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) - goto out; + sib_pos = sib == btree_prev_sib + ? bpos_predecessor(b->data->min_key) + : bpos_successor(b->data->max_key); - /* XXX: can't be holding read locks */ - m = bch2_btree_node_get_sibling(c, iter, b, sib); - if (IS_ERR(m)) { - ret = PTR_ERR(m); + sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id, + sib_pos, U8_MAX, level, + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(sib_iter); + if (ret) goto err; - } - /* NULL means no sibling: */ - if (!m) { + m = sib_iter->l[level].b; + + if (btree_node_parent(iter, b) != + btree_node_parent(sib_iter, m)) { b->sib_u64s[sib] = U16_MAX; goto out; } @@ -1581,6 +1593,8 @@ retry: next = m; } + BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)); + bch2_bkey_format_init(&new_s); bch2_bkey_format_add_pos(&new_s, prev->data->min_key); __bch2_btree_calc_format(&new_s, prev); @@ -1598,33 +1612,21 @@ retry: } sib_u64s = min(sib_u64s, btree_max_u64s(c)); + sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); b->sib_u64s[sib] = sib_u64s; - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { - six_unlock_intent(&m->c.lock); + if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) goto out; - } - - /* We're changing btree topology, doesn't mix with gc: */ - if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && - !down_read_trylock(&c->gc_lock)) - goto err_cycle_gc_lock; - - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { - ret = -EINTR; - goto err_unlock; - } - as = bch2_btree_update_start(trans, iter->btree_id, + parent = btree_node_parent(iter, b); + as = bch2_btree_update_start(iter, level, btree_update_reserve_required(c, parent) + 1, flags| BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); - if (IS_ERR(as)) { - ret = PTR_ERR(as); - goto err_unlock; - } + BTREE_INSERT_USE_RESERVE); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto err; trace_btree_merge(c, b); @@ -1658,6 +1660,7 @@ retry: bch2_btree_update_get_open_buckets(as, n); six_lock_increment(&b->c.lock, SIX_LOCK_intent); + six_lock_increment(&m->c.lock, SIX_LOCK_intent); bch2_btree_iter_node_drop(iter, b); bch2_btree_iter_node_drop(iter, m); @@ -1671,11 +1674,9 @@ retry: six_unlock_intent(&n->c.lock); bch2_btree_update_done(as); - - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) - up_read(&c->gc_lock); out: bch2_btree_trans_verify_locks(trans); + bch2_trans_iter_free(trans, sib_iter); /* * Don't downgrade locks here: we're called after successful insert, @@ -1686,58 +1687,56 @@ out: * split path, and downgrading to read locks in there is potentially * confusing: */ - closure_sync(&cl); - return; - -err_cycle_gc_lock: - six_unlock_intent(&m->c.lock); - - if (flags & BTREE_INSERT_NOUNLOCK) - goto out; - - bch2_trans_unlock(trans); - - down_read(&c->gc_lock); - up_read(&c->gc_lock); - ret = -EINTR; - goto err; - -err_unlock: - six_unlock_intent(&m->c.lock); - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) - up_read(&c->gc_lock); + return ret ?: ret2; err: - BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); - - if ((ret == -EAGAIN || ret == -EINTR) && - !(flags & BTREE_INSERT_NOUNLOCK)) { - bch2_trans_unlock(trans); - closure_sync(&cl); - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto out; + bch2_trans_iter_put(trans, sib_iter); + sib_iter = NULL; + if (ret == -EINTR && bch2_trans_relock(trans)) goto retry; + + if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) { + ret2 = ret; + ret = bch2_btree_iter_traverse_all(trans); + if (!ret) + goto retry; } goto out; } -static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, - struct btree *b, unsigned flags, - struct closure *cl) +/** + * bch_btree_node_rewrite - Rewrite/move a btree node + */ +int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, + __le64 seq, unsigned flags) { - struct btree *n, *parent = btree_node_parent(iter, b); + struct btree *b, *n, *parent; struct btree_update *as; + int ret; + + flags |= BTREE_INSERT_NOFAIL; +retry: + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto out; + + b = bch2_btree_iter_peek_node(iter); + if (!b || b->data->keys.seq != seq) + goto out; - as = bch2_btree_update_start(iter->trans, iter->btree_id, + parent = btree_node_parent(iter, b); + as = bch2_btree_update_start(iter, b->c.level, (parent ? btree_update_reserve_required(c, parent) : 0) + 1, - flags, cl); - if (IS_ERR(as)) { + flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret == -EINTR) + goto retry; + if (ret) { trace_btree_gc_rewrite_node_fail(c, b); - return PTR_ERR(as); + goto out; } bch2_btree_interior_update_will_free_node(as, b); @@ -1768,60 +1767,8 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, six_unlock_intent(&n->c.lock); bch2_btree_update_done(as); - return 0; -} - -/** - * bch_btree_node_rewrite - Rewrite/move a btree node - * - * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. - * btree_check_reserve() has to wait) - */ -int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, - __le64 seq, unsigned flags) -{ - struct btree_trans *trans = iter->trans; - struct closure cl; - struct btree *b; - int ret; - - flags |= BTREE_INSERT_NOFAIL; - - closure_init_stack(&cl); - - bch2_btree_iter_upgrade(iter, U8_MAX); - - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { - if (!down_read_trylock(&c->gc_lock)) { - bch2_trans_unlock(trans); - down_read(&c->gc_lock); - } - } - - while (1) { - ret = bch2_btree_iter_traverse(iter); - if (ret) - break; - - b = bch2_btree_iter_peek_node(iter); - if (!b || b->data->keys.seq != seq) - break; - - ret = __btree_node_rewrite(c, iter, b, flags, &cl); - if (ret != -EAGAIN && - ret != -EINTR) - break; - - bch2_trans_unlock(trans); - closure_sync(&cl); - } - +out: bch2_btree_iter_downgrade(iter); - - if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) - up_read(&c->gc_lock); - - closure_sync(&cl); return ret; } @@ -1892,71 +1839,34 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, struct btree_update *as = NULL; struct btree *new_hash = NULL; struct closure cl; - int ret; + int ret = 0; closure_init_stack(&cl); - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) - return -EINTR; - - if (!down_read_trylock(&c->gc_lock)) { - bch2_trans_unlock(iter->trans); - down_read(&c->gc_lock); - - if (!bch2_trans_relock(iter->trans)) { - ret = -EINTR; - goto err; - } - } - /* * check btree_ptr_hash_val() after @b is locked by * btree_iter_traverse(): */ if (btree_ptr_hash_val(new_key) != b->hash_val) { - /* bch2_btree_reserve_get will unlock */ ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) { bch2_trans_unlock(iter->trans); - up_read(&c->gc_lock); closure_sync(&cl); - down_read(&c->gc_lock); - - if (!bch2_trans_relock(iter->trans)) { - ret = -EINTR; - goto err; - } + if (!bch2_trans_relock(iter->trans)) + return -EINTR; } new_hash = bch2_btree_node_mem_alloc(c); } -retry: - as = bch2_btree_update_start(iter->trans, iter->btree_id, - parent ? btree_update_reserve_required(c, parent) : 0, - BTREE_INSERT_NOFAIL, &cl); + as = bch2_btree_update_start(iter, b->c.level, + parent ? btree_update_reserve_required(c, parent) : 0, + BTREE_INSERT_NOFAIL); if (IS_ERR(as)) { ret = PTR_ERR(as); - if (ret == -EAGAIN) - ret = -EINTR; - - if (ret == -EINTR) { - bch2_trans_unlock(iter->trans); - up_read(&c->gc_lock); - closure_sync(&cl); - down_read(&c->gc_lock); - - if (bch2_trans_relock(iter->trans)) - goto retry; - } - goto err; } - ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); - if (ret) - goto err_free_update; - __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); bch2_btree_iter_downgrade(iter); @@ -1969,12 +1879,9 @@ err: six_unlock_write(&new_hash->c.lock); six_unlock_intent(&new_hash->c.lock); } - up_read(&c->gc_lock); closure_sync(&cl); + bch2_btree_cache_cannibalize_unlock(c); return ret; -err_free_update: - bch2_btree_update_free(as); - goto err; } /* Init code: */ diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 45d2127..f2925b0 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -48,6 +48,7 @@ struct btree_update { } mode; unsigned nodes_written:1; + unsigned took_gc_lock:1; enum btree_id btree_id; @@ -120,8 +121,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, void bch2_btree_update_done(struct btree_update *); struct btree_update * -bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, - unsigned, struct closure *); +bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned); void bch2_btree_interior_update_will_free_node(struct btree_update *, struct btree *); @@ -132,10 +132,10 @@ void bch2_btree_insert_node(struct btree_update *, struct btree *, unsigned); int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); -void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, - unsigned, unsigned, enum btree_node_sibling); +int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, + unsigned, unsigned, enum btree_node_sibling); -static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, +static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c, struct btree_iter *iter, unsigned level, unsigned flags, enum btree_node_sibling sib) @@ -143,27 +143,27 @@ static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, struct btree *b; if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) - return; + return 0; if (!bch2_btree_node_relock(iter, level)) - return; + return 0; b = iter->l[level].b; if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) - return; + return 0; - __bch2_foreground_maybe_merge(c, iter, level, flags, sib); + return __bch2_foreground_maybe_merge(c, iter, level, flags, sib); } -static inline void bch2_foreground_maybe_merge(struct bch_fs *c, +static inline int bch2_foreground_maybe_merge(struct bch_fs *c, struct btree_iter *iter, unsigned level, unsigned flags) { - bch2_foreground_maybe_merge_sibling(c, iter, level, flags, - btree_prev_sib); - bch2_foreground_maybe_merge_sibling(c, iter, level, flags, - btree_next_sib); + return bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + btree_prev_sib) ?: + bch2_foreground_maybe_merge_sibling(c, iter, level, flags, + btree_next_sib); } void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 221a600..e258cf8 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -134,7 +134,7 @@ fix_iter: return true; } -static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, +static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, unsigned i, u64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); @@ -145,14 +145,15 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, bch2_btree_node_write_cond(c, b, (btree_current_write(b) == w && w->journal.seq == seq)); six_unlock_read(&b->c.lock); + return 0; } -static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) { return __btree_node_flush(j, pin, 0, seq); } -static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) +static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) { return __btree_node_flush(j, pin, 1, seq); } @@ -375,7 +376,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, struct btree_insert_entry **stopped_at) { struct bch_fs *c = trans->c; - struct bch_fs_usage *fs_usage = NULL; struct btree_insert_entry *i; struct btree_trans_commit_hook *h; unsigned u64s = 0; @@ -423,7 +423,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (marking) { percpu_down_read(&c->mark_lock); - fs_usage = bch2_fs_usage_scratch_get(c); + } + + /* Must be called under mark_lock: */ + if (marking && trans->fs_usage_deltas && + !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) { + ret = BTREE_INSERT_NEED_MARK_REPLICAS; + goto err; } /* @@ -462,21 +468,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, i->k->k.version = MAX_VERSION; } - /* Must be called under mark_lock: */ - if (marking && trans->fs_usage_deltas && - bch2_replicas_delta_list_apply(c, fs_usage, - trans->fs_usage_deltas)) { - ret = BTREE_INSERT_NEED_MARK_REPLICAS; - goto err; - } - trans_for_each_update(trans, i) if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) bch2_mark_update(trans, i->iter, i->k, - fs_usage, i->trigger_flags); + NULL, i->trigger_flags); - if (marking) - bch2_trans_fs_usage_apply(trans, fs_usage); + if (marking && trans->fs_usage_deltas) + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); if (unlikely(c->gc_pos.phase)) bch2_trans_mark_gc(trans); @@ -485,31 +483,85 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, do_btree_insert_one(trans, i->iter, i->k); err: if (marking) { - bch2_fs_usage_scratch_put(c, fs_usage); percpu_up_read(&c->mark_lock); } return ret; } +static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter) +{ + struct btree_insert_entry *i; + struct btree *b = iter_l(iter)->b; + struct bkey_s_c old; + int u64s_delta = 0; + int ret; + + /* + * Inserting directly into interior nodes is an uncommon operation with + * various weird edge cases: also, a lot of things about + * BTREE_ITER_NODES iters need to be audited + */ + if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS)) + return 0; + + BUG_ON(iter->level); + + trans_for_each_update2(trans, i) { + if (iter_l(i->iter)->b != b) + continue; + + old = bch2_btree_iter_peek_slot(i->iter); + ret = bkey_err(old); + if (ret) + return ret; + + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; + u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; + } + + return u64s_delta <= 0 + ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level, + trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR) + : 0; +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ static inline int do_bch2_trans_commit(struct btree_trans *trans, struct btree_insert_entry **stopped_at) { + struct bch_fs *c = trans->c; struct btree_insert_entry *i; struct btree_iter *iter; int ret; + trans_for_each_update2(trans, i) { + struct btree *b; + + BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + + if (btree_iter_type(i->iter) == BTREE_ITER_CACHED) + continue; + + b = iter_l(i->iter)->b; + if (b->sib_u64s[0] < c->btree_foreground_merge_threshold || + b->sib_u64s[1] < c->btree_foreground_merge_threshold) { + ret = maybe_do_btree_merge(trans, i->iter); + if (unlikely(ret)) + return ret; + } + } + trans_for_each_update2(trans, i) - BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); + BUG_ON(!btree_node_intent_locked(i->iter, i->level)); - ret = bch2_journal_preres_get(&trans->c->journal, + ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, trans->journal_preres_u64s, JOURNAL_RES_GET_NONBLOCK| - ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) - ? JOURNAL_RES_GET_RECLAIM : 0)); + ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED) + ? JOURNAL_RES_GET_RESERVED : 0)); if (unlikely(ret == -EAGAIN)) ret = bch2_trans_journal_preres_get_cold(trans, trans->journal_preres_u64s); @@ -547,7 +599,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, trans_for_each_update2(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_lock_for_insert(trans->c, + bch2_btree_node_lock_for_insert(c, iter_l(i->iter)->b, i->iter); ret = bch2_trans_commit_write_locked(trans, stopped_at); @@ -558,33 +610,43 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, i->iter); if (!ret && trans->journal_pin) - bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, trans->journal_pin, NULL); /* * Drop journal reservation after dropping write locks, since dropping * the journal reservation may kick off a journal write: */ - bch2_journal_res_put(&trans->c->journal, &trans->journal_res); + bch2_journal_res_put(&c->journal, &trans->journal_res); if (unlikely(ret)) return ret; - if (trans->flags & BTREE_INSERT_NOUNLOCK) - trans->nounlock = true; + bch2_trans_downgrade(trans); - if (!(trans->flags & BTREE_INSERT_NOUNLOCK)) - trans_for_each_update2(trans, i) - if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && - !same_leaf_as_prev(trans, i)) - bch2_foreground_maybe_merge(trans->c, i->iter, - 0, trans->flags); + return 0; +} - trans->nounlock = false; +static int journal_reclaim_wait_done(struct bch_fs *c) +{ + int ret; - bch2_trans_downgrade(trans); + ret = bch2_journal_error(&c->journal); + if (ret) + return ret; - return 0; + ret = !bch2_btree_key_cache_must_wait(c); + if (ret) + return ret; + + if (mutex_trylock(&c->journal.reclaim_lock)) { + ret = bch2_journal_reclaim(&c->journal); + mutex_unlock(&c->journal.reclaim_lock); + } + + if (!ret) + ret = !bch2_btree_key_cache_must_wait(c); + return ret; } static noinline @@ -641,11 +703,9 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_MARK_REPLICAS: bch2_trans_unlock(trans); - trans_for_each_update(trans, i) { - ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); - if (ret) - return ret; - } + ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas); + if (ret) + return ret; if (bch2_trans_relock(trans)) return 0; @@ -656,6 +716,10 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_JOURNAL_RES: bch2_trans_unlock(trans); + if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && + !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) + return -EAGAIN; + ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); if (ret) return ret; @@ -669,11 +733,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_JOURNAL_RECLAIM: bch2_trans_unlock(trans); - do { - mutex_lock(&c->journal.reclaim_lock); - ret = bch2_journal_reclaim(&c->journal); - mutex_unlock(&c->journal.reclaim_lock); - } while (!ret && bch2_btree_key_cache_must_wait(c)); + wait_event(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); if (!ret && bch2_trans_relock(trans)) return 0; @@ -920,17 +981,14 @@ int __bch2_trans_commit(struct btree_trans *trans) goto out; } - /* - * We're not using bch2_btree_iter_upgrade here because - * we know trans->nounlock can't be set: - */ - if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) && - !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) { + if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) { trace_trans_restart_upgrade(trans->ip); ret = -EINTR; goto out; } + BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + u64s = jset_u64s(i->k->k.u64s); if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index e6e7523..31f7617 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -167,37 +167,6 @@ void bch2_fs_usage_initialize(struct bch_fs *c) percpu_up_write(&c->mark_lock); } -void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) -{ - if (fs_usage == c->usage_scratch) - mutex_unlock(&c->usage_scratch_lock); - else - kfree(fs_usage); -} - -struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) -{ - struct bch_fs_usage *ret; - unsigned bytes = fs_usage_u64s(c) * sizeof(u64); - - ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); - if (ret) - return ret; - - if (mutex_trylock(&c->usage_scratch_lock)) - goto out_pool; - - ret = kzalloc(bytes, GFP_NOFS); - if (ret) - return ret; - - mutex_lock(&c->usage_scratch_lock); -out_pool: - ret = c->usage_scratch; - memset(ret, 0, bytes); - return ret; -} - static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, unsigned journal_seq, bool gc) @@ -252,30 +221,28 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) return ret; } -struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) +struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) { - struct bch_fs_usage *ret; - unsigned seq, i, v, u64s = fs_usage_u64s(c); -retry: - ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); - if (unlikely(!ret)) - return NULL; + struct bch_fs_usage_online *ret; + unsigned seq, i, u64s; percpu_down_read(&c->mark_lock); - v = fs_usage_u64s(c); - if (unlikely(u64s != v)) { - u64s = v; + ret = kmalloc(sizeof(struct bch_fs_usage_online) + + sizeof(u64) + c->replicas.nr, GFP_NOFS); + if (unlikely(!ret)) { percpu_up_read(&c->mark_lock); - kfree(ret); - goto retry; + return NULL; } + ret->online_reserved = percpu_u64_get(c->online_reserved); + + u64s = fs_usage_u64s(c); do { seq = read_seqcount_begin(&c->usage_lock); - memcpy(ret, c->usage_base, u64s * sizeof(u64)); + memcpy(&ret->u, c->usage_base, u64s * sizeof(u64)); for (i = 0; i < ARRAY_SIZE(c->usage); i++) - acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[i], u64s); + acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s); } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; @@ -311,31 +278,31 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) void bch2_fs_usage_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_fs_usage *fs_usage) + struct bch_fs_usage_online *fs_usage) { unsigned i; pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); pr_buf(out, "hidden:\t\t\t\t%llu\n", - fs_usage->hidden); + fs_usage->u.hidden); pr_buf(out, "data:\t\t\t\t%llu\n", - fs_usage->data); + fs_usage->u.data); pr_buf(out, "cached:\t\t\t\t%llu\n", - fs_usage->cached); + fs_usage->u.cached); pr_buf(out, "reserved:\t\t\t%llu\n", - fs_usage->reserved); + fs_usage->u.reserved); pr_buf(out, "nr_inodes:\t\t\t%llu\n", - fs_usage->nr_inodes); + fs_usage->u.nr_inodes); pr_buf(out, "online reserved:\t\t%llu\n", fs_usage->online_reserved); for (i = 0; - i < ARRAY_SIZE(fs_usage->persistent_reserved); + i < ARRAY_SIZE(fs_usage->u.persistent_reserved); i++) { pr_buf(out, "%u replicas:\n", i + 1); pr_buf(out, "\treserved:\t\t%llu\n", - fs_usage->persistent_reserved[i]); + fs_usage->u.persistent_reserved[i]); } for (i = 0; i < c->replicas.nr; i++) { @@ -344,7 +311,7 @@ void bch2_fs_usage_to_text(struct printbuf *out, pr_buf(out, "\t"); bch2_replicas_entry_to_text(out, e); - pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); + pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]); } } @@ -360,12 +327,12 @@ static u64 avail_factor(u64 r) return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); } -u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) +u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) { - return min(fs_usage->hidden + - fs_usage->btree + - fs_usage->data + - reserve_factor(fs_usage->reserved + + return min(fs_usage->u.hidden + + fs_usage->u.btree + + fs_usage->u.data + + reserve_factor(fs_usage->u.reserved + fs_usage->online_reserved), c->capacity); } @@ -382,7 +349,7 @@ __bch2_fs_usage_read_short(struct bch_fs *c) data = bch2_fs_usage_read_one(c, &c->usage_base->data) + bch2_fs_usage_read_one(c, &c->usage_base->btree); reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + - bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); + percpu_u64_get(c->online_reserved); ret.used = min(ret.capacity, data + reserve_factor(reserved)); ret.free = ret.capacity - ret.used; @@ -436,43 +403,6 @@ static bool bucket_became_unavailable(struct bucket_mark old, !is_available_bucket(new); } -int bch2_fs_usage_apply(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct disk_reservation *disk_res, - unsigned journal_seq) -{ - s64 added = fs_usage->data + fs_usage->reserved; - s64 should_not_have_added; - int ret = 0; - - percpu_rwsem_assert_held(&c->mark_lock); - - /* - * Not allowed to reduce sectors_available except by getting a - * reservation: - */ - should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); - if (WARN_ONCE(should_not_have_added > 0, - "disk usage increased by %lli more than reservation of %llu", - added, disk_res ? disk_res->sectors : 0)) { - atomic64_sub(should_not_have_added, &c->sectors_available); - added -= should_not_have_added; - ret = -1; - } - - if (added > 0) { - disk_res->sectors -= added; - fs_usage->online_reserved -= added; - } - - preempt_disable(); - acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), - (u64 *) fs_usage, fs_usage_u64s(c)); - preempt_enable(); - - return ret; -} - static inline void account_bucket(struct bch_fs_usage *fs_usage, struct bch_dev_usage *dev_usage, enum bch_data_type type, @@ -494,6 +424,8 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, percpu_rwsem_assert_held(&c->mark_lock); preempt_disable(); + if (!fs_usage) + fs_usage = fs_usage_ptr(c, journal_seq, gc); u = dev_usage_ptr(ca, journal_seq, gc); if (bucket_type(old)) @@ -504,8 +436,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, account_bucket(fs_usage, u, bucket_type(new), 1, ca->mi.bucket_size); - u->buckets_alloc += - (int) new.owned_by_allocator - (int) old.owned_by_allocator; u->buckets_ec += (int) new.stripe - (int) old.stripe; u->buckets_unavailable += is_unavailable_bucket(new) - is_unavailable_bucket(old); @@ -524,22 +454,17 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_wake_allocator(ca); } -static inline int update_replicas(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct bch_replicas_entry *r, - s64 sectors) +static inline void update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) { int idx = bch2_replicas_entry_idx(c, r); - if (idx < 0) - return -1; - - if (!fs_usage) - return 0; + BUG_ON(idx < 0); fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); fs_usage->replicas[idx] += sectors; - return 0; } static inline void update_cached_sectors(struct bch_fs *c, @@ -586,6 +511,7 @@ static inline void update_replicas_list(struct btree_trans *trans, n = (void *) d->d + d->used; n->delta = sectors; memcpy(&n->r, r, replicas_entry_bytes(r)); + bch2_replicas_entry_sort(&n->r); d->used += b; } @@ -599,43 +525,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -static inline struct replicas_delta * -replicas_delta_next(struct replicas_delta *d) -{ - return (void *) d + replicas_entry_bytes(&d->r) + 8; -} - -int bch2_replicas_delta_list_apply(struct bch_fs *c, - struct bch_fs_usage *fs_usage, - struct replicas_delta_list *r) -{ - struct replicas_delta *d = r->d; - struct replicas_delta *top = (void *) r->d + r->used; - unsigned i; - - for (d = r->d; d != top; d = replicas_delta_next(d)) - if (update_replicas(c, fs_usage, &d->r, d->delta)) { - top = d; - goto unwind; - } - - if (!fs_usage) - return 0; - - fs_usage->nr_inodes += r->nr_inodes; - - for (i = 0; i < BCH_REPLICAS_MAX; i++) { - fs_usage->reserved += r->persistent_reserved[i]; - fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; - } - - return 0; -unwind: - for (d = r->d; d != top; d = replicas_delta_next(d)) - update_replicas(c, fs_usage, &d->r, -d->delta); - return -1; -} - #define do_mark_fn(fn, c, pos, flags, ...) \ ({ \ int gc, ret = 0; \ @@ -653,7 +542,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, bool gc) { - struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); struct bucket *g = __bucket(ca, b, gc); struct bucket_mark old, new; @@ -661,13 +549,6 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, new.owned_by_allocator = owned_by_allocator; })); - /* - * XXX: this is wrong, this means we'll be doing updates to the percpu - * buckets_alloc counter that don't have an open journal buffer and - * we'll race with the machinery that accumulates that to ca->usage_base - */ - bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc); - BUG_ON(!gc && !owned_by_allocator && !old.owned_by_allocator); @@ -1416,22 +1297,15 @@ int bch2_mark_update(struct btree_trans *trans, return ret; } -void bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct bch_fs_usage *fs_usage) +static noinline __cold +void fs_usage_apply_warn(struct btree_trans *trans, + unsigned disk_res_sectors) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - static int warned_disk_usage = 0; - u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; char buf[200]; - if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, - trans->journal_res.seq) || - warned_disk_usage || - xchg(&warned_disk_usage, 1)) - return; - - bch_err(c, "disk usage increased more than %llu sectors reserved", + bch_err(c, "disk usage increased more than %u sectors reserved", disk_res_sectors); trans_for_each_update(trans, i) { @@ -1466,6 +1340,65 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, } } +void bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + static int warned_disk_usage = 0; + bool warn = false; + unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + struct replicas_delta *d = deltas->d; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + s64 added = 0, should_not_have_added; + unsigned i; + + percpu_rwsem_assert_held(&c->mark_lock); + + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + for (d = deltas->d; d != top; d = replicas_delta_next(d)) { + switch (d->r.data_type) { + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + added += d->delta; + } + + update_replicas(c, dst, &d->r, d->delta); + } + + dst->nr_inodes += deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + added += deltas->persistent_reserved[i]; + dst->reserved += deltas->persistent_reserved[i]; + dst->persistent_reserved[i] += deltas->persistent_reserved[i]; + } + + /* + * Not allowed to reduce sectors_available except by getting a + * reservation: + */ + should_not_have_added = added - (s64) disk_res_sectors; + if (unlikely(should_not_have_added > 0)) { + atomic64_sub(should_not_have_added, &c->sectors_available); + added -= should_not_have_added; + warn = true; + } + + if (added > 0) { + trans->disk_res->sectors -= added; + this_cpu_sub(*c->online_reserved, added); + } + + preempt_enable(); + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) + fs_usage_apply_warn(trans, disk_res_sectors); +} + /* trans_mark: */ static struct btree_iter *trans_get_update(struct btree_trans *trans, @@ -2197,16 +2130,6 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, /* Disk reservations: */ -void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) -{ - percpu_down_read(&c->mark_lock); - this_cpu_sub(c->usage[0]->online_reserved, - res->sectors); - percpu_up_read(&c->mark_lock); - - res->sectors = 0; -} - #define SECTORS_CACHE 1024 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, @@ -2240,7 +2163,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, out: pcpu->sectors_available -= sectors; - this_cpu_add(c->usage[0]->online_reserved, sectors); + this_cpu_add(*c->online_reserved, sectors); res->sectors += sectors; preempt_enable(); @@ -2257,7 +2180,7 @@ recalculate: (flags & BCH_DISK_RESERVATION_NOFAIL)) { atomic64_set(&c->sectors_available, max_t(s64, 0, sectors_available - sectors)); - this_cpu_add(c->usage[0]->online_reserved, sectors); + this_cpu_add(*c->online_reserved, sectors); res->sectors += sectors; ret = 0; } else { diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 6d15c45..54dcc82 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -210,19 +210,16 @@ static inline unsigned dev_usage_u64s(void) return sizeof(struct bch_dev_usage) / sizeof(u64); } -void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); -struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); - u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); -struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); +struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); void bch2_fs_usage_to_text(struct printbuf *, - struct bch_fs *, struct bch_fs_usage *); + struct bch_fs *, struct bch_fs_usage_online *); -u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); +u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); @@ -240,20 +237,15 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, struct bch_fs_usage *, u64, unsigned); -int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *, unsigned); int bch2_mark_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, struct bch_fs_usage *, unsigned); -int bch2_replicas_delta_list_apply(struct bch_fs *, - struct bch_fs_usage *, - struct replicas_delta_list *); int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned, s64, unsigned); int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, struct bkey_i *insert, unsigned); -void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); +void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct disk_reservation *, struct bch_dev *, @@ -263,13 +255,11 @@ int bch2_trans_mark_dev_sb(struct bch_fs *, struct disk_reservation *, /* disk reservations: */ -void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); - static inline void bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { - if (res->sectors) - __bch2_disk_reservation_put(c, res); + this_cpu_sub(*c->online_reserved, res->sectors); + res->sectors = 0; } #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 404c89a..588b1a7 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -53,7 +53,6 @@ struct bucket_array { }; struct bch_dev_usage { - u64 buckets_alloc; u64 buckets_ec; u64 buckets_unavailable; @@ -66,12 +65,6 @@ struct bch_dev_usage { struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - - u64 online_reserved; - - /* fields after online_reserved are cleared/recalculated by gc: */ - u64 gc_start[0]; - u64 hidden; u64 btree; u64 data; @@ -91,6 +84,11 @@ struct bch_fs_usage { u64 replicas[]; }; +struct bch_fs_usage_online { + u64 online_reserved; + struct bch_fs_usage u; +}; + struct bch_fs_usage_short { u64 capacity; u64 used; @@ -98,22 +96,6 @@ struct bch_fs_usage_short { u64 nr_inodes; }; -struct replicas_delta { - s64 delta; - struct bch_replicas_entry r; -} __packed; - -struct replicas_delta_list { - unsigned size; - unsigned used; - - struct {} memset_start; - u64 nr_inodes; - u64 persistent_reserved[BCH_REPLICAS_MAX]; - struct {} memset_end; - struct replicas_delta d[0]; -}; - /* * A reservation for space on disk: */ diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 49842ec..c616014 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -379,7 +379,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, { struct bch_ioctl_fs_usage *arg = NULL; struct bch_replicas_usage *dst_e, *dst_end; - struct bch_fs_usage *src; + struct bch_fs_usage_online *src; u32 replica_entries_bytes; unsigned i; int ret = 0; @@ -405,7 +405,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, arg->online_reserved = src->online_reserved; for (i = 0; i < BCH_REPLICAS_MAX; i++) - arg->persistent_reserved[i] = src->persistent_reserved[i]; + arg->persistent_reserved[i] = src->u.persistent_reserved[i]; dst_e = arg->replicas; dst_end = (void *) arg->replicas + replica_entries_bytes; @@ -419,7 +419,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, break; } - dst_e->sectors = src->replicas[i]; + dst_e->sectors = src->u.replicas[i]; dst_e->r = *src_e; /* recheck after setting nr_devs: */ diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 69c553a..b901be5 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -11,6 +11,7 @@ #include "btree_gc.h" #include "btree_update.h" #include "buckets.h" +#include "error.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -59,21 +60,23 @@ journal_seq_to_buf(struct journal *j, u64 seq) return buf; } -static void journal_pin_new_entry(struct journal *j, int count) +static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) { - struct journal_entry_pin_list *p; + INIT_LIST_HEAD(&p->list); + INIT_LIST_HEAD(&p->key_cache_list); + INIT_LIST_HEAD(&p->flushed); + atomic_set(&p->count, count); + p->devs.nr = 0; +} +static void journal_pin_new_entry(struct journal *j) +{ /* * The fifo_push() needs to happen at the same time as j->seq is * incremented for journal_last_seq() to be calculated correctly */ atomic64_inc(&j->seq); - p = fifo_push_ref(&j->pin); - - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, count); - p->devs.nr = 0; + journal_pin_list_init(fifo_push_ref(&j->pin), 1); } static void bch2_journal_buf_init(struct journal *j) @@ -192,7 +195,7 @@ static bool __journal_entry_close(struct journal *j) __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); /* Initialize new buffer: */ - journal_pin_new_entry(j, 1); + journal_pin_new_entry(j); bch2_journal_buf_init(j); @@ -450,6 +453,27 @@ unlock: if (!ret) goto retry; + if ((ret == cur_entry_journal_full || + ret == cur_entry_journal_pin_full) && + !can_discard && + j->reservations.idx == j->reservations.unwritten_idx && + (flags & JOURNAL_RES_GET_RESERVED)) { + char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC); + + bch_err(c, "Journal stuck!"); + if (journal_debug_buf) { + bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j); + bch_err(c, "%s", journal_debug_buf); + + bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j); + bch_err(c, "Journal pins:\n%s", journal_debug_buf); + kfree(journal_debug_buf); + } + + bch2_fatal_error(c); + dump_stack(); + } + /* * Journal is full - can't rely on reclaim from work item due to * freezing: @@ -499,7 +523,7 @@ static bool journal_preres_available(struct journal *j, unsigned new_u64s, unsigned flags) { - bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); + bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); if (!ret && mutex_trylock(&j->reclaim_lock)) { bch2_journal_reclaim(j); @@ -1009,12 +1033,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); - fifo_for_each_entry_ptr(p, &j->pin, seq) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->flushed); - atomic_set(&p->count, 1); - p->devs.nr = 0; - } + fifo_for_each_entry_ptr(p, &j->pin, seq) + journal_pin_list_init(p, 1); list_for_each_entry(i, journal_entries, list) { unsigned ptr; @@ -1037,7 +1057,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, set_bit(JOURNAL_STARTED, &j->flags); j->last_flush_write = jiffies; - journal_pin_new_entry(j, 1); + journal_pin_new_entry(j); j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); @@ -1114,6 +1134,7 @@ int bch2_fs_journal_init(struct journal *j) spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); INIT_DELAYED_WORK(&j->write_work, journal_write_work); + init_waitqueue_head(&j->reclaim_wait); init_waitqueue_head(&j->pin_flush_wait); mutex_init(&j->reclaim_lock); mutex_init(&j->discard_lock); @@ -1166,6 +1187,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) "last_seq_ondisk:\t%llu\n" "flushed_seq_ondisk:\t%llu\n" "prereserved:\t\t%u/%u\n" + "each entry reserved:\t%u\n" "nr flush writes:\t%llu\n" "nr noflush writes:\t%llu\n" "nr direct reclaim:\t%llu\n" @@ -1180,6 +1202,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) j->flushed_seq_ondisk, j->prereserved.reserved, j->prereserved.remaining, + j->entry_u64s_reserved, j->nr_flush_writes, j->nr_noflush_writes, j->nr_direct_reclaim, diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index bda8cb9..cc49712 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -213,11 +213,13 @@ static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type enum btree_id id, unsigned level, const void *data, unsigned u64s) { - memset(entry, 0, sizeof(*entry)); entry->u64s = cpu_to_le16(u64s); - entry->type = type; entry->btree_id = id; entry->level = level; + entry->type = type; + entry->pad[0] = 0; + entry->pad[1] = 0; + entry->pad[2] = 0; memcpy_u64s_small(entry->_data, data, u64s); return jset_u64s(u64s); @@ -306,7 +308,6 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, #define JOURNAL_RES_GET_NONBLOCK (1 << 0) #define JOURNAL_RES_GET_CHECK (1 << 1) #define JOURNAL_RES_GET_RESERVED (1 << 2) -#define JOURNAL_RES_GET_RECLAIM (1 << 3) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, @@ -410,7 +411,12 @@ static inline void bch2_journal_preres_put(struct journal *j, s.v = atomic64_sub_return(s.v, &j->prereserved.counter); res->u64s = 0; - closure_wake_up(&j->preres_wait); + + if (unlikely(s.waiting)) { + clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), + (unsigned long *) &j->prereserved.v); + closure_wake_up(&j->preres_wait); + } if (s.reserved <= s.remaining && !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { @@ -426,32 +432,32 @@ int __bch2_journal_preres_get(struct journal *, static inline int bch2_journal_preres_get_fast(struct journal *j, struct journal_preres *res, unsigned new_u64s, - unsigned flags) + unsigned flags, + bool set_waiting) { int d = new_u64s - res->u64s; union journal_preres_state old, new; u64 v = atomic64_read(&j->prereserved.counter); + int ret; do { old.v = new.v = v; - - new.reserved += d; - - /* - * If we're being called from the journal reclaim path, we have - * to unconditionally give out the pre-reservation, there's - * nothing else sensible we can do - otherwise we'd recurse back - * into the reclaim path and deadlock: - */ - - if (!(flags & JOURNAL_RES_GET_RECLAIM) && - new.reserved > new.remaining) + ret = 0; + + if ((flags & JOURNAL_RES_GET_RESERVED) || + new.reserved + d < new.remaining) { + new.reserved += d; + ret = 1; + } else if (set_waiting && !new.waiting) + new.waiting = true; + else return 0; } while ((v = atomic64_cmpxchg(&j->prereserved.counter, old.v, new.v)) != old.v); - res->u64s += d; - return 1; + if (ret) + res->u64s += d; + return ret; } static inline int bch2_journal_preres_get(struct journal *j, @@ -462,7 +468,7 @@ static inline int bch2_journal_preres_get(struct journal *j, if (new_u64s <= res->u64s) return 0; - if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) + if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) return 0; if (flags & JOURNAL_RES_GET_NONBLOCK) diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 93b5e07..7be6c65 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -239,7 +239,7 @@ void bch2_journal_space_available(struct journal *j) u64s_remaining = (u64) clean << 6; u64s_remaining -= (u64) total << 3; u64s_remaining = max(0LL, u64s_remaining); - u64s_remaining /= 2; + u64s_remaining /= 4; u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; @@ -353,6 +353,9 @@ static inline void __journal_pin_drop(struct journal *j, if (!journal_pin_active(pin)) return; + if (j->flush_in_progress == pin) + j->flush_in_progress_dropped = true; + pin_list = journal_seq_pin(j, pin->seq); pin->seq = 0; list_del_init(&pin->list); @@ -404,7 +407,12 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, pin->seq = seq; pin->flush = flush_fn; - list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); + if (flush_fn == bch2_btree_key_cache_journal_flush) + list_add(&pin->list, &pin_list->key_cache_list); + else if (flush_fn) + list_add(&pin->list, &pin_list->list); + else + list_add(&pin->list, &pin_list->flushed); spin_unlock(&j->lock); /* @@ -434,39 +442,49 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) */ static struct journal_entry_pin * -journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) +journal_get_next_pin(struct journal *j, + bool get_any, + bool get_key_cache, + u64 max_seq, u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; - if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) - return NULL; - - spin_lock(&j->lock); - - fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) - if (*seq > max_seq || - (ret = list_first_entry_or_null(&pin_list->list, - struct journal_entry_pin, list))) + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { + if (*seq > max_seq && !get_any && !get_key_cache) break; - if (ret) { - list_move(&ret->list, &pin_list->flushed); - BUG_ON(j->flush_in_progress); - j->flush_in_progress = ret; - } + if (*seq <= max_seq || get_any) { + ret = list_first_entry_or_null(&pin_list->list, + struct journal_entry_pin, list); + if (ret) + return ret; + } - spin_unlock(&j->lock); + if (*seq <= max_seq || get_any || get_key_cache) { + ret = list_first_entry_or_null(&pin_list->key_cache_list, + struct journal_entry_pin, list); + if (ret) + return ret; + } + } - return ret; + return NULL; } /* returns true if we did work */ -static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, - unsigned min_nr) +static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, + unsigned min_any, + unsigned min_key_cache) { struct journal_entry_pin *pin; - u64 seq, ret = 0; + size_t nr_flushed = 0; + journal_pin_flush_fn flush_fn; + u64 seq; + int err; + + if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) + return 0; lockdep_assert_held(&j->reclaim_lock); @@ -475,23 +493,47 @@ static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, j->last_flushed = jiffies; - pin = journal_get_next_pin(j, min_nr - ? U64_MAX : seq_to_flush, &seq); + spin_lock(&j->lock); + pin = journal_get_next_pin(j, + min_any != 0, + min_key_cache != 0, + seq_to_flush, &seq); + if (pin) { + BUG_ON(j->flush_in_progress); + j->flush_in_progress = pin; + j->flush_in_progress_dropped = false; + flush_fn = pin->flush; + } + spin_unlock(&j->lock); + if (!pin) break; - if (min_nr) - min_nr--; + if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) + min_key_cache--; + + if (min_any) + min_any--; - pin->flush(j, pin, seq); + err = flush_fn(j, pin, seq); - BUG_ON(j->flush_in_progress != pin); + spin_lock(&j->lock); + /* Pin might have been dropped or rearmed: */ + if (likely(!err && !j->flush_in_progress_dropped)) + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); j->flush_in_progress = NULL; + j->flush_in_progress_dropped = false; + spin_unlock(&j->lock); + wake_up(&j->pin_flush_wait); - ret++; + + if (err) + break; + + nr_flushed++; } - return ret; + return nr_flushed; } static u64 journal_seq_to_flush(struct journal *j) @@ -556,8 +598,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool kthread = (current->flags & PF_KTHREAD) != 0; - u64 seq_to_flush, nr_flushed = 0; - size_t min_nr; + u64 seq_to_flush; + size_t min_nr, nr_flushed; unsigned flags; int ret = 0; @@ -595,15 +637,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) if (j->prereserved.reserved * 2 > j->prereserved.remaining) min_nr = 1; - if (atomic_read(&c->btree_cache.dirty) * 4 > - c->btree_cache.used * 3) - min_nr = 1; - if (fifo_free(&j->pin) <= 32) min_nr = 1; - min_nr = max(min_nr, bch2_nr_btree_keys_want_flush(c)); - trace_journal_reclaim_start(c, min_nr, j->prereserved.reserved, @@ -613,14 +649,19 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); - nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr); + nr_flushed = journal_flush_pins(j, seq_to_flush, + min_nr, + min(bch2_nr_btree_keys_need_flush(c), 128UL)); if (direct) j->nr_direct_reclaim += nr_flushed; else j->nr_background_reclaim += nr_flushed; trace_journal_reclaim_finish(c, nr_flushed); - } while (min_nr && nr_flushed); + + if (nr_flushed) + wake_up(&j->reclaim_wait); + } while (min_nr && nr_flushed && !direct); memalloc_noreclaim_restore(flags); @@ -713,7 +754,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0; + *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0; spin_lock(&j->lock); /* diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index d17a1ff..c24bc4a 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -43,6 +43,7 @@ struct journal_buf { struct journal_entry_pin_list { struct list_head list; + struct list_head key_cache_list; struct list_head flushed; atomic_t count; struct bch_devs_list devs; @@ -50,7 +51,7 @@ struct journal_entry_pin_list { struct journal; struct journal_entry_pin; -typedef void (*journal_pin_flush_fn)(struct journal *j, +typedef int (*journal_pin_flush_fn)(struct journal *j, struct journal_entry_pin *, u64); struct journal_entry_pin { @@ -105,8 +106,9 @@ union journal_preres_state { }; struct { - u32 reserved; - u32 remaining; + u64 waiting:1, + reserved:31, + remaining:32; }; }; @@ -243,6 +245,7 @@ struct journal { spinlock_t err_lock; struct mutex reclaim_lock; + wait_queue_head_t reclaim_wait; struct task_struct *reclaim_thread; bool reclaim_kicked; u64 nr_direct_reclaim; @@ -250,6 +253,7 @@ struct journal { unsigned long last_flushed; struct journal_entry_pin *flush_in_progress; + bool flush_in_progress_dropped; wait_queue_head_t pin_flush_wait; /* protects advancing ja->discard_idx: */ diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 1403616..ef69a19 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -88,6 +88,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags if (ret) break; } + bch2_trans_iter_put(&trans, iter); ret = bch2_trans_exit(&trans) ?: ret; bch2_bkey_buf_exit(&sk, c); @@ -135,20 +136,24 @@ retry: dev_idx, flags, true); if (ret) { bch_err(c, "Cannot drop device without losing data"); - goto err; + break; } ret = bch2_btree_node_update_key(c, iter, b, k.k); if (ret == -EINTR) { b = bch2_btree_iter_peek_node(iter); + ret = 0; goto retry; } if (ret) { bch_err(c, "Error updating btree node key: %i", ret); - goto err; + break; } } bch2_trans_iter_free(&trans, iter); + + if (ret) + goto err; } /* flush relevant btree updates */ diff --git a/libbcachefs/move.c b/libbcachefs/move.c index c9e1849..5b10849 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -793,6 +793,9 @@ next: out: bch2_trans_exit(&trans); + if (ret) + bch_err(c, "error %i in bch2_move_btree", ret); + return ret; } @@ -916,8 +919,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) rewrite_old_nodes_pred, c, stats); if (!ret) { mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE; - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; c->disk_sb.sb->version_min = c->disk_sb.sb->version; bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index f9312f0..0cfbb56 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -21,6 +21,11 @@ const char * const bch2_sb_features[] = { NULL }; +const char * const bch2_sb_compat[] = { + BCH_SB_COMPAT() + NULL +}; + const char * const bch2_btree_ids[] = { BCH_BTREE_IDS() NULL diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 4ae58b6..001e865 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -10,6 +10,7 @@ extern const char * const bch2_error_actions[]; extern const char * const bch2_sb_features[]; +extern const char * const bch2_sb_compat[]; extern const char * const bch2_btree_ids[]; extern const char * const bch2_csum_opts[]; extern const char * const bch2_compression_opts[]; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 3d1bf87..86593e9 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -935,7 +935,7 @@ static int read_btree_roots(struct bch_fs *c) if (i == BTREE_ID_alloc && c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); continue; } @@ -945,7 +945,7 @@ static int read_btree_roots(struct bch_fs *c) "invalid btree root %s", bch2_btree_ids[i]); if (i == BTREE_ID_alloc) - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } ret = bch2_btree_root_read(c, i, &r->key, r->level); @@ -955,7 +955,7 @@ static int read_btree_roots(struct bch_fs *c) "error reading btree root %s", bch2_btree_ids[i]); if (i == BTREE_ID_alloc) - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } } @@ -998,7 +998,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) { + if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); ret = -EINVAL; goto err; @@ -1041,7 +1041,7 @@ int bch2_fs_recovery(struct bch_fs *c) last_journal_entry && !journal_entry_empty(last_journal_entry), c, "filesystem marked clean but journal not empty")) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); c->sb.clean = false; } @@ -1075,7 +1075,7 @@ use_clean: } if (c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); drop_alloc_keys(&c->journal_keys); } @@ -1128,8 +1128,8 @@ use_clean: set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); if (c->opts.fsck || - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bch_info(c, "starting mark and sweep"); err = "error in mark and sweep"; @@ -1215,11 +1215,11 @@ use_clean: bch_verbose(c, "quotas done"); } - if (!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE)) || - !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE))) { + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { struct bch_move_stats stats = { 0 }; - bch_verbose(c, "scanning for old btree nodes"); + bch_info(c, "scanning for old btree nodes"); ret = bch2_fs_read_write(c); if (ret) goto err; @@ -1227,7 +1227,7 @@ use_clean: ret = bch2_scan_old_btree_nodes(c, &stats); if (ret) goto err; - bch_verbose(c, "scanning for old btree nodes done"); + bch_info(c, "scanning for old btree nodes done"); } mutex_lock(&c->sb_lock); @@ -1238,7 +1238,7 @@ use_clean: } if (!test_bit(BCH_FS_ERROR, &c->flags)) { - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; write_sb = true; } @@ -1289,8 +1289,8 @@ int bch2_fs_initialize(struct bch_fs *c) bch_notice(c, "initializing new filesystem"); mutex_lock(&c->sb_lock); - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_EXTENTS_ABOVE_BTREE_UPDATES_DONE; - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_BFORMAT_OVERFLOW_DONE; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done; if (c->opts.version_upgrade) { c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index be73b45..1e29717 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -271,11 +271,13 @@ static int replicas_table_update(struct bch_fs *c, struct bch_replicas_cpu *new_r) { struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; - struct bch_fs_usage *new_scratch = NULL; + struct bch_fs_usage_online *new_scratch = NULL; struct bch_fs_usage __percpu *new_gc = NULL; struct bch_fs_usage *new_base = NULL; unsigned i, bytes = sizeof(struct bch_fs_usage) + sizeof(u64) * new_r->nr; + unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + + sizeof(u64) * new_r->nr; int ret = 0; memset(new_usage, 0, sizeof(new_usage)); @@ -286,7 +288,7 @@ static int replicas_table_update(struct bch_fs *c, goto err; if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || - !(new_scratch = kmalloc(bytes, GFP_KERNEL)) || + !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || (c->usage_gc && !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) goto err; @@ -462,6 +464,36 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, return 0; } +/* replicas delta list: */ + +bool bch2_replicas_delta_list_marked(struct bch_fs *c, + struct replicas_delta_list *r) +{ + struct replicas_delta *d = r->d; + struct replicas_delta *top = (void *) r->d + r->used; + + percpu_rwsem_assert_held(&c->mark_lock); + + for (d = r->d; d != top; d = replicas_delta_next(d)) + if (bch2_replicas_entry_idx(c, &d->r) < 0) + return false; + return true; +} + +int bch2_replicas_delta_list_mark(struct bch_fs *c, + struct replicas_delta_list *r) +{ + struct replicas_delta *d = r->d; + struct replicas_delta *top = (void *) r->d + r->used; + int ret = 0; + + for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) + ret = bch2_mark_replicas(c, &d->r); + return ret; +} + +/* bkey replicas: */ + bool bch2_bkey_replicas_marked(struct bch_fs *c, struct bkey_s_c k) { @@ -473,6 +505,11 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) return __bch2_mark_bkey_replicas(c, k, false); } +/* + * Old replicas_gc mechanism: only used for journal replicas entries now, should + * die at some point: + */ + int bch2_replicas_gc_end(struct bch_fs *c, int ret) { unsigned i; @@ -566,6 +603,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) return 0; } +/* New much simpler mechanism for clearing out unneeded replicas entries: */ + int bch2_replicas_gc2(struct bch_fs *c) { struct bch_replicas_cpu new = { 0 }; @@ -966,11 +1005,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, percpu_down_read(&c->mark_lock); for_each_cpu_replicas_entry(&c->replicas, e) { - unsigned i, nr_online = 0, dflags = 0; + unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; - for (i = 0; i < e->nr_devs; i++) + for (i = 0; i < e->nr_devs; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); + nr_online += test_bit(e->devs[i], devs.d); + nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; + } + + if (nr_failed == e->nr_devs) + continue; if (nr_online < e->nr_required) dflags |= metadata diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 9c8fd3d..c77e873 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -26,6 +26,31 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); int bch2_mark_replicas(struct bch_fs *, struct bch_replicas_entry *); +struct replicas_delta { + s64 delta; + struct bch_replicas_entry r; +} __packed; + +struct replicas_delta_list { + unsigned size; + unsigned used; + + struct {} memset_start; + u64 nr_inodes; + u64 persistent_reserved[BCH_REPLICAS_MAX]; + struct {} memset_end; + struct replicas_delta d[0]; +}; + +static inline struct replicas_delta * +replicas_delta_next(struct replicas_delta *d) +{ + return (void *) d + replicas_entry_bytes(&d->r) + 8; +} + +bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *); +int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); + void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 6f5d539..1793697 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -377,7 +377,6 @@ static void bch2_sb_update(struct bch_fs *c) ca->mi = bch2_mi_to_cpu(mi->members + i); } -/* doesn't copy member info */ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) { struct bch_sb_field *src_f, *dst_f; @@ -996,7 +995,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct bch_dev *ca; unsigned i, dev; - percpu_down_write(&c->mark_lock); + percpu_down_read(&c->mark_lock); if (!journal_seq) { for (i = 0; i < ARRAY_SIZE(c->usage); i++) @@ -1067,7 +1066,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, } } - percpu_up_write(&c->mark_lock); + percpu_up_read(&c->mark_lock); for (i = 0; i < 2; i++) { struct jset_entry_clock *clock = @@ -1093,8 +1092,8 @@ void bch2_fs_mark_clean(struct bch_fs *c) SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; - c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info; + c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata; c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index c57ebff..2d00897 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -153,6 +153,8 @@ read_attribute(io_latency_stats_read); read_attribute(io_latency_stats_write); read_attribute(congested); +read_attribute(btree_avg_write_size); + read_attribute(bucket_quantiles_last_read); read_attribute(bucket_quantiles_last_write); read_attribute(bucket_quantiles_fragmentation); @@ -230,9 +232,17 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) return ret; } +static size_t bch2_btree_avg_write_size(struct bch_fs *c) +{ + u64 nr = atomic64_read(&c->btree_writes_nr); + u64 sectors = atomic64_read(&c->btree_writes_sectors); + + return nr ? div64_u64(sectors, nr) : 0; +} + static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) { - struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); + struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); if (!fs_usage) return -ENOMEM; @@ -318,6 +328,7 @@ SHOW(bch2_fs) sysfs_print(block_size, block_bytes(c)); sysfs_print(btree_node_size, btree_bytes(c)); sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); + sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); sysfs_print(read_realloc_races, atomic_long_read(&c->read_realloc_races)); @@ -513,6 +524,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_block_size, &sysfs_btree_node_size, &sysfs_btree_cache_size, + &sysfs_btree_avg_write_size, &sysfs_journal_write_delay_ms, &sysfs_journal_reclaim_delay_ms, @@ -800,7 +812,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) pr_buf(out, "ec\t%16llu\n" "available%15llu\n" - "alloc\t%16llu\n" "\n" "free_inc\t\t%zu/%zu\n" "free[RESERVE_MOVINGGC]\t%zu/%zu\n" @@ -813,7 +824,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) "btree reserve cache\t%u\n", stats.buckets_ec, __dev_buckets_available(ca, stats), - stats.buckets_alloc, fifo_used(&ca->free_inc), ca->free_inc.size, fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, diff --git a/linux/six.c b/linux/six.c index fe72189..fca1208 100644 --- a/linux/six.c +++ b/linux/six.c @@ -252,12 +252,13 @@ retry: old.v, new.v)) != old.v); ret = !(old.v & l[type].lock_fail); + + EBUG_ON(ret && !(lock->state.v & l[type].held_mask)); } if (ret) six_set_owner(lock, type, old); - EBUG_ON(ret && !(lock->state.v & l[type].held_mask)); EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking)); return ret; -- 2.39.2