From b6740e5392a0e0a5fddc5ad4ffac0567078e114a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Dec 2023 21:57:44 -0500 Subject: [PATCH] Update bcachefs sources to 841a95c29f4c bcachefs: fix userspace build errors Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- Makefile | 2 +- include/linux/compiler.h | 1 + include/linux/printk.h | 1 + include/linux/rcupdate.h | 1 + include/linux/refcount.h | 352 ++++++++++++++ libbcachefs/alloc_background.c | 42 +- libbcachefs/alloc_foreground.c | 33 +- libbcachefs/backpointers.c | 8 +- libbcachefs/bcachefs.h | 48 +- libbcachefs/bcachefs_format.h | 5 +- libbcachefs/bcachefs_ioctl.h | 5 +- libbcachefs/btree_gc.c | 41 +- libbcachefs/btree_io.c | 35 +- libbcachefs/btree_iter.c | 644 ++++++++++++++----------- libbcachefs/btree_iter.h | 274 +++++------ libbcachefs/btree_key_cache.c | 32 +- libbcachefs/btree_key_cache.h | 2 - libbcachefs/btree_locking.c | 73 ++- libbcachefs/btree_locking.h | 7 +- libbcachefs/btree_trans_commit.c | 149 ++---- libbcachefs/btree_types.h | 97 ++-- libbcachefs/btree_update.c | 188 +++----- libbcachefs/btree_update.h | 53 +- libbcachefs/btree_update_interior.c | 201 ++++---- libbcachefs/btree_update_interior.h | 11 +- libbcachefs/btree_write_buffer.c | 523 ++++++++++++++------ libbcachefs/btree_write_buffer.h | 52 +- libbcachefs/btree_write_buffer_types.h | 63 ++- libbcachefs/chardev.c | 106 +++- libbcachefs/clock.c | 3 +- libbcachefs/data_update.c | 3 +- libbcachefs/debug.c | 39 +- libbcachefs/dirent.c | 19 +- libbcachefs/dirent.h | 1 + libbcachefs/ec.c | 48 +- libbcachefs/errcode.h | 1 - libbcachefs/error.h | 2 +- libbcachefs/extents.c | 3 +- libbcachefs/fs-io.c | 14 +- libbcachefs/fs-ioctl.c | 6 +- libbcachefs/fs.c | 22 +- libbcachefs/fsck.c | 144 +++--- libbcachefs/inode.c | 53 +- libbcachefs/journal.c | 53 +- libbcachefs/journal.h | 2 + libbcachefs/journal_io.c | 85 ++-- libbcachefs/journal_reclaim.c | 19 +- libbcachefs/journal_reclaim.h | 1 + libbcachefs/journal_types.h | 8 + libbcachefs/logged_ops.c | 5 +- libbcachefs/move.c | 26 +- libbcachefs/opts.h | 7 +- libbcachefs/quota.c | 8 +- libbcachefs/rebalance.c | 2 +- libbcachefs/recovery.c | 63 ++- libbcachefs/recovery.h | 1 + libbcachefs/recovery_types.h | 73 +-- libbcachefs/sb-errors.h | 2 +- libbcachefs/six.h | 6 +- libbcachefs/snapshot.c | 45 +- libbcachefs/super.c | 34 +- libbcachefs/sysfs.c | 4 +- libbcachefs/trace.h | 10 +- 64 files changed, 2429 insertions(+), 1434 deletions(-) create mode 100644 include/linux/refcount.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 393a80e..a675c09 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -6d44812757ddf81fad087d6abe662355e6712e02 +841a95c29f4caefb9c3875466024f3549f45f842 diff --git a/Makefile b/Makefile index 5693c0e..a9feb67 100644 --- a/Makefile +++ b/Makefile @@ -270,7 +270,7 @@ update-bcachefs-sources: git add include/linux/kmemleak.h cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/ git add linux/int_sqrt.c - git rm libbcachefs/mean_and_variance_test.c + git rm -f libbcachefs/mean_and_variance_test.c # cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/ # git add linux/mean_and_variance.c # cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5778690..02fc334 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -47,6 +47,7 @@ #define __builtin_warning(x, y...) (1) #define __must_hold(x) #define __acquires(x) +#define __cond_acquires(x) #define __releases(x) #define __acquire(x) (void)0 #define __release(x) (void)0 diff --git a/include/linux/printk.h b/include/linux/printk.h index df9c192..cdafb9a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -19,6 +19,7 @@ #define KERN_DEBUG "" #define KERN_DEFAULT "" #define KERN_CONT "" +#define KERN_SOH "\001" static inline int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ec5f478..f526027 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -12,6 +12,7 @@ #define rcu_access_pointer(p) READ_ONCE(p) #define kfree_rcu(ptr, rcu_head) kfree(ptr) /* XXX */ +#define kfree_rcu_mightsleep(ptr) kfree(ptr) /* XXX */ #define kvfree_rcu_mightsleep(ptr) kfree(ptr) /* XXX */ #define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v) diff --git a/include/linux/refcount.h b/include/linux/refcount.h new file mode 100644 index 0000000..ddeec98 --- /dev/null +++ b/include/linux/refcount.h @@ -0,0 +1,352 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Variant of atomic_t specialized for reference counts. + * + * The interface matches the atomic_t interface (to aid in porting) but only + * provides the few functions one should use for reference counting. + * + * Saturation semantics + * ==================== + * + * refcount_t differs from atomic_t in that the counter saturates at + * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the + * counter and causing 'spurious' use-after-free issues. In order to avoid the + * cost associated with introducing cmpxchg() loops into all of the saturating + * operations, we temporarily allow the counter to take on an unchecked value + * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow + * or overflow has occurred. Although this is racy when multiple threads + * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly + * equidistant from 0 and INT_MAX we minimise the scope for error: + * + * INT_MAX REFCOUNT_SATURATED UINT_MAX + * 0 (0x7fff_ffff) (0xc000_0000) (0xffff_ffff) + * +--------------------------------+----------------+----------------+ + * <---------- bad value! ----------> + * + * (in a signed view of the world, the "bad value" range corresponds to + * a negative counter value). + * + * As an example, consider a refcount_inc() operation that causes the counter + * to overflow: + * + * int old = atomic_fetch_add_relaxed(r); + * // old is INT_MAX, refcount now INT_MIN (0x8000_0000) + * if (old < 0) + * atomic_set(r, REFCOUNT_SATURATED); + * + * If another thread also performs a refcount_inc() operation between the two + * atomic operations, then the count will continue to edge closer to 0. If it + * reaches a value of 1 before /any/ of the threads reset it to the saturated + * value, then a concurrent refcount_dec_and_test() may erroneously free the + * underlying object. + * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently + * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK). + * With the current PID limit, if no batched refcounting operations are used and + * the attacker can't repeatedly trigger kernel oopses in the middle of refcount + * operations, this makes it impossible for a saturated refcount to leave the + * saturation range, even if it is possible for multiple uses of the same + * refcount to nest in the context of a single task: + * + * (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT = + * 0x40000000 / 0x400000 = 0x100 = 256 + * + * If hundreds of references are added/removed with a single refcounting + * operation, it may potentially be possible to leave the saturation range; but + * given the precise timing details involved with the round-robin scheduling of + * each thread manipulating the refcount and the need to hit the race multiple + * times in succession, there doesn't appear to be a practical avenue of attack + * even if using refcount_add() operations with larger increments. + * + * Memory ordering + * =============== + * + * Memory ordering rules are slightly relaxed wrt regular atomic_t functions + * and provide only what is strictly required for refcounts. + * + * The increments are fully relaxed; these will not provide ordering. The + * rationale is that whatever is used to obtain the object we're increasing the + * reference count on will provide the ordering. For locked data structures, + * its the lock acquire, for RCU/lockless data structures its the dependent + * load. + * + * Do note that inc_not_zero() provides a control dependency which will order + * future stores against the inc, this ensures we'll never modify the object + * if we did not in fact acquire a reference. + * + * The decrements will provide release order, such that all the prior loads and + * stores will be issued before, it also provides a control dependency, which + * will order us against the subsequent free(). + * + * The control dependency is against the load of the cmpxchg (ll/sc) that + * succeeded. This means the stores aren't fully ordered, but this is fine + * because the 1->0 transition indicates no concurrency. + * + * Note that the allocator is responsible for ordering things between free() + * and alloc(). + * + * The decrements dec_and_test() and sub_and_test() also provide acquire + * ordering on success. + * + */ + +#ifndef _LINUX_REFCOUNT_H +#define _LINUX_REFCOUNT_H + +#include +#include +#include +#include + +struct mutex; + +/** + * typedef refcount_t - variant of atomic_t specialized for reference counts + * @refs: atomic_t counter field + * + * The counter saturates at REFCOUNT_SATURATED and will not move once + * there. This avoids wrapping the counter and causing 'spurious' + * use-after-free bugs. + */ +typedef struct refcount_struct { + atomic_t refs; +} refcount_t; + +#define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } +#define REFCOUNT_MAX INT_MAX +#define REFCOUNT_SATURATED (INT_MIN / 2) + +enum refcount_saturation_type { + REFCOUNT_ADD_NOT_ZERO_OVF, + REFCOUNT_ADD_OVF, + REFCOUNT_ADD_UAF, + REFCOUNT_SUB_UAF, + REFCOUNT_DEC_LEAK, +}; + +/** + * refcount_set - set a refcount's value + * @r: the refcount + * @n: value to which the refcount will be set + */ +static inline void refcount_set(refcount_t *r, int n) +{ + atomic_set(&r->refs, n); +} + +/** + * refcount_read - get a refcount's value + * @r: the refcount + * + * Return: the refcount's value + */ +static inline unsigned int refcount_read(const refcount_t *r) +{ + return atomic_read(&r->refs); +} + +static inline __must_check bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp) +{ + int old = refcount_read(r); + + do { + if (!old) + break; + } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); + + if (oldp) + *oldp = old; + + return old; +} + +/** + * refcount_add_not_zero - add a value to a refcount unless it is 0 + * @i: the value to add to the refcount + * @r: the refcount + * + * Will saturate at REFCOUNT_SATURATED and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + * + * Return: false if the passed refcount is 0, true otherwise + */ +static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) +{ + return __refcount_add_not_zero(i, r, NULL); +} + +static inline void __refcount_add(int i, refcount_t *r, int *oldp) +{ + int old = atomic_add_return(i, &r->refs); + + if (oldp) + *oldp = old; +} + +/** + * refcount_add - add a value to a refcount + * @i: the value to add to the refcount + * @r: the refcount + * + * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + */ +static inline void refcount_add(int i, refcount_t *r) +{ + __refcount_add(i, r, NULL); +} + +static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero(1, r, oldp); +} + +/** + * refcount_inc_not_zero - increment a refcount unless it is 0 + * @r: the refcount to increment + * + * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED + * and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Return: true if the increment was successful, false otherwise + */ +static inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + return __refcount_inc_not_zero(r, NULL); +} + +static inline void __refcount_inc(refcount_t *r, int *oldp) +{ + __refcount_add(1, r, oldp); +} + +/** + * refcount_inc - increment a refcount + * @r: the refcount to increment + * + * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN. + * + * Provides no memory ordering, it is assumed the caller already has a + * reference on the object. + * + * Will WARN if the refcount is 0, as this represents a possible use-after-free + * condition. + */ +static inline void refcount_inc(refcount_t *r) +{ + __refcount_inc(r, NULL); +} + +static inline __must_check bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp) +{ + int old = atomic_sub_return_release(i, &r->refs); + + if (oldp) + *oldp = old; + + if (old == i) { + smp_acquire__after_ctrl_dep(); + return true; + } + + return false; +} + +/** + * refcount_sub_and_test - subtract from a refcount and test if it is 0 + * @i: amount to subtract from the refcount + * @r: the refcount + * + * Similar to atomic_dec_and_test(), but it will WARN, return false and + * ultimately leak on underflow and will fail to decrement when saturated + * at REFCOUNT_SATURATED. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides an acquire ordering on success such that free() + * must come after. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_dec(), or one of its variants, should instead be used to + * decrement a reference count. + * + * Return: true if the resulting refcount is 0, false otherwise + */ +static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r) +{ + return __refcount_sub_and_test(i, r, NULL); +} + +static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp) +{ + return __refcount_sub_and_test(1, r, oldp); +} + +/** + * refcount_dec_and_test - decrement a refcount and test if it is 0 + * @r: the refcount + * + * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to + * decrement when saturated at REFCOUNT_SATURATED. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides an acquire ordering on success such that free() + * must come after. + * + * Return: true if the resulting refcount is 0, false otherwise + */ +static inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + return __refcount_dec_and_test(r, NULL); +} + +static inline void __refcount_dec(refcount_t *r, int *oldp) +{ + int old = atomic_sub_return_release(1, &r->refs); + + if (oldp) + *oldp = old; +} + +/** + * refcount_dec - decrement a refcount + * @r: the refcount + * + * Similar to atomic_dec(), it will WARN on underflow and fail to decrement + * when saturated at REFCOUNT_SATURATED. + * + * Provides release memory ordering, such that prior loads and stores are done + * before. + */ +static inline void refcount_dec(refcount_t *r) +{ + __refcount_dec(r, NULL); +} + +extern __must_check bool refcount_dec_if_one(refcount_t *r); +extern __must_check bool refcount_dec_not_one(refcount_t *r); +extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock); +extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock); +extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, + spinlock_t *lock, + unsigned long *flags) __cond_acquires(lock); +#endif /* _LINUX_REFCOUNT_H */ diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index ad4ad79..769c37f 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -544,8 +544,8 @@ int bch2_bucket_gens_init(struct bch_fs *c) u8 gen; int ret; - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -572,8 +572,8 @@ int bch2_bucket_gens_init(struct bch_fs *c) } g.v.gens[offset] = gen; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); if (have_bucket_gens_key && !ret) ret = commit_do(trans, NULL, NULL, @@ -582,8 +582,7 @@ int bch2_bucket_gens_init(struct bch_fs *c) bch2_trans_put(trans); - if (ret) - bch_err_fn(c, ret); + bch_err_fn(c, ret); return ret; } @@ -601,8 +600,8 @@ int bch2_alloc_read(struct bch_fs *c) const struct bch_bucket_gens *g; u64 b; - for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; @@ -624,13 +623,13 @@ int bch2_alloc_read(struct bch_fs *c) b < min_t(u64, ca->mi.nbuckets, end); b++) *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } else { struct bch_alloc_v4 a; - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ /* * Not a fsck error because this is checked/repaired by * bch2_check_alloc_key() which runs later: @@ -641,16 +640,14 @@ int bch2_alloc_read(struct bch_fs *c) ca = bch_dev_bkey_exists(c, k.k->p.inode); *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; - } - bch2_trans_iter_exit(trans, &iter); + 0; + })); } bch2_trans_put(trans); up_read(&c->gc_lock); - if (ret) - bch_err_fn(c, ret); - + bch_err_fn(c, ret); return ret; } @@ -876,8 +873,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos bch2_trans_copy_iter(&iter2, iter); - if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) - end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); + struct btree_path *path = btree_iter_path(iter->trans, iter); + if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); @@ -1430,7 +1428,7 @@ bkey_err: if (ret < 0) goto err; - ret = for_each_btree_key2(trans, iter, + ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, bch2_check_discard_freespace_key(trans, &iter)); @@ -1696,8 +1694,8 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, - BTREE_ID_need_discard, POS_MIN, 0, k, + for_each_btree_key(trans, iter, + BTREE_ID_need_discard, POS_MIN, 0, k, bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &seen, &open, diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index a961df7..986b914 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -239,9 +239,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - if (!c->blocked_allocate_open_bucket) - c->blocked_allocate_open_bucket = local_clock(); - + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -267,19 +266,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - if (c->blocked_allocate_open_bucket) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate_open_bucket], - c->blocked_allocate_open_bucket); - c->blocked_allocate_open_bucket = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], + &c->blocked_allocate_open_bucket, false); - if (c->blocked_allocate) { - bch2_time_stats_update( - &c->times[BCH_TIME_blocked_allocate], - c->blocked_allocate); - c->blocked_allocate = 0; - } + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, false); spin_unlock(&c->freelist_lock); return ob; @@ -377,9 +368,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); if (!ob) - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); err: - if (iter.trans && iter.path) + if (iter.path) set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -447,7 +438,7 @@ again: ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); next: - citer.path->preserve = false; + set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -502,7 +493,7 @@ again: ob = try_alloc_bucket(trans, ca, watermark, alloc_cursor, s, k, cl); if (ob) { - iter.path->preserve = false; + set_btree_iter_dontneed(&iter); break; } } @@ -567,8 +558,8 @@ again: goto again; } - if (!c->blocked_allocate) - c->blocked_allocate = local_clock(); + track_event_change(&c->times[BCH_TIME_blocked_allocate], + &c->blocked_allocate, true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 9b5f580..d029936 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -417,8 +417,11 @@ static int check_bp_exists(struct btree_trans *trans, struct btree_iter bp_iter = { NULL }; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; + struct bkey_buf tmp; int ret; + bch2_bkey_buf_init(&tmp); + if (bpos_lt(bucket, bucket_start) || bpos_gt(bucket, bucket_end)) return 0; @@ -438,6 +441,8 @@ static int check_bp_exists(struct btree_trans *trans, if (!bpos_eq(orig_k.k->p, last_flushed->k->k.p) || bkey_bytes(orig_k.k) != bkey_bytes(&last_flushed->k->k) || memcmp(orig_k.v, &last_flushed->k->v, bkey_val_bytes(orig_k.k))) { + bch2_bkey_buf_reassemble(&tmp, c, orig_k); + if (bp.level) { bch2_trans_unlock(trans); bch2_btree_interior_updates_flush(c); @@ -447,7 +452,7 @@ static int check_bp_exists(struct btree_trans *trans, if (ret) goto err; - bch2_bkey_buf_reassemble(last_flushed, c, orig_k); + bch2_bkey_buf_copy(last_flushed, c, tmp.k); ret = -BCH_ERR_transaction_restart_write_buffer_flush; goto out; } @@ -457,6 +462,7 @@ out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); + bch2_bkey_buf_exit(&tmp, c); printbuf_exit(&buf); return ret; missing: diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 66de8c0..e8bee13 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -193,6 +193,7 @@ #include #include #include +#include #include #include #include @@ -264,6 +265,7 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") +__printf(2, 3) void __bch2_print(struct bch_fs *c, const char *fmt, ...); #define maybe_dev_to_fs(_c) _Generic((_c), \ @@ -426,6 +428,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_max_in_flight) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ + x(blocked_write_buffer_full) \ x(nocow_lock_contended) enum bch_time_stats { @@ -636,7 +639,7 @@ struct btree_transaction_stats { struct bch2_time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; - unsigned wb_updates_size; + unsigned journal_entries_size; unsigned max_mem; char *max_paths_text; }; @@ -725,6 +728,7 @@ struct bch_fs { dev_t dev; char name[40]; struct log_output *output; + struct task_struct *output_filter; /* ro/rw, add/remove/resize devices: */ struct rw_semaphore state_lock; @@ -735,6 +739,13 @@ struct bch_fs { #else struct percpu_ref writes; #endif + /* + * Analagous to c->writes, for asynchronous ops that don't necessarily + * need fs to be read-write + */ + refcount_t ro_ref; + wait_queue_head_t ro_ref_wait; + struct work_struct read_only_work; struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; @@ -1037,10 +1048,21 @@ struct bch_fs { /* RECOVERY */ u64 journal_replay_seq_start; u64 journal_replay_seq_end; + /* + * Two different uses: + * "Has this fsck pass?" - i.e. should this type of error be an + * emergency read-only + * And, in certain situations fsck will rewind to an earlier pass: used + * for signaling to the toplevel code which pass we want to run now. + */ enum bch_recovery_pass curr_recovery_pass; /* bitmap of explicitly enabled recovery passes: */ u64 recovery_passes_explicit; + /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; + /* never rewinds version of curr_recovery_pass */ + enum bch_recovery_pass recovery_pass_done; + struct semaphore online_fsck_mutex; /* DEBUG JUNK */ struct dentry *fs_debug_dir; @@ -1100,6 +1122,16 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) #endif } +static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_going_ro, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget(&c->writes); +#endif +} + static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) { #ifdef BCH_WRITE_REF_DEBUG @@ -1129,6 +1161,20 @@ static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) #endif } +static inline bool bch2_ro_ref_tryget(struct bch_fs *c) +{ + if (test_bit(BCH_FS_stopping, &c->flags)) + return false; + + return refcount_inc_not_zero(&c->ro_ref); +} + +static inline void bch2_ro_ref_put(struct bch_fs *c) +{ + if (refcount_dec_and_test(&c->ro_ref)) + wake_up(&c->ro_ref_wait); +} + static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) { #ifndef NO_BCACHEFS_FS diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index be0367f..ebd4f25 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -2137,7 +2137,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(clock, 7) \ x(dev_usage, 8) \ x(log, 9) \ - x(overwrite, 10) + x(overwrite, 10) \ + x(write_buffer_keys, 11) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -2223,7 +2224,7 @@ static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage struct jset_entry_log { struct jset_entry entry; u8 d[]; -} __packed; +} __packed __aligned(8); /* * On disk format for a journal entry: diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index 2ac6272..21f81b1 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -83,9 +83,8 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2) -#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) - -#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) +#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) +#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) /* ioctl below act on a particular file, not the filesystem as a whole: */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 5a7b72a..ae88066 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -1538,8 +1538,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) rcu_assign_pointer(ca->buckets_gc, buckets); } - ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ({ + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ ca = bch_dev_bkey_exists(c, k.k->p.inode); g = gc_bucket(ca, k.k->p.offset); @@ -1665,7 +1665,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) static int bch2_gc_reflink_start(struct bch_fs *c, bool metadata_only) { - struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; struct reflink_gc *r; @@ -1674,30 +1673,30 @@ static int bch2_gc_reflink_start(struct bch_fs *c, if (metadata_only) return 0; - trans = bch2_trans_get(c); c->reflink_gc_nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - const __le64 *refcount = bkey_refcount_c(k); + ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + const __le64 *refcount = bkey_refcount_c(k); - if (!refcount) - continue; + if (!refcount) + continue; - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; - break; - } + r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, + GFP_KERNEL); + if (!r) { + ret = -BCH_ERR_ENOMEM_gc_reflink_start; + break; + } - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - } - bch2_trans_iter_exit(trans, &iter); + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + 0; + }))); - bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index a6ac68f..f0cbc91 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -968,12 +968,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bch_btree_ptr_v2 *bp = &bkey_i_to_btree_ptr_v2(&b->key)->v; + bch2_bpos_to_text(&buf, b->data->min_key); + prt_str(&buf, "-"); + bch2_bpos_to_text(&buf, b->data->max_key); + btree_err_on(b->data->keys.seq != bp->seq, -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, btree_node_bad_seq, - "got wrong btree node (seq %llx want %llx)", - b->data->keys.seq, bp->seq); + "got wrong btree node (want %llx got %llx)\n" + "got btree %s level %llu pos %s", + bp->seq, b->data->keys.seq, + bch2_btree_id_str(BTREE_NODE_ID(b->data)), + BTREE_NODE_LEVEL(b->data), + buf.buf); } else { btree_err_on(!b->data->keys.seq, -BCH_ERR_btree_node_read_err_must_retry, @@ -2007,6 +2015,29 @@ do_write: /* buffer must be a multiple of the block size */ bytes = round_up(bytes, block_bytes(c)); + if (bytes > btree_bytes(c)) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "btree node write bounce buffer overrun: %u > %zu\n", + bytes, btree_bytes(c)); + + prt_printf(&buf, "header: %zu\n", b->written + ? sizeof(struct btree_node) + : sizeof(struct btree_node_entry)); + prt_printf(&buf, "unwritten: %zu\n", b->whiteout_u64s * sizeof(u64)); + + for_each_bset(b, t) { + i = bset(b, t); + + if (bset_written(b, i)) + continue; + prt_printf(&buf, "bset %zu: %zu\n", t - b->set, le16_to_cpu(i->u64s) * sizeof(u64)); + } + + panic("%s", buf.buf); + printbuf_exit(&buf); + } + data = btree_bounce_alloc(c, bytes, &used_mempool); if (!b->written) { diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 929f33d..2bd712e 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -13,6 +13,7 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "journal_io.h" #include "replicas.h" #include "snapshot.h" #include "trace.h" @@ -21,8 +22,8 @@ #include static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); -static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, - struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, + btree_path_idx_t, btree_path_idx_t); static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) { @@ -33,7 +34,8 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) #endif } -static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); +static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t); +static void bch2_trans_srcu_lock(struct btree_trans *); static inline int __btree_path_cmp(const struct btree_path *l, enum btree_id r_btree_id, @@ -239,8 +241,9 @@ static void bch2_btree_path_verify(struct btree_trans *trans, void bch2_trans_verify_paths(struct btree_trans *trans) { struct btree_path *path; + unsigned iter; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, iter) bch2_btree_path_verify(trans, path); } @@ -250,7 +253,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) BUG_ON(iter->btree_id >= BTREE_ID_NR); - BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); @@ -260,8 +263,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) - bch2_btree_path_verify(trans, iter->update_path); - bch2_btree_path_verify(trans, iter->path); + bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); + bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); } static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) @@ -330,12 +333,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos, bool key_cache) { struct btree_path *path; - unsigned idx; + struct trans_for_each_path_inorder_iter iter; struct printbuf buf = PRINTBUF; btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, idx) { + trans_for_each_path_inorder(trans, path, iter) { int cmp = cmp_int(path->btree_id, id) ?: cmp_int(path->cached, key_cache); @@ -415,8 +418,9 @@ void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct bkey_packed *where) { struct btree_path *path; + unsigned i; - trans_for_each_path_with_node(trans, b, path) { + trans_for_each_path_with_node(trans, b, path, i) { __bch2_btree_path_fix_key_modified(path, b, where); bch2_btree_path_verify_level(trans, path, b->c.level); } @@ -523,6 +527,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, { struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); struct btree_path *linked; + unsigned i; if (node_iter != &path->l[b->c.level].iter) { __bch2_btree_node_iter_fix(path, b, node_iter, t, @@ -532,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, bch2_btree_node_iter_verify(node_iter, b); } - trans_for_each_path_with_node(trans, b, linked) { + trans_for_each_path_with_node(trans, b, linked, i) { __bch2_btree_node_iter_fix(linked, b, &linked->l[b->c.level].iter, t, where, clobber_u64s, new_u64s); @@ -655,7 +660,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str i->btree_id == b->c.btree_id && bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { - i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v; if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = @@ -674,14 +679,22 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str * A btree node is being replaced - update the iterator to point to the new * node: */ -void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) +void bch2_trans_node_add(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { - struct btree_path *path; + struct btree_path *prev; - trans_for_each_path(trans, path) - if (path->uptodate == BTREE_ITER_UPTODATE && - !path->cached && - btree_path_pos_in_node(path, b)) { + BUG_ON(!btree_path_pos_in_node(path, b)); + + while ((prev = prev_btree_path(trans, path)) && + btree_path_pos_in_node(prev, b)) + path = prev; + + for (; + path && btree_path_pos_in_node(path, b); + path = next_btree_path(trans, path)) + if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) { enum btree_node_locked_type t = btree_lock_want(path, b->c.level); @@ -704,8 +717,9 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) { struct btree_path *path; + unsigned i; - trans_for_each_path_with_node(trans, b, path) + trans_for_each_path_with_node(trans, b, path, i) __btree_path_level_init(path, b->c.level); bch2_trans_revalidate_updates_in_node(trans, b); @@ -953,7 +967,8 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) struct bch_fs *c = trans->c; struct btree_path *path; unsigned long trace_ip = _RET_IP_; - int i, ret = 0; + unsigned i; + int ret = 0; if (trans->in_traverse_all) return -BCH_ERR_transaction_restart_in_traverse_all; @@ -963,7 +978,7 @@ retry_all: trans->restarted = 0; trans->last_restarted_ip = 0; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) path->should_be_locked = false; btree_trans_sort_paths(trans); @@ -985,16 +1000,16 @@ retry_all: /* Now, redo traversals in correct order: */ i = 0; while (i < trans->nr_sorted) { - path = trans->paths + trans->sorted[i]; + btree_path_idx_t idx = trans->sorted[i]; /* * Traversing a path can cause another path to be added at about * the same position: */ - if (path->uptodate) { - __btree_path_get(path, false); - ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); - __btree_path_put(path, false); + if (trans->paths[idx].uptodate) { + __btree_path_get(&trans->paths[idx], false); + ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_); + __btree_path_put(&trans->paths[idx], false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, ENOMEM)) @@ -1099,10 +1114,11 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, * stashed in the iterator and returned from bch2_trans_exit(). */ int bch2_btree_path_traverse_one(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, unsigned flags, unsigned long trace_ip) { + struct btree_path *path = &trans->paths[path_idx]; unsigned depth_want = path->level; int ret = -((int) trans->restarted); @@ -1126,6 +1142,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } + path = &trans->paths[path_idx]; + if (unlikely(path->level >= BTREE_MAX_DEPTH)) goto out; @@ -1188,37 +1206,39 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path } } -static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, - bool intent) +static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src, + bool intent) { - struct btree_path *new = btree_path_alloc(trans, src); + btree_path_idx_t new = btree_path_alloc(trans, src); - btree_path_copy(trans, new, src); - __btree_path_get(new, intent); + btree_path_copy(trans, trans->paths + new, trans->paths + src); + __btree_path_get(trans->paths + new, intent); return new; } __flatten -struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, - struct btree_path *path, bool intent, - unsigned long ip) +btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans, + btree_path_idx_t path, bool intent, unsigned long ip) { - __btree_path_put(path, intent); + __btree_path_put(trans->paths + path, intent); path = btree_path_clone(trans, path, intent); - path->preserve = false; + trans->paths[path].preserve = false; return path; } -struct btree_path * __must_check +btree_path_idx_t __must_check __bch2_btree_path_set_pos(struct btree_trans *trans, - struct btree_path *path, struct bpos new_pos, - bool intent, unsigned long ip, int cmp) + btree_path_idx_t path_idx, struct bpos new_pos, + bool intent, unsigned long ip) { + int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); + bch2_trans_verify_not_in_restart(trans); - EBUG_ON(!path->ref); + EBUG_ON(!trans->paths[path_idx].ref); - path = bch2_btree_path_make_mut(trans, path, intent, ip); + path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip); + struct btree_path *path = trans->paths + path_idx; path->pos = new_pos; trans->paths_sorted = false; @@ -1259,7 +1279,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, } out: bch2_btree_path_verify(trans, path); - return path; + return path_idx; } /* Btree path: main interface: */ @@ -1294,19 +1314,16 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr return NULL; } -static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path) { - __bch2_btree_path_unlock(trans, path); - btree_path_list_remove(trans, path); - __clear_bit(path->idx, trans->paths_allocated); + __bch2_btree_path_unlock(trans, trans->paths + path); + btree_path_list_remove(trans, trans->paths + path); + __clear_bit(path, trans->paths_allocated); } -void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) +void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { - struct btree_path *dup; - - EBUG_ON(trans->paths + path->idx != path); - EBUG_ON(!path->ref); + struct btree_path *path = trans->paths + path_idx, *dup; if (!__btree_path_put(path, intent)) return; @@ -1328,16 +1345,13 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte dup->should_be_locked |= path->should_be_locked; } - __bch2_path_free(trans, path); + __bch2_path_free(trans, path_idx); } -static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, +static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, bool intent) { - EBUG_ON(trans->paths + path->idx != path); - EBUG_ON(!path->ref); - - if (!__btree_path_put(path, intent)) + if (!__btree_path_put(trans->paths + path, intent)) return; __bch2_path_free(trans, path); @@ -1361,7 +1375,6 @@ noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; prt_printf(buf, "transaction updates for %s journal seq %llu", trans->fn, trans->journal_res.seq); @@ -1386,16 +1399,10 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } - trans_for_each_wb_update(trans, wb) { - prt_printf(buf, "update: btree=%s wb=1 %pS", - bch2_btree_id_str(wb->btree), - (void *) i->ip_allocated); - prt_newline(buf); - - prt_printf(buf, " new "); - bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); - prt_newline(buf); - } + for (struct jset_entry *e = trans->journal_entries; + e != btree_trans_journal_entries_top(trans); + e = vstruct_next(e)) + bch2_journal_entry_to_text(buf, trans->c, e); printbuf_indent_sub(buf, 2); } @@ -1410,11 +1417,12 @@ void bch2_dump_trans_updates(struct btree_trans *trans) printbuf_exit(&buf); } -noinline __cold -void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) +static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { + struct btree_path *path = trans->paths + path_idx; + prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", - path->idx, path->ref, path->intent_ref, + path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', bch2_btree_id_str(path->btree_id), @@ -1432,14 +1440,13 @@ static noinline __cold void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, bool nosort) { - struct btree_path *path; - unsigned idx; + struct trans_for_each_path_inorder_iter iter; if (!nosort) btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, idx) - bch2_btree_path_to_text(out, path); + trans_for_each_path_idx_inorder(trans, iter) + bch2_btree_path_to_text(out, trans, iter.path_idx); } noinline __cold @@ -1471,7 +1478,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) { struct btree_transaction_stats *s = btree_trans_stats(trans); struct printbuf buf = PRINTBUF; - size_t nr = bitmap_weight(trans->paths_allocated, BTREE_ITER_MAX); + size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths); if (!s) return; @@ -1489,7 +1496,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) printbuf_exit(&buf); - trans->nr_max_paths = nr; + trans->nr_paths_max = nr; } noinline __cold @@ -1508,60 +1515,88 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); } -static noinline void btree_path_overflow(struct btree_trans *trans) +static noinline void btree_paths_realloc(struct btree_trans *trans) { - bch2_dump_trans_paths_updates(trans); - panic("trans path overflow\n"); + unsigned nr = trans->nr_paths * 2; + + void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + + nr + 8 + + sizeof(struct btree_trans_paths) + + nr * sizeof(struct btree_path) + + nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL); + + unsigned long *paths_allocated = p; + p += BITS_TO_LONGS(nr) * sizeof(unsigned long); + struct btree_path *paths = p; + p += nr * sizeof(struct btree_path); + u8 *sorted = p; + p += nr + 8; + struct btree_insert_entry *updates = p; + + *trans_paths_nr(paths) = nr; + + memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); + memcpy(sorted, trans->sorted, trans->nr_sorted); + memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); + memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_path)); + + unsigned long *old = trans->paths_allocated; + + rcu_assign_pointer(trans->paths_allocated, paths_allocated); + rcu_assign_pointer(trans->sorted, sorted); + rcu_assign_pointer(trans->paths, paths); + rcu_assign_pointer(trans->updates, updates); + + trans->nr_paths = nr; + + if (old != trans->_paths_allocated) + kfree_rcu_mightsleep(trans->paths_allocated); } -static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, - struct btree_path *pos) +static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, + btree_path_idx_t pos) { - struct btree_path *path; - size_t idx = find_first_zero_bit(trans->paths_allocated, BTREE_ITER_MAX); + btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths); - if (unlikely(idx == BTREE_ITER_MAX)) - btree_path_overflow(trans); - - BUG_ON(idx > BTREE_ITER_MAX); + if (unlikely(idx == trans->nr_paths)) + btree_paths_realloc(trans); /* * Do this before marking the new path as allocated, since it won't be * initialized yet: */ - if (unlikely(idx > trans->nr_max_paths)) + if (unlikely(idx > trans->nr_paths_max)) bch2_trans_update_max_paths(trans); __set_bit(idx, trans->paths_allocated); - path = &trans->paths[idx]; - path->idx = idx; + struct btree_path *path = &trans->paths[idx]; path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; - path->alloc_seq++; - btree_path_list_add(trans, pos, path); + btree_path_list_add(trans, pos, idx); trans->paths_sorted = false; - return path; + return idx; } -struct btree_path *bch2_path_get(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - unsigned locks_want, unsigned level, - unsigned flags, unsigned long ip) +btree_path_idx_t bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, + unsigned flags, unsigned long ip) { - struct btree_path *path, *path_pos = NULL; + struct btree_path *path; bool cached = flags & BTREE_ITER_CACHED; bool intent = flags & BTREE_ITER_INTENT; - int i; + struct trans_for_each_path_inorder_iter iter; + btree_path_idx_t path_pos = 0, path_idx; bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_locks(trans); btree_trans_sort_paths(trans); - trans_for_each_path_inorder(trans, path, i) { + trans_for_each_path_inorder(trans, path, iter) { if (__btree_path_cmp(path, btree_id, cached, @@ -1569,18 +1604,19 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, level) > 0) break; - path_pos = path; + path_pos = iter.path_idx; } if (path_pos && - path_pos->cached == cached && - path_pos->btree_id == btree_id && - path_pos->level == level) { - __btree_path_get(path_pos, intent); - path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + trans->paths[path_pos].cached == cached && + trans->paths[path_pos].btree_id == btree_id && + trans->paths[path_pos].level == level) { + __btree_path_get(trans->paths + path_pos, intent); + path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + path = trans->paths + path_idx; } else { - path = btree_path_alloc(trans, path_pos); - path_pos = NULL; + path_idx = btree_path_alloc(trans, path_pos); + path = trans->paths + path_idx; __btree_path_get(path, intent); path->pos = pos; @@ -1591,7 +1627,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path->level = level; path->locks_want = locks_want; path->nodes_locked = 0; - for (i = 0; i < ARRAY_SIZE(path->l); i++) + for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++) path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); #ifdef TRACK_PATH_ALLOCATED path->ip_allocated = ip; @@ -1617,7 +1653,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, if (locks_want > path->locks_want) bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); - return path; + return path_idx; } struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) @@ -1672,9 +1708,10 @@ __bch2_btree_iter_traverse(struct btree_iter *iter) int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; int ret; - iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, + iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); @@ -1683,7 +1720,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(trans->paths + iter->path); return 0; } @@ -1695,14 +1732,15 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) struct btree *b = NULL; int ret; - EBUG_ON(iter->path->cached); + EBUG_ON(trans->paths[iter->path].cached); bch2_btree_iter_verify(iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) goto err; - b = btree_path_node(iter->path, iter->path->level); + struct btree_path *path = btree_iter_path(trans, iter); + b = btree_path_node(path, path->level); if (!b) goto out; @@ -1714,7 +1752,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1739,14 +1777,15 @@ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; - struct btree_path *path = iter->path; struct btree *b = NULL; int ret; + EBUG_ON(trans->paths[iter->path].cached); bch2_trans_verify_not_in_restart(trans); - EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); + struct btree_path *path = btree_iter_path(trans, iter); + /* already at end? */ if (!btree_path_node(path, path->level)) return NULL; @@ -1776,17 +1815,19 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) * Haven't gotten to the end of the parent node: go back down to * the next child node */ - path = iter->path = - bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), - iter->flags & BTREE_ITER_INTENT, - btree_iter_ip_allocated(iter)); + iter->path = bch2_btree_path_set_pos(trans, iter->path, + bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + path = btree_iter_path(trans, iter); btree_path_set_level_down(trans, path, iter->min_depth); - ret = bch2_btree_path_traverse(trans, path, iter->flags); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) goto err; + path = btree_iter_path(trans, iter); b = path->l[path->level].b; } @@ -1796,8 +1837,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); - BUG_ON(iter->path->uptodate); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); + EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -1839,15 +1880,16 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) static noinline struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; struct btree_insert_entry *i; struct bkey_i *ret = NULL; - trans_for_each_update(iter->trans, i) { + trans_for_each_update(trans, i) { if (i->btree_id < iter->btree_id) continue; if (i->btree_id > iter->btree_id) break; - if (bpos_lt(i->k->k.p, iter->path->pos)) + if (bpos_lt(i->k->k.p, btree_iter_path(trans, iter)->pos)) continue; if (i->key_cache_already_flushed) continue; @@ -1869,9 +1911,11 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, struct btree_iter *iter, struct bpos end_pos) { + struct btree_path *path = btree_iter_path(trans, iter); + return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, - iter->path->level, - iter->path->pos, + path->level, + path->pos, end_pos, &iter->journal_idx); } @@ -1880,7 +1924,8 @@ static noinline struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, struct btree_iter *iter) { - struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); if (k) { iter->k = k->k; @@ -1895,9 +1940,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { + struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek(trans, iter, - k.k ? k.k->p : path_l(iter->path)->b->key.k.p); + k.k ? k.k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; @@ -1940,13 +1986,13 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED) ?: - bch2_btree_path_relock(trans, iter->path, _THIS_IP_); + bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); - btree_path_set_should_be_locked(iter->key_cache_path); + btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); - k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); + k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); if (k.k && !bkey_err(k)) { iter->k = u; k.k = &iter->k; @@ -1961,7 +2007,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp struct bkey_s_c k, k2; int ret; - EBUG_ON(iter->path->cached); + EBUG_ON(btree_iter_path(trans, iter)->cached); bch2_btree_iter_verify(iter); while (1) { @@ -1979,7 +2025,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - l = path_l(iter->path); + struct btree_path *path = btree_iter_path(trans, iter); + l = path_l(path); if (unlikely(!l->b)) { /* No btree nodes at requested level: */ @@ -1988,7 +2035,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp goto out; } - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(path); k = btree_path_level_peek_all(trans->c, l, &iter->k); @@ -2068,7 +2115,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; } bch2_btree_iter_verify_entry_exit(iter); @@ -2096,10 +2143,10 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e goto end; if (iter->update_path && - !bkey_eq(iter->update_path->pos, k.k->p)) { + !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; } if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && @@ -2119,7 +2166,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e * advance, same as on exit for iter->path, but only up * to snapshot */ - __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT); iter->update_path = iter->path; iter->update_path = bch2_btree_path_set_pos(trans, @@ -2160,14 +2207,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out_no_locked: if (iter->update_path) { - ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); + ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_); if (unlikely(ret)) k = bkey_s_c_err(ret); else - btree_path_set_should_be_locked(iter->update_path); + btree_path_set_should_be_locked(trans->paths + iter->update_path); } if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) @@ -2214,13 +2261,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; - struct btree_path *saved_path = NULL; struct bkey_s_c k; struct bkey saved_k; const struct bch_val *saved_v; + btree_path_idx_t saved_path = 0; int ret; - EBUG_ON(iter->path->cached || iter->path->level); + EBUG_ON(btree_iter_path(trans, iter)->cached || + btree_iter_path(trans, iter)->level); EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); if (iter->flags & BTREE_ITER_WITH_JOURNAL) @@ -2245,14 +2293,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) goto out_no_locked; } - k = btree_path_level_peek(trans, iter->path, - &iter->path->l[0], &iter->k); + struct btree_path *path = btree_iter_path(trans, iter); + + k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) ? bpos_ge(bkey_start_pos(k.k), search_key) : bpos_gt(k.k->p, search_key))) - k = btree_path_level_prev(trans, iter->path, - &iter->path->l[0], &iter->k); + k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); if (likely(k.k)) { if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { @@ -2268,7 +2316,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) bch2_path_put_nokeep(trans, iter->path, iter->flags & BTREE_ITER_INTENT); iter->path = saved_path; - saved_path = NULL; + saved_path = 0; iter->k = saved_k; k.v = saved_v; goto got_key; @@ -2282,6 +2330,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) iter->flags & BTREE_ITER_INTENT); saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_INTENT); + path = btree_iter_path(trans, iter); saved_k = *k.k; saved_v = k.v; } @@ -2298,10 +2347,11 @@ got_key: continue; } + btree_path_set_should_be_locked(path); break; - } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { + } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { /* Advance to previous leaf node: */ - search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); + search_key = bpos_predecessor(path->l[0].b->data->min_key); } else { /* Start of btree: */ bch2_btree_iter_set_pos(iter, POS_MIN); @@ -2318,8 +2368,6 @@ got_key: if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) iter->pos.snapshot = iter->snapshot; - - btree_path_set_should_be_locked(iter->path); out_no_locked: if (saved_path) bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); @@ -2354,7 +2402,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && @@ -2399,7 +2447,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } - k = bch2_btree_path_peek_slot(iter->path, &iter->k); + k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); if (unlikely(!k.k)) goto out_no_locked; } else { @@ -2409,7 +2457,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->flags & BTREE_ITER_IS_EXTENTS) end.offset = U64_MAX; - EBUG_ON(iter->path->level); + EBUG_ON(btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_INTENT) { struct btree_iter iter2; @@ -2455,7 +2503,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - btree_path_set_should_be_locked(iter->path); + btree_path_set_should_be_locked(btree_iter_path(trans, iter)); out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); @@ -2502,11 +2550,11 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) struct btree_path *path; unsigned i; - BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, BTREE_ITER_MAX)); + BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1); - trans_for_each_path(trans, path) { + trans_for_each_path(trans, path, i) { BUG_ON(path->sorted_idx >= trans->nr_sorted); - BUG_ON(trans->sorted[path->sorted_idx] != path->idx); + BUG_ON(trans->sorted[path->sorted_idx] != i); } for (i = 0; i < trans->nr_sorted; i++) { @@ -2520,12 +2568,12 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans) static void btree_trans_verify_sorted(struct btree_trans *trans) { struct btree_path *path, *prev = NULL; - unsigned i; + struct trans_for_each_path_inorder_iter iter; if (!bch2_debug_check_iterators) return; - trans_for_each_path_inorder(trans, path, i) { + trans_for_each_path_inorder(trans, path, iter) { if (prev && btree_path_cmp(prev, path) > 0) { __bch2_dump_trans_paths_updates(trans, true); panic("trans paths out of order!\n"); @@ -2600,21 +2648,22 @@ static inline void btree_path_list_remove(struct btree_trans *trans, } static inline void btree_path_list_add(struct btree_trans *trans, - struct btree_path *pos, - struct btree_path *path) + btree_path_idx_t pos, + btree_path_idx_t path_idx) { + struct btree_path *path = trans->paths + path_idx; unsigned i; - path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; + path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, trans->sorted + path->sorted_idx, DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); trans->nr_sorted++; - trans->sorted[path->sorted_idx] = path->idx; + trans->sorted[path->sorted_idx] = path_idx; #else - array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx); #endif for (i = path->sorted_idx; i < trans->nr_sorted; i++) @@ -2634,9 +2683,10 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, iter->flags & BTREE_ITER_INTENT); - iter->path = NULL; - iter->update_path = NULL; - iter->key_cache_path = NULL; + iter->path = 0; + iter->update_path = 0; + iter->key_cache_path = 0; + iter->trans = NULL; } void bch2_trans_iter_init_outlined(struct btree_trans *trans, @@ -2667,41 +2717,47 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, iter->min_depth = depth; - BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); - BUG_ON(iter->path->level != depth); - BUG_ON(iter->min_depth != depth); + struct btree_path *path = btree_iter_path(trans, iter); + BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(path->level != depth); + BUG_ON(iter->min_depth != depth); } void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) { + struct btree_trans *trans = src->trans; + *dst = *src; if (src->path) - __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); if (src->update_path) - __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); - dst->key_cache_path = NULL; + __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT); + dst->key_cache_path = 0; } void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { + struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; - size_t old_bytes = trans->mem_bytes; - size_t new_bytes = roundup_pow_of_two(new_top); + unsigned old_bytes = trans->mem_bytes; + unsigned new_bytes = roundup_pow_of_two(new_top); int ret; void *new_mem; void *p; - trans->mem_max = max(trans->mem_max, new_top); - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + struct btree_transaction_stats *s = btree_trans_stats(trans); + if (s) + s->max_mem = max(s->max_mem, new_bytes); + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); if (unlikely(!new_mem)) { bch2_trans_unlock(trans); new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); new_bytes = BTREE_TRANS_MEM_MAX; kfree(trans->mem); } @@ -2721,7 +2777,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trans->mem_bytes = new_bytes; if (old_bytes) { - trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); } @@ -2743,8 +2799,9 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans) if (trans->srcu_held) { struct bch_fs *c = trans->c; struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->cached && !btree_node_locked(path, 0)) path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); @@ -2754,7 +2811,7 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans) } } -void bch2_trans_srcu_lock(struct btree_trans *trans) +static void bch2_trans_srcu_lock(struct btree_trans *trans) { if (!trans->srcu_held) { trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier); @@ -2776,14 +2833,16 @@ void bch2_trans_srcu_lock(struct btree_trans *trans) u32 bch2_trans_begin(struct btree_trans *trans) { struct btree_path *path; + unsigned i; u64 now; bch2_trans_reset_updates(trans); trans->restart_count++; trans->mem_top = 0; + trans->journal_entries = NULL; - trans_for_each_path(trans, path) { + trans_for_each_path(trans, path, i) { path->should_be_locked = false; /* @@ -2800,7 +2859,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) * iterators if we do that */ if (!path->ref && !path->preserve) - __bch2_path_free(trans, path); + __bch2_path_free(trans, i); else path->preserve = false; } @@ -2827,25 +2886,6 @@ u32 bch2_trans_begin(struct btree_trans *trans) return trans->restart_count; } -static struct btree_trans *bch2_trans_alloc(struct bch_fs *c) -{ - struct btree_trans *trans; - - if (IS_ENABLED(__KERNEL__)) { - trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); - if (trans) - return trans; - } - - trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); - /* - * paths need to be zeroed, bch2_check_for_deadlock looks at - * paths in other threads - */ - memset(&trans->paths, 0, sizeof(trans->paths)); - return trans; -} - const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; unsigned bch2_trans_get_fn_idx(const char *fn) @@ -2867,69 +2907,85 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) __acquires(&c->btree_trans_barrier) { struct btree_trans *trans; - struct btree_transaction_stats *s; - - trans = bch2_trans_alloc(c); - - memset(trans, 0, sizeof(*trans)); - trans->c = c; - trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) - ? bch2_btree_transaction_fns[fn_idx] : NULL; - trans->last_begin_time = local_clock(); - trans->fn_idx = fn_idx; - trans->locking_wait.task = current; - trans->journal_replay_not_finished = - unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && - atomic_inc_not_zero(&c->journal_keys.ref); - closure_init_stack(&trans->ref); - - s = btree_trans_stats(trans); - if (s && s->max_mem) { - unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); - trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); - - if (!unlikely(trans->mem)) { - trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); - trans->mem_bytes = BTREE_TRANS_MEM_MAX; - } else { - trans->mem_bytes = expected_mem_bytes; + if (IS_ENABLED(__KERNEL__)) { + trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL); + if (trans) { + memset(trans, 0, offsetof(struct btree_trans, list)); + goto got_trans; } } - if (s) { - trans->nr_max_paths = s->nr_max_paths; - trans->wb_updates_size = s->wb_updates_size; - } - - trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - trans->srcu_lock_time = jiffies; - trans->srcu_held = true; + trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS); + memset(trans, 0, sizeof(*trans)); + closure_init_stack(&trans->ref); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + seqmutex_lock(&c->btree_trans_lock); + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct btree_trans *pos; + pid_t pid = current->pid; + + trans->locking_wait.task = current; - seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(pos, &c->btree_trans_list, list) { + struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task); /* * We'd much prefer to be stricter here and completely * disallow multiple btree_trans in the same thread - * but the data move path calls bch2_write when we * already have a btree_trans initialized. */ - BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && + BUG_ON(pos_task && + pid == pos_task->pid && bch2_trans_locked(pos)); - if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + if (pos_task && pid < pos_task->pid) { list_add_tail(&trans->list, &pos->list); goto list_add_done; } } - list_add_tail(&trans->list, &c->btree_trans_list); + } + list_add_tail(&trans->list, &c->btree_trans_list); list_add_done: - seqmutex_unlock(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); +got_trans: + trans->c = c; + trans->last_begin_time = local_clock(); + trans->fn_idx = fn_idx; + trans->locking_wait.task = current; + trans->journal_replay_not_finished = + unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) && + atomic_inc_not_zero(&c->journal_keys.ref); + trans->nr_paths = ARRAY_SIZE(trans->_paths); + trans->paths_allocated = trans->_paths_allocated; + trans->sorted = trans->_sorted; + trans->paths = trans->_paths; + trans->updates = trans->_updates; + + *trans_paths_nr(trans->paths) = BTREE_ITER_MAX; + + trans->paths_allocated[0] = 1; + + if (fn_idx < BCH_TRANSACTIONS_NR) { + trans->fn = bch2_btree_transaction_fns[fn_idx]; + + struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx]; + + if (s->max_mem) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); + if (likely(trans->mem)) + trans->mem_bytes = expected_mem_bytes; + } + + trans->nr_paths_max = s->nr_max_paths; + trans->journal_entries_size = s->journal_entries_size; } + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; return trans; } @@ -2938,14 +2994,15 @@ static void check_btree_paths_leaked(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG struct bch_fs *c = trans->c; struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->ref) goto leaked; return; leaked: bch_err(c, "btree paths leaked from %s!", trans->fn); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->ref) printk(KERN_ERR " btree %s %pS\n", bch2_btree_id_str(path->btree_id), @@ -2960,24 +3017,13 @@ void bch2_trans_put(struct btree_trans *trans) { struct btree_insert_entry *i; struct bch_fs *c = trans->c; - struct btree_transaction_stats *s = btree_trans_stats(trans); bch2_trans_unlock(trans); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { - seqmutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - seqmutex_unlock(&c->btree_trans_lock); - } - - closure_sync(&trans->ref); - - if (s) - s->max_mem = max(s->max_mem, trans->mem_max); - trans_for_each_update(trans, i) - __btree_path_put(i->path, true); - trans->nr_updates = 0; + __btree_path_put(trans->paths + i->path, true); + trans->nr_updates = 0; + trans->locking_wait.task = NULL; check_btree_paths_leaked(trans); @@ -2986,8 +3032,6 @@ void bch2_trans_put(struct btree_trans *trans) srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); } - kfree(trans->extra_journal_entries.data); - if (trans->fs_usage_deltas) { if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == REPLICAS_DELTA_LIST_MAX) @@ -3000,6 +3044,13 @@ void bch2_trans_put(struct btree_trans *trans) if (unlikely(trans->journal_replay_not_finished)) bch2_journal_keys_put(c); + unsigned long *paths_allocated = trans->paths_allocated; + trans->paths_allocated = NULL; + trans->paths = NULL; + + if (paths_allocated != trans->_paths_allocated) + kfree_rcu_mightsleep(paths_allocated); + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); else @@ -3008,8 +3059,16 @@ void bch2_trans_put(struct btree_trans *trans) /* Userspace doesn't have a real percpu implementation: */ if (IS_ENABLED(__KERNEL__)) trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans); - if (trans) + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + mempool_free(trans, &c->btree_trans_pool); + } } static void __maybe_unused @@ -3037,12 +3096,14 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) { - struct btree_path *path; struct btree_bkey_cached_common *b; static char lock_types[] = { 'r', 'i', 'w' }; struct task_struct *task = READ_ONCE(trans->locking_wait.task); unsigned l, idx; + /* before rcu_read_lock(): */ + bch2_printbuf_make_room(out, 4096); + if (!out->nr_tabstops) { printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 32); @@ -3050,12 +3111,23 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); - trans_for_each_path_safe(trans, path, idx) { + /* trans->paths is rcu protected vs. freeing */ + rcu_read_lock(); + out->atomic++; + + struct btree_path *paths = rcu_dereference(trans->paths); + if (!paths) + goto out; + + unsigned long *paths_allocated = trans_paths_allocated(paths); + + trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) { + struct btree_path *path = paths + idx; if (!path->nodes_locked) continue; prt_printf(out, " path %u %c l=%u %s:", - path->idx, + idx, path->cached ? 'c' : 'b', path->level, bch2_btree_id_str(path->btree_id)); @@ -3083,6 +3155,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) bch2_btree_bkey_cached_common_to_text(out, b); prt_newline(out); } +out: + --out->atomic; + rcu_read_unlock(); } void bch2_fs_btree_iter_exit(struct bch_fs *c) @@ -3091,15 +3166,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) struct btree_trans *trans; int cpu; + if (c->btree_trans_bufs) + for_each_possible_cpu(cpu) { + struct btree_trans *trans = + per_cpu_ptr(c->btree_trans_bufs, cpu)->trans; + + if (trans) { + closure_sync(&trans->ref); + + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + } + kfree(trans); + } + free_percpu(c->btree_trans_bufs); + trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list); if (trans) panic("%s leaked btree_trans\n", trans->fn); - if (c->btree_trans_bufs) - for_each_possible_cpu(cpu) - kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans); - free_percpu(c->btree_trans_bufs); - for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { @@ -3113,10 +3199,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) mempool_exit(&c->btree_trans_pool); } -int bch2_fs_btree_iter_init(struct bch_fs *c) +void bch2_fs_btree_iter_init_early(struct bch_fs *c) { struct btree_transaction_stats *s; - int ret; for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); @@ -3127,6 +3212,11 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) INIT_LIST_HEAD(&c->btree_trans_list); seqmutex_init(&c->btree_trans_lock); +} + +int bch2_fs_btree_iter_init(struct bch_fs *c) +{ + int ret; c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf); if (!c->btree_trans_bufs) diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index ea4fc8a..a361000 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -63,47 +63,57 @@ static inline void btree_trans_sort_paths(struct btree_trans *trans) __bch2_btree_trans_sort_paths(trans); } -static inline struct btree_path * -__trans_next_path(struct btree_trans *trans, unsigned idx) +static inline unsigned long *trans_paths_nr(struct btree_path *paths) { - idx = find_next_bit(trans->paths_allocated, BTREE_ITER_MAX, idx); - if (idx == BTREE_ITER_MAX) - return NULL; - EBUG_ON(idx > BTREE_ITER_MAX); - EBUG_ON(trans->paths[idx].idx != idx); - return &trans->paths[idx]; + return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths; } -#define trans_for_each_path_from(_trans, _path, _start) \ - for (_path = __trans_next_path((_trans), _start); \ - (_path); \ - _path = __trans_next_path((_trans), (_path)->idx + 1)) +static inline unsigned long *trans_paths_allocated(struct btree_path *paths) +{ + unsigned long *v = trans_paths_nr(paths); + return v - BITS_TO_LONGS(*v); +} -#define trans_for_each_path(_trans, _path) \ - trans_for_each_path_from(_trans, _path, 0) +#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\ + for (_idx = _start; \ + (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \ + _idx++) static inline struct btree_path * -__trans_next_path_safe(struct btree_trans *trans, unsigned *idx) -{ - *idx = find_next_bit(trans->paths_allocated, BTREE_ITER_MAX, *idx); - if (*idx == BTREE_ITER_MAX) - return NULL; +__trans_next_path(struct btree_trans *trans, unsigned *idx) +{ + unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG; + /* + * Open coded find_next_bit(), because + * - this is fast path, we can't afford the function call + * - and we know that nr_paths is a multiple of BITS_PER_LONG, + */ + while (*idx < trans->nr_paths) { + unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1)); + if (v) { + *idx += __ffs(v); + return trans->paths + *idx; + } + + *idx += BITS_PER_LONG; + *idx &= ~(BITS_PER_LONG - 1); + w++; + } - EBUG_ON(*idx > BTREE_ITER_MAX); - return &trans->paths[*idx]; + return NULL; } /* * This version is intended to be safe for use on a btree_trans that is owned by * another thread, for bch2_btree_trans_to_text(); */ -#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \ +#define trans_for_each_path_from(_trans, _path, _idx, _start) \ for (_idx = _start; \ - (_path = __trans_next_path_safe((_trans), &_idx)); \ + (_path = __trans_next_path((_trans), &_idx)); \ _idx++) -#define trans_for_each_path_safe(_trans, _path, _idx) \ - trans_for_each_path_safe_from(_trans, _path, _idx, 0) +#define trans_for_each_path(_trans, _path, _idx) \ + trans_for_each_path_from(_trans, _path, _idx, 1) static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) { @@ -125,10 +135,23 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru : NULL; } -#define trans_for_each_path_inorder(_trans, _path, _i) \ - for (_i = 0; \ - ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ - _i++) +#define trans_for_each_path_idx_inorder(_trans, _iter) \ + for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ + (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ + _iter.sorted_idx < (_trans)->nr_sorted); \ + _iter.sorted_idx++) + +struct trans_for_each_path_inorder_iter { + btree_path_idx_t sorted_idx; + btree_path_idx_t path_idx; +}; + +#define trans_for_each_path_inorder(_trans, _path, _iter) \ + for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \ + (_iter.path_idx = trans->sorted[_iter.sorted_idx], \ + _path = (_trans)->paths + _iter.path_idx, \ + _iter.sorted_idx < (_trans)->nr_sorted); \ + _iter.sorted_idx++) #define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ for (_i = trans->nr_sorted - 1; \ @@ -144,67 +167,65 @@ static inline bool __path_has_node(const struct btree_path *path, static inline struct btree_path * __trans_next_path_with_node(struct btree_trans *trans, struct btree *b, - unsigned idx) + unsigned *idx) { - struct btree_path *path = __trans_next_path(trans, idx); + struct btree_path *path; - while (path && !__path_has_node(path, b)) - path = __trans_next_path(trans, path->idx + 1); + while ((path = __trans_next_path(trans, idx)) && + !__path_has_node(path, b)) + (*idx)++; return path; } -#define trans_for_each_path_with_node(_trans, _b, _path) \ - for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ - (_path); \ - _path = __trans_next_path_with_node((_trans), (_b), \ - (_path)->idx + 1)) +#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \ + for (_iter = 1; \ + (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\ + _iter++) -struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, - bool, unsigned long); +btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t, + bool, unsigned long); -static inline struct btree_path * __must_check +static inline btree_path_idx_t __must_check bch2_btree_path_make_mut(struct btree_trans *trans, - struct btree_path *path, bool intent, + btree_path_idx_t path, bool intent, unsigned long ip) { - if (path->ref > 1 || path->preserve) + if (trans->paths[path].ref > 1 || + trans->paths[path].preserve) path = __bch2_btree_path_make_mut(trans, path, intent, ip); - path->should_be_locked = false; + trans->paths[path].should_be_locked = false; return path; } -struct btree_path * __must_check -__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, - struct bpos, bool, unsigned long, int); +btree_path_idx_t __must_check +__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t, + struct bpos, bool, unsigned long); -static inline struct btree_path * __must_check +static inline btree_path_idx_t __must_check bch2_btree_path_set_pos(struct btree_trans *trans, - struct btree_path *path, struct bpos new_pos, - bool intent, unsigned long ip) + btree_path_idx_t path, struct bpos new_pos, + bool intent, unsigned long ip) { - int cmp = bpos_cmp(new_pos, path->pos); - - return cmp - ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp) + return !bpos_eq(new_pos, trans->paths[path].pos) + ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip) : path; } -int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *, +int __must_check bch2_btree_path_traverse_one(struct btree_trans *, + btree_path_idx_t, unsigned, unsigned long); static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - struct btree_path *path, unsigned flags) + btree_path_idx_t path, unsigned flags) { - if (path->uptodate < BTREE_ITER_NEED_RELOCK) + if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); } -int __must_check bch2_btree_path_traverse(struct btree_trans *, - struct btree_path *, unsigned); -struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, +btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, unsigned, unsigned, unsigned, unsigned long); struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); @@ -256,7 +277,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); -void bch2_path_put(struct btree_trans *, struct btree_path *, bool); +void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool); int bch2_trans_relock(struct btree_trans *); int bch2_trans_relock_notrace(struct btree_trans *); @@ -322,7 +343,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *); -void bch2_trans_node_add(struct btree_trans *trans, struct btree *); +void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); @@ -361,10 +382,12 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { + struct btree_trans *trans = iter->trans; + if (unlikely(iter->update_path)) - bch2_path_put(iter->trans, iter->update_path, + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); - iter->update_path = NULL; + iter->update_path = 0; if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) new_pos.snapshot = iter->snapshot; @@ -433,8 +456,8 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, unsigned long ip) { iter->trans = trans; - iter->update_path = NULL; - iter->key_cache_path = NULL; + iter->update_path = 0; + iter->key_cache_path = 0; iter->btree_id = btree_id; iter->min_depth = 0; iter->flags = flags; @@ -473,8 +496,10 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); static inline void set_btree_iter_dontneed(struct btree_iter *iter) { - if (!iter->trans->restarted) - iter->path->preserve = false; + struct btree_trans *trans = iter->trans; + + if (!trans->restarted) + btree_iter_path(trans, iter)->preserve = false; } void *__bch2_trans_kmalloc(struct btree_trans *, size_t); @@ -496,7 +521,7 @@ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) { - size = roundup(size, 8); + size = round_up(size, 8); if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; @@ -565,7 +590,6 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, KEY_TYPE_##_type, sizeof(*_val), _val) void bch2_trans_srcu_unlock(struct btree_trans *); -void bch2_trans_srcu_lock(struct btree_trans *); u32 bch2_trans_begin(struct btree_trans *); @@ -618,7 +642,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *); static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (bitmap_weight(trans->paths_allocated, BTREE_ITER_MAX) > BTREE_ITER_MAX - 8) + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_MAX - 8) return __bch2_btree_trans_too_many_iters(trans); return 0; @@ -656,19 +680,24 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, return k; } +/* + * goto instead of loop, so that when used inside for_each_btree_key2() + * break/continue work correctly + */ #define lockrestart_do(_trans, _do) \ ({ \ + __label__ transaction_restart; \ u32 _restart_count; \ int _ret2; \ +transaction_restart: \ + _restart_count = bch2_trans_begin(_trans); \ + _ret2 = (_do); \ \ - do { \ - _restart_count = bch2_trans_begin(_trans); \ - _ret2 = (_do); \ - } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \ + if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \ + goto transaction_restart; \ \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ - \ _ret2; \ }) @@ -697,66 +726,34 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) -#define for_each_btree_key2(_trans, _iter, _btree_id, \ - _start, _flags, _k, _do) \ -({ \ - int _ret3 = 0; \ - \ - bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - \ - _ret3 = 0; \ - (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ - if (!(_k).k) \ - break; \ - \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_advance(&(_iter))) \ - break; \ - } \ - \ - bch2_trans_iter_exit((_trans), &(_iter)); \ - _ret3; \ -}) - #define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ - _start, _end, _flags, _k, _do) \ + _start, _end, _flags, _k, _do) \ ({ \ int _ret3 = 0; \ \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ + _end, (_flags)); \ + if (!(_k).k) \ + break; \ \ - _ret3 = 0; \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ - if (!(_k).k) \ - break; \ - \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_advance(&(_iter))) \ - break; \ - } \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ }) +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ + for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, \ + SPOS_MAX, _flags, _k, _do) + #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ ({ \ @@ -765,23 +762,16 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - while (1) { \ - u32 _restart_count = bch2_trans_begin(_trans); \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ - if (!(_k).k) { \ - _ret3 = 0; \ - break; \ - } \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \ + (_flags)); \ + if (!(_k).k) \ + break; \ \ - _ret3 = bkey_err(_k) ?: (_do); \ - if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\ - continue; \ - if (_ret3) \ - break; \ - bch2_trans_verify_not_restarted(_trans, _restart_count);\ - if (!bch2_btree_iter_rewind(&(_iter))) \ - break; \ - } \ + bkey_err(_k) ?: (_do); \ + })); \ + } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -791,7 +781,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, _start, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) @@ -811,7 +801,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -#define for_each_btree_key(_trans, _iter, _btree_id, \ +#define for_each_btree_key_old(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ @@ -896,7 +886,6 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, /* new multiple iterator interface: */ void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); @@ -919,6 +908,7 @@ unsigned bch2_trans_get_fn_idx(const char *); void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); void bch2_fs_btree_iter_exit(struct bch_fs *); +void bch2_fs_btree_iter_init_early(struct bch_fs *); int bch2_fs_btree_iter_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index b39b28b..39fd4d8 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -630,7 +630,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, if (ret) goto out; - ck = (void *) c_iter.path->l[0].b; + ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; if (!ck) goto out; @@ -678,7 +678,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, bch2_journal_pin_drop(j, &ck->journal); - BUG_ON(!btree_node_locked(c_iter.path, 0)); + struct btree_path *path = btree_iter_path(trans, &c_iter); + BUG_ON(!btree_node_locked(path, 0)); if (!evict) { if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { @@ -687,19 +688,20 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, } } else { struct btree_path *path2; + unsigned i; evict: - trans_for_each_path(trans, path2) - if (path2 != c_iter.path) + trans_for_each_path(trans, path2, i) + if (path2 != path) __bch2_btree_path_unlock(trans, path2); - bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); + bch2_btree_node_lock_write_nofail(trans, path, &ck->c); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_dec(&c->btree_key_cache.nr_dirty); } - mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); bkey_cached_evict(&c->btree_key_cache, ck); bkey_cached_free_fast(&c->btree_key_cache, ck); } @@ -747,28 +749,12 @@ unlock: return ret; } -/* - * Flush and evict a key from the key cache: - */ -int bch2_btree_key_cache_flush(struct btree_trans *trans, - enum btree_id id, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct bkey_cached_key key = { id, pos }; - - /* Fastpath - assume it won't be found: */ - if (!bch2_btree_key_cache_find(c, id, pos)) - return 0; - - return btree_key_cache_flush_pos(trans, key, 0, 0, true); -} - bool bch2_btree_insert_key_cached(struct btree_trans *trans, unsigned flags, struct btree_insert_entry *insert_entry) { struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; + struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b; struct bkey_i *insert = insert_entry->k; bool kick_reclaim = false; diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index be3acde..e6b2cd0 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -31,8 +31,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, struct btree_insert_entry *); -int bch2_btree_key_cache_flush(struct btree_trans *, - enum btree_id, struct bpos); void bch2_btree_key_cache_drop(struct btree_trans *, struct btree_path *); diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 1eca320..1ed8327 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -32,13 +32,14 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, { struct btree_path *path; struct six_lock_count ret; + unsigned i; memset(&ret, 0, sizeof(ret)); if (IS_ERR_OR_NULL(b)) return ret; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path != skip && &path->l[level].b->c == b) { int t = btree_node_locked_type(path, level); @@ -94,9 +95,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { + struct task_struct *task = i->trans->locking_wait.task; if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", i->trans->locking_wait.task->pid); + prt_printf(out, "%u ", task ?task->pid : 0); } prt_newline(out); } @@ -280,9 +282,8 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) struct lock_graph g; struct trans_waiting_for_lock *top; struct btree_bkey_cached_common *b; - struct btree_path *path; - unsigned path_idx; - int ret; + btree_path_idx_t path_idx; + int ret = 0; g.nr = 0; @@ -295,13 +296,26 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) } lock_graph_down(&g, trans); + + /* trans->paths is rcu protected vs. freeing */ + rcu_read_lock(); + if (cycle) + cycle->atomic++; next: if (!g.nr) - return 0; + goto out; top = &g.g[g.nr - 1]; - trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) { + struct btree_path *paths = rcu_dereference(top->trans->paths); + if (!paths) + goto up; + + unsigned long *paths_allocated = trans_paths_allocated(paths); + + trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), + path_idx, top->path_idx) { + struct btree_path *path = paths + path_idx; if (!path->nodes_locked) continue; @@ -367,18 +381,23 @@ next: ret = lock_graph_descend(&g, trans, cycle); if (ret) - return ret; + goto out; goto next; } raw_spin_unlock(&b->lock.wait_lock); } } - +up: if (g.nr > 1 && cycle) print_chain(cycle, &g); lock_graph_up(&g); goto next; +out: + if (cycle) + --cycle->atomic; + rcu_read_unlock(); + return ret; } int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) @@ -417,7 +436,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, struct btree_bkey_cached_common *b) { struct btree_path *linked; - unsigned i; + unsigned i, iter; int ret; /* @@ -431,7 +450,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, * already taken are no longer needed: */ - trans_for_each_path(trans, linked) { + trans_for_each_path(trans, linked, iter) { if (!linked->nodes_locked) continue; @@ -643,8 +662,6 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, unsigned new_locks_want, struct get_locks_fail *f) { - struct btree_path *linked; - if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f)) return true; @@ -667,8 +684,11 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, * before interior nodes - now that's handled by * bch2_btree_path_traverse_all(). */ - if (!path->cached && !trans->in_traverse_all) - trans_for_each_path(trans, linked) + if (!path->cached && !trans->in_traverse_all) { + struct btree_path *linked; + unsigned i; + + trans_for_each_path(trans, linked, i) if (linked != path && linked->cached == path->cached && linked->btree_id == path->btree_id && @@ -676,6 +696,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->locks_want = new_locks_want; btree_path_get_locks(trans, linked, true, NULL); } + } return false; } @@ -708,7 +729,6 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, bch2_btree_path_verify_locks(path); - path->downgrade_seq++; trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } @@ -717,22 +737,24 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, void bch2_trans_downgrade(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (trans->restarted) return; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) bch2_btree_path_downgrade(trans, path); } int bch2_trans_relock(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->should_be_locked && !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); @@ -744,11 +766,12 @@ int bch2_trans_relock(struct btree_trans *trans) int bch2_trans_relock_notrace(struct btree_trans *trans) { struct btree_path *path; + unsigned i; if (unlikely(trans->restarted)) return -((int) trans->restarted); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->should_be_locked && !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); @@ -759,16 +782,18 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) void bch2_trans_unlock_noassert(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) __bch2_btree_path_unlock(trans, path); } void bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) __bch2_btree_path_unlock(trans, path); } @@ -781,8 +806,9 @@ void bch2_trans_unlock_long(struct btree_trans *trans) bool bch2_trans_locked(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->nodes_locked) return true; return false; @@ -828,8 +854,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path) void bch2_trans_verify_locks(struct btree_trans *trans) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) bch2_btree_path_verify_locks(path); } diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index a49f1dd..64810ea 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -175,6 +175,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat struct btree *b) { struct btree_path *linked; + unsigned i; EBUG_ON(path->l[b->c.level].b != b); EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); @@ -182,7 +183,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path_with_node(trans, b, linked) + trans_for_each_path_with_node(trans, b, linked, i) linked->l[b->c.level].lock_seq++; six_unlock_write(&b->c.lock); @@ -242,8 +243,9 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, enum btree_node_locked_type want) { struct btree_path *path; + unsigned i; - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (&path->l[level].b->c == b && btree_node_locked_type(path, level) >= want) { six_lock_increment(&b->lock, (enum six_lock_type) want); @@ -263,7 +265,6 @@ static inline int btree_node_lock(struct btree_trans *trans, int ret = 0; EBUG_ON(level >= BTREE_MAX_DEPTH); - EBUG_ON(!test_bit(path->idx, trans->paths_allocated)); if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 336350b..47cbfe2 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -23,7 +23,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert #ifdef CONFIG_BCACHEFS_DEBUG struct bch_fs *c = trans->c; struct bkey u; - struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = @@ -41,23 +41,23 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert #endif } -static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i) { - return i->path->l + i->level; + return (trans->paths + i->path)->l + i->level; } static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { return i != trans->updates && - insert_l(&i[0])->b == insert_l(&i[-1])->b; + insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b; } static inline bool same_leaf_as_next(struct btree_trans *trans, struct btree_insert_entry *i) { return i + 1 < trans->updates + trans->nr_updates && - insert_l(&i[0])->b == insert_l(&i[1])->b; + insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b; } inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, @@ -84,7 +84,7 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre if (same_leaf_as_prev(trans, i)) continue; - bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b); } trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); @@ -101,11 +101,11 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) if (same_leaf_as_prev(trans, i)) continue; - if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c)) return trans_lock_write_fail(trans, i); if (!i->cached) - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b); } trans->write_locked = true; @@ -119,8 +119,8 @@ static inline void bch2_trans_unlock_write(struct btree_trans *trans) trans_for_each_update(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(trans, i->path, - insert_l(i)->b); + bch2_btree_node_unlock_write_inlined(trans, + trans->paths + i->path, insert_l(trans, i)->b); trans->write_locked = false; } } @@ -311,10 +311,12 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { - BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); - BUG_ON(i->cached != i->path->cached); - BUG_ON(i->level != i->path->level); - BUG_ON(i->btree_id != i->path->btree_id); + struct btree_path *path = trans->paths + i->path; + + BUG_ON(!bpos_eq(i->k->k.p, path->pos)); + BUG_ON(i->cached != path->cached); + BUG_ON(i->level != path->level); + BUG_ON(i->btree_id != path->btree_id); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && @@ -459,11 +461,9 @@ static int run_one_mem_trigger(struct btree_trans *trans, old, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); } else { - struct bkey _deleted = KEY(0, 0, 0); + struct bkey _deleted = POS_KEY((trans->paths + i->path)->pos); struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - _deleted.p = i->path->pos; - ret = bch2_mark_key(trans, i->btree_id, i->level, deleted, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|flags) ?: @@ -607,7 +607,7 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) */ BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { + if (gc_visited(c, gc_pos_btree_node(insert_l(trans, i)->b))) { ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); if (ret) break; @@ -624,7 +624,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; struct btree_trans_commit_hook *h; unsigned u64s = 0; int ret; @@ -649,8 +648,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, u64s += i->k->k.u64s; ret = !i->cached - ? btree_key_can_insert(trans, insert_l(i)->b, u64s) - : btree_key_can_insert_cached(trans, flags, i->path, u64s); + ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s) + : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s); if (ret) { *stopped_at = i; return ret; @@ -659,10 +658,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, i->k->k.needs_whiteout = false; } - if (trans->nr_wb_updates && - trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) - return -BCH_ERR_btree_insert_need_flush_buffer; - /* * Don't get journal reservation until after we know insert will * succeed: @@ -697,14 +692,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; - if (trans->nr_wb_updates) { - EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res); - - ret = bch2_btree_insert_keys_write_buffer(trans); - if (ret) - goto revert_fs_usage; - } - h = trans->hooks; while (h) { ret = h->fn(trans, h); @@ -726,15 +713,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } - if (unlikely(trans->extra_journal_entries.nr)) { - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->extra_journal_entries.data, - trans->extra_journal_entries.nr); - - trans->journal_res.offset += trans->extra_journal_entries.nr; - trans->journal_res.u64s -= trans->extra_journal_entries.nr; - } - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -764,26 +742,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bkey_copy((struct bkey_i *) entry->start, i->k); } - trans_for_each_wb_update(trans, wb) { - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_btree_keys, - wb->btree, 0, - wb->k.k.u64s); - bkey_copy((struct bkey_i *) entry->start, &wb->k); - } + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->journal_entries, + trans->journal_entries_u64s); + + trans->journal_res.offset += trans->journal_entries_u64s; + trans->journal_res.u64s -= trans->journal_entries_u64s; if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; } trans_for_each_update(trans, i) { + struct btree_path *path = trans->paths + i->path; + if (!i->cached) { - bch2_btree_insert_key_leaf(trans, i->path, i->k, trans->journal_res.seq); + bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq); } else if (!i->key_cache_already_flushed) bch2_btree_insert_key_cached(trans, flags, i); else { - bch2_btree_key_cache_drop(trans, i->path); - btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_key_cache_drop(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); } } @@ -799,13 +778,9 @@ revert_fs_usage: static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { struct btree_insert_entry *i; - struct btree_write_buffered_key *wb; trans_for_each_update(trans, i) bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); - - trans_for_each_wb_update(trans, wb) - bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); } static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, @@ -916,7 +891,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, case -BCH_ERR_btree_insert_btree_node_full: ret = bch2_btree_split_leaf(trans, i->path, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); + trace_and_count(c, trans_restart_btree_node_split, trans, + trace_ip, trans->paths + i->path); break; case -BCH_ERR_btree_insert_need_mark_replicas: ret = drop_locks_do(trans, @@ -950,30 +926,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ret = bch2_trans_relock(trans); break; - case -BCH_ERR_btree_insert_need_flush_buffer: { - struct btree_write_buffer *wb = &c->btree_write_buffer; - - ret = 0; - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_unlock(trans); - mutex_lock(&wb->flush_lock); - - if (wb->state.nr > wb->size * 3 / 4) { - bch2_trans_begin(trans); - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flush_lock); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - } else { - mutex_unlock(&wb->flush_lock); - ret = bch2_trans_relock(trans); - } - } - break; - } default: BUG_ON(ret >= 0); break; @@ -1031,12 +983,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; - struct btree_write_buffered_key *wb; int ret = 0; if (!trans->nr_updates && - !trans->nr_wb_updates && - !trans->extra_journal_entries.nr) + !trans->journal_entries_u64s) goto out_reset; ret = bch2_trans_commit_run_triggers(trans); @@ -1072,35 +1022,23 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; } - if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && - mutex_trylock(&c->btree_write_buffer.flush_lock)) { - bch2_trans_begin(trans); - bch2_trans_unlock(trans); - - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&c->btree_write_buffer.flush_lock); - if (!ret) { - trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); - } - goto out; - } - EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_u64s = trans->journal_entries_u64s; trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans_for_each_update(trans, i) { - EBUG_ON(!i->path->should_be_locked); + struct btree_path *path = trans->paths + i->path; + + EBUG_ON(!path->should_be_locked); - ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + ret = bch2_btree_path_upgrade(trans, path, i->level + 1); if (unlikely(ret)) goto out; - EBUG_ON(!btree_node_intent_locked(i->path, i->level)); + EBUG_ON(!btree_node_intent_locked(path, i->level)); if (i->key_cache_already_flushed) continue; @@ -1116,12 +1054,9 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) trans->journal_u64s += jset_u64s(i->old_k.u64s); } - trans_for_each_wb_update(trans, wb) - trans->journal_u64s += jset_u64s(wb->k.k.u64s); - - if (trans->extra_journal_res) { + if (trans->extra_disk_res) { ret = bch2_disk_reservation_add(c, trans->disk_res, - trans->extra_journal_res, + trans->extra_disk_res, (flags & BCH_TRANS_COMMIT_no_enospc) ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 78d9f58..e4ebfc2 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -222,13 +222,12 @@ enum btree_path_uptodate { #define TRACK_PATH_ALLOCATED #endif +typedef u16 btree_path_idx_t; + struct btree_path { - u8 idx; - u8 sorted_idx; + btree_path_idx_t sorted_idx; u8 ref; u8 intent_ref; - u32 alloc_seq; - u32 downgrade_seq; /* btree_iter_copy starts here: */ struct bpos pos; @@ -282,9 +281,9 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) */ struct btree_iter { struct btree_trans *trans; - struct btree_path *path; - struct btree_path *update_path; - struct btree_path *key_cache_path; + btree_path_idx_t path; + btree_path_idx_t update_path; + btree_path_idx_t key_cache_path; enum btree_id btree_id:8; u8 min_depth; @@ -351,8 +350,8 @@ struct btree_insert_entry { * to the size of the key being overwritten in the btree: */ u8 old_btree_u64s; + btree_path_idx_t path; struct bkey_i *k; - struct btree_path *path; /* key being overwritten: */ struct bkey old_k; const struct bch_val *old_v; @@ -373,25 +372,30 @@ struct btree_trans_commit_hook { #define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 +struct btree_trans_paths { + unsigned long nr_paths; + struct btree_path paths[]; +}; + struct btree_trans { struct bch_fs *c; - const char *fn; - struct closure ref; - struct list_head list; - u64 last_begin_time; - u8 lock_may_not_fail; - u8 lock_must_abort; - struct btree_bkey_cached_common *locking; - struct six_lock_waiter locking_wait; + unsigned long *paths_allocated; + struct btree_path *paths; + u8 *sorted; + struct btree_insert_entry *updates; - int srcu_idx; + void *mem; + unsigned mem_top; + unsigned mem_bytes; + btree_path_idx_t nr_sorted; + btree_path_idx_t nr_paths; + btree_path_idx_t nr_paths_max; u8 fn_idx; - u8 nr_sorted; u8 nr_updates; - u8 nr_wb_updates; - u8 wb_updates_size; + u8 lock_must_abort; + bool lock_may_not_fail:1; bool srcu_held:1; bool used_mempool:1; bool in_traverse_all:1; @@ -403,41 +407,56 @@ struct btree_trans { bool write_locked:1; enum bch_errcode restarted:16; u32 restart_count; + + u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; unsigned long srcu_lock_time; - /* - * For when bch2_trans_update notices we'll be splitting a compressed - * extent: - */ - unsigned extra_journal_res; - unsigned nr_max_paths; - - unsigned long paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)]; - - unsigned mem_top; - unsigned mem_max; - unsigned mem_bytes; - void *mem; - - u8 sorted[BTREE_ITER_MAX + 8]; - struct btree_path paths[BTREE_ITER_MAX]; - struct btree_insert_entry updates[BTREE_ITER_MAX]; - struct btree_write_buffered_key *wb_updates; + const char *fn; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; + int srcu_idx; /* update path: */ + u16 journal_entries_u64s; + u16 journal_entries_size; + struct jset_entry *journal_entries; + struct btree_trans_commit_hook *hooks; - darray_u64 extra_journal_entries; struct journal_entry_pin *journal_pin; struct journal_res journal_res; u64 *journal_seq; struct disk_reservation *disk_res; unsigned journal_u64s; + unsigned extra_disk_res; /* XXX kill */ struct replicas_delta_list *fs_usage_deltas; + + /* Entries before this are zeroed out on every bch2_trans_get() call */ + + struct list_head list; + struct closure ref; + + unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)]; + struct btree_trans_paths trans_paths; + struct btree_path _paths[BTREE_ITER_MAX]; + u8 _sorted[BTREE_ITER_MAX + 8]; + struct btree_insert_entry _updates[BTREE_ITER_MAX]; }; +static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter) +{ + return trans->paths + iter->path; +} + +static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter) +{ + return iter->key_cache_path + ? trans->paths + iter->key_cache_path + : NULL; +} + #define BCH_BTREE_WRITE_TYPES() \ x(initial, 0) \ x(init_next_bset, 1) \ diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 254794c..74ec99d 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -24,7 +24,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, } static int __must_check -bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, +bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t, struct bkey_i *, enum btree_update_flags, unsigned long ip); @@ -198,7 +198,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, if (((front_split && back_split) || ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && (compressed_sectors = bch2_bkey_sectors_compressed(old))) - trans->extra_journal_res += compressed_sectors; + trans->extra_disk_res += compressed_sectors; if (front_split) { update = bch2_bkey_make_mut_noupdate(trans, old); @@ -338,21 +338,22 @@ err: } static noinline int flush_new_cached_update(struct btree_trans *trans, - struct btree_path *path, struct btree_insert_entry *i, enum btree_update_flags flags, unsigned long ip) { - struct btree_path *btree_path; struct bkey k; int ret; - btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, - BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, btree_path, 0); + btree_path_idx_t path_idx = + bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, + BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, path_idx, 0); if (ret) goto out; + struct btree_path *btree_path = trans->paths + path_idx; + /* * The old key in the insert entry might actually refer to an existing * key in the btree that has been deleted from cache and not yet @@ -367,14 +368,14 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, i->flags |= BTREE_TRIGGER_NORUN; btree_path_set_should_be_locked(btree_path); - ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); + ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); out: - bch2_path_put(trans, btree_path, true); + bch2_path_put(trans, path_idx, true); return ret; } static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, +bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, struct bkey_i *k, enum btree_update_flags flags, unsigned long ip) { @@ -382,8 +383,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, struct btree_insert_entry *i, n; int cmp; + struct btree_path *path = trans->paths + path_idx; EBUG_ON(!path->should_be_locked); - EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + EBUG_ON(trans->nr_updates >= trans->nr_paths); EBUG_ON(!bpos_eq(k->k.p, path->pos)); n = (struct btree_insert_entry) { @@ -392,7 +394,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, .btree_id = path->btree_id, .level = path->level, .cached = path->cached, - .path = path, + .path = path_idx, .k = k, .ip_allocated = ip, }; @@ -440,7 +442,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, } } - __btree_path_get(i->path, true); + __btree_path_get(trans->paths + i->path, true); /* * If a key is present in the key cache, it must also exist in the @@ -450,7 +452,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, * work: */ if (path->cached && bkey_deleted(&i->old_k)) - return flush_new_cached_update(trans, path, i, flags, ip); + return flush_new_cached_update(trans, i, flags, ip); return 0; } @@ -459,9 +461,11 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, struct btree_iter *iter, struct btree_path *path) { - if (!iter->key_cache_path || - !iter->key_cache_path->should_be_locked || - !bpos_eq(iter->key_cache_path->pos, iter->pos)) { + struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter); + + if (!key_cache_path || + !key_cache_path->should_be_locked || + !bpos_eq(key_cache_path->pos, iter->pos)) { struct bkey_cached *ck; int ret; @@ -476,19 +480,18 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, iter->flags & BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, - BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED); if (unlikely(ret)) return ret; - ck = (void *) iter->key_cache_path->l[0].b; + ck = (void *) trans->paths[iter->key_cache_path].l[0].b; if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); } - btree_path_set_should_be_locked(iter->key_cache_path); + btree_path_set_should_be_locked(trans->paths + iter->key_cache_path); } return 0; @@ -497,7 +500,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { - struct btree_path *path = iter->update_path ?: iter->path; + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; if (iter->flags & BTREE_ITER_IS_EXTENTS) @@ -517,6 +520,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter /* * Ensure that updates to cached btrees go to the key cache: */ + struct btree_path *path = trans->paths + path_idx; if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && !path->cached && !path->level && @@ -525,15 +529,15 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter if (ret) return ret; - path = iter->key_cache_path; + path_idx = iter->key_cache_path; } - return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); + return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); } -static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) +int bch2_btree_insert_clone_trans(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) { struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k)); int ret = PTR_ERR_OR_ZERO(n); @@ -544,60 +548,32 @@ static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans, return bch2_btree_insert_trans(trans, btree, n, 0); } -int __must_check bch2_trans_update_buffered(struct btree_trans *trans, - enum btree_id btree, - struct bkey_i *k) +struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) { - struct btree_write_buffered_key *i; - int ret; - - EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); - EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - - if (unlikely(trans->journal_replay_not_finished)) - return bch2_btree_insert_clone_trans(trans, btree, k); - - trans_for_each_wb_update(trans, i) { - if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { - bkey_copy(&i->k, k); - return 0; - } - } - - if (!trans->wb_updates || - trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_write_buffered_key *u; - - if (trans->nr_wb_updates == trans->wb_updates_size) { - struct btree_transaction_stats *s = btree_trans_stats(trans); - - BUG_ON(trans->wb_updates_size > U8_MAX / 2); - trans->wb_updates_size = max(1, trans->wb_updates_size * 2); - if (s) - s->wb_updates_size = trans->wb_updates_size; - } + unsigned new_top = trans->journal_entries_u64s + u64s; + unsigned old_size = trans->journal_entries_size; - u = bch2_trans_kmalloc_nomemzero(trans, - trans->wb_updates_size * - sizeof(struct btree_write_buffered_key)); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; + if (new_top > trans->journal_entries_size) { + trans->journal_entries_size = roundup_pow_of_two(new_top); - if (trans->nr_wb_updates) - memcpy(u, trans->wb_updates, trans->nr_wb_updates * - sizeof(struct btree_write_buffered_key)); - trans->wb_updates = u; + struct btree_transaction_stats *s = btree_trans_stats(trans); + if (s) + s->journal_entries_size = trans->journal_entries_size; } - trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { - .btree = btree, - }; + struct jset_entry *n = + bch2_trans_kmalloc_nomemzero(trans, + trans->journal_entries_size * sizeof(u64)); + if (IS_ERR(n)) + return ERR_CAST(n); - bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); - trans->nr_wb_updates++; + if (trans->journal_entries) + memcpy(n, trans->journal_entries, old_size * sizeof(u64)); + trans->journal_entries = n; - return 0; + struct jset_entry *e = btree_trans_journal_entries_top(trans); + trans->journal_entries_u64s = new_top; + return e; } int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, @@ -822,41 +798,17 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, &k); } -__printf(2, 0) -static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) +static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) { - struct printbuf buf = PRINTBUF; - struct jset_entry_log *l; - unsigned u64s; - int ret; - - prt_vprintf(&buf, fmt, args); - ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); + int ret = PTR_ERR_OR_ZERO(e); if (ret) - goto err; - - u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - - ret = darray_make_room(entries, jset_u64s(u64s)); - if (ret) - goto err; + return ret; - l = (void *) &darray_top(*entries); - l->entry.u64s = cpu_to_le16(u64s); - l->entry.btree_id = 0; - l->entry.level = 1; - l->entry.type = BCH_JSET_ENTRY_log; - l->entry.pad[0] = 0; - l->entry.pad[1] = 0; - l->entry.pad[2] = 0; - memcpy(l->d, buf.buf, buf.pos); - while (buf.pos & 7) - l->d[buf.pos++] = '\0'; - - entries->nr += jset_u64s(u64s); -err: - printbuf_exit(&buf); - return ret; + struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); + journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); + memcpy(l->d, buf->buf, buf->pos); + return 0; } __printf(3, 0) @@ -864,16 +816,32 @@ static int __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, va_list args) { - int ret; + struct printbuf buf = PRINTBUF; + prt_vprintf(&buf, fmt, args); + + unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); + prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); + + int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + goto err; if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { - ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); + if (ret) + goto err; + + struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); + journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); + memcpy(l->d, buf.buf, buf.pos); + c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|commit_flags, - __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args)); + __bch2_trans_log_msg(trans, &buf, u64s)); } - +err: + printbuf_exit(&buf); return ret; } diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index fa19f32..ebad738 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -101,10 +101,44 @@ int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_update_flags); -int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); -int __must_check bch2_trans_update_buffered(struct btree_trans *, - enum btree_id, struct bkey_i *); + +struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); + +static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) +{ + return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); +} + +static inline struct jset_entry * +bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +{ + if (!trans->journal_entries || + trans->journal_entries_u64s + u64s > trans->journal_entries_size) + return __bch2_trans_jset_entry_alloc(trans, u64s); + + struct jset_entry *e = btree_trans_journal_entries_top(trans); + trans->journal_entries_u64s += u64s; + return e; +} + +int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); + +static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + if (unlikely(trans->journal_replay_not_finished)) + return bch2_btree_insert_clone_trans(trans, btree, k); + + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); + int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; + + journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s); + bkey_copy(e->start, k); + return 0; +} void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); @@ -157,11 +191,6 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -#define trans_for_each_wb_update(_trans, _i) \ - for ((_i) = (_trans)->wb_updates; \ - (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \ - (_i)++) - static inline void bch2_trans_reset_updates(struct btree_trans *trans) { struct btree_insert_entry *i; @@ -169,12 +198,10 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) trans_for_each_update(trans, i) bch2_path_put(trans, i->path, true); - trans->extra_journal_res = 0; trans->nr_updates = 0; - trans->nr_wb_updates = 0; - trans->wb_updates = NULL; + trans->journal_entries_u64s = 0; trans->hooks = NULL; - trans->extra_journal_entries.nr = 0; + trans->extra_disk_res = 0; if (trans->fs_usage_deltas) { trans->fs_usage_deltas->used = 0; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 970faec..f644578 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -25,24 +25,24 @@ #include static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - struct btree_path *, struct btree *, + btree_path_idx_t, struct btree *, struct keylist *, unsigned); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, - enum btree_id btree_id, - unsigned level, - struct bpos pos) +static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) { - struct btree_path *path; - - path = bch2_path_get(trans, btree_id, pos, level + 1, level, + btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level, BTREE_ITER_NOPRESERVE| BTREE_ITER_INTENT, _RET_IP_); - path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_); + path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_); + + struct btree_path *path = trans->paths + path_idx; bch2_btree_path_downgrade(trans, path); __bch2_btree_path_unlock(trans, path); - return path; + return path_idx; } /* Debug code: */ @@ -189,7 +189,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; - unsigned level = b->c.level; + unsigned i, level = b->c.level; bch2_btree_node_lock_write_nofail(trans, path, &b->c); bch2_btree_node_hash_remove(&c->btree_cache, b); @@ -197,7 +197,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, six_unlock_write(&b->c.lock); mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->l[level].b == b) { btree_node_unlock(trans, path, level); path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); @@ -211,7 +211,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, struct bch_fs *c = as->c; struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; struct btree_path *path; - unsigned level = b->c.level; + unsigned i, level = b->c.level; BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); @@ -234,7 +234,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, six_unlock_intent(&b->c.lock); - trans_for_each_path(trans, path) + trans_for_each_path(trans, path, i) if (path->l[level].b == b) { btree_node_unlock(trans, path, level); path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); @@ -556,16 +556,13 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, struct btree_update *as) { struct bkey_i *k; - int ret; - ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s); + int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; - memcpy(&darray_top(trans->extra_journal_entries), - as->journal_entries, - as->journal_u64s * sizeof(u64)); - trans->extra_journal_entries.nr += as->journal_u64s; + memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64)); trans->journal_pin = &as->journal; @@ -651,10 +648,11 @@ static void btree_update_nodes_written(struct btree_update *as) "%s(): error %s", __func__, bch2_err_str(ret)); err: if (as->b) { - struct btree_path *path; b = as->b; - path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p); + btree_path_idx_t path_idx = get_unlocked_mut_path(trans, + as->btree_id, b->c.level, b->key.k.p); + struct btree_path *path = trans->paths + path_idx; /* * @b is the node we did the final insert into: * @@ -724,7 +722,7 @@ err: btree_node_write_if_need(c, b, SIX_LOCK_intent); btree_node_unlock(trans, path, b->c.level); - bch2_path_put(trans, path, true); + bch2_path_put(trans, path_idx, true); } bch2_journal_pin_drop(&c->journal, &as->journal); @@ -1442,10 +1440,12 @@ static void __btree_split_node(struct btree_update *as, */ static void btree_split_insert_keys(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, struct btree *b, struct keylist *keys) { + struct btree_path *path = trans->paths + path_idx; + if (!bch2_keylist_empty(keys) && bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { struct btree_node_iter node_iter; @@ -1459,18 +1459,18 @@ static void btree_split_insert_keys(struct btree_update *as, } static int btree_split(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, + btree_path_idx_t path, struct btree *b, struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; - struct btree *parent = btree_node_parent(path, b); + struct btree *parent = btree_node_parent(trans->paths + path, b); struct btree *n1, *n2 = NULL, *n3 = NULL; - struct btree_path *path1 = NULL, *path2 = NULL; + btree_path_idx_t path1 = 0, path2 = 0; u64 start_time = local_clock(); int ret = 0; BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1)); + BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); bch2_btree_interior_update_will_free_node(as, b); @@ -1498,15 +1498,15 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_write(&n2->c.lock); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path1, n1); + mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path1, n1); - path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); + path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p); six_lock_increment(&n2->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path2, n2); + mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path2, n2); /* * Note that on recursive parent_keys == keys, so we @@ -1523,11 +1523,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n3); six_unlock_write(&n3->c.lock); - path2->locks_want++; - BUG_ON(btree_node_locked(path2, n3->c.level)); + trans->paths[path2].locks_want++; + BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level)); six_lock_increment(&n3->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path2, n3); + mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path2, n3); n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; @@ -1548,10 +1548,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, bch2_btree_update_add_new_node(as, n1); six_unlock_write(&n1->c.lock); - path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p); six_lock_increment(&n1->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, path1, n1); + mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + path1, n1); if (parent) bch2_keylist_add(&as->parent_keys, &n1->key); @@ -1565,10 +1565,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) goto err; } else if (n3) { - bch2_btree_set_root(as, trans, path, n3); + bch2_btree_set_root(as, trans, trans->paths + path, n3); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, trans, path, n1); + bch2_btree_set_root(as, trans, trans->paths + path, n1); } if (n3) { @@ -1588,13 +1588,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, * node after another thread has locked and updated the new node, thus * seeing stale data: */ - bch2_btree_node_free_inmem(trans, path, b); + bch2_btree_node_free_inmem(trans, trans->paths + path, b); if (n3) - bch2_trans_node_add(trans, n3); + bch2_trans_node_add(trans, trans->paths + path, n3); if (n2) - bch2_trans_node_add(trans, n2); - bch2_trans_node_add(trans, n1); + bch2_trans_node_add(trans, trans->paths + path2, n2); + bch2_trans_node_add(trans, trans->paths + path1, n1); if (n3) six_unlock_intent(&n3->c.lock); @@ -1603,11 +1603,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, six_unlock_intent(&n1->c.lock); out: if (path2) { - __bch2_btree_path_unlock(trans, path2); + __bch2_btree_path_unlock(trans, trans->paths + path2); bch2_path_put(trans, path2, true); } if (path1) { - __bch2_btree_path_unlock(trans, path1); + __bch2_btree_path_unlock(trans, trans->paths + path1); bch2_path_put(trans, path1, true); } @@ -1635,13 +1635,14 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct keylist *keys) { struct btree_path *linked; + unsigned i; __bch2_btree_insert_keys_interior(as, trans, path, b, path->l[b->c.level].iter, keys); btree_update_updated_node(as, b); - trans_for_each_path_with_node(trans, b, linked) + trans_for_each_path_with_node(trans, b, linked, i) bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); bch2_trans_verify_paths(trans); @@ -1664,10 +1665,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * for leaf nodes -- inserts into interior nodes have to be atomic. */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, - struct btree_path *path, struct btree *b, + btree_path_idx_t path_idx, struct btree *b, struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; @@ -1720,19 +1722,22 @@ split: return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys, flags); } int bch2_btree_split_leaf(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned flags) { - struct btree *b = path_l(path)->b; + /* btree_split & merge may both cause paths array to be reallocated */ + + struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; int ret = 0; - as = bch2_btree_update_start(trans, path, path->level, + as = bch2_btree_update_start(trans, trans->paths + path, + trans->paths[path].level, true, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1745,20 +1750,21 @@ int bch2_btree_split_leaf(struct btree_trans *trans, bch2_btree_update_done(as, trans); - for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++) + for (l = trans->paths[path].level + 1; + btree_node_intent_locked(&trans->paths[path], l) && !ret; + l++) ret = bch2_foreground_maybe_merge(trans, path, l, flags); return ret; } int __bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned level, unsigned flags, enum btree_node_sibling sib) { struct bch_fs *c = trans->c; - struct btree_path *sib_path = NULL, *new_path = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1766,13 +1772,15 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, struct btree *b, *m, *n, *prev, *next, *parent; struct bpos sib_pos; size_t sib_u64s; + enum btree_id btree = trans->paths[path].btree_id; + btree_path_idx_t sib_path = 0, new_path = 0; u64 start_time = local_clock(); int ret = 0; - BUG_ON(!path->should_be_locked); - BUG_ON(!btree_node_locked(path, level)); + BUG_ON(!trans->paths[path].should_be_locked); + BUG_ON(!btree_node_locked(&trans->paths[path], level)); - b = path->l[level].b; + b = trans->paths[path].l[level].b; if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { @@ -1784,18 +1792,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, ? bpos_predecessor(b->data->min_key) : bpos_successor(b->data->max_key); - sib_path = bch2_path_get(trans, path->btree_id, sib_pos, + sib_path = bch2_path_get(trans, btree, sib_pos, U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; - btree_path_set_should_be_locked(sib_path); + btree_path_set_should_be_locked(trans->paths + sib_path); - m = sib_path->l[level].b; + m = trans->paths[sib_path].l[level].b; - if (btree_node_parent(path, b) != - btree_node_parent(sib_path, m)) { + if (btree_node_parent(trans->paths + path, b) != + btree_node_parent(trans->paths + sib_path, m)) { b->sib_u64s[sib] = U16_MAX; goto out; } @@ -1848,8 +1856,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) goto out; - parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, level, false, + parent = btree_node_parent(trans->paths + path, b); + as = bch2_btree_update_start(trans, trans->paths + path, level, false, BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) @@ -1879,10 +1887,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_add_new_node(as, n); six_unlock_write(&n->c.lock); - new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); + new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, new_path, n); + mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + new_path, n); bkey_init(&delete.k); delete.k.p = prev->key.k.p; @@ -1900,10 +1908,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); - bch2_btree_node_free_inmem(trans, path, b); - bch2_btree_node_free_inmem(trans, sib_path, m); + bch2_btree_node_free_inmem(trans, trans->paths + path, b); + bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); - bch2_trans_node_add(trans, n); + bch2_trans_node_add(trans, trans->paths + path, n); bch2_trans_verify_paths(trans); @@ -1931,16 +1939,16 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_path *new_path = NULL; struct btree *n, *parent; struct btree_update *as; + btree_path_idx_t new_path = 0; int ret; flags |= BCH_TRANS_COMMIT_no_enospc; - parent = btree_node_parent(iter->path, b); - as = bch2_btree_update_start(trans, iter->path, b->c.level, - false, flags); + struct btree_path *path = btree_iter_path(trans, iter); + parent = btree_node_parent(path, b); + as = bch2_btree_update_start(trans, path, b->c.level, false, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -1955,27 +1963,27 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); six_lock_increment(&n->c.lock, SIX_LOCK_intent); - mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); - bch2_btree_path_level_init(trans, new_path, n); + mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + new_path, n); trace_and_count(c, btree_node_rewrite, trans, b); if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, parent, - &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, iter->path, + parent, &as->parent_keys, flags); if (ret) goto err; } else { - bch2_btree_set_root(as, trans, iter->path, n); + bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n); } bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); - bch2_btree_node_free_inmem(trans, iter->path, b); + bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); - bch2_trans_node_add(trans, n); + bch2_trans_node_add(trans, trans->paths + iter->path, n); six_unlock_intent(&n->c.lock); bch2_btree_update_done(as, trans); @@ -2153,7 +2161,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BUG_ON(ret); } - parent = btree_node_parent(iter->path, b); + parent = btree_node_parent(btree_iter_path(trans, iter), b); if (parent) { bch2_trans_copy_iter(&iter2, iter); @@ -2161,10 +2169,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, iter2.flags & BTREE_ITER_INTENT, _THIS_IP_); - BUG_ON(iter2.path->level != b->c.level); - BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p)); + struct btree_path *path2 = btree_iter_path(trans, &iter2); + BUG_ON(path2->level != b->c.level); + BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); - btree_path_set_level_up(trans, iter2.path); + btree_path_set_level_up(trans, path2); trans->paths_sorted = false; @@ -2175,23 +2184,23 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, } else { BUG_ON(btree_node_root(c, b) != b); - ret = darray_make_room(&trans->extra_journal_entries, + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(new_key->k.u64s)); + ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; - journal_entry_set((void *) &darray_top(trans->extra_journal_entries), + journal_entry_set(e, BCH_JSET_ENTRY_btree_root, b->c.btree_id, b->c.level, new_key, new_key->k.u64s); - trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); } ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); if (ret) goto err; - bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c); + bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -2206,7 +2215,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bkey_copy(&b->key, new_key); } - bch2_btree_node_unlock_write(trans, iter->path, b); + bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); out: bch2_trans_iter_exit(trans, &iter2); return ret; @@ -2225,7 +2234,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite { struct bch_fs *c = trans->c; struct btree *new_hash = NULL; - struct btree_path *path = iter->path; + struct btree_path *path = btree_iter_path(trans, iter); struct closure cl; int ret = 0; @@ -2283,7 +2292,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, goto out; /* has node been freed? */ - if (iter.path->l[b->c.level].b != b) { + if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); goto out; diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 031076e..bb05bcd 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -121,16 +121,17 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, struct btree *, struct bkey_format); -int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); +int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); -int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, +int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, unsigned, unsigned, enum btree_node_sibling); static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path_idx, unsigned level, unsigned flags, enum btree_node_sibling sib) { + struct btree_path *path = trans->paths + path_idx; struct btree *b; EBUG_ON(!btree_node_locked(path, level)); @@ -139,11 +140,11 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; - return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); + return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib); } static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_path *path, + btree_path_idx_t path, unsigned level, unsigned flags) { diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 6ab2657..1595dd2 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -7,37 +7,108 @@ #include "btree_write_buffer.h" #include "error.h" #include "journal.h" +#include "journal_io.h" #include "journal_reclaim.h" -#include +#include static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); -static int btree_write_buffered_key_cmp(const void *_l, const void *_r) +static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); + +static inline bool __wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + return (cmp_int(l->hi, r->hi) ?: + cmp_int(l->mi, r->mi) ?: + cmp_int(l->lo, r->lo)) >= 0; +} - return cmp_int(l->btree, r->btree) ?: - bpos_cmp(l->k.k.p, r->k.k.p) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->journal_offset, r->journal_offset); +static inline bool wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) +{ +#ifdef CONFIG_X86_64 + int cmp; + + asm("mov (%[l]), %%rax;" + "sub (%[r]), %%rax;" + "mov 8(%[l]), %%rax;" + "sbb 8(%[r]), %%rax;" + "mov 16(%[l]), %%rax;" + "sbb 16(%[r]), %%rax;" + : "=@ccae" (cmp) + : [l] "r" (l), [r] "r" (r) + : "rax", "cc"); + + EBUG_ON(cmp != __wb_key_cmp(l, r)); + return cmp; +#else + return __wb_key_cmp(l, r); +#endif } -static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) +/* Compare excluding idx, the low 24 bits: */ +static inline bool wb_key_eq(const void *_l, const void *_r) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + const struct wb_key_ref *l = _l; + const struct wb_key_ref *r = _r; + + return !((l->hi ^ r->hi)| + (l->mi ^ r->mi)| + ((l->lo >> 24) ^ (r->lo >> 24))); +} + +static noinline void wb_sort(struct wb_key_ref *base, size_t num) +{ + size_t n = num, a = num / 2; + + if (!a) /* num < 2 || size == 0 */ + return; + + for (;;) { + size_t b, c, d; - return cmp_int(l->journal_seq, r->journal_seq); + if (a) /* Building heap: sift down --a */ + --a; + else if (--n) /* Sorting: Extract root to --n */ + swap(base[0], base[n]); + else /* Sort complete */ + break; + + /* + * Sift element at "a" down into heap. This is the + * "bottom-up" variant, which significantly reduces + * calls to cmp_func(): we find the sift-down path all + * the way to the leaves (one compare per level), then + * backtrack to find where to insert the target element. + * + * Because elements tend to sift down close to the leaves, + * this uses fewer compares than doing two per level + * on the way down. (A bit more than half as many on + * average, 3/4 worst-case.) + */ + for (b = a; c = 2*b + 1, (d = c + 1) < n;) + b = wb_key_cmp(base + c, base + d) ? c : d; + if (d == n) /* Special case last leaf with no sibling */ + b = c; + + /* Now backtrack from "b" to the correct location for "a" */ + while (b != a && wb_key_cmp(base + a, base + b)) + b = (b - 1) / 2; + c = b; /* Where "a" belongs */ + while (b != a) { /* Shift it into place */ + b = (b - 1) / 2; + swap(base[b], base[c]); + } + } } static noinline int wb_flush_one_slowpath(struct btree_trans *trans, struct btree_iter *iter, struct btree_write_buffered_key *wb) { - bch2_btree_node_unlock_write(trans, iter->path, iter->path->l[0].b); + struct btree_path *path = btree_iter_path(trans, iter); + + bch2_btree_node_unlock_write(trans, path, path->l[0].b); trans->journal_res.seq = wb->journal_seq; @@ -59,6 +130,9 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite int ret; EBUG_ON(!wb->journal_seq); + EBUG_ON(!c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + ret = bch2_btree_iter_traverse(iter); if (ret) return ret; @@ -67,10 +141,10 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite * We can't clone a path that has write locks: unshare it now, before * set_pos and traverse(): */ - if (iter->path->ref > 1) + if (btree_iter_path(trans, iter)->ref > 1) iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_); - path = iter->path; + path = btree_iter_path(trans, iter); if (!*write_locked) { ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); @@ -91,26 +165,6 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite return 0; } -static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) -{ - union btree_write_buffer_state old, new; - u64 v = READ_ONCE(wb->state.v); - - do { - old.v = new.v = v; - - new.nr = 0; - new.idx++; - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); - - while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) - cpu_relax(); - - smp_mb(); - - return old; -} - /* * Update a btree with a write buffered key using the journal seq of the * original write buffer insert. @@ -140,28 +194,79 @@ btree_write_buffered_insert(struct btree_trans *trans, return ret; } -int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) +static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) +{ + struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); + struct journal *j = &c->journal; + + if (!wb->inc.keys.nr) + return; + + bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); + darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { + swap(wb->flushing.keys, wb->inc.keys); + goto out; + } + + size_t nr = min(darray_room(wb->flushing.keys), + wb->sorted.size - wb->flushing.keys.nr); + nr = min(nr, wb->inc.keys.nr); + + memcpy(&darray_top(wb->flushing.keys), + wb->inc.keys.data, + sizeof(wb->inc.keys.data[0]) * nr); + + memmove(wb->inc.keys.data, + wb->inc.keys.data + nr, + sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); + + wb->flushing.keys.nr += nr; + wb->inc.keys.nr -= nr; +out: + if (!wb->inc.keys.nr) + bch2_journal_pin_drop(j, &wb->inc.pin); + else + bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, + bch2_btree_write_buffer_journal_flush); + + if (j->watermark) { + spin_lock(&j->lock); + bch2_journal_set_watermark(j); + spin_unlock(&j->lock); + } + + BUG_ON(wb->sorted.size < wb->flushing.keys.nr); +} + +static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct journal_entry_pin pin; - struct btree_write_buffered_key *i, *keys; + struct wb_key_ref *i; struct btree_iter iter = { NULL }; - size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; + size_t skipped = 0, fast = 0, slowpath = 0; bool write_locked = false; - union btree_write_buffer_state s; int ret = 0; - memset(&pin, 0, sizeof(pin)); + bch2_trans_unlock(trans); + bch2_trans_begin(trans); - bch2_journal_pin_copy(j, &pin, &wb->journal_pin, - bch2_btree_write_buffer_journal_flush); - bch2_journal_pin_drop(j, &wb->journal_pin); + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); + mutex_unlock(&wb->inc.lock); - s = btree_write_buffer_switch(wb); - keys = wb->keys[s.idx]; - nr = s.nr; + for (size_t i = 0; i < wb->flushing.keys.nr; i++) { + wb->sorted.data[i].idx = i; + wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; + memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); + } + wb->sorted.nr = wb->flushing.keys.nr; /* * We first sort so that we can detect and skip redundant updates, and @@ -177,33 +282,44 @@ int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) * If that happens, simply skip the key so we can optimistically insert * as many keys as possible in the fast path. */ - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_key_cmp, NULL); + wb_sort(wb->sorted.data, wb->sorted.nr); + + darray_for_each(wb->sorted, i) { + struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) + prefetch(&wb->flushing.keys.data[n->idx]); + + BUG_ON(!k->journal_seq); + + if (i + 1 < &darray_top(wb->sorted) && + wb_key_eq(i, i + 1)) { + struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; - for (i = keys; i < keys + nr; i++) { - if (i + 1 < keys + nr && - i[0].btree == i[1].btree && - bpos_eq(i[0].k.k.p, i[1].k.k.p)) { skipped++; - i->journal_seq = 0; + n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); + k->journal_seq = 0; continue; } - if (write_locked && - (iter.path->btree_id != i->btree || - bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); - write_locked = false; + if (write_locked) { + struct btree_path *path = btree_iter_path(trans, &iter); + + if (path->btree_id != i->btree || + bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) { + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + write_locked = false; + } } - if (!iter.path || iter.path->btree_id != i->btree) { + if (!iter.path || iter.btree_id != k->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } - bch2_btree_iter_set_pos(&iter, i->k.k.p); - iter.path->preserve = false; + bch2_btree_iter_set_pos(&iter, k->k.k.p); + btree_iter_path(trans, &iter)->preserve = false; do { if (race_fault()) { @@ -211,13 +327,13 @@ int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) break; } - ret = wb_flush_one(trans, &iter, i, &write_locked, &fast); + ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); if (!ret) { - i->journal_seq = 0; + k->journal_seq = 0; } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { slowpath++; ret = 0; @@ -225,8 +341,10 @@ int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) break; } - if (write_locked) - bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + if (write_locked) { + struct btree_path *path = btree_iter_path(trans, &iter); + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + } bch2_trans_iter_exit(trans, &iter); if (ret) @@ -239,18 +357,17 @@ int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) * The fastpath zapped the seq of keys that were successfully flushed so * we can skip those here. */ - trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, nr); + trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); - sort(keys, nr, sizeof(keys[0]), - btree_write_buffered_journal_cmp, - NULL); - - for (i = keys; i < keys + nr; i++) { + struct btree_write_buffered_key *i; + darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) continue; - bch2_journal_pin_update(j, i->journal_seq, &pin, - bch2_btree_write_buffer_journal_flush); + bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, + bch2_btree_write_buffer_journal_flush); + + bch2_trans_begin(trans); ret = commit_do(trans, NULL, NULL, BCH_WATERMARK_reclaim| @@ -265,37 +382,78 @@ int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) } err: bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); - trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); - bch2_journal_pin_drop(j, &pin); + trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); + bch2_journal_pin_drop(j, &wb->flushing.pin); + wb->flushing.keys.nr = 0; return ret; } -int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +{ + struct journal *j = &c->journal; + struct journal_buf *buf; + int ret = 0; + + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { + ret = bch2_journal_keys_to_write_buffer(c, buf); + mutex_unlock(&j->buf_lock); + } + + return ret; +} + +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq) { struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret = 0, fetch_from_journal_err; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) - return -BCH_ERR_erofs_no_writes; + do { + bch2_trans_unlock(trans); - trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); + fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); + + /* + * On memory allocation failure, bch2_btree_write_buffer_flush_locked() + * is not guaranteed to empty wb->inc: + */ + mutex_lock(&wb->flushing.lock); + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flushing.lock); + } while (!ret && + (fetch_from_journal_err || + (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); - bch2_trans_unlock(trans); - mutex_lock(&c->btree_write_buffer.flush_lock); - int ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&c->btree_write_buffer.flush_lock); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); return ret; } +static int bch2_btree_write_buffer_journal_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq)); +} + +int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + + trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); + + return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal)); +} + int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; int ret = 0; - if (mutex_trylock(&wb->flush_lock)) { + if (mutex_trylock(&wb->flushing.lock)) { ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flush_lock); + mutex_unlock(&wb->flushing.lock); } return ret; @@ -313,90 +471,179 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) return ret; } -static int bch2_btree_write_buffer_journal_flush(struct journal *j, - struct journal_entry_pin *_pin, u64 seq) +static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret; - mutex_lock(&wb->flush_lock); - int ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); - mutex_unlock(&wb->flush_lock); + mutex_lock(&wb->flushing.lock); + do { + ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); + } while (!ret && bch2_btree_write_buffer_should_flush(c)); + mutex_unlock(&wb->flushing.lock); - return ret; + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); } -static inline u64 btree_write_buffer_ref(int idx) +int __bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) { - return ((union btree_write_buffer_state) { - .ref0 = idx == 0, - .ref1 = idx == 1, - }).v; + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret; +retry: + ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); + if (!ret && dst->wb == &wb->flushing) + ret = darray_resize(&wb->sorted, wb->flushing.keys.size); + + if (unlikely(ret)) { + if (dst->wb == &c->btree_write_buffer.flushing) { + mutex_unlock(&dst->wb->lock); + dst->wb = &c->btree_write_buffer.inc; + bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, + bch2_btree_write_buffer_journal_flush); + goto retry; + } + + return ret; + } + + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + BUG_ON(!dst->room); + BUG_ON(!dst->seq); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; } -int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) { - struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_write_buffered_key *i; - union btree_write_buffer_state old, new; - int ret = 0; - u64 v; - trans_for_each_wb_update(trans, i) { - EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + if (mutex_trylock(&wb->flushing.lock)) { + mutex_lock(&wb->inc.lock); + move_keys_from_inc_to_flushing(wb); - i->journal_seq = trans->journal_res.seq; - i->journal_offset = trans->journal_res.offset; - } - - preempt_disable(); - v = READ_ONCE(wb->state.v); - do { - old.v = new.v = v; + /* + * Attempt to skip wb->inc, and add keys directly to + * wb->flushing, saving us a copy later: + */ - new.v += btree_write_buffer_ref(new.idx); - new.nr += trans->nr_wb_updates; - if (new.nr > wb->size) { - ret = -BCH_ERR_btree_insert_need_flush_buffer; - goto out; + if (!wb->inc.keys.nr) { + dst->wb = &wb->flushing; + } else { + mutex_unlock(&wb->flushing.lock); + dst->wb = &wb->inc; } - } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + } else { + mutex_lock(&wb->inc.lock); + dst->wb = &wb->inc; + } - memcpy(wb->keys[new.idx] + old.nr, - trans->wb_updates, - sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); + dst->room = darray_room(dst->wb->keys); + if (dst->wb == &wb->flushing) + dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); + dst->seq = seq; - bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, + bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, bch2_btree_write_buffer_journal_flush); +} + +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + if (!dst->wb->keys.nr) + bch2_journal_pin_drop(&c->journal, &dst->wb->pin); + + if (bch2_btree_write_buffer_should_flush(c) && + __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + + if (dst->wb == &wb->flushing) + mutex_unlock(&wb->flushing.lock); + mutex_unlock(&wb->inc.lock); +} + +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + struct jset_entry *entry; + struct bkey_i *k; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } + + buf->need_flush_to_write_buffer = false; out: - preempt_enable(); + bch2_journal_keys_to_write_buffer_end(c, &dst); return ret; } +static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) +{ + if (wb->keys.size >= new_size) + return 0; + + if (!mutex_trylock(&wb->lock)) + return -EINTR; + + int ret = darray_resize(&wb->keys, new_size); + mutex_unlock(&wb->lock); + return ret; +} + +int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb_keys_resize(&wb->flushing, new_size) ?: + wb_keys_resize(&wb->inc, new_size); +} + void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); + BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && + !bch2_journal_error(&c->journal)); - kvfree(wb->keys[1]); - kvfree(wb->keys[0]); + darray_exit(&wb->sorted); + darray_exit(&wb->flushing.keys); + darray_exit(&wb->inc.keys); } int bch2_fs_btree_write_buffer_init(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - mutex_init(&wb->flush_lock); - wb->size = c->opts.btree_write_buffer_size; + mutex_init(&wb->inc.lock); + mutex_init(&wb->flushing.lock); + INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); - wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); - wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); - if (!wb->keys[0] || !wb->keys[1]) - return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; + /* Will be resized by journal as needed: */ + unsigned initial_size = 1 << 16; - return 0; + return darray_make_room(&wb->inc.keys, initial_size) ?: + darray_make_room(&wb->flushing.keys, initial_size) ?: + darray_make_room(&wb->sorted, initial_size); } diff --git a/libbcachefs/btree_write_buffer.h b/libbcachefs/btree_write_buffer.h index dec2c9a..1f645f5 100644 --- a/libbcachefs/btree_write_buffer.h +++ b/libbcachefs/btree_write_buffer.h @@ -2,13 +2,59 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H #define _BCACHEFS_BTREE_WRITE_BUFFER_H -int bch2_btree_write_buffer_flush_locked(struct btree_trans *); -int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); +#include "bkey.h" + +static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; +} + +static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; +} + +struct btree_trans; int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); -int bch2_btree_insert_keys_write_buffer(struct btree_trans *); +struct journal_keys_to_wb { + struct btree_write_buffer_keys *wb; + size_t room; + u64 seq; +}; + +int __bch2_journal_key_to_wb(struct bch_fs *, + struct journal_keys_to_wb *, + enum btree_id, struct bkey_i *); + +static inline int bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + EBUG_ON(!dst->seq); + + if (unlikely(!dst->room)) + return __bch2_journal_key_to_wb(c, dst, btree, k); + + struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr++; + dst->room--; + return 0; +} + +void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); +void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); +int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); diff --git a/libbcachefs/btree_write_buffer_types.h b/libbcachefs/btree_write_buffer_types.h index 99993ba..9b9433d 100644 --- a/libbcachefs/btree_write_buffer_types.h +++ b/libbcachefs/btree_write_buffer_types.h @@ -2,43 +2,56 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H +#include "darray.h" #include "journal_types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 #define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) -struct btree_write_buffered_key { - u64 journal_seq; - unsigned journal_offset; - enum btree_id btree; - __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); -}; - -union btree_write_buffer_state { +struct wb_key_ref { +union { struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned idx:24; + u8 pos[sizeof(struct bpos)]; + enum btree_id btree:8; +#else + enum btree_id btree:8; + u8 pos[sizeof(struct bpos)]; + unsigned idx:24; +#endif + } __packed; struct { - u64 nr:23; - u64 idx:1; - u64 ref0:20; - u64 ref1:20; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + u64 lo; + u64 mi; + u64 hi; +#else + u64 hi; + u64 mi; + u64 lo; +#endif }; }; +}; -struct btree_write_buffer { - struct mutex flush_lock; - struct journal_entry_pin journal_pin; +struct btree_write_buffered_key { + enum btree_id btree:8; + u64 journal_seq:56; + __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); +}; - union btree_write_buffer_state state; - size_t size; +struct btree_write_buffer_keys { + DARRAY(struct btree_write_buffered_key) keys; + struct journal_entry_pin pin; + struct mutex lock; +}; - struct btree_write_buffered_key *keys[2]; +struct btree_write_buffer { + DARRAY(struct wb_key_ref) sorted; + struct btree_write_buffer_keys inc; + struct btree_write_buffer_keys flushing; + struct work_struct flush_work; }; #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index a042e07..08922f7 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -7,6 +7,7 @@ #include "chardev.h" #include "journal.h" #include "move.h" +#include "recovery.h" #include "replicas.h" #include "super.h" #include "super-io.h" @@ -19,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +34,7 @@ static int copy_to_user_errcode(void __user *to, const void *from, unsigned long struct thread_with_file { struct task_struct *task; int ret; + bool done; }; static void thread_with_file_exit(struct thread_with_file *thr) @@ -42,6 +45,7 @@ static void thread_with_file_exit(struct thread_with_file *thr) } } +__printf(4, 0) static int run_thread_with_file(struct thread_with_file *thr, const struct file_operations *fops, int (*fn)(void *), const char *fmt, ...) @@ -227,6 +231,13 @@ static int bch2_fsck_thread_release(struct inode *inode, struct file *file) return 0; } +static bool fsck_thread_ready(struct fsck_thread *thr) +{ + return thr->output.buf.pos || + thr->output2.nr || + thr->thr.done; +} + static ssize_t bch2_fsck_thread_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { @@ -234,11 +245,18 @@ static ssize_t bch2_fsck_thread_read(struct file *file, char __user *buf, size_t copied = 0, b; int ret = 0; + if ((file->f_flags & O_NONBLOCK) && + !fsck_thread_ready(thr)) + return -EAGAIN; + ret = wait_event_interruptible(thr->output.wait, - thr->output.buf.pos || thr->output2.nr); + fsck_thread_ready(thr)); if (ret) return ret; + if (thr->thr.done) + return 0; + while (len) { ret = darray_make_room(&thr->output2, thr->output.buf.pos); if (ret) @@ -279,9 +297,21 @@ static ssize_t bch2_fsck_thread_read(struct file *file, char __user *buf, return copied ?: ret; } +static __poll_t bch2_fsck_thread_poll(struct file *file, struct poll_table_struct *wait) +{ + struct fsck_thread *thr = container_of(file->private_data, struct fsck_thread, thr); + + poll_wait(file, &thr->output.wait, wait); + + return fsck_thread_ready(thr) + ? EPOLLIN|EPOLLHUP + : 0; +} + static const struct file_operations fsck_thread_ops = { .release = bch2_fsck_thread_release, .read = bch2_fsck_thread_read, + .poll = bch2_fsck_thread_poll, .llseek = no_llseek, }; @@ -293,6 +323,9 @@ static int bch2_fsck_offline_thread_fn(void *arg) thr->thr.ret = PTR_ERR_OR_ZERO(c); if (!thr->thr.ret) bch2_fs_stop(c); + + thr->thr.done = true; + wake_up(&thr->output.wait); return 0; } @@ -309,6 +342,9 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a if (arg.flags) return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) || !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) || !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) { @@ -316,6 +352,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a goto err; } + thr->opts = bch2_opts_empty(); thr->nr_devs = arg.nr_devs; thr->output.buf = PRINTBUF; thr->output.buf.atomic++; @@ -899,13 +936,28 @@ static int bch2_fsck_online_thread_fn(void *arg) { struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); struct bch_fs *c = thr->c; -#if 0 - struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); - thr->thr.ret = PTR_ERR_OR_ZERO(c); - if (!thr->thr.ret) - bch2_fs_stop(c); -#endif + c->output_filter = current; + c->output = &thr->output; + + /* + * XXX: can we figure out a way to do this without mucking with c->opts? + */ + if (opt_defined(thr->opts, fix_errors)) + c->opts.fix_errors = thr->opts.fix_errors; + c->opts.fsck = true; + + c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + bch2_run_online_recovery_passes(c); + + c->output = NULL; + c->output_filter = NULL; + + thr->thr.done = true; + wake_up(&thr->output.wait); + + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); return 0; } @@ -918,24 +970,54 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, if (arg.flags) return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bch2_ro_ref_tryget(c)) + return -EROFS; + + if (down_trylock(&c->online_fsck_mutex)) { + bch2_ro_ref_put(c); + return -EAGAIN; + } + thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) - return -ENOMEM; + if (!thr) { + ret = -ENOMEM; + goto err; + } thr->c = c; + thr->opts = bch2_opts_empty(); thr->output.buf = PRINTBUF; thr->output.buf.atomic++; spin_lock_init(&thr->output.lock); init_waitqueue_head(&thr->output.wait); darray_init(&thr->output2); + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(c, &thr->opts, optstr); + kfree(optstr); + + if (ret) + goto err; + } + ret = run_thread_with_file(&thr->thr, &fsck_thread_ops, bch2_fsck_online_thread_fn, "bch-fsck"); - bch_err_fn(c, ret); - if (ret < 0) - bch2_fsck_thread_free(thr); +err: + if (ret < 0) { + bch_err_fn(c, ret); + if (thr) + bch2_fsck_thread_free(thr); + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + } return ret; } diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index c36bfc6..f418890 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -95,6 +95,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, unsigned long io_until, unsigned long cpu_timeout) { + bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait; wait.io_timer.expire = io_until; @@ -110,7 +111,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, while (1) { set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread && kthread_should_stop()) break; if (wait.expired) diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 22d4bb7..6652ef0 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -574,7 +574,8 @@ int bch2_data_update_init(struct btree_trans *trans, move_ctxt_wait_event(ctxt, (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, PTR_BUCKET_POS(c, &p.ptr), 0)) || - !atomic_read(&ctxt->read_sectors)); + (!atomic_read(&ctxt->read_sectors) && + !atomic_read(&ctxt->write_sectors))); if (!locked) bch2_bucket_nocow_lock(&c->nocow_locks, diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 57c5128..bc049ff 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -380,12 +380,13 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, return ret; trans = bch2_trans_get(i->c); - ret = for_each_btree_key2(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ bch2_bkey_val_to_text(&i->buf, i->c, k); prt_newline(&i->buf); - drop_locks_do(trans, flush_buf(i)); + bch2_trans_unlock(trans); + flush_buf(i); })); i->from = iter.pos; @@ -477,10 +478,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, trans = bch2_trans_get(i->c); - ret = for_each_btree_key2(trans, iter, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ({ - struct btree_path_level *l = &iter.path->l[0]; + ret = for_each_btree_key(trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + struct btree_path_level *l = + &btree_iter_path(trans, &iter)->l[0]; struct bkey_packed *_k = bch2_btree_node_iter_peek(&l->iter, l->b); @@ -490,7 +492,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, } bch2_bfloat_to_text(&i->buf, l->b, _k); - drop_locks_do(trans, flush_buf(i)); + bch2_trans_unlock(trans); + flush_buf(i); })); i->from = iter.pos; @@ -616,7 +619,6 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -632,7 +634,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -650,11 +654,11 @@ restart: prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&i->buf, task); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -678,7 +682,6 @@ static const struct file_operations btree_transactions_ops = { .release = bch2_dump_release, .read = bch2_btree_transactions_read, }; -#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) @@ -835,7 +838,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, restart: seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid <= i->iter) + struct task_struct *task = READ_ONCE(trans->locking_wait.task); + + if (!task || task->pid <= i->iter) continue; closure_get(&trans->ref); @@ -850,7 +855,7 @@ restart: bch2_check_for_deadlock(trans, &i->buf); - i->iter = trans->locking_wait.task->pid; + i->iter = task->pid; closure_put(&trans->ref); @@ -897,10 +902,8 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, c->btree_debug, &cached_btree_nodes_ops); -#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, c->btree_debug, &btree_transactions_ops); -#endif debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 0542d99..580c1c9 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -486,20 +486,15 @@ retry: return ret; } -int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) +int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) { struct btree_iter iter; struct bkey_s_c k; - u32 snapshot; int ret; - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); - if (ret) - return ret; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, - SPOS(dir.inum, 0, snapshot), - POS(dir.inum, U64_MAX), 0, k, ret) + SPOS(dir, 0, snapshot), + POS(dir, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { ret = -ENOTEMPTY; break; @@ -509,6 +504,14 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) return ret; } +int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) +{ + u32 snapshot; + + return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: + bch2_empty_dir_snapshot(trans, dir.inum, snapshot); +} + int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { struct btree_trans *trans = bch2_trans_get(c); diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 8a55245..10dc3ad 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -65,6 +65,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *); +int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index bc8b556..76163c2 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1005,7 +1005,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; - ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); + ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; @@ -1833,7 +1833,6 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; const struct bch_stripe *s; @@ -1841,36 +1840,33 @@ int bch2_stripes_read(struct bch_fs *c) unsigned i; int ret; - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - s = bkey_s_c_to_stripe(k).v; + ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ({ + if (k.k->type != KEY_TYPE_stripe) + continue; - m = genradix_ptr(&c->stripes, k.k->p.offset); - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->blocks_nonempty = 0; + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + break; - for (i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); + s = bkey_s_c_to_stripe(k).v; - bch2_stripes_heap_insert(c, m, k.k->p.offset); - } - bch2_trans_iter_exit(trans, &iter); + m = genradix_ptr(&c->stripes, k.k->p.offset); + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; - bch2_trans_put(trans); + for (i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); - if (ret) - bch_err_fn(c, ret); + bch2_stripes_heap_insert(c, m, k.k->p.offset); + 0; + }))); + bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 87c13f1..bb82bb7 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -150,7 +150,6 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ - x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ x(0, backpointer_to_overwritten_btree_node) \ x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ diff --git a/libbcachefs/error.h b/libbcachefs/error.h index d167d65..fec17d1 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -157,6 +157,7 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define fsck_err_on(cond, c, _err_type, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) +__printf(4, 0) static inline void bch2_bkey_fsck_err(struct bch_fs *c, struct printbuf *err_msg, enum bch_sb_error_id err_type, @@ -167,7 +168,6 @@ static inline void bch2_bkey_fsck_err(struct bch_fs *c, va_start(args, fmt); prt_vprintf(err_msg, fmt, args); va_end(args); - } #define bkey_fsck_err(c, _err_msg, _err_type, ...) \ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index f6c92df..9d8afcb 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -1294,7 +1294,8 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, unsigned i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { rewrite_ptrs = 0; goto incompressible; } diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 31f40e5..98bd5ba 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -192,13 +192,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, ret2, ret3; + int ret; ret = file_write_and_wait_range(file, start, end); - ret2 = sync_inode_metadata(&inode->v, 1); - ret3 = bch2_flush_inode(c, inode); - - return bch2_err_class(ret ?: ret2 ?: ret3); + if (ret) + goto out; + ret = sync_inode_metadata(&inode->v, 1); + if (ret) + goto out; + ret = bch2_flush_inode(c, inode); +out: + return bch2_err_class(ret); } /* truncate: */ diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 561fc1d..8098a3a 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -333,10 +333,6 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) return -EINVAL; - if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && - !arg.src_ptr) - return -EOPNOTSUPP; - if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) create_flags |= BCH_CREATE_SNAPSHOT; @@ -409,7 +405,7 @@ retry: if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && !arg.src_ptr) - snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; + snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), dst_dentry, arg.mode|S_IFDIR, diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 0d0a37c..f6ea5ae 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1143,24 +1143,33 @@ static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_inode_info *dir = to_bch_ei(vdir); - - if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32)) - return FILEID_INVALID; + int min_len; if (!S_ISDIR(inode->v.i_mode) && dir) { struct bcachefs_fid_with_parent *fid = (void *) fh; + min_len = sizeof(*fid) / sizeof(u32); + if (*len < min_len) { + *len = min_len; + return FILEID_INVALID; + } + fid->fid = bch2_inode_to_fid(inode); fid->dir = bch2_inode_to_fid(dir); - *len = sizeof(*fid) / sizeof(u32); + *len = min_len; return FILEID_BCACHEFS_WITH_PARENT; } else { struct bcachefs_fid *fid = (void *) fh; + min_len = sizeof(*fid) / sizeof(u32); + if (*len < min_len) { + *len = min_len; + return FILEID_INVALID; + } *fid = bch2_inode_to_fid(inode); - *len = sizeof(*fid) / sizeof(u32); + *len = min_len; return FILEID_BCACHEFS_WITHOUT_PARENT; } } @@ -1733,6 +1742,9 @@ static int bch2_unfreeze(struct super_block *sb) struct bch_fs *c = sb->s_fs_info; int ret; + if (test_bit(BCH_FS_emergency_ro, &c->flags)) + return 0; + down_write(&c->state_lock); ret = bch2_fs_read_write(c); up_write(&c->state_lock); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index bc6b566..095453d 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -589,14 +589,13 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - u32 restart_count = trans->restart_count; int ret; w->recalculate_sums = false; w->inodes.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (k.k->p.offset != inum) break; @@ -609,8 +608,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return ret; w->first_this_inode = true; - - return trans_was_restarted(trans, restart_count); + return 0; } static struct inode_walker_entry * @@ -1212,7 +1210,7 @@ static int overlapping_extents_found(struct btree_trans *trans, swap(k1, k2); } - trans->extra_journal_res += bch2_bkey_sectors_compressed(k2); + trans->extra_disk_res += bch2_bkey_sectors_compressed(k2); ret = bch2_trans_update_extent_overwrite(trans, old_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, @@ -1705,7 +1703,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - BUG_ON(!iter->path->should_be_locked); + BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); ret = PTR_ERR_OR_ZERO(i); @@ -1952,14 +1950,10 @@ static int check_root_trans(struct btree_trans *trans) root_subvol.v.flags = 0; root_subvol.v.snapshot = cpu_to_le32(snapshot); root_subvol.v.inode = cpu_to_le64(inum); - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc, - bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, - &root_subvol.k_i, 0)); + ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0); bch_err_msg(c, ret, "writing root subvol"); if (ret) goto err; - } ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); @@ -1986,9 +1980,7 @@ fsck_err: /* Get root directory, create if it doesn't exist: */ int bch2_check_root(struct bch_fs *c) { - int ret; - - ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_root_trans(trans)); bch_err_fn(c, ret); return ret; @@ -2146,19 +2138,14 @@ int bch2_check_directory_structure(struct bch_fs *c) pathbuf path = { 0, }; int ret; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + for_each_btree_key_old(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (!bkey_is_inode(k.k)) continue; - ret = bch2_inode_unpack(k, &u); - if (ret) { - /* Should have been caught earlier in fsck: */ - bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); - break; - } + BUG_ON(bch2_inode_unpack(k, &u)); if (u.bi_flags & BCH_INODE_unlinked) continue; @@ -2170,6 +2157,7 @@ int bch2_check_directory_structure(struct bch_fs *c) bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); darray_exit(&path); + bch_err_fn(c, ret); return ret; } @@ -2255,47 +2243,42 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, struct nlink_table *t, u64 start, u64 *end) { - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; - int ret = 0; - - for_each_btree_key(trans, iter, BTREE_ID_inodes, - POS(0, start), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (!bkey_is_inode(k.k)) - continue; - - /* Should never fail, checked by bch2_inode_invalid: */ - BUG_ON(bch2_inode_unpack(k, &u)); - /* - * Backpointer and directory structure checks are sufficient for - * directories, since they can't have hardlinks: - */ - if (S_ISDIR(u.bi_mode)) - continue; + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_inodes, + POS(0, start), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + if (!bkey_is_inode(k.k)) + continue; - if (!u.bi_nlink) - continue; + /* Should never fail, checked by bch2_inode_invalid: */ + BUG_ON(bch2_inode_unpack(k, &u)); - ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); - if (ret) { - *end = k.k->p.offset; - ret = 0; - break; - } + /* + * Backpointer and directory structure checks are sufficient for + * directories, since they can't have hardlinks: + */ + if (S_ISDIR(u.bi_mode)) + continue; - } - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); + if (!u.bi_nlink) + continue; - if (ret) - bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); + if (ret) { + *end = k.k->p.offset; + ret = 0; + break; + } + 0; + }))); + bch_err_fn(c, ret); return ret; } @@ -2303,42 +2286,39 @@ noinline_for_stack static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, u64 range_start, u64 range_end) { - struct btree_trans *trans = bch2_trans_get(c); struct snapshots_seen s; struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent d; - int ret; snapshots_seen_init(&s); - for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); - if (ret) - break; - - switch (k.k->type) { - case KEY_TYPE_dirent: - d = bkey_s_c_to_dirent(k); + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); + if (ret) + break; - if (d.v->d_type != DT_DIR && - d.v->d_type != DT_SUBVOL) - inc_link(c, &s, links, range_start, range_end, - le64_to_cpu(d.v->d_inum), - bch2_snapshot_equiv(c, d.k->p.snapshot)); - break; - } - } - bch2_trans_iter_exit(trans, &iter); + switch (k.k->type) { + case KEY_TYPE_dirent: + d = bkey_s_c_to_dirent(k); - if (ret) - bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + if (d.v->d_type != DT_DIR && + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), + bch2_snapshot_equiv(c, d.k->p.snapshot)); + break; + } + 0; + }))); - bch2_trans_put(trans); snapshots_seen_exit(&s); + + bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index b861ab2..7ee9ac5 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -7,6 +7,7 @@ #include "btree_update.h" #include "buckets.h" #include "compress.h" +#include "dirent.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -1093,11 +1094,15 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (ret) goto out; - if (fsck_err_on(S_ISDIR(inode.bi_mode), c, - deleted_inode_is_dir, - "directory %llu:%u in deleted_inodes btree", - pos.offset, pos.snapshot)) - goto delete; + if (S_ISDIR(inode.bi_mode)) { + ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot); + if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir, + "non empty directory %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + if (ret) + goto out; + } if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c, deleted_inode_not_unlinked, @@ -1163,29 +1168,29 @@ again: * but we can't retry because the btree write buffer won't have been * flushed and we'd spin: */ - for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_lazy_rw, - may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass)); - if (ret < 0) - break; - - if (ret) { - if (!test_bit(BCH_FS_rw, &c->flags)) { - bch2_trans_unlock(trans); - bch2_fs_lazy_rw(c); - } - + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + if (ret > 0) { bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - break; + /* + * We don't want to loop here: a transaction restart + * error here means we handled a transaction restart and + * we're actually done, but if we loop we'll retry the + * same key because the write buffer hasn't been flushed + * yet + */ + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } } - } - bch2_trans_iter_exit(trans, &iter); + + ret; + })); if (!ret && need_another_pass) { ret = bch2_btree_write_buffer_flush_sync(trans); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index acf9c35..8294d7f 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -10,6 +10,7 @@ #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "error.h" #include "journal.h" @@ -251,7 +252,7 @@ static bool journal_entry_want_write(struct journal *j) return ret; } -static bool journal_entry_close(struct journal *j) +bool bch2_journal_entry_close(struct journal *j) { bool ret; @@ -332,6 +333,7 @@ static int journal_entry_open(struct journal *j) buf->must_flush = false; buf->separate_flush = false; buf->flush_time = 0; + buf->need_flush_to_write_buffer = true; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -380,7 +382,7 @@ static bool journal_quiesced(struct journal *j) bool ret = atomic64_read(&j->seq) == j->seq_ondisk; if (!ret) - journal_entry_close(j); + bch2_journal_entry_close(j); return ret; } @@ -433,7 +435,7 @@ retry: /* * Recheck after taking the lock, so we don't race with another thread - * that just did journal_entry_open() and call journal_entry_close() + * that just did journal_entry_open() and call bch2_journal_entry_close() * unnecessarily */ if (journal_res_get_fast(j, res, flags)) { @@ -768,6 +770,48 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret = NULL; + + mutex_lock(&j->buf_lock); + spin_lock(&j->lock); + max_seq = min(max_seq, journal_cur_seq(j)); + + for (u64 seq = journal_last_unwritten_seq(j); + seq <= max_seq; + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *buf = j->buf + idx; + + if (buf->need_flush_to_write_buffer) { + if (seq == journal_cur_seq(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + + union journal_res_state s; + s.v = atomic64_read_acquire(&j->reservations.counter); + + ret = journal_state_count(s, idx) + ? ERR_PTR(-EAGAIN) + : buf; + break; + } + } + + spin_unlock(&j->lock); + if (IS_ERR_OR_NULL(ret)) + mutex_unlock(&j->buf_lock); + return ret; +} + +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +{ + struct journal_buf *ret; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); + return ret; +} + /* allocate journal on a device: */ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, @@ -1035,7 +1079,7 @@ void bch2_fs_journal_stop(struct journal *j) bch2_journal_reclaim_stop(j); bch2_journal_flush_all_pins(j); - wait_event(j->wait, journal_entry_close(j)); + wait_event(j->wait, bch2_journal_entry_close(j)); /* * Always write a new journal entry, to make sure the clock hands are up @@ -1219,6 +1263,7 @@ int bch2_fs_journal_init(struct journal *j) static struct lock_class_key res_key; unsigned i; + mutex_init(&j->buf_lock); spin_lock_init(&j->lock); spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index e1e9e60..1e14e6b 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -265,6 +265,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u return s; } +bool bch2_journal_entry_close(struct journal *); void bch2_journal_buf_put_final(struct journal *, u64, bool); static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) @@ -424,6 +425,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index fe4565f..3c3a6c4 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -4,6 +4,7 @@ #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" #include "disk_groups.h" @@ -721,6 +722,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); +} + +static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -1501,6 +1518,8 @@ done: static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* we aren't holding j->lock: */ unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; @@ -1508,6 +1527,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (buf->buf_size >= new_size) return; + size_t btree_write_buffer_size = new_size / 64; + + if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) + return; + new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1597,6 +1621,7 @@ static CLOSURE_CALLBACK(journal_write_done) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); + bch2_journal_reclaim_fast(j); bch2_journal_space_available(j); track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], @@ -1698,11 +1723,13 @@ static CLOSURE_CALLBACK(do_journal_write) static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct jset_entry *start, *end, *i, *next, *prev = NULL; + struct jset_entry *start, *end, *i; struct jset *jset = w->data; + struct journal_keys_to_wb wb = { NULL }; unsigned sectors, bytes, u64s; - bool validate_before_checksum = false; unsigned long btree_roots_have = 0; + bool validate_before_checksum = false; + u64 seq = le64_to_cpu(jset->seq); int ret; /* @@ -1713,7 +1740,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * If we wanted to be really fancy here, we could sort all the keys in * the jset and drop keys that were overwritten - probably not worth it: */ - vstruct_for_each_safe(jset, i, next) { + vstruct_for_each(jset, i) { unsigned u64s = le16_to_cpu(i->u64s); /* Empty entry: */ @@ -1730,40 +1757,40 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * to c->btree_roots we have to get any missing btree roots and * add them to this journal entry: */ - if (i->type == BCH_JSET_ENTRY_btree_root) { + switch (i->type) { + case BCH_JSET_ENTRY_btree_root: bch2_journal_entry_to_btree_root(c, i); __set_bit(i->btree_id, &btree_roots_have); + break; + case BCH_JSET_ENTRY_write_buffer_keys: + EBUG_ON(!w->need_flush_to_write_buffer); + + if (!wb.wb) + bch2_journal_keys_to_write_buffer_start(c, &wb, seq); + + struct bkey_i *k; + jset_entry_for_each_key(i, k) { + ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); + if (ret) { + bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); + bch2_journal_keys_to_write_buffer_end(c, &wb); + return ret; + } + } + i->type = BCH_JSET_ENTRY_btree_keys; + break; } - - /* Can we merge with previous entry? */ - if (prev && - i->btree_id == prev->btree_id && - i->level == prev->level && - i->type == prev->type && - i->type == BCH_JSET_ENTRY_btree_keys && - le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { - memmove_u64s_down(vstruct_next(prev), - i->_data, - u64s); - le16_add_cpu(&prev->u64s, u64s); - continue; - } - - /* Couldn't merge, move i into new position (after prev): */ - prev = prev ? vstruct_next(prev) : jset->start; - if (i != prev) - memmove_u64s_down(prev, i, jset_u64s(u64s)); } - prev = prev ? vstruct_next(prev) : jset->start; - jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); + if (wb.wb) + bch2_journal_keys_to_write_buffer_end(c, &wb); + w->need_flush_to_write_buffer = false; start = end = vstruct_last(jset); end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - bch2_journal_super_entries_add_common(c, &end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1786,7 +1813,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = le64_to_cpu(jset->seq); + j->last_empty_seq = seq; if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; @@ -1882,9 +1909,11 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + mutex_lock(&j->buf_lock); journal_buf_realloc(j, w); ret = bch2_journal_write_prep(j, w); + mutex_unlock(&j->buf_lock); if (ret) goto err; diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 658aaa2..60b9d35 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "errcode.h" #include "error.h" @@ -50,20 +51,23 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -static inline void journal_set_watermark(struct journal *j) +void bch2_journal_set_watermark(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool low_on_space = j->space[journal_space_clean].total * 4 <= j->space[journal_space_total].total; bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; - unsigned watermark = low_on_space || low_on_pin + bool low_on_wb = bch2_btree_write_buffer_must_wait(c); + unsigned watermark = low_on_space || low_on_pin || low_on_wb ? BCH_WATERMARK_reclaim : BCH_WATERMARK_stripe; if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], &j->low_on_space_start, low_on_space) || track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], - &j->low_on_pin_start, low_on_pin)) + &j->low_on_pin_start, low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], + &j->write_buffer_full_start, low_on_wb)) trace_and_count(c, journal_full, c); swap(watermark, j->watermark); @@ -230,7 +234,7 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - journal_set_watermark(j); + bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -303,6 +307,7 @@ void bch2_journal_reclaim_fast(struct journal *j) * all btree nodes got written out */ while (!fifo_empty(&j->pin) && + j->pin.front <= j->seq_ondisk && !atomic_read(&fifo_peek_front(&j->pin).count)) { j->pin.front++; popped = true; @@ -635,6 +640,7 @@ static u64 journal_seq_to_flush(struct journal *j) static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool kthread = (current->flags & PF_KTHREAD) != 0; u64 seq_to_flush; size_t min_nr, min_key_cache, nr_flushed; unsigned flags; @@ -650,7 +656,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) flags = memalloc_noreclaim_save(); do { - if (kthread_should_stop()) + if (kthread && kthread_should_stop()) break; if (bch2_journal_error(j)) { @@ -816,6 +822,9 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, (1U << JOURNAL_PIN_btree), 0, 0, 0)) *did_work = true; + if (seq_to_flush > journal_cur_seq(j)) + bch2_journal_entry_close(j); + spin_lock(&j->lock); /* * If journal replay hasn't completed, the unreplayed journal entries diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index 7b15d68..ec84c33 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j) unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, enum journal_space_from); +void bch2_journal_set_watermark(struct journal *); void bch2_journal_space_available(struct journal *); static inline bool journal_pin_active(struct journal_entry_pin *pin) diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 4ffae25..38817c7 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -36,6 +36,7 @@ struct journal_buf { bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ bool separate_flush; + bool need_flush_to_write_buffer; }; /* @@ -181,6 +182,12 @@ struct journal { */ darray_u64 early_journal_entries; + /* + * Protects journal_buf->data, when accessing without a jorunal + * reservation: for synchronization between the btree write buffer code + * and the journal write path: + */ + struct mutex buf_lock; /* * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. @@ -270,6 +277,7 @@ struct journal { u64 low_on_space_start; u64 low_on_pin_start; u64 max_in_flight_start; + u64 write_buffer_full_start; struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c index 9a76a9a..4b60374 100644 --- a/libbcachefs/logged_ops.c +++ b/libbcachefs/logged_ops.c @@ -59,8 +59,9 @@ int bch2_resume_logged_ops(struct bch_fs *c) int ret; ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, - BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k, + for_each_btree_key(trans, iter, + BTREE_ID_logged_ops, POS_MIN, + BTREE_ITER_PREFETCH, k, resume_logged_op(trans, &iter, k))); if (ret) bch_err_fn(c, ret); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 5ed9f53..9be421e 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -377,8 +377,8 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, io_opts->d.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), - BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ({ if (k.k->p.offset != extent_k.k->p.inode) break; @@ -391,11 +391,8 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; bch2_inode_opts_get(&e.io_opts, trans->c, &inode); - ret = darray_push(&io_opts->d, e); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); + darray_push(&io_opts->d, e); + })); io_opts->cur_inum = extent_k.k->p.inode; } @@ -449,25 +446,26 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, int bch2_move_ratelimit(struct moving_context *ctxt) { struct bch_fs *c = ctxt->trans->c; + bool is_kthread = current->flags & PF_KTHREAD; u64 delay; if (ctxt->wait_on_copygc && c->copygc_running) { bch2_moving_ctxt_flush_all(ctxt); wait_event_killable(c->copygc_running_wq, !c->copygc_running || - kthread_should_stop()); + (is_kthread && kthread_should_stop())); } do { delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; - if (kthread_should_stop()) + if (is_kthread && kthread_should_stop()) return 1; if (delay) move_ctxt_wait_event_timeout(ctxt, freezing(current) || - kthread_should_stop(), + (is_kthread && kthread_should_stop()), delay); if (unlikely(freezing(current))) { @@ -642,6 +640,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; + bool is_kthread = current->flags & PF_KTHREAD; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_iter iter; struct bkey_buf sk; @@ -686,7 +685,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, goto err; while (!(ret = bch2_move_ratelimit(ctxt))) { - if (kthread_should_stop()) + if (is_kthread && kthread_should_stop()) break; bch2_trans_begin(trans); @@ -804,6 +803,7 @@ static int bch2_move_btree(struct bch_fs *c, move_btree_pred pred, void *arg, struct bch_move_stats *stats) { + bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct moving_context ctxt; struct btree_trans *trans; @@ -835,7 +835,7 @@ retry: while (bch2_trans_begin(trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { - if (kthread_should_stop()) + if (kthread && kthread_should_stop()) break; if ((cmp_int(btree, end.btree) ?: @@ -860,7 +860,7 @@ next: bch2_trans_iter_exit(trans, &iter); - if (kthread_should_stop()) + if (kthread && kthread_should_stop()) break; } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 91026df..cf69b92 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -233,11 +233,6 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ - x(btree_write_buffer_size, u32, \ - OPT_FS|OPT_MOUNT, \ - OPT_UINT(16, (1U << 20) - 1), \ - BCH2_NO_SB_OPT, 1U << 13, \ - NULL, "Number of btree write buffer entries") \ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ @@ -423,7 +418,7 @@ enum fsck_err_opts { 0, \ OPT_UINT(0, S64_MAX), \ BCH2_NO_SB_OPT, false, \ - NULL, "Allocate the buckets_nouse bitmap") \ + NULL, "Pointer to a struct log_output") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index a54647c..79724a7 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -617,11 +617,11 @@ int bch2_fs_quota_read(struct bch_fs *c) trans = bch2_trans_get(c); - ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas, - POS_MIN, BTREE_ITER_PREFETCH, k, + ret = for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, + BTREE_ITER_PREFETCH, k, __bch2_quota_set(c, k, NULL)) ?: - for_each_btree_key2(trans, iter, BTREE_ID_inodes, - POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, bch2_fs_quota_read_inode(trans, &iter, k)); bch2_trans_put(trans); diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 79bd4ad..ee452aa 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -334,7 +334,7 @@ static int do_rebalance(struct moving_context *ctxt) while (!bch2_move_ratelimit(ctxt)) { if (!r->enabled) { bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(c->copy_gc_enabled || + kthread_wait_freezable(r->enabled || kthread_should_stop()); } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 69b4984..f51456f 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -150,7 +150,7 @@ static int bch2_journal_replay(struct bch_fs *c) u64 start_seq = c->journal_replay_seq_start; u64 end_seq = c->journal_replay_seq_start; struct btree_trans *trans = bch2_trans_get(c); - int ret; + int ret = 0; if (keys->nr) { ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", @@ -533,7 +533,8 @@ static int bch2_set_may_go_rw(struct bch_fs *c) keys->gap = keys->nr; set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck) + + if (keys->nr || c->opts.fsck || !c->sb.clean) return bch2_fs_read_write_early(c); return 0; } @@ -634,7 +635,7 @@ u64 bch2_fsck_recovery_passes(void) static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; + struct recovery_pass_fn *p = recovery_pass_fns + pass; if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) return false; @@ -651,39 +652,59 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct recovery_pass_fn *p = recovery_pass_fns + pass; int ret; - c->curr_recovery_pass = pass; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); - if (should_run_recovery_pass(c, pass)) { - struct recovery_pass_fn *p = recovery_pass_fns + pass; + return 0; +} - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), - bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) - return ret; - if (!(p->when & PASS_SILENT)) - bch2_print(c, KERN_CONT " done\n"); +static int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; - c->recovery_passes_complete |= BIT_ULL(pass); + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + if (should_run_recovery_pass(c, c->curr_recovery_pass)) { + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) + continue; + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + } + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); } - return 0; + return ret; } -static int bch2_run_recovery_passes(struct bch_fs *c) +int bch2_run_online_recovery_passes(struct bch_fs *c) { int ret = 0; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { + struct recovery_pass_fn *p = recovery_pass_fns + i; + + if (!(p->when & PASS_ONLINE)) + continue; + + ret = bch2_run_recovery_pass(c, i); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { + i = c->curr_recovery_pass; continue; + } if (ret) break; - c->curr_recovery_pass++; } return ret; diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index 852d305..447590f 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -25,6 +25,7 @@ static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_online_recovery_passes(struct bch_fs *); u64 bch2_fsck_recovery_passes(void); int bch2_fs_recovery(struct bch_fs *); diff --git a/libbcachefs/recovery_types.h b/libbcachefs/recovery_types.h index 515e3d6..c6deceb 100644 --- a/libbcachefs/recovery_types.h +++ b/libbcachefs/recovery_types.h @@ -6,43 +6,44 @@ #define PASS_FSCK BIT(1) #define PASS_UNCLEAN BIT(2) #define PASS_ALWAYS BIT(3) +#define PASS_ONLINE BIT(4) -#define BCH_RECOVERY_PASSES() \ - x(alloc_read, PASS_ALWAYS) \ - x(stripes_read, PASS_ALWAYS) \ - x(initialize_subvolumes, 0) \ - x(snapshots_read, PASS_ALWAYS) \ - x(check_topology, 0) \ - x(check_allocations, PASS_FSCK) \ - x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \ - x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, PASS_ALWAYS) \ - x(check_alloc_info, PASS_FSCK) \ - x(check_lrus, PASS_FSCK) \ - x(check_btree_backpointers, PASS_FSCK) \ - x(check_backpointers_to_extents,PASS_FSCK) \ - x(check_extents_to_backpointers,PASS_FSCK) \ - x(check_alloc_to_lru_refs, PASS_FSCK) \ - x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 0) \ - x(check_snapshot_trees, PASS_FSCK) \ - x(check_snapshots, PASS_FSCK) \ - x(check_subvols, PASS_FSCK) \ - x(delete_dead_snapshots, PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 0) \ - x(resume_logged_ops, PASS_ALWAYS) \ - x(check_inodes, PASS_FSCK) \ - x(check_extents, PASS_FSCK) \ - x(check_indirect_extents, PASS_FSCK) \ - x(check_dirents, PASS_FSCK) \ - x(check_xattrs, PASS_FSCK) \ - x(check_root, PASS_FSCK) \ - x(check_directory_structure, PASS_FSCK) \ - x(check_nlinks, PASS_FSCK) \ - x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ - x(fix_reflink_p, 0) \ - x(set_fs_needs_rebalance, 0) \ +#define BCH_RECOVERY_PASSES() \ + x(alloc_read, PASS_ALWAYS) \ + x(stripes_read, PASS_ALWAYS) \ + x(initialize_subvolumes, 0) \ + x(snapshots_read, PASS_ALWAYS) \ + x(check_topology, 0) \ + x(check_allocations, PASS_FSCK) \ + x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \ + x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \ + x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, PASS_ALWAYS) \ + x(check_alloc_info, PASS_ONLINE|PASS_FSCK) \ + x(check_lrus, PASS_ONLINE|PASS_FSCK) \ + x(check_btree_backpointers, PASS_ONLINE|PASS_FSCK) \ + x(check_backpointers_to_extents, PASS_ONLINE|PASS_FSCK) \ + x(check_extents_to_backpointers, PASS_ONLINE|PASS_FSCK) \ + x(check_alloc_to_lru_refs, PASS_ONLINE|PASS_FSCK) \ + x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 0) \ + x(check_snapshot_trees, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, PASS_ONLINE|PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 0) \ + x(resume_logged_ops, PASS_ALWAYS) \ + x(check_inodes, PASS_FSCK) \ + x(check_extents, PASS_FSCK) \ + x(check_indirect_extents, PASS_FSCK) \ + x(check_dirents, PASS_FSCK) \ + x(check_xattrs, PASS_FSCK) \ + x(check_root, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, PASS_FSCK) \ + x(check_nlinks, PASS_FSCK) \ + x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ + x(fix_reflink_p, 0) \ + x(set_fs_needs_rebalance, 0) \ enum bch_recovery_pass { #define x(n, when) BCH_RECOVERY_PASS_##n, diff --git a/libbcachefs/sb-errors.h b/libbcachefs/sb-errors.h index 57bce14..92289ce 100644 --- a/libbcachefs/sb-errors.h +++ b/libbcachefs/sb-errors.h @@ -65,7 +65,7 @@ x(btree_node_bkey_out_of_order, 57) \ x(btree_root_bkey_invalid, 58) \ x(btree_root_read_error, 59) \ - x(btree_root_bad_min_key, 50) \ + x(btree_root_bad_min_key, 60) \ x(btree_root_bad_max_key, 61) \ x(btree_node_read_error, 62) \ x(btree_node_topology_bad_min_key, 63) \ diff --git a/libbcachefs/six.h b/libbcachefs/six.h index a7104ac..68d46fd 100644 --- a/libbcachefs/six.h +++ b/libbcachefs/six.h @@ -15,7 +15,7 @@ * will have to take write locks for the full duration of the operation. * * But by adding an intent state, which is exclusive with other intent locks but - * not with readers, we can take intent locks at thte start of the operation, + * not with readers, we can take intent locks at the start of the operation, * and then take write locks only for the actual update to each individual * nodes, without deadlocking. * @@ -65,8 +65,8 @@ * * Reentrancy: * - * Six locks are not by themselves reentrent, but have counters for both the - * read and intent states that can be used to provide reentrency by an upper + * Six locks are not by themselves reentrant, but have counters for both the + * read and intent states that can be used to provide reentrancy by an upper * layer that tracks held locks. If a lock is known to already be held in the * read or intent state, six_lock_increment() can be used to bump the "lock * held in this state" counter, increasing the number of unlock calls that diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index b2d216f..8d1800e 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -123,7 +123,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) struct snapshot_table *t; bool ret; - EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); + EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots); rcu_read_lock(); t = rcu_dereference(c->snapshots); @@ -1402,27 +1402,24 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, bch2_snapshot_set_equiv(trans, k)); if (ret) { bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); goto err; } - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ({ if (k.k->type != KEY_TYPE_snapshot) continue; snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v)) { - ret = snapshot_list_add(c, &deleted, k.k->p.offset); - if (ret) - break; - } - } - bch2_trans_iter_exit(trans, &iter); + BCH_SNAPSHOT_DELETED(snap.v) + ? snapshot_list_add(c, &deleted, k.k->p.offset) + : 0; + })); if (ret) { bch_err_msg(c, ret, "walking snapshots"); @@ -1469,18 +1466,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) bch2_trans_unlock(trans); down_write(&c->snapshot_create_lock); - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ({ u32 snapshot = k.k->p.offset; u32 equiv = bch2_snapshot_equiv(c, snapshot); - if (equiv != snapshot) - snapshot_list_add(c, &deleted_interior, snapshot); - } - bch2_trans_iter_exit(trans, &iter); + equiv != snapshot + ? snapshot_list_add(c, &deleted_interior, snapshot) + : 0; + })); - if (ret) + if (ret) { + bch_err_msg(c, ret, "walking snapshots"); goto err_create_lock; + } /* * Fixing children of deleted snapshots can't be done completely @@ -1694,13 +1693,13 @@ int bch2_snapshots_read(struct bch_fs *c) int ret = 0; ret = bch2_trans_run(c, - for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: bch2_snapshot_set_equiv(trans, k) ?: bch2_check_snapshot_needs_deletion(trans, k)) ?: - for_each_btree_key2(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); if (ret) bch_err_fn(c, ret); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 3abccdb..88a762b 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -72,6 +72,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet "); MODULE_DESCRIPTION("bcachefs filesystem"); +MODULE_SOFTDEP("pre: crc32c"); +MODULE_SOFTDEP("pre: crc64"); +MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: chacha20"); +MODULE_SOFTDEP("pre: poly1305"); +MODULE_SOFTDEP("pre: xxhash"); const char * const bch2_fs_flag_strs[] = { #define x(n) #n, @@ -82,19 +88,26 @@ const char * const bch2_fs_flag_strs[] = { void __bch2_print(struct bch_fs *c, const char *fmt, ...) { + struct log_output *output = c->output; va_list args; + if (c->output_filter && c->output_filter != current) + output = NULL; + va_start(args, fmt); - if (likely(!c->output)) { + if (likely(!output)) { vprintk(fmt, args); } else { unsigned long flags; - spin_lock_irqsave(&c->output->lock, flags); - prt_vprintf(&c->output->buf, fmt, args); - spin_unlock_irqrestore(&c->output->lock, flags); + if (fmt[0] == KERN_SOH[0]) + fmt += 2; + + spin_lock_irqsave(&output->lock, flags); + prt_vprintf(&output->buf, fmt, args); + spin_unlock_irqrestore(&output->lock, flags); - wake_up(&c->output->wait); + wake_up(&output->wait); } va_end(args); } @@ -350,7 +363,8 @@ void bch2_fs_read_only(struct bch_fs *c) BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.state.nr); + BUG_ON(c->btree_write_buffer.inc.keys.nr); + BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); @@ -612,6 +626,9 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_debug_exit(c); bch2_fs_chardev_exit(c); + bch2_ro_ref_put(c); + wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); + kobject_put(&c->counters_kobj); kobject_put(&c->time_stats); kobject_put(&c->opts_dir); @@ -744,6 +761,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); + refcount_set(&c->ro_ref, 1); + init_waitqueue_head(&c->ro_ref_wait); + sema_init(&c->online_fsck_mutex, 1); + init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); atomic_set(&c->journal_keys.ref, 1); @@ -754,6 +775,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_iter_init_early(c); bch2_fs_btree_interior_update_init_early(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 1b82a3a..e818ca7 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -278,8 +278,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c if (!btree_type_has_ptrs(id)) continue; - ret = for_each_btree_key2(trans, iter, id, POS_MIN, - BTREE_ITER_ALL_SNAPSHOTS, k, ({ + ret = for_each_btree_key(trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ({ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_extent_crc_unpacked crc; const union bch_extent_entry *entry; diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index cfa7ee7..427edb3 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -1145,8 +1145,6 @@ TRACE_EVENT(trans_restart_upgrade, __field(u8, level ) __field(u32, path_seq ) __field(u32, node_seq ) - __field(u32, path_alloc_seq ) - __field(u32, downgrade_seq) TRACE_BPOS_entries(pos) ), @@ -1159,12 +1157,10 @@ TRACE_EVENT(trans_restart_upgrade, __entry->level = f->l; __entry->path_seq = path->l[f->l].lock_seq; __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; - __entry->path_alloc_seq = path->alloc_seq; - __entry->downgrade_seq = path->downgrade_seq; TRACE_BPOS_assign(pos, path->pos) ), - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u", + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", __entry->trans_fn, (void *) __entry->caller_ip, bch2_btree_id_str(__entry->btree_id), @@ -1175,9 +1171,7 @@ TRACE_EVENT(trans_restart_upgrade, __entry->new_locks_want, __entry->level, __entry->path_seq, - __entry->node_seq, - __entry->path_alloc_seq, - __entry->downgrade_seq) + __entry->node_seq) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, -- 2.39.2