From e0a51ccce8533a91c7cc0cd0adc5662697c9bcfa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Oct 2022 16:29:56 -0400 Subject: [PATCH] Update bcachefs sources to 3e93567c51 bcachefs: Switch to local_clock() for fastpath time source --- .bcachefs_revision | 2 +- Makefile | 6 + include/linux/mean_and_variance.h | 170 +++++++++++++++++++++++ libbcachefs/backpointers.c | 5 +- libbcachefs/backpointers.h | 2 +- libbcachefs/btree_gc.c | 8 +- libbcachefs/btree_iter.c | 12 +- libbcachefs/btree_key_cache.c | 52 +++++-- libbcachefs/btree_locking.c | 172 +++++++++++------------ libbcachefs/btree_locking.h | 6 +- libbcachefs/ec.c | 4 +- libbcachefs/fs-common.c | 5 + libbcachefs/fs-io.c | 9 +- libbcachefs/fs-ioctl.c | 16 +++ libbcachefs/fs.c | 6 +- libbcachefs/journal_io.c | 217 +++++++++++++++++++----------- libbcachefs/journal_io.h | 4 +- libbcachefs/move.c | 3 +- libbcachefs/movinggc.c | 2 +- libbcachefs/quota.c | 173 +++++++++++++++++++++++- libbcachefs/super.c | 6 + libbcachefs/util.c | 173 ++++++++++++++++++------ libbcachefs/util.h | 11 +- libbcachefs/xattr.c | 8 +- linux/int_sqrt.c | 71 ++++++++++ linux/mean_and_variance.c | 178 ++++++++++++++++++++++++ linux/six.c | 11 +- 27 files changed, 1058 insertions(+), 274 deletions(-) create mode 100644 include/linux/mean_and_variance.h create mode 100644 linux/int_sqrt.c create mode 100644 linux/mean_and_variance.c diff --git a/.bcachefs_revision b/.bcachefs_revision index 2c908e6..11194e5 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -6ee8a33cee5dfb74a1fb6ff348578fd43aae3a14 +3e93567c5196ef0c80e2ac3c08295130d858dfd6 diff --git a/Makefile b/Makefile index a5a74fe..d460a6d 100644 --- a/Makefile +++ b/Makefile @@ -199,6 +199,12 @@ update-bcachefs-sources: git add include/linux/printbuf.h cp $(LINUX_DIR)/lib/printbuf.c linux/ git add linux/printbuf.c + cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/ + git add linux/mean_and_variance.c + cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/ + git add include/linux/mean_and_variance.h + cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/ + git add linux/int_sqrt.c cp $(LINUX_DIR)/scripts/Makefile.compiler ./ git add Makefile.compiler $(RM) libbcachefs/*.mod.c diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h new file mode 100644 index 0000000..3d62abe --- /dev/null +++ b/include/linux/mean_and_variance.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MEAN_AND_VARIANCE_H_ +#define MEAN_AND_VARIANCE_H_ + +#include +#include +#include +#include + +#define SQRT_U64_MAX 4294967295ULL + + +#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) + +typedef unsigned __int128 u128; + +static inline u128 u64_to_u128(u64 a) +{ + return (u128)a; +} + +static inline u64 u128_to_u64(u128 a) +{ + return (u64)a; +} + +static inline u64 u128_shr64_to_u64(u128 a) +{ + return (u64)(a >> 64); +} + +static inline u128 u128_add(u128 a, u128 b) +{ + return a + b; +} + +static inline u128 u128_sub(u128 a, u128 b) +{ + return a - b; +} + +static inline u128 u128_shl(u128 i, s8 shift) +{ + return i << shift; +} + +static inline u128 u128_shl64_add(u64 a, u64 b) +{ + return ((u128)a << 64) + b; +} + +static inline u128 u128_square(u64 i) +{ + return i*i; +} + +#else + +typedef struct { + u64 hi, lo; +} u128; + +static inline u128 u64_to_u128(u64 a) +{ + return (u128){ .lo = a }; +} + +static inline u64 u128_to_u64(u128 a) +{ + return a.lo; +} + +static inline u64 u128_shr64_to_u64(u128 a) +{ + return a.hi; +} + +static inline u128 u128_add(u128 a, u128 b) +{ + u128 c; + + c.lo = a.lo + b.lo; + c.hi = a.hi + b.hi + (c.lo < a.lo); + return c; +} + +static inline u128 u128_sub(u128 a, u128 b) +{ + u128 c; + + c.lo = a.lo - b.lo; + c.hi = a.hi - b.hi - (c.lo > a.lo); + return c; +} + +static inline u128 u128_shl(u128 i, s8 shift) +{ + u128 r; + + r.lo = i.lo << shift; + if (shift < 64) + r.hi = (i.hi << shift) | (i.lo >> (64 - shift)); + else { + r.hi = i.lo << (shift - 64); + r.lo = 0; + } + return r; +} + +static inline u128 u128_shl64_add(u64 a, u64 b) +{ + return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b)); +} + +static inline u128 u128_square(u64 i) +{ + u128 r; + u64 h = i >> 32, l = i & (u64)U32_MAX; + + r = u128_shl(u64_to_u128(h*h), 64); + r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); + r = u128_add(r, u128_shl(u64_to_u128(l*h), 32)); + r = u128_add(r, u64_to_u128(l*l)); + return r; +} + +#endif + +static inline u128 u128_div(u128 n, u64 d) +{ + u128 r; + u64 rem; + u64 hi = u128_shr64_to_u64(n); + u64 lo = u128_to_u64(n); + u64 h = hi & ((u64)U32_MAX << 32); + u64 l = (hi & (u64)U32_MAX) << 32; + + r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); + r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); + r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); + return r; +} + +struct mean_and_variance { + s64 n; + s64 sum; + u128 sum_squares; +}; + +/* expontentially weighted variant */ +struct mean_and_variance_weighted { + bool init; + u8 w; + s64 mean; + u64 variance; +}; + +inline s64 fast_divpow2(s64 n, u8 d); + +struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1); + s64 mean_and_variance_get_mean(struct mean_and_variance s); + u64 mean_and_variance_get_variance(struct mean_and_variance s1); + u32 mean_and_variance_get_stddev(struct mean_and_variance s); + +struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1); + s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); + u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); + u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); + +#endif // MEAN_AND_VAIRANCE_H_ diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index b0e3059..a537768 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -414,7 +414,8 @@ err: int bch2_get_next_backpointer(struct btree_trans *trans, struct bpos bucket, int gen, u64 *bp_offset, - struct bch_backpointer *dst) + struct bch_backpointer *dst, + unsigned iter_flags) { struct bch_fs *c = trans->c; struct bpos bp_pos, bp_end_pos; @@ -1023,7 +1024,7 @@ static int check_one_backpointer(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret; - ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp); + ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0); if (ret || *bp_offset == U64_MAX) return ret; diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h index fe42af2..1c97e36 100644 --- a/libbcachefs/backpointers.h +++ b/libbcachefs/backpointers.h @@ -25,7 +25,7 @@ int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *, int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *, struct bch_backpointer, struct bkey_s_c); int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, - u64 *, struct bch_backpointer *); + u64 *, struct bch_backpointer *, unsigned); struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, struct bpos, u64, struct bch_backpointer); struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index fd89165..a4d6998 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -1979,10 +1979,10 @@ int bch2_gc_gens(struct bch_fs *c) NULL, NULL, BTREE_INSERT_NOFAIL, gc_btree_gens_key(&trans, &iter, k)); - if (ret) { + if (ret && ret != -EROFS) bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); + if (ret) goto err; - } } ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, @@ -1992,10 +1992,10 @@ int bch2_gc_gens(struct bch_fs *c) NULL, NULL, BTREE_INSERT_NOFAIL, bch2_alloc_write_oldest_gen(&trans, &iter, k)); - if (ret) { + if (ret && ret != -EROFS) bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); + if (ret) goto err; - } c->gc_gens_btree = 0; c->gc_gens_pos = POS_MIN; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index af65839..0dfde9f 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -772,7 +772,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat bch2_bkey_buf_init(&tmp); - while (nr && !ret) { + while (nr-- && !ret) { if (!bch2_btree_node_relock(trans, path, path->level)) break; @@ -807,7 +807,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p bch2_bkey_buf_init(&tmp); - while (nr && !ret) { + while (nr-- && !ret) { if (!bch2_btree_node_relock(trans, path, path->level)) break; @@ -2386,6 +2386,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } k = bch2_btree_path_peek_slot(iter->path, &iter->k); + if (unlikely(!k.k)) + goto out_no_locked; } else { struct bpos next; @@ -2783,7 +2785,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) if (!trans->restarted && (need_resched() || - ktime_get_ns() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { bch2_trans_unlock(trans); cond_resched(); bch2_trans_relock(trans); @@ -2793,7 +2795,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) if (trans->restarted) bch2_btree_path_traverse_all(trans); - trans->last_begin_time = ktime_get_ns(); + trans->last_begin_time = local_clock(); return trans->restart_count; } @@ -2850,7 +2852,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char * memset(trans, 0, sizeof(*trans)); trans->c = c; trans->fn = fn; - trans->last_begin_time = ktime_get_ns(); + trans->last_begin_time = local_clock(); trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn); trans->locking_wait.task = current; closure_init_stack(&trans->ref); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 35e9419..958feac 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -112,6 +112,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); if (!ck->c.lock.readers) { +#ifdef __KERNEL__ preempt_disable(); f = this_cpu_ptr(bc->pcpu_freed); @@ -136,6 +137,11 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, list_move_tail(&ck->list, &bc->freed_nonpcpu); mutex_unlock(&bc->lock); } +#else + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_nonpcpu); + mutex_unlock(&bc->lock); +#endif } else { mutex_lock(&bc->lock); list_move_tail(&ck->list, &bc->freed_pcpu); @@ -174,6 +180,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path) bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); if (!pcpu_readers) { +#ifdef __KERNEL__ preempt_disable(); f = this_cpu_ptr(bc->pcpu_freed); if (f->nr) @@ -196,6 +203,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path) preempt_enable(); mutex_unlock(&bc->lock); } +#else + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_nonpcpu)) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); + list_del_init(&ck->list); + } + mutex_unlock(&bc->lock); +#endif } else { mutex_lock(&bc->lock); if (!list_empty(&bc->freed_pcpu)) { @@ -228,6 +243,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path) return ck; } + /* GFP_NOFS because we're holding btree locks: */ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); if (likely(ck)) { INIT_LIST_HEAD(&ck->list); @@ -252,6 +268,7 @@ bkey_cached_reuse(struct btree_key_cache *c) struct bkey_cached *ck; unsigned i; + mutex_lock(&c->lock); rcu_read_lock(); tbl = rht_dereference_rcu(c->table.tbl, &c->table); for (i = 0; i < tbl->size; i++) @@ -259,13 +276,14 @@ bkey_cached_reuse(struct btree_key_cache *c) if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { bkey_cached_evict(c, ck); - rcu_read_unlock(); - return ck; + goto out; } } + ck = NULL; +out: rcu_read_unlock(); - - return NULL; + mutex_unlock(&c->lock); + return ck; } static struct bkey_cached * @@ -759,12 +777,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, unsigned start, flags; int srcu_idx; - /* Return -1 if we can't do anything right now */ - if (sc->gfp_mask & __GFP_FS) - mutex_lock(&bc->lock); - else if (!mutex_trylock(&bc->lock)) - return -1; - + mutex_lock(&bc->lock); srcu_idx = srcu_read_lock(&c->btree_trans_barrier); flags = memalloc_nofs_save(); @@ -869,7 +882,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) struct bkey_cached *ck, *n; struct rhash_head *pos; unsigned i; +#ifdef __KERNEL__ int cpu; +#endif if (bc->shrink.list.next) unregister_shrinker(&bc->shrink); @@ -886,6 +901,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) } rcu_read_unlock(); +#ifdef __KERNEL__ for_each_possible_cpu(cpu) { struct btree_key_cache_freelist *f = per_cpu_ptr(bc->pcpu_freed, cpu); @@ -895,6 +911,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) list_add(&ck->list, &bc->freed_nonpcpu); } } +#endif list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); @@ -910,10 +927,15 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) kmem_cache_free(bch2_key_cache, ck); } - BUG_ON(atomic_long_read(&bc->nr_dirty) && - !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_WAS_RW, &c->flags)); - BUG_ON(atomic_long_read(&bc->nr_keys)); + if (atomic_long_read(&bc->nr_dirty) && + !bch2_journal_error(&c->journal) && + test_bit(BCH_FS_WAS_RW, &c->flags)) + panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", + atomic_long_read(&bc->nr_dirty)); + + if (atomic_long_read(&bc->nr_keys)) + panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", + atomic_long_read(&bc->nr_keys)); mutex_unlock(&bc->lock); @@ -943,9 +965,11 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); int ret; +#ifdef __KERNEL__ bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); if (!bc->pcpu_freed) return -ENOMEM; +#endif ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params); if (ret) diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index 9a525d3..93a6ebe 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -94,6 +94,37 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) prt_newline(out); } +static void lock_graph_up(struct lock_graph *g) +{ + closure_put(&g->g[--g->nr].trans->ref); +} + +static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +{ + closure_get(&trans->ref); + + g->g[g->nr++] = (struct trans_waiting_for_lock) { + .trans = trans, + .node_want = trans->locking, + .lock_want = trans->locking_wait.lock_want, + }; +} + +static bool lock_graph_remove_non_waiters(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { + while (g->g + g->nr > i) + lock_graph_up(g); + return true; + } + + return false; +} + static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { @@ -106,40 +137,42 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) } } -static noinline int break_cycle(struct lock_graph *g) +static int btree_trans_abort_preference(struct btree_trans *trans) { - struct trans_waiting_for_lock *i; - - /* - * We'd like to prioritize aborting transactions that have done less - * work - but it appears breaking cycles by telling other transactions - * to abort may still be buggy: - */ -#if 0 - for (i = g->g; i < g->g + g->nr; i++) { - if (i->trans->lock_may_not_fail || - i->trans->locking_wait.lock_want == SIX_LOCK_write) - continue; + if (trans->lock_may_not_fail) + return 0; + if (trans->locking_wait.lock_want == SIX_LOCK_write) + return 1; + if (!trans->in_traverse_all) + return 2; + return 3; +} - return abort_lock(g, i); - } +static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) +{ + struct trans_waiting_for_lock *i, *abort = NULL; + unsigned best = 0, pref; + int ret; - for (i = g->g; i < g->g + g->nr; i++) { - if (i->trans->lock_may_not_fail || - !i->trans->in_traverse_all) - continue; + if (lock_graph_remove_non_waiters(g)) + return 0; - return abort_lock(g, i); + /* Only checking, for debugfs: */ + if (cycle) { + print_cycle(cycle, g); + ret = -1; + goto out; } -#endif - for (i = g->g; i < g->g + g->nr; i++) { - if (i->trans->lock_may_not_fail) - continue; - return abort_lock(g, i); + for (i = g->g; i < g->g + g->nr; i++) { + pref = btree_trans_abort_preference(i->trans); + if (pref > best) { + abort = i; + best = pref; + } } - { + if (unlikely(!best)) { struct bch_fs *c = g->g->trans->c; struct printbuf buf = PRINTBUF; @@ -162,21 +195,13 @@ static noinline int break_cycle(struct lock_graph *g) printbuf_exit(&buf); BUG(); } -} - -static void lock_graph_pop(struct lock_graph *g) -{ - closure_put(&g->g[--g->nr].trans->ref); -} - -static void lock_graph_pop_above(struct lock_graph *g, struct trans_waiting_for_lock *above, - struct printbuf *cycle) -{ - if (g->nr > 1 && cycle) - print_chain(cycle, g); - while (g->g + g->nr > above) - lock_graph_pop(g); + ret = abort_lock(g, abort); +out: + if (ret) + while (g->nr) + lock_graph_up(g); + return ret; } static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, @@ -184,67 +209,23 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, { struct btree_trans *orig_trans = g->g->trans; struct trans_waiting_for_lock *i; - int ret = 0; - - for (i = g->g; i < g->g + g->nr; i++) { - if (i->trans->locking != i->node_want) { - lock_graph_pop_above(g, i - 1, cycle); - return 0; - } - if (i->trans == trans) { - if (cycle) { - /* Only checking: */ - print_cycle(cycle, g); - ret = -1; - } else { - ret = break_cycle(g); - } - - if (ret) - goto deadlock; - /* - * If we didn't abort (instead telling another - * transaction to abort), keep checking: - */ - } - } + for (i = g->g; i < g->g + g->nr; i++) + if (i->trans == trans) + return break_cycle(g, cycle); if (g->nr == ARRAY_SIZE(g->g)) { if (orig_trans->lock_may_not_fail) return 0; + while (g->nr) + lock_graph_up(g); trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); - ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); - goto deadlock; + return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); } - closure_get(&trans->ref); - - g->g[g->nr++] = (struct trans_waiting_for_lock) { - .trans = trans, - .node_want = trans->locking, - .lock_want = trans->locking_wait.lock_want, - }; - + lock_graph_down(g, trans); return 0; -deadlock: - lock_graph_pop_above(g, g->g, cycle); - return ret; -} - -static noinline void lock_graph_remove_non_waiters(struct lock_graph *g, - struct printbuf *cycle) -{ - struct trans_waiting_for_lock *i; - - for (i = g->g + 1; i < g->g + g->nr; i++) - if (i->trans->locking != i->node_want || - i->trans->locking_wait.start_time != i[-1].lock_start_time) { - lock_graph_pop_above(g, i - 1, cycle); - return; - } - BUG(); } static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) @@ -266,8 +247,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) } g.nr = 0; - ret = lock_graph_descend(&g, trans, cycle); - BUG_ON(ret); + lock_graph_down(&g, trans); next: if (!g.nr) return 0; @@ -295,7 +275,7 @@ next: b = &READ_ONCE(path->l[top->level].b)->c; if (unlikely(IS_ERR_OR_NULL(b))) { - lock_graph_remove_non_waiters(&g, cycle); + BUG_ON(!lock_graph_remove_non_waiters(&g)); goto next; } @@ -321,7 +301,7 @@ next: raw_spin_unlock(&b->lock.wait_lock); if (ret) - return ret < 0 ? ret : 0; + return ret; goto next; } @@ -331,7 +311,7 @@ next: if (g.nr > 1 && cycle) print_chain(cycle, &g); - lock_graph_pop(&g); + lock_graph_up(&g); goto next; } diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index d91b42b..bf8d188 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -88,7 +88,7 @@ static inline void mark_btree_node_locked(struct btree_trans *trans, { mark_btree_node_locked_noreset(path, level, type); #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - path->l[level].lock_taken_time = ktime_get_ns(); + path->l[level].lock_taken_time = local_clock(); #endif } @@ -120,7 +120,7 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, if (s) __bch2_time_stats_update(&s->lock_hold_times, path->l[level].lock_taken_time, - ktime_get_ns()); + local_clock()); #endif } @@ -260,7 +260,7 @@ static inline int btree_node_lock(struct btree_trans *trans, btree_node_lock_increment(trans, b, level, type) || !(ret = btree_node_lock_nopath(trans, b, type))) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS - path->l[b->level].lock_taken_time = ktime_get_ns(); + path->l[b->level].lock_taken_time = local_clock(); #endif } diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index d3fa2d7..dfe3796 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -872,7 +872,9 @@ retry: while (1) { bch2_trans_begin(trans); - ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen, &bp_offset, &bp); + ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen, + &bp_offset, &bp, + BTREE_ITER_CACHED); if (ret) break; if (bp_offset == U64_MAX) diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 53ffc68..e9dd1d1 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -212,6 +212,11 @@ int bch2_link_trans(struct btree_trans *trans, if (ret) goto err; + if (bch2_reinherit_attrs(inode_u, dir_u)) { + ret = -EXDEV; + goto err; + } + dir_u->bi_mtime = dir_u->bi_ctime = now; dir_hash = bch2_hash_info_init(c, dir_u); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 2ea6e79..02ef343 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -1527,7 +1527,7 @@ out: if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); if (ret) - goto out; + goto err; } ret = bch2_page_reservation_get(c, inode, page, res, @@ -3102,6 +3102,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode, inode_dio_wait(&inode->v); bch2_pagecache_block_get(&inode->ei_pagecache_lock); + ret = file_modified(file); + if (ret) + goto err; + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ret = bchfs_fallocate(inode, mode, offset, len); else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) @@ -3112,8 +3116,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, ret = bchfs_fcollapse_finsert(inode, offset, len, false); else ret = -EOPNOTSUPP; - - +err: bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); percpu_ref_put(&c->writes); diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index bab0707..2bb6808 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -26,6 +26,9 @@ struct flags_set { unsigned flags; unsigned projid; + + bool set_projinherit; + bool projinherit; }; static int bch2_inode_flags_set(struct bch_inode_info *inode, @@ -50,6 +53,11 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode, (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) return -EINVAL; + if (s->set_projinherit) { + bi->bi_fields_set &= ~(1 << Inode_opt_project); + bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); + } + bi->bi_flags &= ~s->mask; bi->bi_flags |= newflags; @@ -107,6 +115,10 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, struct fsxattr fa = { 0 }; fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); + + if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) + fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; + fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; return copy_to_user(arg, &fa, sizeof(fa)); @@ -138,6 +150,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; + s.set_projinherit = true; + s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; + fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; + s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); if (fa.fsx_xflags) return -EOPNOTSUPP; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 57e6e21..bf82737 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -419,7 +419,7 @@ static int bch2_mknod(struct user_namespace *mnt_userns, (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) - return PTR_ERR(inode); + return bch2_err_class(PTR_ERR(inode)); d_instantiate(dentry, &inode->v); return 0; @@ -529,7 +529,7 @@ static int bch2_symlink(struct user_namespace *mnt_userns, inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (unlikely(IS_ERR(inode))) - return PTR_ERR(inode); + return bch2_err_class(PTR_ERR(inode)); inode_lock(&inode->v); ret = page_symlink(&inode->v, symname, strlen(symname) + 1); @@ -838,7 +838,7 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) - return PTR_ERR(inode); + return bch2_err_class(PTR_ERR(inode)); d_mark_tmpfile(dentry, &inode->v); d_instantiate(dentry, &inode->v); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 253a6ae..68113a0 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -17,6 +17,23 @@ #include +static struct nonce journal_nonce(const struct jset *jset) +{ + return (struct nonce) {{ + [0] = 0, + [1] = ((__le32 *) &jset->seq)[0], + [2] = ((__le32 *) &jset->seq)[1], + [3] = BCH_NONCE_JOURNAL, + }}; +} + +static bool jset_csum_good(struct bch_fs *c, struct jset *j) +{ + return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && + !bch2_crc_cmp(j->csum, + csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); +} + static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) { return (seq - c->journal_entries_base_seq) & (~0U >> 1); @@ -59,8 +76,7 @@ struct journal_list { */ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct journal_ptr entry_ptr, - struct journal_list *jlist, struct jset *j, - bool bad) + struct journal_list *jlist, struct jset *j) { struct genradix_iter iter; struct journal_replay **_i, *i, *dup; @@ -111,38 +127,53 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, */ dup = *_i; if (dup) { - if (dup->bad) { - /* we'll replace @dup: */ - } else if (bad) { + if (bytes == vstruct_bytes(&dup->j) && + !memcmp(j, &dup->j, bytes)) { i = dup; goto found; - } else { - fsck_err_on(bytes != vstruct_bytes(&dup->j) || - memcmp(j, &dup->j, bytes), c, - "found duplicate but non identical journal entries (seq %llu)", - le64_to_cpu(j->seq)); + } + + if (!entry_ptr.csum_good) { i = dup; goto found; } - } + if (!dup->csum_good) + goto replace; + + fsck_err(c, "found duplicate but non identical journal entries (seq %llu)", + le64_to_cpu(j->seq)); + i = dup; + goto found; + } +replace: i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) return -ENOMEM; - i->nr_ptrs = 0; - i->bad = bad; + i->nr_ptrs = 0; + i->csum_good = entry_ptr.csum_good; i->ignore = false; memcpy(&i->j, j, bytes); + i->ptrs[i->nr_ptrs++] = entry_ptr; if (dup) { - i->nr_ptrs = dup->nr_ptrs; - memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs)); + if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { + bch_err(c, "found too many copies of journal entry %llu", + le64_to_cpu(i->j.seq)); + dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; + } + + /* The first ptr should represent the jset we kept: */ + memcpy(i->ptrs + i->nr_ptrs, + dup->ptrs, + sizeof(dup->ptrs[0]) * dup->nr_ptrs); + i->nr_ptrs += dup->nr_ptrs; __journal_replay_free(c, dup); } - *_i = i; + return 0; found: for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { if (ptr->dev == ca->dev_idx) { @@ -164,16 +195,6 @@ fsck_err: return ret; } -static struct nonce journal_nonce(const struct jset *jset) -{ - return (struct nonce) {{ - [0] = 0, - [1] = ((__le32 *) &jset->seq)[0], - [2] = ((__le32 *) &jset->seq)[1], - [3] = BCH_NONCE_JOURNAL, - }}; -} - /* this fills in a range with empty jset_entries: */ static void journal_entry_null_range(void *start, void *end) { @@ -715,12 +736,8 @@ fsck_err: static int jset_validate(struct bch_fs *c, struct bch_dev *ca, struct jset *jset, u64 sector, - unsigned bucket_sectors_left, - unsigned sectors_read, int write) { - size_t bytes = vstruct_bytes(jset); - struct bch_csum csum; unsigned version; int ret = 0; @@ -737,21 +754,7 @@ static int jset_validate(struct bch_fs *c, sector, le64_to_cpu(jset->seq), version)) { /* don't try to continue: */ - return EINVAL; - } - - if (bytes > (sectors_read << 9) && - sectors_read < bucket_sectors_left) - return JOURNAL_ENTRY_REREAD; - - if (journal_entry_err_on(bytes > bucket_sectors_left << 9, - c, jset, NULL, - "%s sector %llu seq %llu: journal entry too big (%zu bytes)", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq), bytes)) { - ret = JOURNAL_ENTRY_BAD; - le32_add_cpu(&jset->u64s, - -((bytes - (bucket_sectors_left << 9)) / 8)); + return -EINVAL; } if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), @@ -759,28 +762,9 @@ static int jset_validate(struct bch_fs *c, "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), - JSET_CSUM_TYPE(jset))) { - ret = JOURNAL_ENTRY_BAD; - goto csum_done; - } - - if (write) - goto csum_done; - - csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); - if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), - c, jset, NULL, - "%s sector %llu seq %llu: journal checksum bad", - ca ? ca->name : c->name, - sector, le64_to_cpu(jset->seq))) + JSET_CSUM_TYPE(jset))) ret = JOURNAL_ENTRY_BAD; - ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), - jset->encrypted_start, - vstruct_end(jset) - (void *) jset->encrypted_start); - bch2_fs_fatal_err_on(ret, c, - "error decrypting journal entry: %i", ret); -csum_done: /* last_seq is ignored when JSET_NO_FLUSH is true */ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), @@ -791,16 +775,52 @@ csum_done: jset->last_seq = jset->seq; return JOURNAL_ENTRY_BAD; } + + ret = jset_validate_entries(c, jset, write); fsck_err: return ret; } -static int jset_validate_for_write(struct bch_fs *c, struct jset *jset) +static int jset_validate_early(struct bch_fs *c, + struct bch_dev *ca, + struct jset *jset, u64 sector, + unsigned bucket_sectors_left, + unsigned sectors_read) { - unsigned sectors = vstruct_sectors(jset, c->block_bits); + size_t bytes = vstruct_bytes(jset); + unsigned version; + int write = READ; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + + version = le32_to_cpu(jset->version); + if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, + c, jset, NULL, + "%s sector %llu seq %llu: unknown journal entry version %u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), + version)) { + /* don't try to continue: */ + return -EINVAL; + } - return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?: - jset_validate_entries(c, jset, WRITE); + if (bytes > (sectors_read << 9) && + sectors_read < bucket_sectors_left) + return JOURNAL_ENTRY_REREAD; + + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, + c, jset, NULL, + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), bytes)) + le32_add_cpu(&jset->u64s, + -((bytes - (bucket_sectors_left << 9)) / 8)); +fsck_err: + return ret; } struct journal_read_buf { @@ -839,7 +859,7 @@ static int journal_read_bucket(struct bch_dev *ca, unsigned sectors, sectors_read = 0; u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; - bool saw_bad = false; + bool saw_bad = false, csum_good; int ret = 0; pr_debug("reading %u", bucket); @@ -878,9 +898,8 @@ reread: j = buf->data; } - ret = jset_validate(c, ca, j, offset, - end - offset, sectors_read, - READ); + ret = jset_validate_early(c, ca, j, offset, + end - offset, sectors_read); switch (ret) { case 0: sectors = vstruct_sectors(j, c->block_bits); @@ -896,17 +915,13 @@ reread: case JOURNAL_ENTRY_NONE: if (!saw_bad) return 0; - sectors = block_sectors(c); - goto next_block; - case JOURNAL_ENTRY_BAD: - saw_bad = true; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading * again at next block boundary: */ sectors = block_sectors(c); - break; + goto next_block; default: return ret; } @@ -922,14 +937,25 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + csum_good = jset_csum_good(c, j); + if (!csum_good) + saw_bad = true; + + ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), + j->encrypted_start, + vstruct_end(j) - (void *) j->encrypted_start); + bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret); + mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { + .csum_good = csum_good, .dev = ca->dev_idx, .bucket = bucket, .bucket_offset = offset - bucket_to_sector(ca, ja->buckets[bucket]), .sector = offset, - }, jlist, j, ret != 0); + }, jlist, j); mutex_unlock(&jlist->lock); switch (ret) { @@ -1128,6 +1154,19 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) *start_seq = le64_to_cpu(i->j.seq) + 1; if (!JSET_NO_FLUSH(&i->j)) { + int write = READ; + if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), + c, &i->j, NULL, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq))) + i->j.last_seq = i->j.seq; + + pr_info("last flush %llu-%llu csum good %u", + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq), + i->csum_good); + last_seq = le64_to_cpu(i->j.last_seq); *blacklist_seq = le64_to_cpu(i->j.seq) + 1; break; @@ -1231,7 +1270,21 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) if (!i || i->ignore) continue; - ret = jset_validate_entries(c, &i->j, READ); + for (ptr = 0; ptr < i->nr_ptrs; ptr++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + + if (!i->ptrs[ptr].csum_good) + printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n", + ca->name, i->ptrs[ptr].sector, + le64_to_cpu(i->j.seq), + i->csum_good ? " (had good copy on another device)" : ""); + } + + ret = jset_validate(c, + bch_dev_bkey_exists(c, i->ptrs[0].dev), + &i->j, + i->ptrs[0].sector, + READ); if (ret) goto err; @@ -1667,7 +1720,7 @@ void bch2_journal_write(struct closure *cl) validate_before_checksum = true; if (validate_before_checksum && - jset_validate_for_write(c, jset)) + jset_validate(c, NULL, jset, 0, WRITE)) goto err; ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), @@ -1681,7 +1734,7 @@ void bch2_journal_write(struct closure *cl) journal_nonce(jset), jset); if (!validate_before_checksum && - jset_validate_for_write(c, jset)) + jset_validate(c, NULL, jset, 0, WRITE)) goto err; sectors = vstruct_sectors(jset, c->block_bits); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 1a91f2c..2f8bbf0 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -8,6 +8,7 @@ */ struct journal_replay { struct journal_ptr { + bool csum_good; u8 dev; u32 bucket; u32 bucket_offset; @@ -15,8 +16,7 @@ struct journal_replay { } ptrs[BCH_REPLICAS_MAX]; unsigned nr_ptrs; - /* checksum error, but we may want to try using it anyways: */ - bool bad; + bool csum_good; bool ignore; /* must be last: */ struct jset j; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 4f4dfaa..55fdaca 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -628,7 +628,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(&trans); ret = bch2_get_next_backpointer(&trans, bucket, gen, - &bp_offset, &bp); + &bp_offset, &bp, + BTREE_ITER_CACHED); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 35958c6..044eca8 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -162,7 +162,7 @@ static int bch2_copygc(struct bch_fs *c) bch2_moving_ctxt_exit(&ctxt); - if (ret < 0) + if (ret < 0 && ret != -EROFS) bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index c12d715..ad7130a 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -95,6 +95,113 @@ void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, #include #include +static void qc_info_to_text(struct printbuf *out, struct qc_info *i) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 20); + + prt_str(out, "i_fieldmask"); + prt_tab(out); + prt_printf(out, "%x", i->i_fieldmask); + prt_newline(out); + + prt_str(out, "i_flags"); + prt_tab(out); + prt_printf(out, "%u", i->i_flags); + prt_newline(out); + + prt_str(out, "i_spc_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_spc_timelimit); + prt_newline(out); + + prt_str(out, "i_ino_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_ino_timelimit); + prt_newline(out); + + prt_str(out, "i_rt_spc_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_rt_spc_timelimit); + prt_newline(out); + + prt_str(out, "i_spc_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_spc_warnlimit); + prt_newline(out); + + prt_str(out, "i_ino_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_ino_warnlimit); + prt_newline(out); + + prt_str(out, "i_rt_spc_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_rt_spc_warnlimit); + prt_newline(out); +} + +static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 20); + + prt_str(out, "d_fieldmask"); + prt_tab(out); + prt_printf(out, "%x", q->d_fieldmask); + prt_newline(out); + + prt_str(out, "d_spc_hardlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_hardlimit); + prt_newline(out); + + prt_str(out, "d_spc_softlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_softlimit); + prt_newline(out); + + prt_str(out, "d_ino_hardlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_hardlimit); + prt_newline(out); + + prt_str(out, "d_ino_softlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_softlimit); + prt_newline(out); + + prt_str(out, "d_space"); + prt_tab(out); + prt_printf(out, "%llu", q->d_space); + prt_newline(out); + + prt_str(out, "d_ino_count"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_count); + prt_newline(out); + + prt_str(out, "d_ino_timer"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_timer); + prt_newline(out); + + prt_str(out, "d_spc_timer"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_timer); + prt_newline(out); + + prt_str(out, "d_ino_warns"); + prt_tab(out); + prt_printf(out, "%i", q->d_ino_warns); + prt_newline(out); + + prt_str(out, "d_spc_warns"); + prt_tab(out); + prt_printf(out, "%i", q->d_spc_warns); + prt_newline(out); +} + static inline unsigned __next_qtype(unsigned i, unsigned qtypes) { qtypes >>= i; @@ -413,6 +520,26 @@ void bch2_fs_quota_init(struct bch_fs *c) mutex_init(&c->quotas[i].lock); } +static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) +{ + struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb); + + if (sb_quota) + return sb_quota; + + sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64)); + if (sb_quota) { + unsigned qtype, qc; + + for (qtype = 0; qtype < QTYP_NR; qtype++) + for (qc = 0; qc < Q_COUNTERS; qc++) + sb_quota->q[qtype].c[qc].timelimit = + cpu_to_le32(7 * 24 * 60 * 60); + } + + return sb_quota; +} + static void bch2_sb_quota_read(struct bch_fs *c) { struct bch_sb_field_quota *sb_quota; @@ -471,12 +598,19 @@ advance: int bch2_fs_quota_read(struct bch_fs *c) { + struct bch_sb_field_quota *sb_quota; struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret; mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + mutex_unlock(&c->sb_lock); + return -BCH_ERR_ENOSPC_sb_quota; + } + bch2_sb_quota_read(c); mutex_unlock(&c->sb_lock); @@ -500,6 +634,8 @@ int bch2_fs_quota_read(struct bch_fs *c) static int bch2_quota_enable(struct super_block *sb, unsigned uflags) { struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; + int ret = 0; if (sb->s_flags & SB_RDONLY) return -EROFS; @@ -519,6 +655,12 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) return -EINVAL; mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + ret = -BCH_ERR_ENOSPC_sb_quota; + goto unlock; + } + if (uflags & FS_QUOTA_UDQ_ENFD) SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); @@ -529,9 +671,10 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); bch2_write_super(c); +unlock: mutex_unlock(&c->sb_lock); - return 0; + return bch2_err_class(ret); } static int bch2_quota_disable(struct super_block *sb, unsigned uflags) @@ -643,6 +786,15 @@ static int bch2_quota_set_info(struct super_block *sb, int type, struct bch_fs *c = sb->s_fs_info; struct bch_sb_field_quota *sb_quota; struct bch_memquota_type *q; + int ret = 0; + + if (0) { + struct printbuf buf = PRINTBUF; + + qc_info_to_text(&buf, info); + pr_info("setting:\n%s", buf.buf); + printbuf_exit(&buf); + } if (sb->s_flags & SB_RDONLY) return -EROFS; @@ -660,12 +812,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type, q = &c->quotas[type]; mutex_lock(&c->sb_lock); - sb_quota = bch2_sb_get_quota(c->disk_sb.sb); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - sb_quota = bch2_sb_resize_quota(&c->disk_sb, - sizeof(*sb_quota) / sizeof(u64)); - if (!sb_quota) - return -BCH_ERR_ENOSPC_sb_quota; + ret = -BCH_ERR_ENOSPC_sb_quota; + goto unlock; } if (info->i_fieldmask & QC_SPC_TIMER) @@ -687,9 +837,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type, bch2_sb_quota_read(c); bch2_write_super(c); +unlock: mutex_unlock(&c->sb_lock); - return 0; + return bch2_err_class(ret); } /* Get/set individual quotas: */ @@ -794,6 +945,14 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, struct bkey_i_quota new_quota; int ret; + if (0) { + struct printbuf buf = PRINTBUF; + + qc_dqblk_to_text(&buf, qdq); + pr_info("setting:\n%s", buf.buf); + printbuf_exit(&buf); + } + if (sb->s_flags & SB_RDONLY) return -EROFS; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 9df0828..3f674bf 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -895,6 +895,12 @@ int bch2_fs_start(struct bch_fs *c) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + for (i = 0; i < BCH_TRANSACTIONS_NR; i++) { + mutex_lock(&c->btree_transaction_stats[i].lock); + bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times); + mutex_unlock(&c->btree_transaction_stats[i].lock); + } + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); diff --git a/libbcachefs/util.c b/libbcachefs/util.c index d191935..f08215a 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "eytzinger.h" #include "util.h" @@ -323,38 +324,44 @@ static void bch2_time_stats_update_one(struct time_stats *stats, { u64 duration, freq; - duration = time_after64(end, start) - ? end - start : 0; - freq = time_after64(end, stats->last_event) - ? end - stats->last_event : 0; - - stats->count++; - - stats->average_duration = stats->average_duration - ? ewma_add(stats->average_duration, duration, 6) - : duration; - - stats->average_frequency = stats->average_frequency - ? ewma_add(stats->average_frequency, freq, 6) - : freq; - - stats->max_duration = max(stats->max_duration, duration); - - stats->last_event = end; + if (time_after64(end, start)) { + duration = end - start; + stats->duration_stats = mean_and_variance_update(stats->duration_stats, + duration); + stats->duration_stats_weighted = mean_and_variance_weighted_update( + stats->duration_stats_weighted, + duration); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + bch2_quantiles_update(&stats->quantiles, duration); + } - bch2_quantiles_update(&stats->quantiles, duration); + if (time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq); + stats->freq_stats_weighted = mean_and_variance_weighted_update( + stats->freq_stats_weighted, + freq); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + stats->last_event = end; + } } void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) { unsigned long flags; + WARN_RATELIMIT(!stats->min_duration || !stats->min_freq, + "time_stats: min_duration = %llu, min_freq = %llu", + stats->min_duration, stats->min_freq); + if (!stats->buffer) { spin_lock_irqsave(&stats->lock, flags); bch2_time_stats_update_one(stats, start, end); - if (stats->average_frequency < 32 && - stats->count > 1024) + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && + stats->duration_stats.n > 1024) stats->buffer = alloc_percpu_gfp(struct time_stat_buffer, GFP_ATOMIC); @@ -389,12 +396,15 @@ void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) static const struct time_unit { const char *name; - u32 nsecs; + u64 nsecs; } time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "sec", NSEC_PER_SEC }, + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", NSEC_PER_SEC * 60}, + { "h", NSEC_PER_SEC * 3600}, + { "eon", U64_MAX }, }; static const struct time_unit *pick_time_units(u64 ns) @@ -414,38 +424,117 @@ static void pr_time_units(struct printbuf *out, u64 ns) { const struct time_unit *u = pick_time_units(ns); - prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); + prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); + prt_tab_rjust(out); + prt_printf(out, "%s", u->name); +} + +#define TABSTOP_SIZE 12 + +static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) +{ + prt_printf(out, name); + prt_tab(out); + pr_time_units(out, ns); + prt_newline(out); } void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) { const struct time_unit *u; - u64 freq = READ_ONCE(stats->average_frequency); - u64 q, last_q = 0; + s64 f_mean = 0, d_mean = 0; + u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; int i; + /* + * avoid divide by zero + */ + if (stats->freq_stats.n) { + f_mean = mean_and_variance_get_mean(stats->freq_stats); + f_stddev = mean_and_variance_get_stddev(stats->freq_stats); + d_mean = mean_and_variance_get_mean(stats->duration_stats); + d_stddev = mean_and_variance_get_stddev(stats->duration_stats); + } - prt_printf(out, "count:\t\t%llu", - stats->count); + printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); + prt_printf(out, "count:"); + prt_tab(out); + prt_printf(out, "%llu ", + stats->duration_stats.n); + printbuf_tabstop_pop(out); prt_newline(out); - prt_printf(out, "rate:\t\t%llu/sec", - freq ? div64_u64(NSEC_PER_SEC, freq) : 0); + + printbuf_tabstops_reset(out); + + printbuf_tabstop_push(out, out->indent + 20); + printbuf_tabstop_push(out, TABSTOP_SIZE + 2); + printbuf_tabstop_push(out, 0); + printbuf_tabstop_push(out, TABSTOP_SIZE + 2); + + prt_tab(out); + prt_printf(out, "since mount"); + prt_tab_rjust(out); + prt_tab(out); + prt_printf(out, "recent"); + prt_tab_rjust(out); prt_newline(out); - prt_printf(out, "frequency:\t"); - pr_time_units(out, freq); + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, out->indent + 20); + printbuf_tabstop_push(out, TABSTOP_SIZE); + printbuf_tabstop_push(out, 2); + printbuf_tabstop_push(out, TABSTOP_SIZE); + prt_printf(out, "duration of events"); prt_newline(out); - prt_printf(out, "avg duration:\t"); - pr_time_units(out, stats->average_duration); + printbuf_indent_add(out, 2); + + pr_name_and_units(out, "min:", stats->min_duration); + pr_name_and_units(out, "max:", stats->max_duration); + prt_printf(out, "mean:"); + prt_tab(out); + pr_time_units(out, d_mean); + prt_tab(out); + pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); prt_newline(out); - prt_printf(out, "max duration:\t"); - pr_time_units(out, stats->max_duration); + + prt_printf(out, "stddev:"); + prt_tab(out); + pr_time_units(out, d_stddev); + prt_tab(out); + pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + + printbuf_indent_sub(out, 2); + prt_newline(out); + + prt_printf(out, "time between events"); + prt_newline(out); + printbuf_indent_add(out, 2); + + pr_name_and_units(out, "min:", stats->min_freq); + pr_name_and_units(out, "max:", stats->max_freq); + + prt_printf(out, "mean:"); + prt_tab(out); + pr_time_units(out, f_mean); + prt_tab(out); + pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + prt_newline(out); + + prt_printf(out, "stddev:"); + prt_tab(out); + pr_time_units(out, f_stddev); + prt_tab(out); + pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + + printbuf_indent_sub(out, 2); + prt_newline(out); + + printbuf_tabstops_reset(out); i = eytzinger0_first(NR_QUANTILES); u = pick_time_units(stats->quantiles.entries[i].m); - prt_newline(out); prt_printf(out, "quantiles (%s):\t", u->name); eytzinger0_for_each(i, NR_QUANTILES) { bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; @@ -467,6 +556,10 @@ void bch2_time_stats_exit(struct time_stats *stats) void bch2_time_stats_init(struct time_stats *stats) { memset(stats, 0, sizeof(*stats)); + stats->duration_stats_weighted.w = 8; + stats->freq_stats_weighted.w = 8; + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; spin_lock_init(&stats->lock); } diff --git a/libbcachefs/util.h b/libbcachefs/util.h index a7f68e1..846e602 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -18,6 +18,7 @@ #include #include #include +#include struct closure; @@ -380,14 +381,18 @@ struct time_stat_buffer { struct time_stats { spinlock_t lock; - u64 count; /* all fields are in nanoseconds */ - u64 average_duration; - u64 average_frequency; u64 max_duration; + u64 min_duration; + u64 max_freq; + u64 min_freq; u64 last_event; struct quantiles quantiles; + struct mean_and_variance duration_stats; + struct mean_and_variance_weighted duration_stats_weighted; + struct mean_and_variance freq_stats; + struct mean_and_variance_weighted freq_stats_weighted; struct time_stat_buffer __percpu *buffer; }; diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 6a5be6c..4fc1c3a 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -371,8 +371,10 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret; - return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); + ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags); + return bch2_err_class(ret); } static int bch2_xattr_set_handler(const struct xattr_handler *handler, @@ -384,11 +386,13 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + int ret; - return bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_do(c, NULL, NULL, 0, bch2_xattr_set(&trans, inode_inum(inode), &hash, name, value, size, handler->flags, flags)); + return bch2_err_class(ret); } static const struct xattr_handler bch_xattr_user_handler = { diff --git a/linux/int_sqrt.c b/linux/int_sqrt.c new file mode 100644 index 0000000..a8170bb --- /dev/null +++ b/linux/int_sqrt.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Davidlohr Bueso + * + * Based on the shift-and-subtract algorithm for computing integer + * square root from Guy L. Steele. + */ + +#include +#include +#include +#include + +/** + * int_sqrt - computes the integer square root + * @x: integer of which to calculate the sqrt + * + * Computes: floor(sqrt(x)) + */ +unsigned long int_sqrt(unsigned long x) +{ + unsigned long b, m, y = 0; + + if (x <= 1) + return x; + + m = 1UL << (__fls(x) & ~1UL); + while (m != 0) { + b = y + m; + y >>= 1; + + if (x >= b) { + x -= b; + y += m; + } + m >>= 2; + } + + return y; +} +EXPORT_SYMBOL(int_sqrt); + +#if BITS_PER_LONG < 64 +/** + * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input + * is expected. + * @x: 64bit integer of which to calculate the sqrt + */ +u32 int_sqrt64(u64 x) +{ + u64 b, m, y = 0; + + if (x <= ULONG_MAX) + return int_sqrt((unsigned long) x); + + m = 1ULL << ((fls64(x) - 1) & ~1ULL); + while (m != 0) { + b = y + m; + y >>= 1; + + if (x >= b) { + x -= b; + y += m; + } + m >>= 2; + } + + return y; +} +EXPORT_SYMBOL(int_sqrt64); +#endif diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c new file mode 100644 index 0000000..643e311 --- /dev/null +++ b/linux/mean_and_variance.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions for incremental mean and variance. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Copyright © 2022 Daniel B. Hill + * + * Author: Daniel B. Hill + * + * Description: + * + * This is includes some incremental algorithms for mean and variance calculation + * + * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf + * + * Create a struct and if it's the weighted variant set the w field (weight = 2^k). + * + * Use mean_and_variance[_weighted]_update() on the struct to update it's state. + * + * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation + * is deferred to these functions for performance reasons. + * + * see lib/math/mean_and_variance_test.c for examples of usage. + * + * DO NOT access the mean and variance fields of the weighted variants directly. + * DO NOT change the weight after calling update. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/** + * fast_divpow2() - fast approximation for n / (1 << d) + * @n: numerator + * @d: the power of 2 denominator. + * + * note: this rounds towards 0. + */ +inline s64 fast_divpow2(s64 n, u8 d) +{ + return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; +} + +/** + * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 + * and return it. + * @s1: the mean_and_variance to update. + * @v1: the new sample. + * + * see linked pdf equation 12. + */ +struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1) +{ + struct mean_and_variance s2; + u64 v2 = abs(v1); + + s2.n = s1.n + 1; + s2.sum = s1.sum + v1; + s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2)); + return s2; +} +EXPORT_SYMBOL_GPL(mean_and_variance_update); + +/** + * mean_and_variance_get_mean() - get mean from @s + */ +s64 mean_and_variance_get_mean(struct mean_and_variance s) +{ + return div64_u64(s.sum, s.n); +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); + +/** + * mean_and_variance_get_variance() - get variance from @s1 + * + * see linked pdf equation 12. + */ +u64 mean_and_variance_get_variance(struct mean_and_variance s1) +{ + u128 s2 = u128_div(s1.sum_squares, s1.n); + u64 s3 = abs(mean_and_variance_get_mean(s1)); + + return u128_to_u64(u128_sub(s2, u128_square(s3))); +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); + +/** + * mean_and_variance_get_stddev() - get standard deviation from @s + */ +u32 mean_and_variance_get_stddev(struct mean_and_variance s) +{ + return int_sqrt64(mean_and_variance_get_variance(s)); +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); + +/** + * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() + * @s1: .. + * @s2: .. + * + * see linked pdf: function derived from equations 140-143 where alpha = 2^w. + * values are stored bitshifted for performance and added precision. + */ +struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, + s64 x) +{ + struct mean_and_variance_weighted s2; + // previous weighted variance. + u64 var_w0 = s1.variance; + u8 w = s2.w = s1.w; + // new value weighted. + s64 x_w = x << w; + s64 diff_w = x_w - s1.mean; + s64 diff = fast_divpow2(diff_w, w); + // new mean weighted. + s64 u_w1 = s1.mean + diff; + + BUG_ON(w % 2 != 0); + + if (!s1.init) { + s2.mean = x_w; + s2.variance = 0; + } else { + s2.mean = u_w1; + s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; + } + s2.init = true; + + return s2; +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); + +/** + * mean_and_variance_weighted_get_mean() - get mean from @s + */ +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) +{ + return fast_divpow2(s.mean, s.w); +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); + +/** + * mean_and_variance_weighted_get_variance() -- get variance from @s + */ +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) +{ + // always positive don't need fast divpow2 + return s.variance >> s.w; +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); + +/** + * mean_and_variance_weighted_get_stddev() - get standard deviation from @s + */ +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) +{ + return int_sqrt64(mean_and_variance_weighted_get_variance(s)); +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); + +MODULE_AUTHOR("Daniel B. Hill"); +MODULE_LICENSE("GPL"); diff --git a/linux/six.c b/linux/six.c index b11660a..39f7ea7 100644 --- a/linux/six.c +++ b/linux/six.c @@ -148,6 +148,14 @@ static int __do_six_trylock_type(struct six_lock *lock, atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter); smp_mb__after_atomic(); + } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) { + atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write), + &lock->state.counter); + /* + * pairs with barrier after unlock and before checking + * for readers in unlock path + */ + smp_mb__after_atomic(); } ret = !pcpu_read_count(lock); @@ -162,9 +170,6 @@ static int __do_six_trylock_type(struct six_lock *lock, if (ret || try) v -= __SIX_VAL(write_locking, 1); - if (!ret && !try && !(lock->state.waiters & (1 << SIX_LOCK_write))) - v += __SIX_VAL(waiters, 1 << SIX_LOCK_write); - if (try && !ret) { old.v = atomic64_add_return(v, &lock->state.counter); if (old.waiters & (1 << SIX_LOCK_read)) -- 2.39.2