From: Kent Overstreet Date: Thu, 25 May 2023 21:52:28 +0000 (-0400) Subject: Update bcachefs sources to 31c09369cd six locks: Fix an unitialized var X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=1f78fed4693a5361f56508daac59bebd5b556379;p=bcachefs-tools-debian Update bcachefs sources to 31c09369cd six locks: Fix an unitialized var Signed-off-by: Kent Overstreet --- diff --git a/.bcachefs_revision b/.bcachefs_revision index 1f415ca..1d85f95 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -799716df00709f7480f575e8fd626915bafba006 +31c09369cd01b34fb8ba845fa09776576b03a1e2 diff --git a/include/linux/atomic.h b/include/linux/atomic.h index a9852fa..79cf5aa 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -32,6 +32,8 @@ typedef struct { #define __ATOMIC_SUB(v, p) uatomic_sub(p, v) #define __ATOMIC_INC(p) uatomic_inc(p) #define __ATOMIC_DEC(p) uatomic_dec(p) +#define __ATOMIC_AND(v, p) uatomic_and(p, v) +#define __ATOMIC_OR(v, p) uatomic_or(p, v) #define xchg(p, v) uatomic_xchg(p, v) #define xchg_acquire(p, v) uatomic_xchg(p, v) @@ -56,6 +58,8 @@ typedef struct { #define __ATOMIC_SUB_RETURN(v, p) __atomic_sub_fetch(p, v, __ATOMIC_RELAXED) #define __ATOMIC_SUB_RETURN_RELEASE(v, p) \ __atomic_sub_fetch(p, v, __ATOMIC_RELEASE) +#define __ATOMIC_AND(p) __atomic_and_fetch(p, v, __ATOMIC_RELAXED) +#define __ATOMIC_OR(p) __atomic_or_fetch(p, v, __ATOMIC_RELAXED) #define xchg(p, v) __atomic_exchange_n(p, v, __ATOMIC_SEQ_CST) #define xchg_acquire(p, v) __atomic_exchange_n(p, v, __ATOMIC_ACQUIRE) @@ -244,6 +248,16 @@ static inline bool a_type##_inc_not_zero(a_type##_t *v) \ return a_type##_add_unless(v, 1, 0); \ } \ \ +static inline void a_type##_and(i_type a, a_type##_t *v) \ +{ \ + __ATOMIC_AND(a, v); \ +} \ + \ +static inline void a_type##_or(i_type a, a_type##_t *v) \ +{ \ + __ATOMIC_OR(a, v); \ +} \ + \ static inline i_type a_type##_xchg(a_type##_t *v, i_type i) \ { \ return xchg(&v->counter, i); \ diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h index 756eb3d..9ed79f4 100644 --- a/include/linux/mean_and_variance.h +++ b/include/linux/mean_and_variance.h @@ -2,122 +2,112 @@ #ifndef MEAN_AND_VARIANCE_H_ #define MEAN_AND_VARIANCE_H_ -#include #include +#include #include #include +#include #define SQRT_U64_MAX 4294967295ULL -/** - * abs - return absolute value of an argument - * @x: the value. If it is unsigned type, it is converted to signed type first. - * char is treated as if it was signed (regardless of whether it really is) - * but the macro's return type is preserved as char. - * - * Return: an absolute value of x. +/* + * u128_u: u128 user mode, because not all architectures support a real int128 + * type */ -#define abs(x) __abs_choose_expr(x, long long, \ - __abs_choose_expr(x, long, \ - __abs_choose_expr(x, int, \ - __abs_choose_expr(x, short, \ - __abs_choose_expr(x, char, \ - __builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(x), char), \ - (char)({ signed char __x = (x); __x<0?-__x:__x; }), \ - ((void)0))))))) -#define __abs_choose_expr(x, type, other) __builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(x), signed type) || \ - __builtin_types_compatible_p(typeof(x), unsigned type), \ - ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other) +#ifdef __SIZEOF_INT128__ -#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) - -typedef unsigned __int128 u128; +typedef struct { + unsigned __int128 v; +} __aligned(16) u128_u; -static inline u128 u64_to_u128(u64 a) +static inline u128_u u64_to_u128(u64 a) { - return (u128)a; + return (u128_u) { .v = a }; } -static inline u64 u128_to_u64(u128 a) +static inline u64 u128_lo(u128_u a) { - return (u64)a; + return a.v; } -static inline u64 u128_shr64_to_u64(u128 a) +static inline u64 u128_hi(u128_u a) { - return (u64)(a >> 64); + return a.v >> 64; } -static inline u128 u128_add(u128 a, u128 b) +static inline u128_u u128_add(u128_u a, u128_u b) { - return a + b; + a.v += b.v; + return a; } -static inline u128 u128_sub(u128 a, u128 b) +static inline u128_u u128_sub(u128_u a, u128_u b) { - return a - b; + a.v -= b.v; + return a; } -static inline u128 u128_shl(u128 i, s8 shift) +static inline u128_u u128_shl(u128_u a, s8 shift) { - return i << shift; + a.v <<= shift; + return a; } -static inline u128 u128_shl64_add(u64 a, u64 b) +static inline u128_u u128_square(u64 a) { - return ((u128)a << 64) + b; -} + u128_u b = u64_to_u128(a); -static inline u128 u128_square(u64 i) -{ - return i*i; + b.v *= b.v; + return b; } #else typedef struct { u64 hi, lo; -} u128; +} __aligned(16) u128_u; + +/* conversions */ -static inline u128 u64_to_u128(u64 a) +static inline u128_u u64_to_u128(u64 a) { - return (u128){ .lo = a }; + return (u128_u) { .lo = a }; } -static inline u64 u128_to_u64(u128 a) +static inline u64 u128_lo(u128_u a) { return a.lo; } -static inline u64 u128_shr64_to_u64(u128 a) +static inline u64 u128_hi(u128_u a) { return a.hi; } -static inline u128 u128_add(u128 a, u128 b) +/* arithmetic */ + +static inline u128_u u128_add(u128_u a, u128_u b) { - u128 c; + u128_u c; c.lo = a.lo + b.lo; c.hi = a.hi + b.hi + (c.lo < a.lo); return c; } -static inline u128 u128_sub(u128 a, u128 b) +static inline u128_u u128_sub(u128_u a, u128_u b) { - u128 c; + u128_u c; c.lo = a.lo - b.lo; c.hi = a.hi - b.hi - (c.lo > a.lo); return c; } -static inline u128 u128_shl(u128 i, s8 shift) +static inline u128_u u128_shl(u128_u i, s8 shift) { - u128 r; + u128_u r; r.lo = i.lo << shift; if (shift < 64) @@ -129,15 +119,10 @@ static inline u128 u128_shl(u128 i, s8 shift) return r; } -static inline u128 u128_shl64_add(u64 a, u64 b) +static inline u128_u u128_square(u64 i) { - return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b)); -} - -static inline u128 u128_square(u64 i) -{ - u128 r; - u64 h = i >> 32, l = i & (u64)U32_MAX; + u128_u r; + u64 h = i >> 32, l = i & U32_MAX; r = u128_shl(u64_to_u128(h*h), 64); r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); @@ -148,85 +133,69 @@ static inline u128 u128_square(u64 i) #endif -static inline u128 u128_div(u128 n, u64 d) +static inline u128_u u64s_to_u128(u64 hi, u64 lo) { - u128 r; - u64 rem; - u64 hi = u128_shr64_to_u64(n); - u64 lo = u128_to_u64(n); - u64 h = hi & ((u64)U32_MAX << 32); - u64 l = (hi & (u64)U32_MAX) << 32; + u128_u c = u64_to_u128(hi); - r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); - r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); - r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); - return r; + c = u128_shl(c, 64); + c = u128_add(c, u64_to_u128(lo)); + return c; } +u128_u u128_div(u128_u n, u64 d); + struct mean_and_variance { - s64 n; - s64 sum; - u128 sum_squares; + s64 n; + s64 sum; + u128_u sum_squares; }; /* expontentially weighted variant */ struct mean_and_variance_weighted { - bool init; - u8 w; - s64 mean; - u64 variance; + bool init; + u8 weight; /* base 2 logarithim */ + s64 mean; + u64 variance; }; -s64 fast_divpow2(s64 n, u8 d); +/** + * fast_divpow2() - fast approximation for n / (1 << d) + * @n: numerator + * @d: the power of 2 denominator. + * + * note: this rounds towards 0. + */ +static inline s64 fast_divpow2(s64 n, u8 d) +{ + return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; +} +/** + * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 + * and return it. + * @s1: the mean_and_variance to update. + * @v1: the new sample. + * + * see linked pdf equation 12. + */ static inline struct mean_and_variance -mean_and_variance_update_inlined(struct mean_and_variance s1, s64 v1) -{ - struct mean_and_variance s2; - u64 v2 = abs(v1); - - s2.n = s1.n + 1; - s2.sum = s1.sum + v1; - s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2)); - return s2; -} - -static inline struct mean_and_variance_weighted -mean_and_variance_weighted_update_inlined(struct mean_and_variance_weighted s1, s64 x) -{ - struct mean_and_variance_weighted s2; - // previous weighted variance. - u64 var_w0 = s1.variance; - u8 w = s2.w = s1.w; - // new value weighted. - s64 x_w = x << w; - s64 diff_w = x_w - s1.mean; - s64 diff = fast_divpow2(diff_w, w); - // new mean weighted. - s64 u_w1 = s1.mean + diff; - - BUG_ON(w % 2 != 0); - - if (!s1.init) { - s2.mean = x_w; - s2.variance = 0; - } else { - s2.mean = u_w1; - s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; - } - s2.init = true; - - return s2; +mean_and_variance_update(struct mean_and_variance s, s64 v) +{ + return (struct mean_and_variance) { + .n = s.n + 1, + .sum = s.sum + v, + .sum_squares = u128_add(s.sum_squares, u128_square(abs(v))), + }; } -struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1); - s64 mean_and_variance_get_mean(struct mean_and_variance s); - u64 mean_and_variance_get_variance(struct mean_and_variance s1); - u32 mean_and_variance_get_stddev(struct mean_and_variance s); +s64 mean_and_variance_get_mean(struct mean_and_variance s); +u64 mean_and_variance_get_variance(struct mean_and_variance s1); +u32 mean_and_variance_get_stddev(struct mean_and_variance s); + +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); -struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1); - s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); - u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); - u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); #endif // MEAN_AND_VAIRANCE_H_ diff --git a/include/linux/six.h b/include/linux/six.h index 83023f6..394da42 100644 --- a/include/linux/six.h +++ b/include/linux/six.h @@ -3,59 +3,124 @@ #ifndef _LINUX_SIX_H #define _LINUX_SIX_H -/* - * Shared/intent/exclusive locks: sleepable read/write locks, much like rw - * semaphores, except with a third intermediate state, intent. Basic operations - * are: +/** + * DOC: SIX locks overview * - * six_lock_read(&foo->lock); - * six_unlock_read(&foo->lock); + * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores + * but with an additional state: read/shared, intent, exclusive/write * - * six_lock_intent(&foo->lock); - * six_unlock_intent(&foo->lock); + * The purpose of the intent state is to allow for greater concurrency on tree + * structures without deadlocking. In general, a read can't be upgraded to a + * write lock without deadlocking, so an operation that updates multiple nodes + * will have to take write locks for the full duration of the operation. * - * six_lock_write(&foo->lock); - * six_unlock_write(&foo->lock); + * But by adding an intent state, which is exclusive with other intent locks but + * not with readers, we can take intent locks at thte start of the operation, + * and then take write locks only for the actual update to each individual + * nodes, without deadlocking. * - * Intent locks block other intent locks, but do not block read locks, and you - * must have an intent lock held before taking a write lock, like so: + * Example usage: + * six_lock_read(&foo->lock); + * six_unlock_read(&foo->lock); * - * six_lock_intent(&foo->lock); - * six_lock_write(&foo->lock); - * six_unlock_write(&foo->lock); - * six_unlock_intent(&foo->lock); + * An intent lock must be held before taking a write lock: + * six_lock_intent(&foo->lock); + * six_lock_write(&foo->lock); + * six_unlock_write(&foo->lock); + * six_unlock_intent(&foo->lock); * * Other operations: - * * six_trylock_read() * six_trylock_intent() * six_trylock_write() * - * six_lock_downgrade(): convert from intent to read - * six_lock_tryupgrade(): attempt to convert from read to intent - * - * Locks also embed a sequence number, which is incremented when the lock is - * locked or unlocked for write. The current sequence number can be grabbed - * while a lock is held from lock->state.seq; then, if you drop the lock you can - * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock - * iff it hasn't been locked for write in the meantime. - * - * There are also operations that take the lock type as a parameter, where the - * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: - * - * six_lock_type(lock, type) - * six_unlock_type(lock, type) - * six_relock(lock, type, seq) - * six_trylock_type(lock, type) - * six_trylock_convert(lock, from, to) - * - * A lock may be held multiple times by the same thread (for read or intent, - * not write). However, the six locks code does _not_ implement the actual - * recursive checks itself though - rather, if your code (e.g. btree iterator - * code) knows that the current thread already has a lock held, and for the - * correct type, six_lock_increment() may be used to bump up the counter for - * that type - the only effect is that one more call to unlock will be required - * before the lock is unlocked. + * six_lock_downgrade() convert from intent to read + * six_lock_tryupgrade() attempt to convert from read to intent, may fail + * + * There are also interfaces that take the lock type as an enum: + * + * six_lock_type(&foo->lock, SIX_LOCK_read); + * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) + * six_lock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_intent); + * + * Lock sequence numbers - unlock(), relock(): + * + * Locks embed sequences numbers, which are incremented on write lock/unlock. + * This allows locks to be dropped and the retaken iff the state they protect + * hasn't changed; this makes it much easier to avoid holding locks while e.g. + * doing IO or allocating memory. + * + * Example usage: + * six_lock_read(&foo->lock); + * u32 seq = six_lock_seq(&foo->lock); + * six_unlock_read(&foo->lock); + * + * some_operation_that_may_block(); + * + * if (six_relock_read(&foo->lock, seq)) { ... } + * + * If the relock operation succeeds, it is as if the lock was never unlocked. + * + * Reentrancy: + * + * Six locks are not by themselves reentrent, but have counters for both the + * read and intent states that can be used to provide reentrency by an upper + * layer that tracks held locks. If a lock is known to already be held in the + * read or intent state, six_lock_increment() can be used to bump the "lock + * held in this state" counter, increasing the number of unlock calls that + * will be required to fully unlock it. + * + * Example usage: + * six_lock_read(&foo->lock); + * six_lock_increment(&foo->lock, SIX_LOCK_read); + * six_unlock_read(&foo->lock); + * six_unlock_read(&foo->lock); + * foo->lock is now fully unlocked. + * + * Since the intent state supercedes read, it's legal to increment the read + * counter when holding an intent lock, but not the reverse. + * + * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) + * is not legal. + * + * should_sleep_fn: + * + * There is a six_lock() variant that takes a function pointer that is called + * immediately prior to schedule() when blocking, and may return an error to + * abort. + * + * One possible use for this feature is when objects being locked are part of + * a cache and may reused, and lock ordering is based on a property of the + * object that will change when the object is reused - i.e. logical key order. + * + * If looking up an object in the cache may race with object reuse, and lock + * ordering is required to prevent deadlock, object reuse may change the + * correct lock order for that object and cause a deadlock. should_sleep_fn + * can be used to check if the object is still the object we want and avoid + * this deadlock. + * + * Wait list entry interface: + * + * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a + * wait list entry. By embedding six_lock_waiter into another object, and by + * traversing lock waitlists, it is then possible for an upper layer to + * implement full cycle detection for deadlock avoidance. + * + * should_sleep_fn should be used for invoking the cycle detector, walking the + * graph of held locks to check for a deadlock. The upper layer must track + * held locks for each thread, and each thread's held locks must be reachable + * from its six_lock_waiter object. + * + * six_lock_waiter() will add the wait object to the waitlist re-trying taking + * the lock, and before calling should_sleep_fn, and the wait object will not + * be removed from the waitlist until either the lock has been successfully + * acquired, or we aborted because should_sleep_fn returned an error. + * + * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will + * have timestamps in strictly ascending order - this is so the timestamp can + * be used as a cursor for lock graph traverse. */ #include @@ -63,41 +128,6 @@ #include #include -#define SIX_LOCK_SEPARATE_LOCKFNS - -union six_lock_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - /* for waitlist_bitnr() */ - unsigned long l; - }; - - struct { - unsigned read_lock:26; - unsigned write_locking:1; - unsigned intent_lock:1; - unsigned nospin:1; - unsigned waiters:3; - /* - * seq works much like in seqlocks: it's incremented every time - * we lock and unlock for write. - * - * If it's odd write lock is held, even unlocked. - * - * Thus readers can unlock, and then lock again later iff it - * hasn't been modified in the meantime. - */ - u32 seq; - }; -}; - enum six_lock_type { SIX_LOCK_read, SIX_LOCK_intent, @@ -105,7 +135,8 @@ enum six_lock_type { }; struct six_lock { - union six_lock_state state; + atomic_t state; + u32 seq; unsigned intent_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; @@ -127,59 +158,210 @@ struct six_lock_waiter { typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); -static __always_inline void __six_lock_init(struct six_lock *lock, - const char *name, - struct lock_class_key *key) -{ - atomic64_set(&lock->state.counter, 0); - raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - debug_check_no_locks_freed((void *) lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif -} +void six_lock_exit(struct six_lock *lock); + +enum six_lock_init_flags { + SIX_LOCK_INIT_PCPU = 1U << 0, +}; -#define six_lock_init(lock) \ +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags); + +/** + * six_lock_init - initialize a six lock + * @lock: lock to initialize + * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU + */ +#define six_lock_init(lock, flags) \ do { \ static struct lock_class_key __key; \ \ - __six_lock_init((lock), #lock, &__key); \ + __six_lock_init((lock), #lock, &__key, flags); \ } while (0) -#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) +/** + * six_lock_seq - obtain current lock sequence number + * @lock: six_lock to obtain sequence number for + * + * @lock should be held for read or intent, and not write + * + * By saving the lock sequence number, we can unlock @lock and then (typically + * after some blocking operation) attempt to relock it: the relock will succeed + * if the sequence number hasn't changed, meaning no write locks have been taken + * and state corresponding to what @lock protects is still valid. + */ +static inline u32 six_lock_seq(const struct six_lock *lock) +{ + return lock->seq; +} + +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); + +/** + * six_trylock_type - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * Return: true on success, false on failure. + */ +static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +{ + return six_trylock_ip(lock, type, _THIS_IP_); +} + +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip); + +/** + * six_lock_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * This is a convenience wrapper around six_lock_ip_waiter(), see that function + * for full documentation. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); +} + +/** + * six_lock_ip - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); +} + +/** + * six_lock_type - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); +} + +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip); + +/** + * six_relock_type - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * + * Return: true on success, false on failure. + */ +static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, + unsigned seq) +{ + return six_relock_ip(lock, type, seq, _THIS_IP_); +} + +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); + +/** + * six_unlock_type - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ +static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + six_unlock_ip(lock, type, _THIS_IP_); +} #define __SIX_LOCK(type) \ -bool six_trylock_ip_##type(struct six_lock *, unsigned long); \ -bool six_relock_ip_##type(struct six_lock *, u32, unsigned long); \ -int six_lock_ip_##type(struct six_lock *, six_lock_should_sleep_fn, \ - void *, unsigned long); \ -int six_lock_ip_waiter_##type(struct six_lock *, struct six_lock_waiter *,\ - six_lock_should_sleep_fn, void *, unsigned long);\ -void six_unlock_ip_##type(struct six_lock *, unsigned long); \ +static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ +{ \ + return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ +} \ \ static inline bool six_trylock_##type(struct six_lock *lock) \ { \ - return six_trylock_ip_##type(lock, _THIS_IP_); \ + return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ +} \ + \ +static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ + struct six_lock_waiter *wait, \ + six_lock_should_sleep_fn should_sleep_fn, void *p,\ + unsigned long ip) \ +{ \ + return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ +} \ + \ +static inline int six_lock_ip_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn should_sleep_fn, void *p, \ + unsigned long ip) \ +{ \ + return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ +} \ + \ +static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ +{ \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ } \ + \ static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ { \ - return six_relock_ip_##type(lock, seq, _THIS_IP_); \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ } \ + \ static inline int six_lock_##type(struct six_lock *lock, \ six_lock_should_sleep_fn fn, void *p)\ { \ return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ } \ -static inline int six_lock_waiter_##type(struct six_lock *lock, \ - struct six_lock_waiter *wait, \ - six_lock_should_sleep_fn fn, void *p) \ + \ +static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ { \ - return six_lock_ip_waiter_##type(lock, wait, fn, p, _THIS_IP_); \ + six_unlock_ip(lock, SIX_LOCK_##type, ip); \ } \ + \ static inline void six_unlock_##type(struct six_lock *lock) \ { \ - return six_unlock_ip_##type(lock, _THIS_IP_); \ + six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ } __SIX_LOCK(read) @@ -187,55 +369,6 @@ __SIX_LOCK(intent) __SIX_LOCK(write) #undef __SIX_LOCK -#define SIX_LOCK_DISPATCH(type, fn, ...) \ - switch (type) { \ - case SIX_LOCK_read: \ - return fn##_read(__VA_ARGS__); \ - case SIX_LOCK_intent: \ - return fn##_intent(__VA_ARGS__); \ - case SIX_LOCK_write: \ - return fn##_write(__VA_ARGS__); \ - default: \ - BUG(); \ - } - -static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -{ - SIX_LOCK_DISPATCH(type, six_trylock, lock); -} - -static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) -{ - SIX_LOCK_DISPATCH(type, six_relock, lock, seq); -} - -static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); -} - -static inline int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - SIX_LOCK_DISPATCH(type, six_lock_ip_waiter, lock, wait, should_sleep_fn, p, ip); -} - -static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p); -} - -static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -{ - SIX_LOCK_DISPATCH(type, six_unlock, lock); -} - void six_lock_downgrade(struct six_lock *); bool six_lock_tryupgrade(struct six_lock *); bool six_trylock_convert(struct six_lock *, enum six_lock_type, @@ -245,13 +378,11 @@ void six_lock_increment(struct six_lock *, enum six_lock_type); void six_lock_wakeup_all(struct six_lock *); -void six_lock_pcpu_free(struct six_lock *); -void six_lock_pcpu_alloc(struct six_lock *); - struct six_lock_count { unsigned n[3]; }; struct six_lock_count six_lock_counts(struct six_lock *); +void six_lock_readers_add(struct six_lock *, int); #endif /* _LINUX_SIX_H */ diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index dcdef3b..f774a66 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -269,9 +269,9 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); int rw = flags & WRITE; - if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) { - prt_printf(err, "bad val size (%lu != %u)", - bkey_val_u64s(k.k), alloc_v4_u64s(a.v)); + if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { + prt_printf(err, "bad val size (%u > %lu)", + alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); return -BCH_ERR_invalid_bkey; } diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index b58b876..ee7ba70 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -724,7 +724,7 @@ unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) return 0; } -#ifdef CONFIG_X86_64 +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK #define I(_x) (*(out)++ = (_x)) #define I1(i0) I(i0) diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 727bed9..e81fb3e 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -9,9 +9,17 @@ #include "util.h" #include "vstructs.h" +#if 0 + +/* + * compiled unpack functions are disabled, pending a new interface for + * dynamically allocating executable memory: + */ + #ifdef CONFIG_X86_64 #define HAVE_BCACHEFS_COMPILED_UNPACK 1 #endif +#endif void bch2_bkey_packed_to_binary_text(struct printbuf *, const struct bkey_format *, diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 73d3268..f840270 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -62,10 +62,12 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) EBUG_ON(btree_node_write_in_flight(b)); + clear_btree_node_just_written(b); + kvpfree(b->data, btree_bytes(c)); b->data = NULL; #ifdef __KERNEL__ - vfree(b->aux_data); + kvfree(b->aux_data); #else munmap(b->aux_data, btree_aux_data_bytes(b)); #endif @@ -100,7 +102,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ - b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); + b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); #else b->aux_data = mmap(NULL, btree_aux_data_bytes(b), PROT_READ|PROT_WRITE|PROT_EXEC, @@ -126,7 +128,6 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) return NULL; bkey_btree_ptr_init(&b->key); - bch2_btree_lock_init(&b->c); #ifdef CONFIG_DEBUG_LOCK_ALLOC lockdep_set_no_check_recursion(&b->c.lock.dep_map); #endif @@ -150,6 +151,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return NULL; } + bch2_btree_lock_init(&b->c, 0); + bc->used++; list_add(&b->list, &bc->freeable); return b; @@ -484,7 +487,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) while (!list_empty(&bc->freed_nonpcpu)) { b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); list_del(&b->list); - six_lock_pcpu_free(&b->c.lock); + six_lock_exit(&b->c.lock); kfree(b); } @@ -645,8 +648,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea mutex_lock(&bc->lock); } - if (pcpu_read_locks) - six_lock_pcpu_alloc(&b->c.lock); + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); @@ -700,6 +702,7 @@ err: /* Try to cannibalize another cached btree node: */ if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); + clear_btree_node_just_written(b2); bch2_btree_node_hash_remove(bc, b2); if (b) { @@ -784,7 +787,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, set_btree_node_read_in_flight(b); six_unlock_write(&b->c.lock); - seq = b->c.lock.state.seq; + seq = six_lock_seq(&b->c.lock); six_unlock_intent(&b->c.lock); /* Unlock before doing IO: */ @@ -908,7 +911,7 @@ retry: } if (unlikely(btree_node_read_in_flight(b))) { - u32 seq = b->c.lock.state.seq; + u32 seq = six_lock_seq(&b->c.lock); six_unlock_type(&b->c.lock, lock_type); bch2_trans_unlock(trans); @@ -1006,7 +1009,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * } if (unlikely(btree_node_read_in_flight(b))) { - u32 seq = b->c.lock.state.seq; + u32 seq = six_lock_seq(&b->c.lock); six_unlock_type(&b->c.lock, lock_type); bch2_trans_unlock(trans); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index decbbaa..0a7a18e 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -483,7 +483,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) struct btree_node_entry *bne; bool reinit_iter = false; - EBUG_ON(!(b->c.lock.state.seq & 1)); + EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); BUG_ON(bset_written(b, bset(b, &b->set[1]))); BUG_ON(btree_node_just_written(b)); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 365794d..4b9c04d 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -652,9 +652,8 @@ void bch2_btree_path_level_init(struct btree_trans *trans, BUG_ON(path->cached); EBUG_ON(!btree_path_pos_in_node(path, b)); - EBUG_ON(b->c.lock.state.seq & 1); - path->l[b->c.level].lock_seq = b->c.lock.state.seq; + path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); path->l[b->c.level].b = b; __btree_path_level_init(path, b->c.level); } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 02dd81a..198e381 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -42,14 +42,7 @@ static inline struct btree *btree_path_node(struct btree_path *path, static inline bool btree_node_lock_seq_matches(const struct btree_path *path, const struct btree *b, unsigned level) { - /* - * We don't compare the low bits of the lock sequence numbers because - * @path might have taken a write lock on @b, and we don't want to skip - * the linked path if the sequence numbers were equal before taking that - * write lock. The lock sequence number is incremented by taking and - * releasing write locks and is even when unlocked: - */ - return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; + return path->l[level].lock_seq == six_lock_seq(&b->c.lock); } static inline struct btree *btree_node_parent(struct btree_path *path, diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 3b333e3..645fa99 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -252,7 +252,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, } path->l[0].b = (void *) ck; - path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); ret = bch2_btree_node_lock_write(trans, path, &ck->c); @@ -283,9 +283,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, return NULL; init: INIT_LIST_HEAD(&ck->list); - bch2_btree_lock_init(&ck->c); - if (pcpu_readers) - six_lock_pcpu_alloc(&ck->c.lock); + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); ck->c.cached = true; BUG_ON(!six_trylock_intent(&ck->c.lock)); @@ -341,9 +339,6 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) } mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); - } else { - if (path->btree_id == BTREE_ID_subvolumes) - six_lock_pcpu_alloc(&ck->c.lock); } ck->c.level = 0; @@ -512,7 +507,7 @@ retry: mark_btree_node_locked(trans, path, 0, lock_want); } - path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); path->l[0].b = (void *) ck; fill: path->uptodate = BTREE_ITER_UPTODATE; @@ -594,7 +589,7 @@ retry: mark_btree_node_locked(trans, path, 0, lock_want); } - path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); path->l[0].b = (void *) ck; fill: if (!ck->valid) @@ -872,7 +867,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, break; list_del(&ck->list); - six_lock_pcpu_free(&ck->c.lock); + six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); scanned++; @@ -888,7 +883,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, break; list_del(&ck->list); - six_lock_pcpu_free(&ck->c.lock); + six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); scanned++; @@ -1013,7 +1008,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) list_del(&ck->list); kfree(ck->k); - six_lock_pcpu_free(&ck->c.lock); + six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); } diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index b999866..70639a1 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -6,9 +6,10 @@ static struct lock_class_key bch2_btree_node_lock_key; -void bch2_btree_lock_init(struct btree_bkey_cached_common *b) +void bch2_btree_lock_init(struct btree_bkey_cached_common *b, + enum six_lock_init_flags flags) { - __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key); + __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); } #ifdef CONFIG_LOCKDEP @@ -20,16 +21,6 @@ void bch2_assert_btree_nodes_not_locked(void) /* Btree node locking: */ -static inline void six_lock_readers_add(struct six_lock *lock, int nr) -{ - if (lock->readers) - this_cpu_add(*lock->readers, nr); - else if (nr > 0) - atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter); - else - atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter); -} - struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, struct btree_path *skip, struct btree_bkey_cached_common *b, diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 327780c..b341cc8 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -14,7 +14,7 @@ #include "btree_iter.h" -void bch2_btree_lock_init(struct btree_bkey_cached_common *); +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); #ifdef CONFIG_LOCKDEP void bch2_assert_btree_nodes_not_locked(void); @@ -176,13 +176,13 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat struct btree_path *linked; EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); trans_for_each_path_with_node(trans, b, linked) - linked->l[b->c.level].lock_seq += 2; + linked->l[b->c.level].lock_seq++; six_unlock_write(&b->c.lock); } @@ -206,8 +206,8 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans, trans->lock_must_abort = false; trans->locking = b; - ret = six_lock_type_ip_waiter(&b->lock, type, &trans->locking_wait, - bch2_six_check_for_deadlock, trans, ip); + ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans, ip); WRITE_ONCE(trans->locking, NULL); WRITE_ONCE(trans->locking_wait.start_time, 0); return ret; @@ -284,7 +284,7 @@ static inline int __btree_node_lock_write(struct btree_trans *trans, bool lock_may_not_fail) { EBUG_ON(&path->l[b->level].b->c != b); - EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq); + EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock)); EBUG_ON(!btree_node_intent_locked(path, b->level)); /* diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 6ba0954..1319337 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -688,7 +688,7 @@ err: bch2_trans_unlock(&trans); btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); - path->l[b->c.level].lock_seq = b->c.lock.state.seq; + path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); path->l[b->c.level].b = b; bch2_btree_node_lock_write_nofail(&trans, path, &b->c); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index bce42ee..bd14418 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -137,17 +137,17 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) { struct bch_fs_usage_online *ret; - unsigned seq, i, v, u64s = fs_usage_u64s(c) + 1; + unsigned nr_replicas = READ_ONCE(c->replicas.nr); + unsigned seq, i; retry: - ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); + ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_NOFS); if (unlikely(!ret)) return NULL; percpu_down_read(&c->mark_lock); - v = fs_usage_u64s(c) + 1; - if (unlikely(u64s != v)) { - u64s = v; + if (nr_replicas != c->replicas.nr) { + nr_replicas = c->replicas.nr; percpu_up_read(&c->mark_lock); kfree(ret); goto retry; @@ -157,10 +157,12 @@ retry: do { seq = read_seqcount_begin(&c->usage_lock); - unsafe_memcpy(&ret->u, c->usage_base, u64s * sizeof(u64), + unsafe_memcpy(&ret->u, c->usage_base, + __fs_usage_u64s(nr_replicas) * sizeof(u64), "embedded variable length struct"); for (i = 0; i < ARRAY_SIZE(c->usage); i++) - acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s); + acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], + __fs_usage_u64s(nr_replicas)); } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index d677b02..bdf4fff 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -207,10 +207,24 @@ static inline u64 dev_buckets_available(struct bch_dev *ca, /* Filesystem usage: */ +static inline unsigned __fs_usage_u64s(unsigned nr_replicas) +{ + return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas; +} + static inline unsigned fs_usage_u64s(struct bch_fs *c) { - return sizeof(struct bch_fs_usage) / sizeof(u64) + - READ_ONCE(c->replicas.nr); + return __fs_usage_u64s(READ_ONCE(c->replicas.nr)); +} + +static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas) +{ + return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas; +} + +static inline unsigned fs_usage_online_u64s(struct bch_fs *c) +{ + return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr)); } static inline unsigned dev_usage_u64s(void) diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 8027c2a..cfb1779 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -420,7 +420,9 @@ TRACE_EVENT(btree_path_relock_fail, else scnprintf(__entry->node, sizeof(__entry->node), "%px", b); __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) + : 0; ), TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u", @@ -475,7 +477,9 @@ TRACE_EVENT(btree_path_upgrade_fail, __entry->read_count = c.n[SIX_LOCK_read]; __entry->intent_count = c.n[SIX_LOCK_read]; __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) + : 0; ), TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", diff --git a/libbcachefs/util.c b/libbcachefs/util.c index dfc55fe..9079686 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -350,11 +350,8 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, if (time_after64(end, start)) { duration = end - start; - stats->duration_stats = mean_and_variance_update_inlined(stats->duration_stats, - duration); - stats->duration_stats_weighted = mean_and_variance_weighted_update( - stats->duration_stats_weighted, - duration); + stats->duration_stats = mean_and_variance_update(stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); stats->max_duration = max(stats->max_duration, duration); stats->min_duration = min(stats->min_duration, duration); bch2_quantiles_update(&stats->quantiles, duration); @@ -362,10 +359,8 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, if (time_after64(end, stats->last_event)) { freq = end - stats->last_event; - stats->freq_stats = mean_and_variance_update_inlined(stats->freq_stats, freq); - stats->freq_stats_weighted = mean_and_variance_weighted_update( - stats->freq_stats_weighted, - freq); + stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); stats->max_freq = max(stats->max_freq, freq); stats->min_freq = min(stats->min_freq, freq); stats->last_event = end; @@ -594,8 +589,8 @@ void bch2_time_stats_exit(struct bch2_time_stats *stats) void bch2_time_stats_init(struct bch2_time_stats *stats) { memset(stats, 0, sizeof(*stats)); - stats->duration_stats_weighted.w = 8; - stats->freq_stats_weighted.w = 8; + stats->duration_stats_weighted.weight = 8; + stats->freq_stats_weighted.weight = 8; stats->min_duration = U64_MAX; stats->min_freq = U64_MAX; spin_lock_init(&stats->lock); diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c index bd08da5..eb5f2ba 100644 --- a/linux/mean_and_variance.c +++ b/linux/mean_and_variance.c @@ -43,38 +43,28 @@ #include #include -/** - * fast_divpow2() - fast approximation for n / (1 << d) - * @n: numerator - * @d: the power of 2 denominator. - * - * note: this rounds towards 0. - */ -s64 fast_divpow2(s64 n, u8 d) +u128_u u128_div(u128_u n, u64 d) { - return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; -} + u128_u r; + u64 rem; + u64 hi = u128_hi(n); + u64 lo = u128_lo(n); + u64 h = hi & ((u64) U32_MAX << 32); + u64 l = (hi & (u64) U32_MAX) << 32; -/** - * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 - * and return it. - * @s1: the mean_and_variance to update. - * @v1: the new sample. - * - * see linked pdf equation 12. - */ -struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1) -{ - return mean_and_variance_update_inlined(s1, v1); + r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); + r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); + r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); + return r; } -EXPORT_SYMBOL_GPL(mean_and_variance_update); +EXPORT_SYMBOL_GPL(u128_div); /** * mean_and_variance_get_mean() - get mean from @s */ s64 mean_and_variance_get_mean(struct mean_and_variance s) { - return div64_u64(s.sum, s.n); + return s.n ? div64_u64(s.sum, s.n) : 0; } EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); @@ -85,10 +75,14 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); */ u64 mean_and_variance_get_variance(struct mean_and_variance s1) { - u128 s2 = u128_div(s1.sum_squares, s1.n); - u64 s3 = abs(mean_and_variance_get_mean(s1)); + if (s1.n) { + u128_u s2 = u128_div(s1.sum_squares, s1.n); + u64 s3 = abs(mean_and_variance_get_mean(s1)); - return u128_to_u64(u128_sub(s2, u128_square(s3))); + return u128_lo(u128_sub(s2, u128_square(s3))); + } else { + return 0; + } } EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); @@ -109,10 +103,26 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. */ -struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, - s64 x) +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) { - return mean_and_variance_weighted_update_inlined(s1, x); + // previous weighted variance. + u8 w = s->weight; + u64 var_w0 = s->variance; + // new value weighted. + s64 x_w = x << w; + s64 diff_w = x_w - s->mean; + s64 diff = fast_divpow2(diff_w, w); + // new mean weighted. + s64 u_w1 = s->mean + diff; + + if (!s->init) { + s->mean = x_w; + s->variance = 0; + } else { + s->mean = u_w1; + s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; + } + s->init = true; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); @@ -121,7 +131,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); */ s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) { - return fast_divpow2(s.mean, s.w); + return fast_divpow2(s.mean, s.weight); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); @@ -131,7 +141,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) { // always positive don't need fast divpow2 - return s.variance >> s.w; + return s.variance >> s.weight; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); diff --git a/linux/six.c b/linux/six.c index 3d366a8..a47cd6d 100644 --- a/linux/six.c +++ b/linux/six.c @@ -14,9 +14,9 @@ #include #ifdef DEBUG -#define EBUG_ON(cond) BUG_ON(cond) +#define EBUG_ON(cond) BUG_ON(cond) #else -#define EBUG_ON(cond) do {} while (0) +#define EBUG_ON(cond) do {} while (0) #endif #define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) @@ -24,59 +24,69 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); +#define SIX_LOCK_HELD_read_OFFSET 0 +#define SIX_LOCK_HELD_read ~(~0U << 26) +#define SIX_LOCK_HELD_intent (1U << 26) +#define SIX_LOCK_HELD_write (1U << 27) +#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) +#define SIX_LOCK_WAITING_intent (1U << (28 + SIX_LOCK_intent)) +#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) +#define SIX_LOCK_NOSPIN (1U << 31) + struct six_lock_vals { /* Value we add to the lock in order to take the lock: */ - u64 lock_val; + u32 lock_val; /* If the lock has this value (used as a mask), taking the lock fails: */ - u64 lock_fail; - - /* Value we add to the lock in order to release the lock: */ - u64 unlock_val; + u32 lock_fail; /* Mask that indicates lock is held for this type: */ - u64 held_mask; + u32 held_mask; /* Waitlist we wakeup when releasing the lock: */ enum six_lock_type unlock_wakeup; }; -#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) -#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) -#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) - #define LOCK_VALS { \ [SIX_LOCK_read] = { \ - .lock_val = __SIX_VAL(read_lock, 1), \ - .lock_fail = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\ - .unlock_val = -__SIX_VAL(read_lock, 1), \ - .held_mask = __SIX_LOCK_HELD_read, \ + .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, \ + .lock_fail = SIX_LOCK_HELD_write, \ + .held_mask = SIX_LOCK_HELD_read, \ .unlock_wakeup = SIX_LOCK_write, \ }, \ [SIX_LOCK_intent] = { \ - .lock_val = __SIX_VAL(intent_lock, 1), \ - .lock_fail = __SIX_LOCK_HELD_intent, \ - .unlock_val = -__SIX_VAL(intent_lock, 1), \ - .held_mask = __SIX_LOCK_HELD_intent, \ + .lock_val = SIX_LOCK_HELD_intent, \ + .lock_fail = SIX_LOCK_HELD_intent, \ + .held_mask = SIX_LOCK_HELD_intent, \ .unlock_wakeup = SIX_LOCK_intent, \ }, \ [SIX_LOCK_write] = { \ - .lock_val = __SIX_VAL(seq, 1), \ - .lock_fail = __SIX_LOCK_HELD_read, \ - .unlock_val = __SIX_VAL(seq, 1), \ - .held_mask = __SIX_LOCK_HELD_write, \ + .lock_val = SIX_LOCK_HELD_write, \ + .lock_fail = SIX_LOCK_HELD_read, \ + .held_mask = SIX_LOCK_HELD_write, \ .unlock_wakeup = SIX_LOCK_read, \ }, \ } +static inline void six_set_bitmask(struct six_lock *lock, u32 mask) +{ + if ((atomic_read(&lock->state) & mask) != mask) + atomic_or(mask, &lock->state); +} + +static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) +{ + if (atomic_read(&lock->state) & mask) + atomic_and(~mask, &lock->state); +} + static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, - union six_lock_state old, - struct task_struct *owner) + u32 old, struct task_struct *owner) { if (type != SIX_LOCK_intent) return; - if (!old.intent_lock) { + if (!(old & SIX_LOCK_HELD_intent)) { EBUG_ON(lock->owner); lock->owner = owner; } else { @@ -94,22 +104,25 @@ static inline unsigned pcpu_read_count(struct six_lock *lock) return read_count; } -/* This is probably up there with the more evil things I've done */ -#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) - -static int __do_six_trylock_type(struct six_lock *lock, - enum six_lock_type type, - struct task_struct *task, - bool try) +/* + * __do_six_trylock() - main trylock routine + * + * Returns 1 on success, 0 on failure + * + * In percpu reader mode, a failed trylock may cause a spurious trylock failure + * for anoter thread taking the competing lock type, and we may havve to do a + * wakeup: when a wakeup is required, we return -1 - wakeup_type. + */ +static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, + struct task_struct *task, bool try) { const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old, new; int ret; - u64 v; + u32 old, new, v; EBUG_ON(type == SIX_LOCK_write && lock->owner != task); - EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1)); - EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking))); + EBUG_ON(type == SIX_LOCK_write && + (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); /* * Percpu reader mode: @@ -124,101 +137,75 @@ static int __do_six_trylock_type(struct six_lock *lock, * the lock, then issues a full memory barrier, then reads from the * other thread's variable to check if the other thread thinks it has * the lock. If we raced, we backoff and retry/sleep. + * + * Failure to take the lock may cause a spurious trylock failure in + * another thread, because we temporarily set the lock to indicate that + * we held it. This would be a problem for a thread in six_lock(), when + * they are calling trylock after adding themself to the waitlist and + * prior to sleeping. + * + * Therefore, if we fail to get the lock, and there were waiters of the + * type we conflict with, we will have to issue a wakeup. + * + * Since we may be called under wait_lock (and by the wakeup code + * itself), we return that the wakeup has to be done instead of doing it + * here. */ - if (type == SIX_LOCK_read && lock->readers) { preempt_disable(); this_cpu_inc(*lock->readers); /* signal that we own lock */ smp_mb(); - old.v = READ_ONCE(lock->state.v); - ret = !(old.v & l[type].lock_fail); + old = atomic_read(&lock->state); + ret = !(old & l[type].lock_fail); this_cpu_sub(*lock->readers, !ret); preempt_enable(); - /* - * If we failed because a writer was trying to take the - * lock, issue a wakeup because we might have caused a - * spurious trylock failure: - */ -#if 0 - /* - * This code should be sufficient, but we're seeing unexplained - * lost wakeups: - */ - if (old.write_locking) + if (!ret && (old & SIX_LOCK_WAITING_write)) ret = -1 - SIX_LOCK_write; -#else - if (!ret) - ret = -1 - SIX_LOCK_write; -#endif } else if (type == SIX_LOCK_write && lock->readers) { if (try) { - atomic64_add(__SIX_VAL(write_locking, 1), - &lock->state.counter); - smp_mb__after_atomic(); - } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) { - atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write), - &lock->state.counter); - /* - * pairs with barrier after unlock and before checking - * for readers in unlock path - */ + atomic_add(SIX_LOCK_HELD_write, &lock->state); smp_mb__after_atomic(); } ret = !pcpu_read_count(lock); - /* - * On success, we increment lock->seq; also we clear - * write_locking unless we failed from the lock path: - */ - v = 0; - if (ret) - v += __SIX_VAL(seq, 1); - if (ret || try) - v -= __SIX_VAL(write_locking, 1); - if (try && !ret) { - old.v = atomic64_add_return(v, &lock->state.counter); - if (old.waiters & (1 << SIX_LOCK_read)) + old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); + if (old & SIX_LOCK_WAITING_read) ret = -1 - SIX_LOCK_read; - } else { - atomic64_add(v, &lock->state.counter); } } else { - v = READ_ONCE(lock->state.v); + v = atomic_read(&lock->state); do { - new.v = old.v = v; + new = old = v; - if (!(old.v & l[type].lock_fail)) { - new.v += l[type].lock_val; + ret = !(old & l[type].lock_fail); - if (type == SIX_LOCK_write) - new.write_locking = 0; - } else if (!try && !(new.waiters & (1 << type))) - new.waiters |= 1 << type; - else - break; /* waiting bit already set */ - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, new.v)) != old.v); + if (!ret || (type == SIX_LOCK_write && !try)) { + smp_mb(); + break; + } - ret = !(old.v & l[type].lock_fail); + new += l[type].lock_val; + } while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old); - EBUG_ON(ret && !(lock->state.v & l[type].held_mask)); + EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); } if (ret > 0) six_set_owner(lock, type, old, task); - EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking)); + EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && + (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); return ret; } -static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) +static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) { struct six_lock_waiter *w, *next; struct task_struct *task; @@ -237,7 +224,7 @@ again: goto unlock; saw_one = true; - ret = __do_six_trylock_type(lock, lock_type, w->task, false); + ret = __do_six_trylock(lock, lock_type, w->task, false); if (ret <= 0) goto unlock; @@ -252,7 +239,7 @@ again: wake_up_process(task); } - clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v); + six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); unlock: raw_spin_unlock(&lock->wait_lock); @@ -262,96 +249,74 @@ unlock: } } -static inline void six_lock_wakeup(struct six_lock *lock, - union six_lock_state state, - enum six_lock_type lock_type) +__always_inline +static void six_lock_wakeup(struct six_lock *lock, u32 state, + enum six_lock_type lock_type) { - if (lock_type == SIX_LOCK_write && state.read_lock) + if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) return; - if (!(state.waiters & (1 << lock_type))) + if (!(state & (SIX_LOCK_WAITING_read << lock_type))) return; __six_lock_wakeup(lock, lock_type); } -static bool do_six_trylock_type(struct six_lock *lock, - enum six_lock_type type, - bool try) +__always_inline +static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) { int ret; - ret = __do_six_trylock_type(lock, type, current, try); + ret = __do_six_trylock(lock, type, current, try); if (ret < 0) __six_lock_wakeup(lock, -ret - 1); return ret > 0; } -__always_inline __flatten -static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type, - unsigned long ip) +/** + * six_trylock_ip - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) { - if (!do_six_trylock_type(lock, type, true)) + if (!do_six_trylock(lock, type, true)) return false; if (type != SIX_LOCK_write) six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); return true; } - -__always_inline __flatten -static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq, unsigned long ip) +EXPORT_SYMBOL_GPL(six_trylock_ip); + +/** + * six_relock_ip - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip) { - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old; - u64 v; - - EBUG_ON(type == SIX_LOCK_write); - - if (type == SIX_LOCK_read && - lock->readers) { - bool ret; - - preempt_disable(); - this_cpu_inc(*lock->readers); - - smp_mb(); - - old.v = READ_ONCE(lock->state.v); - ret = !(old.v & l[type].lock_fail) && old.seq == seq; - - this_cpu_sub(*lock->readers, !ret); - preempt_enable(); - - /* - * Similar to the lock path, we may have caused a spurious write - * lock fail and need to issue a wakeup: - */ - if (ret) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); - else - six_lock_wakeup(lock, old, SIX_LOCK_write); + if (lock->seq != seq || !six_trylock_ip(lock, type, ip)) + return false; - return ret; + if (lock->seq != seq) { + six_unlock_ip(lock, type, ip); + return false; } - v = READ_ONCE(lock->state.v); - do { - old.v = v; - - if (old.seq != seq || old.v & l[type].lock_fail) - return false; - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, - old.v + l[type].lock_val)) != old.v); - - six_set_owner(lock, type, old, current); - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); return true; } +EXPORT_SYMBOL_GPL(six_relock_ip); #ifdef CONFIG_LOCK_SPIN_ON_OWNER @@ -371,17 +336,6 @@ static inline bool six_can_spin_on_owner(struct six_lock *lock) return ret; } -static inline void six_set_nospin(struct six_lock *lock) -{ - union six_lock_state old, new; - u64 v = READ_ONCE(lock->state.v); - - do { - new.v = old.v = v; - new.nospin = true; - } while ((v = atomic64_cmpxchg(&lock->state.counter, old.v, new.v)) != old.v); -} - static inline bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner, u64 end_time) @@ -405,7 +359,7 @@ static inline bool six_spin_on_owner(struct six_lock *lock, } if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { - six_set_nospin(lock); + six_set_bitmask(lock, SIX_LOCK_NOSPIN); ret = false; break; } @@ -445,7 +399,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type if (owner && !six_spin_on_owner(lock, owner, end_time)) break; - if (do_six_trylock_type(lock, type, false)) { + if (do_six_trylock(lock, type, false)) { osq_unlock(&lock->osq); preempt_enable(); return true; @@ -494,17 +448,16 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type #endif noinline -static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) +static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { - union six_lock_state old; int ret = 0; if (type == SIX_LOCK_write) { - EBUG_ON(lock->state.write_locking); - atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter); + EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); + atomic_add(SIX_LOCK_HELD_write, &lock->state); smp_mb__after_atomic(); } @@ -519,13 +472,12 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty wait->lock_acquired = false; raw_spin_lock(&lock->wait_lock); - if (!(lock->state.waiters & (1 << type))) - set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v); + six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); /* - * Retry taking the lock after taking waitlist lock, have raced with an - * unlock: + * Retry taking the lock after taking waitlist lock, in case we raced + * with an unlock: */ - ret = __do_six_trylock_type(lock, type, current, false); + ret = __do_six_trylock(lock, type, current, false); if (ret <= 0) { wait->start_time = local_clock(); @@ -565,7 +517,7 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty list_del(&wait->list); raw_spin_unlock(&lock->wait_lock); - if (wait->lock_acquired) + if (unlikely(wait->lock_acquired)) do_six_unlock_type(lock, type); break; } @@ -575,21 +527,49 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty __set_current_state(TASK_RUNNING); out: - if (ret && type == SIX_LOCK_write && lock->state.write_locking) { - old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1), - &lock->state.counter); - six_lock_wakeup(lock, old, SIX_LOCK_read); + if (ret && type == SIX_LOCK_write) { + six_clear_bitmask(lock, SIX_LOCK_HELD_write); + six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); } trace_contention_end(lock, 0); return ret; } -__always_inline __flatten -static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) +/** + * six_lock_ip_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * This is the most general six_lock() variant, with parameters to support full + * cycle detection for deadlock avoidance. + * + * The code calling this function must implement tracking of held locks, and the + * @wait object should be embedded into the struct that tracks held locks - + * which must also be accessible in a thread-safe way. + * + * @should_sleep_fn should invoke the cycle detector; it should walk each + * lock's waiters, and for each waiter recursively walk their held locks. + * + * When this function must block, @wait will be added to @lock's waitlist before + * calling trylock, and before calling @should_sleep_fn, and @wait will not be + * removed from the lock waitlist until the lock has been successfully acquired, + * or we abort. + * + * @wait.start_time will be monotonically increasing for any given waitlist, and + * thus may be used as a loop cursor. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { int ret; @@ -598,8 +578,8 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type if (type != SIX_LOCK_write) six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); - ret = do_six_trylock_type(lock, type, true) ? 0 - : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p, ip); + ret = do_six_trylock(lock, type, true) ? 0 + : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); if (ret && type != SIX_LOCK_write) six_release(&lock->dep_map, ip); @@ -608,22 +588,13 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type return ret; } +EXPORT_SYMBOL_GPL(six_lock_ip_waiter); __always_inline -static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p, - unsigned long ip) -{ - struct six_lock_waiter wait; - - return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p, ip); -} - -__always_inline __flatten static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) { const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state state; + u32 state; if (type == SIX_LOCK_intent) lock->owner = NULL; @@ -633,26 +604,39 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) smp_mb(); /* unlock barrier */ this_cpu_dec(*lock->readers); smp_mb(); /* between unlocking and checking for waiters */ - state.v = READ_ONCE(lock->state.v); + state = atomic_read(&lock->state); } else { - u64 v = l[type].unlock_val; + u32 v = l[type].lock_val; if (type != SIX_LOCK_read) - v -= lock->state.v & __SIX_VAL(nospin, 1); + v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; - EBUG_ON(!(lock->state.v & l[type].held_mask)); - state.v = atomic64_add_return_release(v, &lock->state.counter); + EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); + state = atomic_sub_return_release(v, &lock->state); } six_lock_wakeup(lock, state, l[type].unlock_wakeup); } -__always_inline __flatten -static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type, - unsigned long ip) +/** + * six_unlock_ip - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) { EBUG_ON(type == SIX_LOCK_write && - !(lock->state.v & __SIX_LOCK_HELD_intent)); + !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); EBUG_ON((type == SIX_LOCK_write || type == SIX_LOCK_intent) && lock->owner != current); @@ -666,52 +650,18 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type, return; } + lock->seq += type == SIX_LOCK_write; + do_six_unlock_type(lock, type); } +EXPORT_SYMBOL_GPL(six_unlock_ip); -#define __SIX_LOCK(type) \ -bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip) \ -{ \ - return __six_trylock_type(lock, SIX_LOCK_##type, ip); \ -} \ -EXPORT_SYMBOL_GPL(six_trylock_ip_##type); \ - \ -bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ -{ \ - return __six_relock_type(lock, SIX_LOCK_##type, seq, ip); \ -} \ -EXPORT_SYMBOL_GPL(six_relock_ip_##type); \ - \ -int six_lock_ip_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn should_sleep_fn, void *p, \ - unsigned long ip) \ -{ \ - return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ -} \ -EXPORT_SYMBOL_GPL(six_lock_ip_##type); \ - \ -int six_lock_ip_waiter_##type(struct six_lock *lock, \ - struct six_lock_waiter *wait, \ - six_lock_should_sleep_fn should_sleep_fn, void *p,\ - unsigned long ip) \ -{ \ - return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ -} \ -EXPORT_SYMBOL_GPL(six_lock_ip_waiter_##type); \ - \ -void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ -{ \ - __six_unlock_type(lock, SIX_LOCK_##type, ip); \ -} \ -EXPORT_SYMBOL_GPL(six_unlock_ip_##type); - -__SIX_LOCK(read) -__SIX_LOCK(intent) -__SIX_LOCK(write) - -#undef __SIX_LOCK - -/* Convert from intent to read: */ +/** + * six_lock_downgrade - convert an intent lock to a read lock + * @lock: lock to dowgrade + * + * @lock will have read count incremented and intent count decremented + */ void six_lock_downgrade(struct six_lock *lock) { six_lock_increment(lock, SIX_LOCK_read); @@ -719,25 +669,33 @@ void six_lock_downgrade(struct six_lock *lock) } EXPORT_SYMBOL_GPL(six_lock_downgrade); +/** + * six_lock_tryupgrade - attempt to convert read lock to an intent lock + * @lock: lock to upgrade + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ bool six_lock_tryupgrade(struct six_lock *lock) { - union six_lock_state old, new; - u64 v = READ_ONCE(lock->state.v); + const struct six_lock_vals l[] = LOCK_VALS; + u32 old, new, v = atomic_read(&lock->state); do { - new.v = old.v = v; + new = old = v; - if (new.intent_lock) + if (new & SIX_LOCK_HELD_intent) return false; if (!lock->readers) { - EBUG_ON(!new.read_lock); - new.read_lock--; + EBUG_ON(!(new & SIX_LOCK_HELD_read)); + new -= l[SIX_LOCK_read].lock_val; } - new.intent_lock = 1; - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, new.v)) != old.v); + new |= SIX_LOCK_HELD_intent; + } while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old); if (lock->readers) this_cpu_dec(*lock->readers); @@ -748,6 +706,17 @@ bool six_lock_tryupgrade(struct six_lock *lock) } EXPORT_SYMBOL_GPL(six_lock_tryupgrade); +/** + * six_trylock_convert - attempt to convert a held lock from one type to another + * @lock: lock to upgrade + * @from: SIX_LOCK_read or SIX_LOCK_intent + * @to: SIX_LOCK_read or SIX_LOCK_intent + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ bool six_trylock_convert(struct six_lock *lock, enum six_lock_type from, enum six_lock_type to) @@ -766,9 +735,16 @@ bool six_trylock_convert(struct six_lock *lock, } EXPORT_SYMBOL_GPL(six_trylock_convert); -/* - * Increment read/intent lock count, assuming we already have it read or intent - * locked: +/** + * six_lock_increment - increase held lock count on a lock that is already held + * @lock: lock to increment + * @type: SIX_LOCK_read or SIX_LOCK_intent + * + * @lock must already be held, with a lock type that is greater than or equal to + * @type + * + * A corresponding six_unlock_type() call will be required for @lock to be fully + * unlocked. */ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) { @@ -783,13 +759,14 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) if (lock->readers) { this_cpu_inc(*lock->readers); } else { - EBUG_ON(!lock->state.read_lock && - !lock->state.intent_lock); - atomic64_add(l[type].lock_val, &lock->state.counter); + EBUG_ON(!(atomic_read(&lock->state) & + (SIX_LOCK_HELD_read| + SIX_LOCK_HELD_intent))); + atomic_add(l[type].lock_val, &lock->state); } break; case SIX_LOCK_intent: - EBUG_ON(!lock->state.intent_lock); + EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); lock->intent_lock_recurse++; break; case SIX_LOCK_write: @@ -799,9 +776,19 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) } EXPORT_SYMBOL_GPL(six_lock_increment); +/** + * six_lock_wakeup_all - wake up all waiters on @lock + * @lock: lock to wake up waiters for + * + * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then + * abort the lock operation. + * + * This function is never needed in a bug-free program; it's only useful in + * debug code, e.g. to determine if a cycle detector is at fault. + */ void six_lock_wakeup_all(struct six_lock *lock) { - union six_lock_state state = lock->state; + u32 state = atomic_read(&lock->state); struct six_lock_waiter *w; six_lock_wakeup(lock, state, SIX_LOCK_read); @@ -815,38 +802,96 @@ void six_lock_wakeup_all(struct six_lock *lock) } EXPORT_SYMBOL_GPL(six_lock_wakeup_all); -void six_lock_pcpu_free(struct six_lock *lock) -{ - BUG_ON(lock->readers && pcpu_read_count(lock)); - BUG_ON(lock->state.read_lock); - - free_percpu(lock->readers); - lock->readers = NULL; -} -EXPORT_SYMBOL_GPL(six_lock_pcpu_free); - -void six_lock_pcpu_alloc(struct six_lock *lock) -{ -#ifdef __KERNEL__ - if (!lock->readers) - lock->readers = alloc_percpu(unsigned); -#endif -} -EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc); - -/* - * Returns lock held counts, for both read and intent +/** + * six_lock_counts - return held lock counts, for each lock type + * @lock: lock to return counters for + * + * Return: the number of times a lock is held for read, intent and write. */ struct six_lock_count six_lock_counts(struct six_lock *lock) { struct six_lock_count ret; ret.n[SIX_LOCK_read] = !lock->readers - ? lock->state.read_lock + ? atomic_read(&lock->state) & SIX_LOCK_HELD_read : pcpu_read_count(lock); - ret.n[SIX_LOCK_intent] = lock->state.intent_lock + lock->intent_lock_recurse; - ret.n[SIX_LOCK_write] = lock->state.seq & 1; + ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + + lock->intent_lock_recurse; + ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); return ret; } EXPORT_SYMBOL_GPL(six_lock_counts); + +/** + * six_lock_readers_add - directly manipulate reader count of a lock + * @lock: lock to add/subtract readers for + * @nr: reader count to add/subtract + * + * When an upper layer is implementing lock reentrency, we may have both read + * and intent locks on the same lock. + * + * When we need to take a write lock, the read locks will cause self-deadlock, + * because six locks themselves do not track which read locks are held by the + * current thread and which are held by a different thread - it does no + * per-thread tracking of held locks. + * + * The upper layer that is tracking held locks may however, if trylock() has + * failed, count up its own read locks, subtract them, take the write lock, and + * then re-add them. + * + * As in any other situation when taking a write lock, @lock must be held for + * intent one (or more) times, so @lock will never be left unlocked. + */ +void six_lock_readers_add(struct six_lock *lock, int nr) +{ + if (lock->readers) { + this_cpu_add(*lock->readers, nr); + } else { + EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); + /* reader count starts at bit 0 */ + atomic_add(nr, &lock->state); + } +} +EXPORT_SYMBOL_GPL(six_lock_readers_add); + +/** + * six_lock_exit - release resources held by a lock prior to freeing + * @lock: lock to exit + * + * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is + * required to free the percpu read counts. + */ +void six_lock_exit(struct six_lock *lock) +{ + WARN_ON(lock->readers && pcpu_read_count(lock)); + WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); + + free_percpu(lock->readers); + lock->readers = NULL; +} +EXPORT_SYMBOL_GPL(six_lock_exit); + +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags) +{ + atomic_set(&lock->state, 0); + raw_spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *) lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + + if (flags & SIX_LOCK_INIT_PCPU) { + /* + * We don't return an error here on memory allocation failure + * since percpu is an optimization, and locks will work with the + * same semantics in non-percpu mode: callers can check for + * failure if they wish by checking lock->readers, but generally + * will not want to treat it as an error. + */ + lock->readers = alloc_percpu(unsigned); + } +} +EXPORT_SYMBOL_GPL(__six_lock_init);