]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 31c09369cd six locks: Fix an unitialized var
authorKent Overstreet <kent.overstreet@linux.dev>
Thu, 25 May 2023 21:52:28 +0000 (17:52 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Fri, 26 May 2023 02:25:34 +0000 (22:25 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
21 files changed:
.bcachefs_revision
include/linux/atomic.h
include/linux/mean_and_variance.h
include/linux/six.h
libbcachefs/alloc_background.c
libbcachefs/bkey.c
libbcachefs/bkey.h
libbcachefs/btree_cache.c
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_locking.c
libbcachefs/btree_locking.h
libbcachefs/btree_update_interior.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/trace.h
libbcachefs/util.c
linux/mean_and_variance.c
linux/six.c

index 1f415ca7aa3c24a93d5e5131795a9ddd21f1214b..1d85f952dc54a4bdb00c9839aeba858ba98e87a8 100644 (file)
@@ -1 +1 @@
-799716df00709f7480f575e8fd626915bafba006
+31c09369cd01b34fb8ba845fa09776576b03a1e2
index a9852fa1f99a3370f0baa1c489dada515e1402a2..79cf5aa9c9f689429b8ccdb4b16f3be6af54a45a 100644 (file)
@@ -32,6 +32,8 @@ typedef struct {
 #define __ATOMIC_SUB(v, p)             uatomic_sub(p, v)
 #define __ATOMIC_INC(p)                        uatomic_inc(p)
 #define __ATOMIC_DEC(p)                        uatomic_dec(p)
+#define __ATOMIC_AND(v, p)             uatomic_and(p, v)
+#define __ATOMIC_OR(v, p)              uatomic_or(p, v)
 
 #define xchg(p, v)                     uatomic_xchg(p, v)
 #define xchg_acquire(p, v)             uatomic_xchg(p, v)
@@ -56,6 +58,8 @@ typedef struct {
 #define __ATOMIC_SUB_RETURN(v, p)      __atomic_sub_fetch(p, v, __ATOMIC_RELAXED)
 #define __ATOMIC_SUB_RETURN_RELEASE(v, p)                              \
                                        __atomic_sub_fetch(p, v, __ATOMIC_RELEASE)
+#define __ATOMIC_AND(p)                        __atomic_and_fetch(p, v, __ATOMIC_RELAXED)
+#define __ATOMIC_OR(p)                 __atomic_or_fetch(p, v, __ATOMIC_RELAXED)
 
 #define xchg(p, v)                     __atomic_exchange_n(p, v, __ATOMIC_SEQ_CST)
 #define xchg_acquire(p, v)             __atomic_exchange_n(p, v, __ATOMIC_ACQUIRE)
@@ -244,6 +248,16 @@ static inline bool a_type##_inc_not_zero(a_type##_t *v)                    \
        return a_type##_add_unless(v, 1, 0);                            \
 }                                                                      \
                                                                        \
+static inline void a_type##_and(i_type a, a_type##_t *v)               \
+{                                                                      \
+       __ATOMIC_AND(a, v);                                             \
+}                                                                      \
+                                                                       \
+static inline void a_type##_or(i_type a, a_type##_t *v)                        \
+{                                                                      \
+       __ATOMIC_OR(a, v);                                              \
+}                                                                      \
+                                                                       \
 static inline i_type a_type##_xchg(a_type##_t *v, i_type i)            \
 {                                                                      \
        return xchg(&v->counter, i);                                    \
index 756eb3d1ca641a2acebf5d52b05ebb551eaad5f2..9ed79f42a40439472a4367ef8e5c5d566890a9f3 100644 (file)
 #ifndef MEAN_AND_VARIANCE_H_
 #define MEAN_AND_VARIANCE_H_
 
-#include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/kernel.h>
 #include <linux/limits.h>
 #include <linux/math64.h>
+#include <stdlib.h>
 
 #define SQRT_U64_MAX 4294967295ULL
 
-/**
- * abs - return absolute value of an argument
- * @x: the value.  If it is unsigned type, it is converted to signed type first.
- *     char is treated as if it was signed (regardless of whether it really is)
- *     but the macro's return type is preserved as char.
- *
- * Return: an absolute value of x.
+/*
+ * u128_u: u128 user mode, because not all architectures support a real int128
+ * type
  */
-#define abs(x) __abs_choose_expr(x, long long,                         \
-               __abs_choose_expr(x, long,                              \
-               __abs_choose_expr(x, int,                               \
-               __abs_choose_expr(x, short,                             \
-               __abs_choose_expr(x, char,                              \
-               __builtin_choose_expr(                                  \
-                       __builtin_types_compatible_p(typeof(x), char),  \
-                       (char)({ signed char __x = (x); __x<0?-__x:__x; }), \
-                       ((void)0)))))))
 
-#define __abs_choose_expr(x, type, other) __builtin_choose_expr(       \
-       __builtin_types_compatible_p(typeof(x),   signed type) ||       \
-       __builtin_types_compatible_p(typeof(x), unsigned type),         \
-       ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
+#ifdef __SIZEOF_INT128__
 
-#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
-
-typedef unsigned __int128 u128;
+typedef struct {
+       unsigned __int128 v;
+} __aligned(16) u128_u;
 
-static inline u128 u64_to_u128(u64 a)
+static inline u128_u u64_to_u128(u64 a)
 {
-       return (u128)a;
+       return (u128_u) { .v = a };
 }
 
-static inline u64 u128_to_u64(u128 a)
+static inline u64 u128_lo(u128_u a)
 {
-       return (u64)a;
+       return a.v;
 }
 
-static inline u64 u128_shr64_to_u64(u128 a)
+static inline u64 u128_hi(u128_u a)
 {
-       return (u64)(a >> 64);
+       return a.v >> 64;
 }
 
-static inline u128 u128_add(u128 a, u128 b)
+static inline u128_u u128_add(u128_u a, u128_u b)
 {
-       return a + b;
+       a.v += b.v;
+       return a;
 }
 
-static inline u128 u128_sub(u128 a, u128 b)
+static inline u128_u u128_sub(u128_u a, u128_u b)
 {
-       return a - b;
+       a.v -= b.v;
+       return a;
 }
 
-static inline u128 u128_shl(u128 i, s8 shift)
+static inline u128_u u128_shl(u128_u a, s8 shift)
 {
-       return i << shift;
+       a.v <<= shift;
+       return a;
 }
 
-static inline u128 u128_shl64_add(u64 a, u64 b)
+static inline u128_u u128_square(u64 a)
 {
-       return ((u128)a << 64) + b;
-}
+       u128_u b = u64_to_u128(a);
 
-static inline u128 u128_square(u64 i)
-{
-       return i*i;
+       b.v *= b.v;
+       return b;
 }
 
 #else
 
 typedef struct {
        u64 hi, lo;
-} u128;
+} __aligned(16) u128_u;
+
+/* conversions */
 
-static inline u128 u64_to_u128(u64 a)
+static inline u128_u u64_to_u128(u64 a)
 {
-       return (u128){ .lo = a };
+       return (u128_u) { .lo = a };
 }
 
-static inline u64 u128_to_u64(u128 a)
+static inline u64 u128_lo(u128_u a)
 {
        return a.lo;
 }
 
-static inline u64 u128_shr64_to_u64(u128 a)
+static inline u64 u128_hi(u128_u a)
 {
        return a.hi;
 }
 
-static inline u128 u128_add(u128 a, u128 b)
+/* arithmetic */
+
+static inline u128_u u128_add(u128_u a, u128_u b)
 {
-       u128 c;
+       u128_u c;
 
        c.lo = a.lo + b.lo;
        c.hi = a.hi + b.hi + (c.lo < a.lo);
        return c;
 }
 
-static inline u128 u128_sub(u128 a, u128 b)
+static inline u128_u u128_sub(u128_u a, u128_u b)
 {
-       u128 c;
+       u128_u c;
 
        c.lo = a.lo - b.lo;
        c.hi = a.hi - b.hi - (c.lo > a.lo);
        return c;
 }
 
-static inline u128 u128_shl(u128 i, s8 shift)
+static inline u128_u u128_shl(u128_u i, s8 shift)
 {
-       u128 r;
+       u128_u r;
 
        r.lo = i.lo << shift;
        if (shift < 64)
@@ -129,15 +119,10 @@ static inline u128 u128_shl(u128 i, s8 shift)
        return r;
 }
 
-static inline u128 u128_shl64_add(u64 a, u64 b)
+static inline u128_u u128_square(u64 i)
 {
-       return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b));
-}
-
-static inline u128 u128_square(u64 i)
-{
-       u128 r;
-       u64  h = i >> 32, l = i & (u64)U32_MAX;
+       u128_u r;
+       u64  h = i >> 32, l = i & U32_MAX;
 
        r =             u128_shl(u64_to_u128(h*h), 64);
        r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
@@ -148,85 +133,69 @@ static inline u128 u128_square(u64 i)
 
 #endif
 
-static inline u128 u128_div(u128 n, u64 d)
+static inline u128_u u64s_to_u128(u64 hi, u64 lo)
 {
-       u128 r;
-       u64 rem;
-       u64 hi = u128_shr64_to_u64(n);
-       u64 lo = u128_to_u64(n);
-       u64  h =  hi & ((u64)U32_MAX  << 32);
-       u64  l = (hi &  (u64)U32_MAX) << 32;
+       u128_u c = u64_to_u128(hi);
 
-       r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
-       r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
-       r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
-       return r;
+       c = u128_shl(c, 64);
+       c = u128_add(c, u64_to_u128(lo));
+       return c;
 }
 
+u128_u u128_div(u128_u n, u64 d);
+
 struct mean_and_variance {
-       s64 n;
-       s64 sum;
-       u128 sum_squares;
+       s64     n;
+       s64     sum;
+       u128_u  sum_squares;
 };
 
 /* expontentially weighted variant */
 struct mean_and_variance_weighted {
-       bool init;
-       u8 w;
-       s64 mean;
-       u64 variance;
+       bool    init;
+       u8      weight; /* base 2 logarithim */
+       s64     mean;
+       u64     variance;
 };
 
-s64 fast_divpow2(s64 n, u8 d);
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+static inline s64 fast_divpow2(s64 n, u8 d)
+{
+       return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
 
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
 static inline struct mean_and_variance
-mean_and_variance_update_inlined(struct mean_and_variance s1, s64 v1)
-{
-       struct mean_and_variance s2;
-       u64 v2 = abs(v1);
-
-       s2.n           = s1.n + 1;
-       s2.sum         = s1.sum + v1;
-       s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2));
-       return s2;
-}
-
-static inline struct mean_and_variance_weighted
-mean_and_variance_weighted_update_inlined(struct mean_and_variance_weighted s1, s64 x)
-{
-       struct mean_and_variance_weighted s2;
-       // previous weighted variance.
-       u64 var_w0 = s1.variance;
-       u8 w = s2.w = s1.w;
-       // new value weighted.
-       s64 x_w = x << w;
-       s64 diff_w = x_w - s1.mean;
-       s64 diff = fast_divpow2(diff_w, w);
-       // new mean weighted.
-       s64 u_w1     = s1.mean + diff;
-
-       BUG_ON(w % 2 != 0);
-
-       if (!s1.init) {
-               s2.mean = x_w;
-               s2.variance = 0;
-       } else {
-               s2.mean = u_w1;
-               s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
-       }
-       s2.init = true;
-
-       return s2;
+mean_and_variance_update(struct mean_and_variance s, s64 v)
+{
+       return (struct mean_and_variance) {
+               .n           = s.n + 1,
+               .sum         = s.sum + v,
+               .sum_squares = u128_add(s.sum_squares, u128_square(abs(v))),
+       };
 }
 
-struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1);
-       s64              mean_and_variance_get_mean(struct mean_and_variance s);
-       u64              mean_and_variance_get_variance(struct mean_and_variance s1);
-       u32              mean_and_variance_get_stddev(struct mean_and_variance s);
+s64 mean_and_variance_get_mean(struct mean_and_variance s);
+u64 mean_and_variance_get_variance(struct mean_and_variance s1);
+u32 mean_and_variance_get_stddev(struct mean_and_variance s);
+
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
 
-struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1);
-       s64                       mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-       u64                       mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-       u32                       mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
 
 #endif // MEAN_AND_VAIRANCE_H_
index 83023f64a30e2a20108c79d495487fac6f916728..394da423c28e511f4e0d733708941c9385104b80 100644 (file)
 #ifndef _LINUX_SIX_H
 #define _LINUX_SIX_H
 
-/*
- * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
- * semaphores, except with a third intermediate state, intent. Basic operations
- * are:
+/**
+ * DOC: SIX locks overview
  *
- * six_lock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
+ * but with an additional state: read/shared, intent, exclusive/write
  *
- * six_lock_intent(&foo->lock);
- * six_unlock_intent(&foo->lock);
+ * The purpose of the intent state is to allow for greater concurrency on tree
+ * structures without deadlocking. In general, a read can't be upgraded to a
+ * write lock without deadlocking, so an operation that updates multiple nodes
+ * will have to take write locks for the full duration of the operation.
  *
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
+ * But by adding an intent state, which is exclusive with other intent locks but
+ * not with readers, we can take intent locks at thte start of the operation,
+ * and then take write locks only for the actual update to each individual
+ * nodes, without deadlocking.
  *
- * Intent locks block other intent locks, but do not block read locks, and you
- * must have an intent lock held before taking a write lock, like so:
+ * Example usage:
+ *   six_lock_read(&foo->lock);
+ *   six_unlock_read(&foo->lock);
  *
- * six_lock_intent(&foo->lock);
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
- * six_unlock_intent(&foo->lock);
+ * An intent lock must be held before taking a write lock:
+ *   six_lock_intent(&foo->lock);
+ *   six_lock_write(&foo->lock);
+ *   six_unlock_write(&foo->lock);
+ *   six_unlock_intent(&foo->lock);
  *
  * Other operations:
- *
  *   six_trylock_read()
  *   six_trylock_intent()
  *   six_trylock_write()
  *
- *   six_lock_downgrade():     convert from intent to read
- *   six_lock_tryupgrade():    attempt to convert from read to intent
- *
- * Locks also embed a sequence number, which is incremented when the lock is
- * locked or unlocked for write. The current sequence number can be grabbed
- * while a lock is held from lock->state.seq; then, if you drop the lock you can
- * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
- * iff it hasn't been locked for write in the meantime.
- *
- * There are also operations that take the lock type as a parameter, where the
- * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
- *
- *   six_lock_type(lock, type)
- *   six_unlock_type(lock, type)
- *   six_relock(lock, type, seq)
- *   six_trylock_type(lock, type)
- *   six_trylock_convert(lock, from, to)
- *
- * A lock may be held multiple times by the same thread (for read or intent,
- * not write). However, the six locks code does _not_ implement the actual
- * recursive checks itself though - rather, if your code (e.g. btree iterator
- * code) knows that the current thread already has a lock held, and for the
- * correct type, six_lock_increment() may be used to bump up the counter for
- * that type - the only effect is that one more call to unlock will be required
- * before the lock is unlocked.
+ *   six_lock_downgrade()      convert from intent to read
+ *   six_lock_tryupgrade()     attempt to convert from read to intent, may fail
+ *
+ * There are also interfaces that take the lock type as an enum:
+ *
+ *   six_lock_type(&foo->lock, SIX_LOCK_read);
+ *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
+ *   six_lock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
+ *
+ * Lock sequence numbers - unlock(), relock():
+ *
+ *   Locks embed sequences numbers, which are incremented on write lock/unlock.
+ *   This allows locks to be dropped and the retaken iff the state they protect
+ *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
+ *   doing IO or allocating memory.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     u32 seq = six_lock_seq(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *
+ *     some_operation_that_may_block();
+ *
+ *     if (six_relock_read(&foo->lock, seq)) { ... }
+ *
+ *   If the relock operation succeeds, it is as if the lock was never unlocked.
+ *
+ * Reentrancy:
+ *
+ *   Six locks are not by themselves reentrent, but have counters for both the
+ *   read and intent states that can be used to provide reentrency by an upper
+ *   layer that tracks held locks. If a lock is known to already be held in the
+ *   read or intent state, six_lock_increment() can be used to bump the "lock
+ *   held in this state" counter, increasing the number of unlock calls that
+ *   will be required to fully unlock it.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     six_lock_increment(&foo->lock, SIX_LOCK_read);
+ *     six_unlock_read(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *   foo->lock is now fully unlocked.
+ *
+ *   Since the intent state supercedes read, it's legal to increment the read
+ *   counter when holding an intent lock, but not the reverse.
+ *
+ *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
+ *   is not legal.
+ *
+ * should_sleep_fn:
+ *
+ *   There is a six_lock() variant that takes a function pointer that is called
+ *   immediately prior to schedule() when blocking, and may return an error to
+ *   abort.
+ *
+ *   One possible use for this feature is when objects being locked are part of
+ *   a cache and may reused, and lock ordering is based on a property of the
+ *   object that will change when the object is reused - i.e. logical key order.
+ *
+ *   If looking up an object in the cache may race with object reuse, and lock
+ *   ordering is required to prevent deadlock, object reuse may change the
+ *   correct lock order for that object and cause a deadlock. should_sleep_fn
+ *   can be used to check if the object is still the object we want and avoid
+ *   this deadlock.
+ *
+ * Wait list entry interface:
+ *
+ *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
+ *   wait list entry. By embedding six_lock_waiter into another object, and by
+ *   traversing lock waitlists, it is then possible for an upper layer to
+ *   implement full cycle detection for deadlock avoidance.
+ *
+ *   should_sleep_fn should be used for invoking the cycle detector, walking the
+ *   graph of held locks to check for a deadlock. The upper layer must track
+ *   held locks for each thread, and each thread's held locks must be reachable
+ *   from its six_lock_waiter object.
+ *
+ *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
+ *   the lock, and before calling should_sleep_fn, and the wait object will not
+ *   be removed from the waitlist until either the lock has been successfully
+ *   acquired, or we aborted because should_sleep_fn returned an error.
+ *
+ *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
+ *   have timestamps in strictly ascending order - this is so the timestamp can
+ *   be used as a cursor for lock graph traverse.
  */
 
 #include <linux/lockdep.h>
 #include <linux/sched.h>
 #include <linux/types.h>
 
-#define SIX_LOCK_SEPARATE_LOCKFNS
-
-union six_lock_state {
-       struct {
-               atomic64_t      counter;
-       };
-
-       struct {
-               u64             v;
-       };
-
-       struct {
-               /* for waitlist_bitnr() */
-               unsigned long   l;
-       };
-
-       struct {
-               unsigned        read_lock:26;
-               unsigned        write_locking:1;
-               unsigned        intent_lock:1;
-               unsigned        nospin:1;
-               unsigned        waiters:3;
-               /*
-                * seq works much like in seqlocks: it's incremented every time
-                * we lock and unlock for write.
-                *
-                * If it's odd write lock is held, even unlocked.
-                *
-                * Thus readers can unlock, and then lock again later iff it
-                * hasn't been modified in the meantime.
-                */
-               u32             seq;
-       };
-};
-
 enum six_lock_type {
        SIX_LOCK_read,
        SIX_LOCK_intent,
@@ -105,7 +135,8 @@ enum six_lock_type {
 };
 
 struct six_lock {
-       union six_lock_state    state;
+       atomic_t                state;
+       u32                     seq;
        unsigned                intent_lock_recurse;
        struct task_struct      *owner;
        unsigned __percpu       *readers;
@@ -127,59 +158,210 @@ struct six_lock_waiter {
 
 typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
 
-static __always_inline void __six_lock_init(struct six_lock *lock,
-                                           const char *name,
-                                           struct lock_class_key *key)
-{
-       atomic64_set(&lock->state.counter, 0);
-       raw_spin_lock_init(&lock->wait_lock);
-       INIT_LIST_HEAD(&lock->wait_list);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       debug_check_no_locks_freed((void *) lock, sizeof(*lock));
-       lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-}
+void six_lock_exit(struct six_lock *lock);
+
+enum six_lock_init_flags {
+       SIX_LOCK_INIT_PCPU      = 1U << 0,
+};
 
-#define six_lock_init(lock)                                            \
+void __six_lock_init(struct six_lock *lock, const char *name,
+                    struct lock_class_key *key, enum six_lock_init_flags flags);
+
+/**
+ * six_lock_init - initialize a six lock
+ * @lock:      lock to initialize
+ * @flags:     optional flags, i.e. SIX_LOCK_INIT_PCPU
+ */
+#define six_lock_init(lock, flags)                                     \
 do {                                                                   \
        static struct lock_class_key __key;                             \
                                                                        \
-       __six_lock_init((lock), #lock, &__key);                         \
+       __six_lock_init((lock), #lock, &__key, flags);                  \
 } while (0)
 
-#define __SIX_VAL(field, _v)   (((union six_lock_state) { .field = _v }).v)
+/**
+ * six_lock_seq - obtain current lock sequence number
+ * @lock:      six_lock to obtain sequence number for
+ *
+ * @lock should be held for read or intent, and not write
+ *
+ * By saving the lock sequence number, we can unlock @lock and then (typically
+ * after some blocking operation) attempt to relock it: the relock will succeed
+ * if the sequence number hasn't changed, meaning no write locks have been taken
+ * and state corresponding to what @lock protects is still valid.
+ */
+static inline u32 six_lock_seq(const struct six_lock *lock)
+{
+       return lock->seq;
+}
+
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_trylock_type - attempt to take a six lock without blocking
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       return six_trylock_ip(lock, type, _THIS_IP_);
+}
+
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+                      struct six_lock_waiter *wait,
+                      six_lock_should_sleep_fn should_sleep_fn, void *p,
+                      unsigned long ip);
+
+/**
+ * six_lock_waiter - take a lock, with full waitlist interface
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:      pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *             to scheduling
+ * @p:         passed through to @should_sleep_fn
+ *
+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
+ * for full documentation.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
+                                 struct six_lock_waiter *wait,
+                                 six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+       return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+/**
+ * six_lock_ip - take a six lock lock
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *             to scheduling
+ * @p:         passed through to @should_sleep_fn
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
+                             six_lock_should_sleep_fn should_sleep_fn, void *p,
+                             unsigned long ip)
+{
+       struct six_lock_waiter wait;
+
+       return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
+}
+
+/**
+ * six_lock_type - take a six lock lock
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *             to scheduling
+ * @p:         passed through to @should_sleep_fn
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+                               six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+       struct six_lock_waiter wait;
+
+       return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+                  unsigned seq, unsigned long ip);
+
+/**
+ * six_relock_type - attempt to re-take a lock that was held previously
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:       lock sequence number obtained from six_lock_seq() while lock was
+ *             held previously
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+                                  unsigned seq)
+{
+       return six_relock_ip(lock, type, seq, _THIS_IP_);
+}
+
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_unlock_type - drop a six lock
+ * @lock:      lock to unlock
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);                          read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);      read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 0
+ */
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       six_unlock_ip(lock, type, _THIS_IP_);
+}
 
 #define __SIX_LOCK(type)                                               \
-bool six_trylock_ip_##type(struct six_lock *, unsigned long);          \
-bool six_relock_ip_##type(struct six_lock *, u32, unsigned long);      \
-int six_lock_ip_##type(struct six_lock *, six_lock_should_sleep_fn,    \
-                      void *, unsigned long);                          \
-int six_lock_ip_waiter_##type(struct six_lock *, struct six_lock_waiter *,\
-                       six_lock_should_sleep_fn, void *, unsigned long);\
-void six_unlock_ip_##type(struct six_lock *, unsigned long);           \
+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
+{                                                                      \
+       return six_trylock_ip(lock, SIX_LOCK_##type, ip);               \
+}                                                                      \
                                                                        \
 static inline bool six_trylock_##type(struct six_lock *lock)           \
 {                                                                      \
-       return six_trylock_ip_##type(lock, _THIS_IP_);                  \
+       return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);        \
+}                                                                      \
+                                                                       \
+static inline int six_lock_ip_waiter_##type(struct six_lock *lock,     \
+                          struct six_lock_waiter *wait,                \
+                          six_lock_should_sleep_fn should_sleep_fn, void *p,\
+                          unsigned long ip)                            \
+{                                                                      \
+       return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
+}                                                                      \
+                                                                       \
+static inline int six_lock_ip_##type(struct six_lock *lock,            \
+                   six_lock_should_sleep_fn should_sleep_fn, void *p,  \
+                   unsigned long ip)                                   \
+{                                                                      \
+       return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
+}                                                                      \
+                                                                       \
+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
+{                                                                      \
+       return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);           \
 }                                                                      \
+                                                                       \
 static inline bool six_relock_##type(struct six_lock *lock, u32 seq)   \
 {                                                                      \
-       return six_relock_ip_##type(lock, seq, _THIS_IP_);              \
+       return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);    \
 }                                                                      \
+                                                                       \
 static inline int six_lock_##type(struct six_lock *lock,               \
                                  six_lock_should_sleep_fn fn, void *p)\
 {                                                                      \
        return six_lock_ip_##type(lock, fn, p, _THIS_IP_);              \
 }                                                                      \
-static inline int six_lock_waiter_##type(struct six_lock *lock,                \
-                       struct six_lock_waiter *wait,                   \
-                       six_lock_should_sleep_fn fn, void *p)           \
+                                                                       \
+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)       \
 {                                                                      \
-       return six_lock_ip_waiter_##type(lock, wait, fn, p, _THIS_IP_); \
+       six_unlock_ip(lock, SIX_LOCK_##type, ip);                       \
 }                                                                      \
+                                                                       \
 static inline void six_unlock_##type(struct six_lock *lock)            \
 {                                                                      \
-       return six_unlock_ip_##type(lock, _THIS_IP_);                   \
+       six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);                \
 }
 
 __SIX_LOCK(read)
@@ -187,55 +369,6 @@ __SIX_LOCK(intent)
 __SIX_LOCK(write)
 #undef __SIX_LOCK
 
-#define SIX_LOCK_DISPATCH(type, fn, ...)                       \
-       switch (type) {                                         \
-       case SIX_LOCK_read:                                     \
-               return fn##_read(__VA_ARGS__);                  \
-       case SIX_LOCK_intent:                                   \
-               return fn##_intent(__VA_ARGS__);                \
-       case SIX_LOCK_write:                                    \
-               return fn##_write(__VA_ARGS__);                 \
-       default:                                                \
-               BUG();                                          \
-       }
-
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
-       SIX_LOCK_DISPATCH(type, six_trylock, lock);
-}
-
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-                                  unsigned seq)
-{
-       SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
-}
-
-static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
-                               six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-       SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
-}
-
-static inline int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-                               struct six_lock_waiter *wait,
-                               six_lock_should_sleep_fn should_sleep_fn, void *p,
-                               unsigned long ip)
-{
-       SIX_LOCK_DISPATCH(type, six_lock_ip_waiter, lock, wait, should_sleep_fn, p, ip);
-}
-
-static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
-                               struct six_lock_waiter *wait,
-                               six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-       SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p);
-}
-
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-       SIX_LOCK_DISPATCH(type, six_unlock, lock);
-}
-
 void six_lock_downgrade(struct six_lock *);
 bool six_lock_tryupgrade(struct six_lock *);
 bool six_trylock_convert(struct six_lock *, enum six_lock_type,
@@ -245,13 +378,11 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
 
 void six_lock_wakeup_all(struct six_lock *);
 
-void six_lock_pcpu_free(struct six_lock *);
-void six_lock_pcpu_alloc(struct six_lock *);
-
 struct six_lock_count {
        unsigned n[3];
 };
 
 struct six_lock_count six_lock_counts(struct six_lock *);
+void six_lock_readers_add(struct six_lock *, int);
 
 #endif /* _LINUX_SIX_H */
index dcdef3bcd4c49159fb1962601acb1e5bb9d495c4..f774a660a68122e561082399e0f84d306ddb8eeb 100644 (file)
@@ -269,9 +269,9 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
        struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
        int rw = flags & WRITE;
 
-       if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
-               prt_printf(err, "bad val size (%lu != %u)",
-                      bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
+       if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
+               prt_printf(err, "bad val size (%u > %lu)",
+                      alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
                return -BCH_ERR_invalid_bkey;
        }
 
index b58b876fdc7181aa13d46968a7098b0e4d4ea441..ee7ba700e75f4ee3afbac3ab241c7d6d9b1278c3 100644 (file)
@@ -724,7 +724,7 @@ unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
        return 0;
 }
 
-#ifdef CONFIG_X86_64
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
 
 #define I(_x)                  (*(out)++ = (_x))
 #define I1(i0)                                         I(i0)
index 727bed99554bde5336665569ecde728388f6b199..e81fb3e00c602dfca2e544ac85de7b4584c5b92d 100644 (file)
@@ -9,9 +9,17 @@
 #include "util.h"
 #include "vstructs.h"
 
+#if 0
+
+/*
+ * compiled unpack functions are disabled, pending a new interface for
+ * dynamically allocating executable memory:
+ */
+
 #ifdef CONFIG_X86_64
 #define HAVE_BCACHEFS_COMPILED_UNPACK  1
 #endif
+#endif
 
 void bch2_bkey_packed_to_binary_text(struct printbuf *,
                                     const struct bkey_format *,
index 73d326880cbb1168986f011d22a64b4fea9c8167..f8402709190079a6e108f901193880212e0a794a 100644 (file)
@@ -62,10 +62,12 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 
        EBUG_ON(btree_node_write_in_flight(b));
 
+       clear_btree_node_just_written(b);
+
        kvpfree(b->data, btree_bytes(c));
        b->data = NULL;
 #ifdef __KERNEL__
-       vfree(b->aux_data);
+       kvfree(b->aux_data);
 #else
        munmap(b->aux_data, btree_aux_data_bytes(b));
 #endif
@@ -100,7 +102,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
        if (!b->data)
                return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 #ifdef __KERNEL__
-       b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
+       b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
 #else
        b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
                           PROT_READ|PROT_WRITE|PROT_EXEC,
@@ -126,7 +128,6 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
                return NULL;
 
        bkey_btree_ptr_init(&b->key);
-       bch2_btree_lock_init(&b->c);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
        lockdep_set_no_check_recursion(&b->c.lock.dep_map);
 #endif
@@ -150,6 +151,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
                return NULL;
        }
 
+       bch2_btree_lock_init(&b->c, 0);
+
        bc->used++;
        list_add(&b->list, &bc->freeable);
        return b;
@@ -484,7 +487,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
        while (!list_empty(&bc->freed_nonpcpu)) {
                b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
                list_del(&b->list);
-               six_lock_pcpu_free(&b->c.lock);
+               six_lock_exit(&b->c.lock);
                kfree(b);
        }
 
@@ -645,8 +648,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
                mutex_lock(&bc->lock);
        }
 
-       if (pcpu_read_locks)
-               six_lock_pcpu_alloc(&b->c.lock);
+       bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
 
        BUG_ON(!six_trylock_intent(&b->c.lock));
        BUG_ON(!six_trylock_write(&b->c.lock));
@@ -700,6 +702,7 @@ err:
        /* Try to cannibalize another cached btree node: */
        if (bc->alloc_lock == current) {
                b2 = btree_node_cannibalize(c);
+               clear_btree_node_just_written(b2);
                bch2_btree_node_hash_remove(bc, b2);
 
                if (b) {
@@ -784,7 +787,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
        set_btree_node_read_in_flight(b);
 
        six_unlock_write(&b->c.lock);
-       seq = b->c.lock.state.seq;
+       seq = six_lock_seq(&b->c.lock);
        six_unlock_intent(&b->c.lock);
 
        /* Unlock before doing IO: */
@@ -908,7 +911,7 @@ retry:
        }
 
        if (unlikely(btree_node_read_in_flight(b))) {
-               u32 seq = b->c.lock.state.seq;
+               u32 seq = six_lock_seq(&b->c.lock);
 
                six_unlock_type(&b->c.lock, lock_type);
                bch2_trans_unlock(trans);
@@ -1006,7 +1009,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
        }
 
        if (unlikely(btree_node_read_in_flight(b))) {
-               u32 seq = b->c.lock.state.seq;
+               u32 seq = six_lock_seq(&b->c.lock);
 
                six_unlock_type(&b->c.lock, lock_type);
                bch2_trans_unlock(trans);
index decbbaace1eef03e98a143325a296fceafdd30a8..0a7a18eca3977189bf5330ffcac73f5fe50ee696 100644 (file)
@@ -483,7 +483,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
        struct btree_node_entry *bne;
        bool reinit_iter = false;
 
-       EBUG_ON(!(b->c.lock.state.seq & 1));
+       EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
        BUG_ON(bset_written(b, bset(b, &b->set[1])));
        BUG_ON(btree_node_just_written(b));
 
index 365794dc4dcd66c4b18a1519987e01ed6628d25d..4b9c04dc58db3bc73e9c0121b98cac7a4649f41d 100644 (file)
@@ -652,9 +652,8 @@ void bch2_btree_path_level_init(struct btree_trans *trans,
        BUG_ON(path->cached);
 
        EBUG_ON(!btree_path_pos_in_node(path, b));
-       EBUG_ON(b->c.lock.state.seq & 1);
 
-       path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+       path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
        path->l[b->c.level].b = b;
        __btree_path_level_init(path, b->c.level);
 }
index 02dd81a1d70416baa5593f75301a2f96d99fad61..198e3815093eee7d691732111b68deeb99dcba59 100644 (file)
@@ -42,14 +42,7 @@ static inline struct btree *btree_path_node(struct btree_path *path,
 static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
                                        const struct btree *b, unsigned level)
 {
-       /*
-        * We don't compare the low bits of the lock sequence numbers because
-        * @path might have taken a write lock on @b, and we don't want to skip
-        * the linked path if the sequence numbers were equal before taking that
-        * write lock. The lock sequence number is incremented by taking and
-        * releasing write locks and is even when unlocked:
-        */
-       return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
+       return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
 }
 
 static inline struct btree *btree_node_parent(struct btree_path *path,
index 3b333e3bc2436c8188556121b98308400e5f97f9..645fa994bdc6f54c8c74cce5f1a760a9a80574af 100644 (file)
@@ -252,7 +252,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
                }
 
                path->l[0].b = (void *) ck;
-               path->l[0].lock_seq = ck->c.lock.state.seq;
+               path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
                mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
 
                ret = bch2_btree_node_lock_write(trans, path, &ck->c);
@@ -283,9 +283,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
                return NULL;
 init:
        INIT_LIST_HEAD(&ck->list);
-       bch2_btree_lock_init(&ck->c);
-       if (pcpu_readers)
-               six_lock_pcpu_alloc(&ck->c.lock);
+       bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
 
        ck->c.cached = true;
        BUG_ON(!six_trylock_intent(&ck->c.lock));
@@ -341,9 +339,6 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
                }
 
                mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
-       } else {
-               if (path->btree_id == BTREE_ID_subvolumes)
-                       six_lock_pcpu_alloc(&ck->c.lock);
        }
 
        ck->c.level             = 0;
@@ -512,7 +507,7 @@ retry:
                mark_btree_node_locked(trans, path, 0, lock_want);
        }
 
-       path->l[0].lock_seq     = ck->c.lock.state.seq;
+       path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
        path->l[0].b            = (void *) ck;
 fill:
        path->uptodate = BTREE_ITER_UPTODATE;
@@ -594,7 +589,7 @@ retry:
                mark_btree_node_locked(trans, path, 0, lock_want);
        }
 
-       path->l[0].lock_seq     = ck->c.lock.state.seq;
+       path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
        path->l[0].b            = (void *) ck;
 fill:
        if (!ck->valid)
@@ -872,7 +867,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                        break;
 
                list_del(&ck->list);
-               six_lock_pcpu_free(&ck->c.lock);
+               six_lock_exit(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
                atomic_long_dec(&bc->nr_freed);
                scanned++;
@@ -888,7 +883,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                        break;
 
                list_del(&ck->list);
-               six_lock_pcpu_free(&ck->c.lock);
+               six_lock_exit(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
                atomic_long_dec(&bc->nr_freed);
                scanned++;
@@ -1013,7 +1008,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
                list_del(&ck->list);
                kfree(ck->k);
-               six_lock_pcpu_free(&ck->c.lock);
+               six_lock_exit(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
        }
 
index b99986653adefe24ac4a312ab8d960194307d958..70639a1572ff9ef1516287b46e6da56f3b3f236e 100644 (file)
@@ -6,9 +6,10 @@
 
 static struct lock_class_key bch2_btree_node_lock_key;
 
-void bch2_btree_lock_init(struct btree_bkey_cached_common *b)
+void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
+                         enum six_lock_init_flags flags)
 {
-       __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key);
+       __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
 }
 
 #ifdef CONFIG_LOCKDEP
@@ -20,16 +21,6 @@ void bch2_assert_btree_nodes_not_locked(void)
 
 /* Btree node locking: */
 
-static inline void six_lock_readers_add(struct six_lock *lock, int nr)
-{
-       if (lock->readers)
-               this_cpu_add(*lock->readers, nr);
-       else if (nr > 0)
-               atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
-       else
-               atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
-}
-
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
                                                  struct btree_path *skip,
                                                  struct btree_bkey_cached_common *b,
index 327780ce8e9ae0772e325dedd4b331684962d3ad..b341cc894c61c06dc0c689945e6b0f943a5f2b12 100644 (file)
@@ -14,7 +14,7 @@
 
 #include "btree_iter.h"
 
-void bch2_btree_lock_init(struct btree_bkey_cached_common *);
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
 
 #ifdef CONFIG_LOCKDEP
 void bch2_assert_btree_nodes_not_locked(void);
@@ -176,13 +176,13 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
        struct btree_path *linked;
 
        EBUG_ON(path->l[b->c.level].b != b);
-       EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+       EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
        EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
 
        mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
 
        trans_for_each_path_with_node(trans, b, linked)
-               linked->l[b->c.level].lock_seq += 2;
+               linked->l[b->c.level].lock_seq++;
 
        six_unlock_write(&b->c.lock);
 }
@@ -206,8 +206,8 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
        trans->lock_must_abort  = false;
        trans->locking          = b;
 
-       ret = six_lock_type_ip_waiter(&b->lock, type, &trans->locking_wait,
-                                  bch2_six_check_for_deadlock, trans, ip);
+       ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+                                bch2_six_check_for_deadlock, trans, ip);
        WRITE_ONCE(trans->locking, NULL);
        WRITE_ONCE(trans->locking_wait.start_time, 0);
        return ret;
@@ -284,7 +284,7 @@ static inline int __btree_node_lock_write(struct btree_trans *trans,
                                          bool lock_may_not_fail)
 {
        EBUG_ON(&path->l[b->level].b->c != b);
-       EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
+       EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
        EBUG_ON(!btree_node_intent_locked(path, b->level));
 
        /*
index 6ba0954e648e6ac9161f84fbdc3aa654c780d349..1319337c53828368c9fdebd139bb8bb8b47aaf2b 100644 (file)
@@ -688,7 +688,7 @@ err:
                bch2_trans_unlock(&trans);
                btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
                mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
-               path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+               path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
                path->l[b->c.level].b = b;
 
                bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
index bce42eef6f576001341fe94283a05ea46652a1d9..bd144182c1e12bc2014f7b1359fa03987e77ebf9 100644 (file)
@@ -137,17 +137,17 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 {
        struct bch_fs_usage_online *ret;
-       unsigned seq, i, v, u64s = fs_usage_u64s(c) + 1;
+       unsigned nr_replicas = READ_ONCE(c->replicas.nr);
+       unsigned seq, i;
 retry:
-       ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+       ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_NOFS);
        if (unlikely(!ret))
                return NULL;
 
        percpu_down_read(&c->mark_lock);
 
-       v = fs_usage_u64s(c) + 1;
-       if (unlikely(u64s != v)) {
-               u64s = v;
+       if (nr_replicas != c->replicas.nr) {
+               nr_replicas = c->replicas.nr;
                percpu_up_read(&c->mark_lock);
                kfree(ret);
                goto retry;
@@ -157,10 +157,12 @@ retry:
 
        do {
                seq = read_seqcount_begin(&c->usage_lock);
-               unsafe_memcpy(&ret->u, c->usage_base, u64s * sizeof(u64),
+               unsafe_memcpy(&ret->u, c->usage_base,
+                             __fs_usage_u64s(nr_replicas) * sizeof(u64),
                              "embedded variable length struct");
                for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-                       acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
+                       acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
+                                       __fs_usage_u64s(nr_replicas));
        } while (read_seqcount_retry(&c->usage_lock, seq));
 
        return ret;
index d677b0225c52bf1d9388790e8680805a124f572e..bdf4fff9cb8a8a69e372a03daf9051231ae352bd 100644 (file)
@@ -207,10 +207,24 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
 
 /* Filesystem usage: */
 
+static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
+{
+       return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
+}
+
 static inline unsigned fs_usage_u64s(struct bch_fs *c)
 {
-       return sizeof(struct bch_fs_usage) / sizeof(u64) +
-               READ_ONCE(c->replicas.nr);
+       return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
+}
+
+static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
+{
+       return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
+}
+
+static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
+{
+       return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
 }
 
 static inline unsigned dev_usage_u64s(void)
index 8027c2a14199bc02db05fc73cd8b379b6cf2d330..cfb1779d712a3a51b500f4718c08887126edff3a 100644 (file)
@@ -420,7 +420,9 @@ TRACE_EVENT(btree_path_relock_fail,
                else
                        scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
                __entry->iter_lock_seq          = path->l[level].lock_seq;
-               __entry->node_lock_seq          = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
+               __entry->node_lock_seq          = is_btree_node(path, level)
+                       ? six_lock_seq(&path->l[level].b->c.lock)
+                       : 0;
        ),
 
        TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
@@ -475,7 +477,9 @@ TRACE_EVENT(btree_path_upgrade_fail,
                __entry->read_count             = c.n[SIX_LOCK_read];
                __entry->intent_count           = c.n[SIX_LOCK_read];
                __entry->iter_lock_seq          = path->l[level].lock_seq;
-               __entry->node_lock_seq          = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
+               __entry->node_lock_seq          = is_btree_node(path, level)
+                       ? six_lock_seq(&path->l[level].b->c.lock)
+                       : 0;
        ),
 
        TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
index dfc55fe4c7d17e1f57de2f823ad1b6dc0104e7d0..90796863f7cca6f413139fd4f68ca8452c997686 100644 (file)
@@ -350,11 +350,8 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
 
        if (time_after64(end, start)) {
                duration = end - start;
-               stats->duration_stats = mean_and_variance_update_inlined(stats->duration_stats,
-                                                                duration);
-               stats->duration_stats_weighted = mean_and_variance_weighted_update(
-                       stats->duration_stats_weighted,
-                       duration);
+               stats->duration_stats = mean_and_variance_update(stats->duration_stats, duration);
+               mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
                stats->max_duration = max(stats->max_duration, duration);
                stats->min_duration = min(stats->min_duration, duration);
                bch2_quantiles_update(&stats->quantiles, duration);
@@ -362,10 +359,8 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
 
        if (time_after64(end, stats->last_event)) {
                freq = end - stats->last_event;
-               stats->freq_stats = mean_and_variance_update_inlined(stats->freq_stats, freq);
-               stats->freq_stats_weighted = mean_and_variance_weighted_update(
-                       stats->freq_stats_weighted,
-                       freq);
+               stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq);
+               mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
                stats->max_freq = max(stats->max_freq, freq);
                stats->min_freq = min(stats->min_freq, freq);
                stats->last_event = end;
@@ -594,8 +589,8 @@ void bch2_time_stats_exit(struct bch2_time_stats *stats)
 void bch2_time_stats_init(struct bch2_time_stats *stats)
 {
        memset(stats, 0, sizeof(*stats));
-       stats->duration_stats_weighted.w = 8;
-       stats->freq_stats_weighted.w = 8;
+       stats->duration_stats_weighted.weight = 8;
+       stats->freq_stats_weighted.weight = 8;
        stats->min_duration = U64_MAX;
        stats->min_freq = U64_MAX;
        spin_lock_init(&stats->lock);
index bd08da5f9e70c0c108fcb99932b82fb82f52df5d..eb5f2ba03b7fbdd920b0080600989a0dc76f8e61 100644 (file)
 #include <linux/mean_and_variance.h>
 #include <linux/module.h>
 
-/**
- * fast_divpow2() - fast approximation for n / (1 << d)
- * @n: numerator
- * @d: the power of 2 denominator.
- *
- * note: this rounds towards 0.
- */
-s64 fast_divpow2(s64 n, u8 d)
+u128_u u128_div(u128_u n, u64 d)
 {
-       return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
-}
+       u128_u r;
+       u64 rem;
+       u64 hi = u128_hi(n);
+       u64 lo = u128_lo(n);
+       u64  h =  hi & ((u64) U32_MAX  << 32);
+       u64  l = (hi &  (u64) U32_MAX) << 32;
 
-/**
- * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
- * and return it.
- * @s1: the mean_and_variance to update.
- * @v1: the new sample.
- *
- * see linked pdf equation 12.
- */
-struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1)
-{
-       return mean_and_variance_update_inlined(s1, v1);
+       r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
+       r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
+       r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+       return r;
 }
-EXPORT_SYMBOL_GPL(mean_and_variance_update);
+EXPORT_SYMBOL_GPL(u128_div);
 
 /**
  * mean_and_variance_get_mean() - get mean from @s
  */
 s64 mean_and_variance_get_mean(struct mean_and_variance s)
 {
-       return div64_u64(s.sum, s.n);
+       return s.n ? div64_u64(s.sum, s.n) : 0;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
 
@@ -85,10 +75,14 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
  */
 u64 mean_and_variance_get_variance(struct mean_and_variance s1)
 {
-       u128 s2 = u128_div(s1.sum_squares, s1.n);
-       u64  s3 = abs(mean_and_variance_get_mean(s1));
+       if (s1.n) {
+               u128_u s2 = u128_div(s1.sum_squares, s1.n);
+               u64  s3 = abs(mean_and_variance_get_mean(s1));
 
-       return u128_to_u64(u128_sub(s2, u128_square(s3)));
+               return u128_lo(u128_sub(s2, u128_square(s3)));
+       } else {
+               return 0;
+       }
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
 
@@ -109,10 +103,26 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
  * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
  * values are stored bitshifted for performance and added precision.
  */
-struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1,
-                                                                   s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
 {
-       return mean_and_variance_weighted_update_inlined(s1, x);
+       // previous weighted variance.
+       u8 w            = s->weight;
+       u64 var_w0      = s->variance;
+       // new value weighted.
+       s64 x_w         = x << w;
+       s64 diff_w      = x_w - s->mean;
+       s64 diff        = fast_divpow2(diff_w, w);
+       // new mean weighted.
+       s64 u_w1        = s->mean + diff;
+
+       if (!s->init) {
+               s->mean = x_w;
+               s->variance = 0;
+       } else {
+               s->mean = u_w1;
+               s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+       }
+       s->init = true;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
 
@@ -121,7 +131,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
  */
 s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
 {
-       return fast_divpow2(s.mean, s.w);
+       return fast_divpow2(s.mean, s.weight);
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
 
@@ -131,7 +141,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
 u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
 {
        // always positive don't need fast divpow2
-       return s.variance >> s.w;
+       return s.variance >> s.weight;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
 
index 3d366a843eb55b243c0f0c07b9245fcc76c40ae6..a47cd6d008c501fb4c021150671fde3e93b0fccb 100644 (file)
@@ -14,9 +14,9 @@
 #include <trace/events/lock.h>
 
 #ifdef DEBUG
-#define EBUG_ON(cond)          BUG_ON(cond)
+#define EBUG_ON(cond)                  BUG_ON(cond)
 #else
-#define EBUG_ON(cond)          do {} while (0)
+#define EBUG_ON(cond)                  do {} while (0)
 #endif
 
 #define six_acquire(l, t, r, ip)       lock_acquire(l, 0, t, r, 1, NULL, ip)
 
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 
+#define SIX_LOCK_HELD_read_OFFSET      0
+#define SIX_LOCK_HELD_read             ~(~0U << 26)
+#define SIX_LOCK_HELD_intent           (1U << 26)
+#define SIX_LOCK_HELD_write            (1U << 27)
+#define SIX_LOCK_WAITING_read          (1U << (28 + SIX_LOCK_read))
+#define SIX_LOCK_WAITING_intent                (1U << (28 + SIX_LOCK_intent))
+#define SIX_LOCK_WAITING_write         (1U << (28 + SIX_LOCK_write))
+#define SIX_LOCK_NOSPIN                        (1U << 31)
+
 struct six_lock_vals {
        /* Value we add to the lock in order to take the lock: */
-       u64                     lock_val;
+       u32                     lock_val;
 
        /* If the lock has this value (used as a mask), taking the lock fails: */
-       u64                     lock_fail;
-
-       /* Value we add to the lock in order to release the lock: */
-       u64                     unlock_val;
+       u32                     lock_fail;
 
        /* Mask that indicates lock is held for this type: */
-       u64                     held_mask;
+       u32                     held_mask;
 
        /* Waitlist we wakeup when releasing the lock: */
        enum six_lock_type      unlock_wakeup;
 };
 
-#define __SIX_LOCK_HELD_read   __SIX_VAL(read_lock, ~0)
-#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0)
-#define __SIX_LOCK_HELD_write  __SIX_VAL(seq, 1)
-
 #define LOCK_VALS {                                                    \
        [SIX_LOCK_read] = {                                             \
-               .lock_val       = __SIX_VAL(read_lock, 1),              \
-               .lock_fail      = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
-               .unlock_val     = -__SIX_VAL(read_lock, 1),             \
-               .held_mask      = __SIX_LOCK_HELD_read,                 \
+               .lock_val       = 1U << SIX_LOCK_HELD_read_OFFSET,      \
+               .lock_fail      = SIX_LOCK_HELD_write,                  \
+               .held_mask      = SIX_LOCK_HELD_read,                   \
                .unlock_wakeup  = SIX_LOCK_write,                       \
        },                                                              \
        [SIX_LOCK_intent] = {                                           \
-               .lock_val       = __SIX_VAL(intent_lock, 1),            \
-               .lock_fail      = __SIX_LOCK_HELD_intent,               \
-               .unlock_val     = -__SIX_VAL(intent_lock, 1),           \
-               .held_mask      = __SIX_LOCK_HELD_intent,               \
+               .lock_val       = SIX_LOCK_HELD_intent,                 \
+               .lock_fail      = SIX_LOCK_HELD_intent,                 \
+               .held_mask      = SIX_LOCK_HELD_intent,                 \
                .unlock_wakeup  = SIX_LOCK_intent,                      \
        },                                                              \
        [SIX_LOCK_write] = {                                            \
-               .lock_val       = __SIX_VAL(seq, 1),                    \
-               .lock_fail      = __SIX_LOCK_HELD_read,                 \
-               .unlock_val     = __SIX_VAL(seq, 1),                    \
-               .held_mask      = __SIX_LOCK_HELD_write,                \
+               .lock_val       = SIX_LOCK_HELD_write,                  \
+               .lock_fail      = SIX_LOCK_HELD_read,                   \
+               .held_mask      = SIX_LOCK_HELD_write,                  \
                .unlock_wakeup  = SIX_LOCK_read,                        \
        },                                                              \
 }
 
+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
+{
+       if ((atomic_read(&lock->state) & mask) != mask)
+               atomic_or(mask, &lock->state);
+}
+
+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
+{
+       if (atomic_read(&lock->state) & mask)
+               atomic_and(~mask, &lock->state);
+}
+
 static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-                                union six_lock_state old,
-                                struct task_struct *owner)
+                                u32 old, struct task_struct *owner)
 {
        if (type != SIX_LOCK_intent)
                return;
 
-       if (!old.intent_lock) {
+       if (!(old & SIX_LOCK_HELD_intent)) {
                EBUG_ON(lock->owner);
                lock->owner = owner;
        } else {
@@ -94,22 +104,25 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
        return read_count;
 }
 
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-
-static int __do_six_trylock_type(struct six_lock *lock,
-                                enum six_lock_type type,
-                                struct task_struct *task,
-                                bool try)
+/*
+ * __do_six_trylock() - main trylock routine
+ *
+ * Returns 1 on success, 0 on failure
+ *
+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
+ * for anoter thread taking the competing lock type, and we may havve to do a
+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
+ */
+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
+                           struct task_struct *task, bool try)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
-       union six_lock_state old, new;
        int ret;
-       u64 v;
+       u32 old, new, v;
 
        EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
-       EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
-       EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
+       EBUG_ON(type == SIX_LOCK_write &&
+               (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
 
        /*
         * Percpu reader mode:
@@ -124,101 +137,75 @@ static int __do_six_trylock_type(struct six_lock *lock,
         * the lock, then issues a full memory barrier, then reads from the
         * other thread's variable to check if the other thread thinks it has
         * the lock. If we raced, we backoff and retry/sleep.
+        *
+        * Failure to take the lock may cause a spurious trylock failure in
+        * another thread, because we temporarily set the lock to indicate that
+        * we held it. This would be a problem for a thread in six_lock(), when
+        * they are calling trylock after adding themself to the waitlist and
+        * prior to sleeping.
+        *
+        * Therefore, if we fail to get the lock, and there were waiters of the
+        * type we conflict with, we will have to issue a wakeup.
+        *
+        * Since we may be called under wait_lock (and by the wakeup code
+        * itself), we return that the wakeup has to be done instead of doing it
+        * here.
         */
-
        if (type == SIX_LOCK_read && lock->readers) {
                preempt_disable();
                this_cpu_inc(*lock->readers); /* signal that we own lock */
 
                smp_mb();
 
-               old.v = READ_ONCE(lock->state.v);
-               ret = !(old.v & l[type].lock_fail);
+               old = atomic_read(&lock->state);
+               ret = !(old & l[type].lock_fail);
 
                this_cpu_sub(*lock->readers, !ret);
                preempt_enable();
 
-               /*
-                * If we failed because a writer was trying to take the
-                * lock, issue a wakeup because we might have caused a
-                * spurious trylock failure:
-                */
-#if 0
-               /*
-                * This code should be sufficient, but we're seeing unexplained
-                * lost wakeups:
-                */
-               if (old.write_locking)
+               if (!ret && (old & SIX_LOCK_WAITING_write))
                        ret = -1 - SIX_LOCK_write;
-#else
-               if (!ret)
-                       ret = -1 - SIX_LOCK_write;
-#endif
        } else if (type == SIX_LOCK_write && lock->readers) {
                if (try) {
-                       atomic64_add(__SIX_VAL(write_locking, 1),
-                                    &lock->state.counter);
-                       smp_mb__after_atomic();
-               } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) {
-                       atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write),
-                                    &lock->state.counter);
-                       /*
-                        * pairs with barrier after unlock and before checking
-                        * for readers in unlock path
-                        */
+                       atomic_add(SIX_LOCK_HELD_write, &lock->state);
                        smp_mb__after_atomic();
                }
 
                ret = !pcpu_read_count(lock);
 
-               /*
-                * On success, we increment lock->seq; also we clear
-                * write_locking unless we failed from the lock path:
-                */
-               v = 0;
-               if (ret)
-                       v += __SIX_VAL(seq, 1);
-               if (ret || try)
-                       v -= __SIX_VAL(write_locking, 1);
-
                if (try && !ret) {
-                       old.v = atomic64_add_return(v, &lock->state.counter);
-                       if (old.waiters & (1 << SIX_LOCK_read))
+                       old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
+                       if (old & SIX_LOCK_WAITING_read)
                                ret = -1 - SIX_LOCK_read;
-               } else {
-                       atomic64_add(v, &lock->state.counter);
                }
        } else {
-               v = READ_ONCE(lock->state.v);
+               v = atomic_read(&lock->state);
                do {
-                       new.v = old.v = v;
+                       new = old = v;
 
-                       if (!(old.v & l[type].lock_fail)) {
-                               new.v += l[type].lock_val;
+                       ret = !(old & l[type].lock_fail);
 
-                               if (type == SIX_LOCK_write)
-                                       new.write_locking = 0;
-                       } else if (!try && !(new.waiters & (1 << type)))
-                               new.waiters |= 1 << type;
-                       else
-                               break; /* waiting bit already set */
-               } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-                                       old.v, new.v)) != old.v);
+                       if (!ret || (type == SIX_LOCK_write && !try)) {
+                               smp_mb();
+                               break;
+                       }
 
-               ret = !(old.v & l[type].lock_fail);
+                       new += l[type].lock_val;
+               } while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old);
 
-               EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+               EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
        }
 
        if (ret > 0)
                six_set_owner(lock, type, old, task);
 
-       EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking));
+       EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
+               (atomic_read(&lock->state) & SIX_LOCK_HELD_write));
 
        return ret;
 }
 
-static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
 {
        struct six_lock_waiter *w, *next;
        struct task_struct *task;
@@ -237,7 +224,7 @@ again:
                        goto unlock;
                saw_one = true;
 
-               ret = __do_six_trylock_type(lock, lock_type, w->task, false);
+               ret = __do_six_trylock(lock, lock_type, w->task, false);
                if (ret <= 0)
                        goto unlock;
 
@@ -252,7 +239,7 @@ again:
                wake_up_process(task);
        }
 
-       clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
+       six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
 unlock:
        raw_spin_unlock(&lock->wait_lock);
 
@@ -262,96 +249,74 @@ unlock:
        }
 }
 
-static inline void six_lock_wakeup(struct six_lock *lock,
-                                  union six_lock_state state,
-                                  enum six_lock_type lock_type)
+__always_inline
+static void six_lock_wakeup(struct six_lock *lock, u32 state,
+                           enum six_lock_type lock_type)
 {
-       if (lock_type == SIX_LOCK_write && state.read_lock)
+       if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
                return;
 
-       if (!(state.waiters & (1 << lock_type)))
+       if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
                return;
 
        __six_lock_wakeup(lock, lock_type);
 }
 
-static bool do_six_trylock_type(struct six_lock *lock,
-                               enum six_lock_type type,
-                               bool try)
+__always_inline
+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
 {
        int ret;
 
-       ret = __do_six_trylock_type(lock, type, current, try);
+       ret = __do_six_trylock(lock, type, current, try);
        if (ret < 0)
                __six_lock_wakeup(lock, -ret - 1);
 
        return ret > 0;
 }
 
-__always_inline __flatten
-static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type,
-                              unsigned long ip)
+/**
+ * six_trylock_ip - attempt to take a six lock without blocking
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
-       if (!do_six_trylock_type(lock, type, true))
+       if (!do_six_trylock(lock, type, true))
                return false;
 
        if (type != SIX_LOCK_write)
                six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
        return true;
 }
-
-__always_inline __flatten
-static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
-                             unsigned seq, unsigned long ip)
+EXPORT_SYMBOL_GPL(six_trylock_ip);
+
+/**
+ * six_relock_ip - attempt to re-take a lock that was held previously
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:       lock sequence number obtained from six_lock_seq() while lock was
+ *             held previously
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+                  unsigned seq, unsigned long ip)
 {
-       const struct six_lock_vals l[] = LOCK_VALS;
-       union six_lock_state old;
-       u64 v;
-
-       EBUG_ON(type == SIX_LOCK_write);
-
-       if (type == SIX_LOCK_read &&
-           lock->readers) {
-               bool ret;
-
-               preempt_disable();
-               this_cpu_inc(*lock->readers);
-
-               smp_mb();
-
-               old.v = READ_ONCE(lock->state.v);
-               ret = !(old.v & l[type].lock_fail) && old.seq == seq;
-
-               this_cpu_sub(*lock->readers, !ret);
-               preempt_enable();
-
-               /*
-                * Similar to the lock path, we may have caused a spurious write
-                * lock fail and need to issue a wakeup:
-                */
-               if (ret)
-                       six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-               else
-                       six_lock_wakeup(lock, old, SIX_LOCK_write);
+       if (lock->seq != seq || !six_trylock_ip(lock, type, ip))
+               return false;
 
-               return ret;
+       if (lock->seq != seq) {
+               six_unlock_ip(lock, type, ip);
+               return false;
        }
 
-       v = READ_ONCE(lock->state.v);
-       do {
-               old.v = v;
-
-               if (old.seq != seq || old.v & l[type].lock_fail)
-                       return false;
-       } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-                               old.v,
-                               old.v + l[type].lock_val)) != old.v);
-
-       six_set_owner(lock, type, old, current);
-       if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
        return true;
 }
+EXPORT_SYMBOL_GPL(six_relock_ip);
 
 #ifdef CONFIG_LOCK_SPIN_ON_OWNER
 
@@ -371,17 +336,6 @@ static inline bool six_can_spin_on_owner(struct six_lock *lock)
        return ret;
 }
 
-static inline void six_set_nospin(struct six_lock *lock)
-{
-       union six_lock_state old, new;
-       u64 v = READ_ONCE(lock->state.v);
-
-       do {
-               new.v = old.v = v;
-               new.nospin = true;
-       } while ((v = atomic64_cmpxchg(&lock->state.counter, old.v, new.v)) != old.v);
-}
-
 static inline bool six_spin_on_owner(struct six_lock *lock,
                                     struct task_struct *owner,
                                     u64 end_time)
@@ -405,7 +359,7 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
                }
 
                if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
-                       six_set_nospin(lock);
+                       six_set_bitmask(lock, SIX_LOCK_NOSPIN);
                        ret = false;
                        break;
                }
@@ -445,7 +399,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
                if (owner && !six_spin_on_owner(lock, owner, end_time))
                        break;
 
-               if (do_six_trylock_type(lock, type, false)) {
+               if (do_six_trylock(lock, type, false)) {
                        osq_unlock(&lock->osq);
                        preempt_enable();
                        return true;
@@ -494,17 +448,16 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 #endif
 
 noinline
-static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
-                                   struct six_lock_waiter *wait,
-                                   six_lock_should_sleep_fn should_sleep_fn, void *p,
-                                   unsigned long ip)
+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
+                            struct six_lock_waiter *wait,
+                            six_lock_should_sleep_fn should_sleep_fn, void *p,
+                            unsigned long ip)
 {
-       union six_lock_state old;
        int ret = 0;
 
        if (type == SIX_LOCK_write) {
-               EBUG_ON(lock->state.write_locking);
-               atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+               EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+               atomic_add(SIX_LOCK_HELD_write, &lock->state);
                smp_mb__after_atomic();
        }
 
@@ -519,13 +472,12 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
        wait->lock_acquired     = false;
 
        raw_spin_lock(&lock->wait_lock);
-       if (!(lock->state.waiters & (1 << type)))
-               set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
+       six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
        /*
-        * Retry taking the lock after taking waitlist lock, have raced with an
-        * unlock:
+        * Retry taking the lock after taking waitlist lock, in case we raced
+        * with an unlock:
         */
-       ret = __do_six_trylock_type(lock, type, current, false);
+       ret = __do_six_trylock(lock, type, current, false);
        if (ret <= 0) {
                wait->start_time = local_clock();
 
@@ -565,7 +517,7 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
                                list_del(&wait->list);
                        raw_spin_unlock(&lock->wait_lock);
 
-                       if (wait->lock_acquired)
+                       if (unlikely(wait->lock_acquired))
                                do_six_unlock_type(lock, type);
                        break;
                }
@@ -575,21 +527,49 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 
        __set_current_state(TASK_RUNNING);
 out:
-       if (ret && type == SIX_LOCK_write && lock->state.write_locking) {
-               old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
-                                           &lock->state.counter);
-               six_lock_wakeup(lock, old, SIX_LOCK_read);
+       if (ret && type == SIX_LOCK_write) {
+               six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+               six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
        }
        trace_contention_end(lock, 0);
 
        return ret;
 }
 
-__always_inline __flatten
-static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
-                        struct six_lock_waiter *wait,
-                        six_lock_should_sleep_fn should_sleep_fn, void *p,
-                        unsigned long ip)
+/**
+ * six_lock_ip_waiter - take a lock, with full waitlist interface
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:      pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *             to scheduling
+ * @p:         passed through to @should_sleep_fn
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * This is the most general six_lock() variant, with parameters to support full
+ * cycle detection for deadlock avoidance.
+ *
+ * The code calling this function must implement tracking of held locks, and the
+ * @wait object should be embedded into the struct that tracks held locks -
+ * which must also be accessible in a thread-safe way.
+ *
+ * @should_sleep_fn should invoke the cycle detector; it should walk each
+ * lock's waiters, and for each waiter recursively walk their held locks.
+ *
+ * When this function must block, @wait will be added to @lock's waitlist before
+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
+ * removed from the lock waitlist until the lock has been successfully acquired,
+ * or we abort.
+ *
+ * @wait.start_time will be monotonically increasing for any given waitlist, and
+ * thus may be used as a loop cursor.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+                      struct six_lock_waiter *wait,
+                      six_lock_should_sleep_fn should_sleep_fn, void *p,
+                      unsigned long ip)
 {
        int ret;
 
@@ -598,8 +578,8 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type
        if (type != SIX_LOCK_write)
                six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
 
-       ret = do_six_trylock_type(lock, type, true) ? 0
-               : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p, ip);
+       ret = do_six_trylock(lock, type, true) ? 0
+               : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
 
        if (ret && type != SIX_LOCK_write)
                six_release(&lock->dep_map, ip);
@@ -608,22 +588,13 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type
 
        return ret;
 }
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
 
 __always_inline
-static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
-                          six_lock_should_sleep_fn should_sleep_fn, void *p,
-                          unsigned long ip)
-{
-       struct six_lock_waiter wait;
-
-       return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-}
-
-__always_inline __flatten
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
        const struct six_lock_vals l[] = LOCK_VALS;
-       union six_lock_state state;
+       u32 state;
 
        if (type == SIX_LOCK_intent)
                lock->owner = NULL;
@@ -633,26 +604,39 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
                smp_mb(); /* unlock barrier */
                this_cpu_dec(*lock->readers);
                smp_mb(); /* between unlocking and checking for waiters */
-               state.v = READ_ONCE(lock->state.v);
+               state = atomic_read(&lock->state);
        } else {
-               u64 v = l[type].unlock_val;
+               u32 v = l[type].lock_val;
 
                if (type != SIX_LOCK_read)
-                       v -= lock->state.v & __SIX_VAL(nospin, 1);
+                       v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
 
-               EBUG_ON(!(lock->state.v & l[type].held_mask));
-               state.v = atomic64_add_return_release(v, &lock->state.counter);
+               EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
+               state = atomic_sub_return_release(v, &lock->state);
        }
 
        six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
-__always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type,
-                             unsigned long ip)
+/**
+ * six_unlock_ip - drop a six lock
+ * @lock:      lock to unlock
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);                          read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);      read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 0
+ */
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
        EBUG_ON(type == SIX_LOCK_write &&
-               !(lock->state.v & __SIX_LOCK_HELD_intent));
+               !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
        EBUG_ON((type == SIX_LOCK_write ||
                 type == SIX_LOCK_intent) &&
                lock->owner != current);
@@ -666,52 +650,18 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type,
                return;
        }
 
+       lock->seq += type == SIX_LOCK_write;
+
        do_six_unlock_type(lock, type);
 }
+EXPORT_SYMBOL_GPL(six_unlock_ip);
 
-#define __SIX_LOCK(type)                                               \
-bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)    \
-{                                                                      \
-       return __six_trylock_type(lock, SIX_LOCK_##type, ip);           \
-}                                                                      \
-EXPORT_SYMBOL_GPL(six_trylock_ip_##type);                              \
-                                                                       \
-bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-{                                                                      \
-       return __six_relock_type(lock, SIX_LOCK_##type, seq, ip);       \
-}                                                                      \
-EXPORT_SYMBOL_GPL(six_relock_ip_##type);                               \
-                                                                       \
-int six_lock_ip_##type(struct six_lock *lock,                          \
-                   six_lock_should_sleep_fn should_sleep_fn, void *p,  \
-                   unsigned long ip)                                   \
-{                                                                      \
-       return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-}                                                                      \
-EXPORT_SYMBOL_GPL(six_lock_ip_##type);                                 \
-                                                                       \
-int six_lock_ip_waiter_##type(struct six_lock *lock,                   \
-                          struct six_lock_waiter *wait,                \
-                          six_lock_should_sleep_fn should_sleep_fn, void *p,\
-                          unsigned long ip)                            \
-{                                                                      \
-       return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-}                                                                      \
-EXPORT_SYMBOL_GPL(six_lock_ip_waiter_##type);                          \
-                                                                       \
-void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)     \
-{                                                                      \
-       __six_unlock_type(lock, SIX_LOCK_##type, ip);                   \
-}                                                                      \
-EXPORT_SYMBOL_GPL(six_unlock_ip_##type);
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-
-#undef __SIX_LOCK
-
-/* Convert from intent to read: */
+/**
+ * six_lock_downgrade - convert an intent lock to a read lock
+ * @lock:      lock to dowgrade
+ *
+ * @lock will have read count incremented and intent count decremented
+ */
 void six_lock_downgrade(struct six_lock *lock)
 {
        six_lock_increment(lock, SIX_LOCK_read);
@@ -719,25 +669,33 @@ void six_lock_downgrade(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_downgrade);
 
+/**
+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
+ * @lock:      lock to upgrade
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-       union six_lock_state old, new;
-       u64 v = READ_ONCE(lock->state.v);
+       const struct six_lock_vals l[] = LOCK_VALS;
+       u32 old, new, v = atomic_read(&lock->state);
 
        do {
-               new.v = old.v = v;
+               new = old = v;
 
-               if (new.intent_lock)
+               if (new & SIX_LOCK_HELD_intent)
                        return false;
 
                if (!lock->readers) {
-                       EBUG_ON(!new.read_lock);
-                       new.read_lock--;
+                       EBUG_ON(!(new & SIX_LOCK_HELD_read));
+                       new -= l[SIX_LOCK_read].lock_val;
                }
 
-               new.intent_lock = 1;
-       } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-                               old.v, new.v)) != old.v);
+               new |= SIX_LOCK_HELD_intent;
+       } while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old);
 
        if (lock->readers)
                this_cpu_dec(*lock->readers);
@@ -748,6 +706,17 @@ bool six_lock_tryupgrade(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
 
+/**
+ * six_trylock_convert - attempt to convert a held lock from one type to another
+ * @lock:      lock to upgrade
+ * @from:      SIX_LOCK_read or SIX_LOCK_intent
+ * @to:                SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
 bool six_trylock_convert(struct six_lock *lock,
                         enum six_lock_type from,
                         enum six_lock_type to)
@@ -766,9 +735,16 @@ bool six_trylock_convert(struct six_lock *lock,
 }
 EXPORT_SYMBOL_GPL(six_trylock_convert);
 
-/*
- * Increment read/intent lock count, assuming we already have it read or intent
- * locked:
+/**
+ * six_lock_increment - increase held lock count on a lock that is already held
+ * @lock:      lock to increment
+ * @type:      SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * @lock must already be held, with a lock type that is greater than or equal to
+ * @type
+ *
+ * A corresponding six_unlock_type() call will be required for @lock to be fully
+ * unlocked.
  */
 void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
@@ -783,13 +759,14 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
                if (lock->readers) {
                        this_cpu_inc(*lock->readers);
                } else {
-                       EBUG_ON(!lock->state.read_lock &&
-                               !lock->state.intent_lock);
-                       atomic64_add(l[type].lock_val, &lock->state.counter);
+                       EBUG_ON(!(atomic_read(&lock->state) &
+                                 (SIX_LOCK_HELD_read|
+                                  SIX_LOCK_HELD_intent)));
+                       atomic_add(l[type].lock_val, &lock->state);
                }
                break;
        case SIX_LOCK_intent:
-               EBUG_ON(!lock->state.intent_lock);
+               EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
                lock->intent_lock_recurse++;
                break;
        case SIX_LOCK_write:
@@ -799,9 +776,19 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 }
 EXPORT_SYMBOL_GPL(six_lock_increment);
 
+/**
+ * six_lock_wakeup_all - wake up all waiters on @lock
+ * @lock:      lock to wake up waiters for
+ *
+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
+ * abort the lock operation.
+ *
+ * This function is never needed in a bug-free program; it's only useful in
+ * debug code, e.g. to determine if a cycle detector is at fault.
+ */
 void six_lock_wakeup_all(struct six_lock *lock)
 {
-       union six_lock_state state = lock->state;
+       u32 state = atomic_read(&lock->state);
        struct six_lock_waiter *w;
 
        six_lock_wakeup(lock, state, SIX_LOCK_read);
@@ -815,38 +802,96 @@ void six_lock_wakeup_all(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
 
-void six_lock_pcpu_free(struct six_lock *lock)
-{
-       BUG_ON(lock->readers && pcpu_read_count(lock));
-       BUG_ON(lock->state.read_lock);
-
-       free_percpu(lock->readers);
-       lock->readers = NULL;
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
-
-void six_lock_pcpu_alloc(struct six_lock *lock)
-{
-#ifdef __KERNEL__
-       if (!lock->readers)
-               lock->readers = alloc_percpu(unsigned);
-#endif
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
-
-/*
- * Returns lock held counts, for both read and intent
+/**
+ * six_lock_counts - return held lock counts, for each lock type
+ * @lock:      lock to return counters for
+ *
+ * Return: the number of times a lock is held for read, intent and write.
  */
 struct six_lock_count six_lock_counts(struct six_lock *lock)
 {
        struct six_lock_count ret;
 
        ret.n[SIX_LOCK_read]    = !lock->readers
-               ? lock->state.read_lock
+               ? atomic_read(&lock->state) & SIX_LOCK_HELD_read
                : pcpu_read_count(lock);
-       ret.n[SIX_LOCK_intent]  = lock->state.intent_lock + lock->intent_lock_recurse;
-       ret.n[SIX_LOCK_write]   = lock->state.seq & 1;
+       ret.n[SIX_LOCK_intent]  = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
+               lock->intent_lock_recurse;
+       ret.n[SIX_LOCK_write]   = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
 
        return ret;
 }
 EXPORT_SYMBOL_GPL(six_lock_counts);
+
+/**
+ * six_lock_readers_add - directly manipulate reader count of a lock
+ * @lock:      lock to add/subtract readers for
+ * @nr:                reader count to add/subtract
+ *
+ * When an upper layer is implementing lock reentrency, we may have both read
+ * and intent locks on the same lock.
+ *
+ * When we need to take a write lock, the read locks will cause self-deadlock,
+ * because six locks themselves do not track which read locks are held by the
+ * current thread and which are held by a different thread - it does no
+ * per-thread tracking of held locks.
+ *
+ * The upper layer that is tracking held locks may however, if trylock() has
+ * failed, count up its own read locks, subtract them, take the write lock, and
+ * then re-add them.
+ *
+ * As in any other situation when taking a write lock, @lock must be held for
+ * intent one (or more) times, so @lock will never be left unlocked.
+ */
+void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+       if (lock->readers) {
+               this_cpu_add(*lock->readers, nr);
+       } else {
+               EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
+               /* reader count starts at bit 0 */
+               atomic_add(nr, &lock->state);
+       }
+}
+EXPORT_SYMBOL_GPL(six_lock_readers_add);
+
+/**
+ * six_lock_exit - release resources held by a lock prior to freeing
+ * @lock:      lock to exit
+ *
+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
+ * required to free the percpu read counts.
+ */
+void six_lock_exit(struct six_lock *lock)
+{
+       WARN_ON(lock->readers && pcpu_read_count(lock));
+       WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
+
+       free_percpu(lock->readers);
+       lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_exit);
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+                    struct lock_class_key *key, enum six_lock_init_flags flags)
+{
+       atomic_set(&lock->state, 0);
+       raw_spin_lock_init(&lock->wait_lock);
+       INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+       lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+
+       if (flags & SIX_LOCK_INIT_PCPU) {
+               /*
+                * We don't return an error here on memory allocation failure
+                * since percpu is an optimization, and locks will work with the
+                * same semantics in non-percpu mode: callers can check for
+                * failure if they wish by checking lock->readers, but generally
+                * will not want to treat it as an error.
+                */
+               lock->readers = alloc_percpu(unsigned);
+       }
+}
+EXPORT_SYMBOL_GPL(__six_lock_init);