X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fsix.c;h=458a1de0a6e39c89bb6840be729a4fc3f2a52ec5;hb=9799b119c34d7be1ee96d143209cfe5fc543d92a;hp=c60a67307773327e8c5d6e4e915e4f11c43c0d83;hpb=c35fbbc025c6099969befb4dfaf065215cf40cf3;p=bcachefs-tools-debian

diff --git a/libbcachefs/six.c b/libbcachefs/six.c
index c60a673..458a1de 100644
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@@ -1,158 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0
 
+#include <linux/export.h>
 #include <linux/log2.h>
+#include <linux/percpu.h>
 #include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/sched/clock.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+
+#include <trace/events/lock.h>
 
 #include "six.h"
 
-#define six_acquire(l, t)	lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
-#define six_release(l)		lock_release(l, 0, _RET_IP_)
+#ifdef DEBUG
+#define EBUG_ON(cond)			BUG_ON(cond)
+#else
+#define EBUG_ON(cond)			do {} while (0)
+#endif
+
+#define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
+#define six_release(l, ip)		lock_release(l, ip)
 
-#define __SIX_LOCK_HELD_read		__SIX_VAL(read_lock, ~0)
-#define __SIX_LOCK_HELD_intent		__SIX_VAL(intent_lock, ~0)
-#define __SIX_LOCK_HELD_write		__SIX_VAL(seq, 1)
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
+#define SIX_LOCK_HELD_read_OFFSET	0
+#define SIX_LOCK_HELD_read		~(~0U << 26)
+#define SIX_LOCK_HELD_intent		(1U << 26)
+#define SIX_LOCK_HELD_write		(1U << 27)
+#define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
+#define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
+#define SIX_LOCK_NOSPIN			(1U << 31)
 
 struct six_lock_vals {
 	/* Value we add to the lock in order to take the lock: */
-	u64			lock_val;
+	u32			lock_val;
 
 	/* If the lock has this value (used as a mask), taking the lock fails: */
-	u64			lock_fail;
-
-	/* Value we add to the lock in order to release the lock: */
-	u64			unlock_val;
+	u32			lock_fail;
 
 	/* Mask that indicates lock is held for this type: */
-	u64			held_mask;
+	u32			held_mask;
 
 	/* Waitlist we wakeup when releasing the lock: */
 	enum six_lock_type	unlock_wakeup;
 };
 
-#define LOCK_VALS {							\
-	[SIX_LOCK_read] = {						\
-		.lock_val	= __SIX_VAL(read_lock, 1),		\
-		.lock_fail	= __SIX_LOCK_HELD_write,		\
-		.unlock_val	= -__SIX_VAL(read_lock, 1),		\
-		.held_mask	= __SIX_LOCK_HELD_read,			\
-		.unlock_wakeup	= SIX_LOCK_write,			\
-	},								\
-	[SIX_LOCK_intent] = {						\
-		.lock_val	= __SIX_VAL(intent_lock, 1),		\
-		.lock_fail	= __SIX_LOCK_HELD_intent,		\
-		.unlock_val	= -__SIX_VAL(intent_lock, 1),		\
-		.held_mask	= __SIX_LOCK_HELD_intent,		\
-		.unlock_wakeup	= SIX_LOCK_intent,			\
-	},								\
-	[SIX_LOCK_write] = {						\
-		.lock_val	= __SIX_VAL(seq, 1),			\
-		.lock_fail	= __SIX_LOCK_HELD_read,			\
-		.unlock_val	= __SIX_VAL(seq, 1),			\
-		.held_mask	= __SIX_LOCK_HELD_write,		\
-		.unlock_wakeup	= SIX_LOCK_read,			\
-	},								\
-}
+static const struct six_lock_vals l[] = {
+	[SIX_LOCK_read] = {
+		.lock_val	= 1U << SIX_LOCK_HELD_read_OFFSET,
+		.lock_fail	= SIX_LOCK_HELD_write,
+		.held_mask	= SIX_LOCK_HELD_read,
+		.unlock_wakeup	= SIX_LOCK_write,
+	},
+	[SIX_LOCK_intent] = {
+		.lock_val	= SIX_LOCK_HELD_intent,
+		.lock_fail	= SIX_LOCK_HELD_intent,
+		.held_mask	= SIX_LOCK_HELD_intent,
+		.unlock_wakeup	= SIX_LOCK_intent,
+	},
+	[SIX_LOCK_write] = {
+		.lock_val	= SIX_LOCK_HELD_write,
+		.lock_fail	= SIX_LOCK_HELD_read,
+		.held_mask	= SIX_LOCK_HELD_write,
+		.unlock_wakeup	= SIX_LOCK_read,
+	},
+};
 
-static void six_set_owner(struct six_lock *lock, enum six_lock_type type)
+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
 {
-	if (type == SIX_LOCK_intent)
-		lock->owner = current;
+	if ((atomic_read(&lock->state) & mask) != mask)
+		atomic_or(mask, &lock->state);
 }
 
-static void six_clear_owner(struct six_lock *lock, enum six_lock_type type)
+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
 {
-	if (type == SIX_LOCK_intent)
-		lock->owner = NULL;
+	if (atomic_read(&lock->state) & mask)
+		atomic_and(~mask, &lock->state);
 }
 
-static inline bool __six_trylock_type(struct six_lock *lock,
-				      enum six_lock_type type)
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
+				 u32 old, struct task_struct *owner)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old;
-	u64 v = READ_ONCE(lock->state.v);
+	if (type != SIX_LOCK_intent)
+		return;
 
-	do {
-		old.v = v;
+	if (!(old & SIX_LOCK_HELD_intent)) {
+		EBUG_ON(lock->owner);
+		lock->owner = owner;
+	} else {
+		EBUG_ON(lock->owner != current);
+	}
+}
 
-		EBUG_ON(type == SIX_LOCK_write &&
-			((old.v & __SIX_LOCK_HELD_write) ||
-			 !(old.v & __SIX_LOCK_HELD_intent)));
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+	unsigned read_count = 0;
+	int cpu;
 
-		if (old.v & l[type].lock_fail)
-			return false;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v,
-				old.v + l[type].lock_val)) != old.v);
-	return true;
+	for_each_possible_cpu(cpu)
+		read_count += *per_cpu_ptr(lock->readers, cpu);
+	return read_count;
 }
 
-bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+/*
+ * __do_six_trylock() - main trylock routine
+ *
+ * Returns 1 on success, 0 on failure
+ *
+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
+ * for anoter thread taking the competing lock type, and we may havve to do a
+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
+ */
+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
+			    struct task_struct *task, bool try)
 {
-	bool ret = __six_trylock_type(lock, type);
+	int ret;
+	u32 old;
+
+	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
+	EBUG_ON(type == SIX_LOCK_write &&
+		(try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
+
+	/*
+	 * Percpu reader mode:
+	 *
+	 * The basic idea behind this algorithm is that you can implement a lock
+	 * between two threads without any atomics, just memory barriers:
+	 *
+	 * For two threads you'll need two variables, one variable for "thread a
+	 * has the lock" and another for "thread b has the lock".
+	 *
+	 * To take the lock, a thread sets its variable indicating that it holds
+	 * the lock, then issues a full memory barrier, then reads from the
+	 * other thread's variable to check if the other thread thinks it has
+	 * the lock. If we raced, we backoff and retry/sleep.
+	 *
+	 * Failure to take the lock may cause a spurious trylock failure in
+	 * another thread, because we temporarily set the lock to indicate that
+	 * we held it. This would be a problem for a thread in six_lock(), when
+	 * they are calling trylock after adding themself to the waitlist and
+	 * prior to sleeping.
+	 *
+	 * Therefore, if we fail to get the lock, and there were waiters of the
+	 * type we conflict with, we will have to issue a wakeup.
+	 *
+	 * Since we may be called under wait_lock (and by the wakeup code
+	 * itself), we return that the wakeup has to be done instead of doing it
+	 * here.
+	 */
+	if (type == SIX_LOCK_read && lock->readers) {
+		preempt_disable();
+		this_cpu_inc(*lock->readers); /* signal that we own lock */
+
+		smp_mb();
+
+		old = atomic_read(&lock->state);
+		ret = !(old & l[type].lock_fail);
 
-	if (ret) {
-		six_acquire(&lock->dep_map, 1);
-		six_set_owner(lock, type);
+		this_cpu_sub(*lock->readers, !ret);
+		preempt_enable();
+
+		if (!ret && (old & SIX_LOCK_WAITING_write))
+			ret = -1 - SIX_LOCK_write;
+	} else if (type == SIX_LOCK_write && lock->readers) {
+		if (try) {
+			atomic_add(SIX_LOCK_HELD_write, &lock->state);
+			smp_mb__after_atomic();
+		}
+
+		ret = !pcpu_read_count(lock);
+
+		if (try && !ret) {
+			old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
+			if (old & SIX_LOCK_WAITING_read)
+				ret = -1 - SIX_LOCK_read;
+		}
+	} else {
+		old = atomic_read(&lock->state);
+		do {
+			ret = !(old & l[type].lock_fail);
+			if (!ret || (type == SIX_LOCK_write && !try)) {
+				smp_mb();
+				break;
+			}
+		} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
+
+		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
 	}
 
+	if (ret > 0)
+		six_set_owner(lock, type, old, task);
+
+	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
+		(atomic_read(&lock->state) & SIX_LOCK_HELD_write));
+
 	return ret;
 }
 
-bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-		     unsigned seq)
+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old;
-	u64 v = READ_ONCE(lock->state.v);
+	struct six_lock_waiter *w, *next;
+	struct task_struct *task;
+	bool saw_one;
+	int ret;
+again:
+	ret = 0;
+	saw_one = false;
+	raw_spin_lock(&lock->wait_lock);
 
-	do {
-		old.v = v;
+	list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+		if (w->lock_want != lock_type)
+			continue;
 
-		if (old.seq != seq || old.v & l[type].lock_fail)
-			return false;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v,
-				old.v + l[type].lock_val)) != old.v);
+		if (saw_one && lock_type != SIX_LOCK_read)
+			goto unlock;
+		saw_one = true;
+
+		ret = __do_six_trylock(lock, lock_type, w->task, false);
+		if (ret <= 0)
+			goto unlock;
+
+		/*
+		 * Similar to percpu_rwsem_wake_function(), we need to guard
+		 * against the wakee noticing w->lock_acquired, returning, and
+		 * then exiting before we do the wakeup:
+		 */
+		task = get_task_struct(w->task);
+		__list_del(w->list.prev, w->list.next);
+		/*
+		 * The release barrier here ensures the ordering of the
+		 * __list_del before setting w->lock_acquired; @w is on the
+		 * stack of the thread doing the waiting and will be reused
+		 * after it sees w->lock_acquired with no other locking:
+		 * pairs with smp_load_acquire() in six_lock_slowpath()
+		 */
+		smp_store_release(&w->lock_acquired, true);
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+
+	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
+unlock:
+	raw_spin_unlock(&lock->wait_lock);
+
+	if (ret < 0) {
+		lock_type = -ret - 1;
+		goto again;
+	}
+}
 
-	six_acquire(&lock->dep_map, 1);
-	six_set_owner(lock, type);
+__always_inline
+static void six_lock_wakeup(struct six_lock *lock, u32 state,
+			    enum six_lock_type lock_type)
+{
+	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
+		return;
+
+	if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
+		return;
+
+	__six_lock_wakeup(lock, lock_type);
+}
+
+__always_inline
+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
+{
+	int ret;
+
+	ret = __do_six_trylock(lock, type, current, try);
+	if (ret < 0)
+		__six_lock_wakeup(lock, -ret - 1);
+
+	return ret > 0;
+}
+
+/**
+ * six_trylock_ip - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+	if (!do_six_trylock(lock, type, true))
+		return false;
+
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	return true;
 }
+EXPORT_SYMBOL_GPL(six_trylock_ip);
+
+/**
+ * six_relock_ip - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip)
+{
+	if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
+		return false;
 
-struct six_lock_waiter {
-	struct list_head	list;
-	struct task_struct	*task;
-};
+	if (six_lock_seq(lock) != seq) {
+		six_unlock_ip(lock, type, ip);
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(six_relock_ip);
 
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
 
-static inline int six_can_spin_on_owner(struct six_lock *lock)
+static inline bool six_can_spin_on_owner(struct six_lock *lock)
 {
 	struct task_struct *owner;
-	int retval = 1;
+	bool ret;
 
 	if (need_resched())
-		return 0;
+		return false;
 
 	rcu_read_lock();
 	owner = READ_ONCE(lock->owner);
-	if (owner)
-		retval = owner->on_cpu;
+	ret = !owner || owner_on_cpu(owner);
 	rcu_read_unlock();
-	/*
-	 * if lock->owner is not set, the mutex owner may have just acquired
-	 * it and not set the owner yet or the mutex has been released.
-	 */
-	return retval;
+
+	return ret;
 }
 
-static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner)
+static inline bool six_spin_on_owner(struct six_lock *lock,
+				     struct task_struct *owner,
+				     u64 end_time)
 {
 	bool ret = true;
+	unsigned loop = 0;
 
 	rcu_read_lock();
 	while (lock->owner == owner) {
@@ -164,7 +356,13 @@ static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner)
 		 */
 		barrier();
 
-		if (!owner->on_cpu || need_resched()) {
+		if (!owner_on_cpu(owner) || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+			six_set_bitmask(lock, SIX_LOCK_NOSPIN);
 			ret = false;
 			break;
 		}
@@ -176,9 +374,10 @@ static bool six_spin_on_owner(struct six_lock *lock, struct task_struct *owner)
 	return ret;
 }
 
-static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
 {
 	struct task_struct *task = current;
+	u64 end_time;
 
 	if (type == SIX_LOCK_write)
 		return false;
@@ -190,6 +389,8 @@ static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
 	if (!osq_lock(&lock->osq))
 		goto fail;
 
+	end_time = sched_clock() + 10 * NSEC_PER_USEC;
+
 	while (1) {
 		struct task_struct *owner;
 
@@ -198,10 +399,10 @@ static bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
 		 * release the lock or go to sleep.
 		 */
 		owner = READ_ONCE(lock->owner);
-		if (owner && !six_spin_on_owner(lock, owner))
+		if (owner && !six_spin_on_owner(lock, owner, end_time))
 			break;
 
-		if (__six_trylock_type(lock, type)) {
+		if (do_six_trylock(lock, type, false)) {
 			osq_unlock(&lock->osq);
 			preempt_enable();
 			return true;
@@ -240,160 +441,477 @@ fail:
 	return false;
 }
 
-void six_lock_type(struct six_lock *lock, enum six_lock_type type)
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+{
+	return false;
+}
+
+#endif
+
+noinline
+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
+			     struct six_lock_waiter *wait,
+			     six_lock_should_sleep_fn should_sleep_fn, void *p,
+			     unsigned long ip)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old, new;
-	struct six_lock_waiter wait;
-	u64 v;
+	int ret = 0;
 
-	six_acquire(&lock->dep_map, 0);
+	if (type == SIX_LOCK_write) {
+		EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+		atomic_add(SIX_LOCK_HELD_write, &lock->state);
+		smp_mb__after_atomic();
+	}
 
-	if (__six_trylock_type(lock, type))
-		goto done;
+	trace_contention_begin(lock, 0);
+	lock_contended(&lock->dep_map, ip);
 
 	if (six_optimistic_spin(lock, type))
-		goto done;
+		goto out;
+
+	wait->task		= current;
+	wait->lock_want		= type;
+	wait->lock_acquired	= false;
 
-	lock_contended(&lock->dep_map, _RET_IP_);
+	raw_spin_lock(&lock->wait_lock);
+	six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
+	/*
+	 * Retry taking the lock after taking waitlist lock, in case we raced
+	 * with an unlock:
+	 */
+	ret = __do_six_trylock(lock, type, current, false);
+	if (ret <= 0) {
+		wait->start_time = local_clock();
 
-	INIT_LIST_HEAD(&wait.list);
-	wait.task = current;
+		if (!list_empty(&lock->wait_list)) {
+			struct six_lock_waiter *last =
+				list_last_entry(&lock->wait_list,
+					struct six_lock_waiter, list);
+
+			if (time_before_eq64(wait->start_time, last->start_time))
+				wait->start_time = last->start_time + 1;
+		}
+
+		list_add_tail(&wait->list, &lock->wait_list);
+	}
+	raw_spin_unlock(&lock->wait_lock);
+
+	if (unlikely(ret > 0)) {
+		ret = 0;
+		goto out;
+	}
+
+	if (unlikely(ret < 0)) {
+		__six_lock_wakeup(lock, -ret - 1);
+		ret = 0;
+	}
 
 	while (1) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (list_empty_careful(&wait.list)) {
+
+		/*
+		 * Ensures that writes to the waitlist entry happen after we see
+		 * wait->lock_acquired: pairs with the smp_store_release in
+		 * __six_lock_wakeup
+		 */
+		if (smp_load_acquire(&wait->lock_acquired))
+			break;
+
+		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+		if (unlikely(ret)) {
+			bool acquired;
+
+			/*
+			 * If should_sleep_fn() returns an error, we are
+			 * required to return that error even if we already
+			 * acquired the lock - should_sleep_fn() might have
+			 * modified external state (e.g. when the deadlock cycle
+			 * detector in bcachefs issued a transaction restart)
+			 */
 			raw_spin_lock(&lock->wait_lock);
-			list_add_tail(&wait.list, &lock->wait_list[type]);
+			acquired = wait->lock_acquired;
+			if (!acquired)
+				list_del(&wait->list);
 			raw_spin_unlock(&lock->wait_lock);
-		}
 
-		v = READ_ONCE(lock->state.v);
-		do {
-			new.v = old.v = v;
-
-			if (!(old.v & l[type].lock_fail))
-				new.v += l[type].lock_val;
-			else if (!(new.waiters & (1 << type)))
-				new.waiters |= 1 << type;
-			else
-				break; /* waiting bit already set */
-		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-					old.v, new.v)) != old.v);
-
-		if (!(old.v & l[type].lock_fail))
+			if (unlikely(acquired))
+				do_six_unlock_type(lock, type);
 			break;
+		}
 
 		schedule();
 	}
 
 	__set_current_state(TASK_RUNNING);
-
-	if (!list_empty_careful(&wait.list)) {
-		raw_spin_lock(&lock->wait_lock);
-		list_del_init(&wait.list);
-		raw_spin_unlock(&lock->wait_lock);
+out:
+	if (ret && type == SIX_LOCK_write) {
+		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+		six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
 	}
-done:
-	lock_acquired(&lock->dep_map, _RET_IP_);
-	six_set_owner(lock, type);
+	trace_contention_end(lock, 0);
+
+	return ret;
 }
 
-static inline void six_lock_wakeup(struct six_lock *lock,
-				   union six_lock_state state,
-				   unsigned waitlist_id)
+/**
+ * six_lock_ip_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * This is the most general six_lock() variant, with parameters to support full
+ * cycle detection for deadlock avoidance.
+ *
+ * The code calling this function must implement tracking of held locks, and the
+ * @wait object should be embedded into the struct that tracks held locks -
+ * which must also be accessible in a thread-safe way.
+ *
+ * @should_sleep_fn should invoke the cycle detector; it should walk each
+ * lock's waiters, and for each waiter recursively walk their held locks.
+ *
+ * When this function must block, @wait will be added to @lock's waitlist before
+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
+ * removed from the lock waitlist until the lock has been successfully acquired,
+ * or we abort.
+ *
+ * @wait.start_time will be monotonically increasing for any given waitlist, and
+ * thus may be used as a loop cursor.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip)
 {
-	struct list_head *wait_list = &lock->wait_list[waitlist_id];
-	struct six_lock_waiter *w, *next;
+	int ret;
 
-	if (waitlist_id == SIX_LOCK_write && state.read_lock)
-		return;
+	wait->start_time = 0;
 
-	if (!(state.waiters & (1 << waitlist_id)))
-		return;
+	if (type != SIX_LOCK_write)
+		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
 
-	clear_bit(waitlist_bitnr(waitlist_id),
-		  (unsigned long *) &lock->state.v);
+	ret = do_six_trylock(lock, type, true) ? 0
+		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
 
-	raw_spin_lock(&lock->wait_lock);
+	if (ret && type != SIX_LOCK_write)
+		six_release(&lock->dep_map, ip);
+	if (!ret)
+		lock_acquired(&lock->dep_map, ip);
 
-	list_for_each_entry_safe(w, next, wait_list, list) {
-		list_del_init(&w->list);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
 
-		if (wake_up_process(w->task) &&
-		    waitlist_id != SIX_LOCK_read) {
-			if (!list_empty(wait_list))
-				set_bit(waitlist_bitnr(waitlist_id),
-					(unsigned long *) &lock->state.v);
-			break;
-		}
+__always_inline
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	u32 state;
+
+	if (type == SIX_LOCK_intent)
+		lock->owner = NULL;
+
+	if (type == SIX_LOCK_read &&
+	    lock->readers) {
+		smp_mb(); /* unlock barrier */
+		this_cpu_dec(*lock->readers);
+		smp_mb(); /* between unlocking and checking for waiters */
+		state = atomic_read(&lock->state);
+	} else {
+		u32 v = l[type].lock_val;
+
+		if (type != SIX_LOCK_read)
+			v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
+
+		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
+		state = atomic_sub_return_release(v, &lock->state);
 	}
 
-	raw_spin_unlock(&lock->wait_lock);
+	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
-void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+/**
+ * six_unlock_ip - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state state;
-
-	six_clear_owner(lock, type);
-
-	EBUG_ON(!(lock->state.v & l[type].held_mask));
 	EBUG_ON(type == SIX_LOCK_write &&
-		!(lock->state.v & __SIX_LOCK_HELD_intent));
+		!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+	EBUG_ON((type == SIX_LOCK_write ||
+		 type == SIX_LOCK_intent) &&
+		lock->owner != current);
+
+	if (type != SIX_LOCK_write)
+		six_release(&lock->dep_map, ip);
+	else
+		lock->seq++;
+
+	if (type == SIX_LOCK_intent &&
+	    lock->intent_lock_recurse) {
+		--lock->intent_lock_recurse;
+		return;
+	}
 
-	state.v = atomic64_add_return_release(l[type].unlock_val,
-					      &lock->state.counter);
-	six_release(&lock->dep_map);
-	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+	do_six_unlock_type(lock, type);
 }
+EXPORT_SYMBOL_GPL(six_unlock_ip);
 
-bool six_trylock_convert(struct six_lock *lock,
-			 enum six_lock_type from,
-			 enum six_lock_type to)
+/**
+ * six_lock_downgrade - convert an intent lock to a read lock
+ * @lock:	lock to dowgrade
+ *
+ * @lock will have read count incremented and intent count decremented
+ */
+void six_lock_downgrade(struct six_lock *lock)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old, new;
-	u64 v = READ_ONCE(lock->state.v);
+	six_lock_increment(lock, SIX_LOCK_read);
+	six_unlock_intent(lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
+
+/**
+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
+ * @lock:	lock to upgrade
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_lock_tryupgrade(struct six_lock *lock)
+{
+	u32 old = atomic_read(&lock->state), new;
 
 	do {
-		new.v = old.v = v;
-		new.v += l[from].unlock_val;
+		new = old;
 
-		if (new.v & l[to].lock_fail)
+		if (new & SIX_LOCK_HELD_intent)
 			return false;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v,
-				new.v + l[to].lock_val)) != old.v);
 
-	six_clear_owner(lock, from);
-	six_set_owner(lock, to);
+		if (!lock->readers) {
+			EBUG_ON(!(new & SIX_LOCK_HELD_read));
+			new -= l[SIX_LOCK_read].lock_val;
+		}
+
+		new |= SIX_LOCK_HELD_intent;
+	} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
+
+	if (lock->readers)
+		this_cpu_dec(*lock->readers);
 
-	six_lock_wakeup(lock, new, l[from].unlock_wakeup);
+	six_set_owner(lock, SIX_LOCK_intent, old, current);
 
 	return true;
 }
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
+
+/**
+ * six_trylock_convert - attempt to convert a held lock from one type to another
+ * @lock:	lock to upgrade
+ * @from:	SIX_LOCK_read or SIX_LOCK_intent
+ * @to:		SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_trylock_convert(struct six_lock *lock,
+			 enum six_lock_type from,
+			 enum six_lock_type to)
+{
+	EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
 
-/*
- * Increment read/intent lock count, assuming we already have it read or intent
- * locked:
+	if (to == from)
+		return true;
+
+	if (to == SIX_LOCK_read) {
+		six_lock_downgrade(lock);
+		return true;
+	} else {
+		return six_lock_tryupgrade(lock);
+	}
+}
+EXPORT_SYMBOL_GPL(six_trylock_convert);
+
+/**
+ * six_lock_increment - increase held lock count on a lock that is already held
+ * @lock:	lock to increment
+ * @type:	SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * @lock must already be held, with a lock type that is greater than or equal to
+ * @type
+ *
+ * A corresponding six_unlock_type() call will be required for @lock to be fully
+ * unlocked.
  */
 void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-
-	EBUG_ON(type == SIX_LOCK_write);
-	six_acquire(&lock->dep_map, 0);
+	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
 
 	/* XXX: assert already locked, and that we don't overflow: */
 
-	atomic64_add(l[type].lock_val, &lock->state.counter);
+	switch (type) {
+	case SIX_LOCK_read:
+		if (lock->readers) {
+			this_cpu_inc(*lock->readers);
+		} else {
+			EBUG_ON(!(atomic_read(&lock->state) &
+				  (SIX_LOCK_HELD_read|
+				   SIX_LOCK_HELD_intent)));
+			atomic_add(l[type].lock_val, &lock->state);
+		}
+		break;
+	case SIX_LOCK_intent:
+		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+		lock->intent_lock_recurse++;
+		break;
+	case SIX_LOCK_write:
+		BUG();
+		break;
+	}
 }
+EXPORT_SYMBOL_GPL(six_lock_increment);
+
+/**
+ * six_lock_wakeup_all - wake up all waiters on @lock
+ * @lock:	lock to wake up waiters for
+ *
+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
+ * abort the lock operation.
+ *
+ * This function is never needed in a bug-free program; it's only useful in
+ * debug code, e.g. to determine if a cycle detector is at fault.
+ */
+void six_lock_wakeup_all(struct six_lock *lock)
+{
+	u32 state = atomic_read(&lock->state);
+	struct six_lock_waiter *w;
 
-/* Convert from intent to read: */
-void six_lock_downgrade(struct six_lock *lock)
+	six_lock_wakeup(lock, state, SIX_LOCK_read);
+	six_lock_wakeup(lock, state, SIX_LOCK_intent);
+	six_lock_wakeup(lock, state, SIX_LOCK_write);
+
+	raw_spin_lock(&lock->wait_lock);
+	list_for_each_entry(w, &lock->wait_list, list)
+		wake_up_process(w->task);
+	raw_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+/**
+ * six_lock_counts - return held lock counts, for each lock type
+ * @lock:	lock to return counters for
+ *
+ * Return: the number of times a lock is held for read, intent and write.
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
 {
-	six_lock_increment(lock, SIX_LOCK_read);
-	six_unlock_intent(lock);
+	struct six_lock_count ret;
+
+	ret.n[SIX_LOCK_read]	= !lock->readers
+		? atomic_read(&lock->state) & SIX_LOCK_HELD_read
+		: pcpu_read_count(lock);
+	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
+		lock->intent_lock_recurse;
+	ret.n[SIX_LOCK_write]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
+
+/**
+ * six_lock_readers_add - directly manipulate reader count of a lock
+ * @lock:	lock to add/subtract readers for
+ * @nr:		reader count to add/subtract
+ *
+ * When an upper layer is implementing lock reentrency, we may have both read
+ * and intent locks on the same lock.
+ *
+ * When we need to take a write lock, the read locks will cause self-deadlock,
+ * because six locks themselves do not track which read locks are held by the
+ * current thread and which are held by a different thread - it does no
+ * per-thread tracking of held locks.
+ *
+ * The upper layer that is tracking held locks may however, if trylock() has
+ * failed, count up its own read locks, subtract them, take the write lock, and
+ * then re-add them.
+ *
+ * As in any other situation when taking a write lock, @lock must be held for
+ * intent one (or more) times, so @lock will never be left unlocked.
+ */
+void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+	if (lock->readers) {
+		this_cpu_add(*lock->readers, nr);
+	} else {
+		EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
+		/* reader count starts at bit 0 */
+		atomic_add(nr, &lock->state);
+	}
+}
+EXPORT_SYMBOL_GPL(six_lock_readers_add);
+
+/**
+ * six_lock_exit - release resources held by a lock prior to freeing
+ * @lock:	lock to exit
+ *
+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
+ * required to free the percpu read counts.
+ */
+void six_lock_exit(struct six_lock *lock)
+{
+	WARN_ON(lock->readers && pcpu_read_count(lock));
+	WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
+
+	free_percpu(lock->readers);
+	lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_exit);
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags)
+{
+	atomic_set(&lock->state, 0);
+	raw_spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+
+	/*
+	 * Don't assume that we have real percpu variables available in
+	 * userspace:
+	 */
+#ifdef __KERNEL__
+	if (flags & SIX_LOCK_INIT_PCPU) {
+		/*
+		 * We don't return an error here on memory allocation failure
+		 * since percpu is an optimization, and locks will work with the
+		 * same semantics in non-percpu mode: callers can check for
+		 * failure if they wish by checking lock->readers, but generally
+		 * will not want to treat it as an error.
+		 */
+		lock->readers = alloc_percpu(unsigned);
+	}
+#endif
 }
+EXPORT_SYMBOL_GPL(__six_lock_init);