Update bcachefs sources to 31c09369cd six locks: Fix an unitialized var

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
author: Kent Overstreet <kent.overstreet@linux.dev> 2023-05-25 17:52:28 -0400
committer: Kent Overstreet <kent.overstreet@linux.dev> 2023-05-25 22:25:34 -0400
commit: 1f78fed4693a5361f56508daac59bebd5b556379 (patch)
tree: 267c710018040b6caa9193a1ee34e514317709c4
parent: b8b8dcfaed641eabeec8ba070e1e23665bc4ceb2 (diff)
download: bcachefs-tools-1f78fed4693a5361f56508daac59bebd5b556379.tar.gz
21 files changed, 865 insertions, 692 deletions
diff --git a/.bcachefs_revision b/.bcachefs_revision
index 1f415ca7..1d85f952 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-799716df00709f7480f575e8fd626915bafba006
+31c09369cd01b34fb8ba845fa09776576b03a1e2
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index a9852fa1..79cf5aa9 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -32,6 +32,8 @@ typedef struct {
 #define __ATOMIC_SUB(v, p)		uatomic_sub(p, v)
 #define __ATOMIC_INC(p)			uatomic_inc(p)
 #define __ATOMIC_DEC(p)			uatomic_dec(p)
+#define __ATOMIC_AND(v, p)		uatomic_and(p, v)
+#define __ATOMIC_OR(v, p)		uatomic_or(p, v)
 
 #define xchg(p, v)			uatomic_xchg(p, v)
 #define xchg_acquire(p, v)		uatomic_xchg(p, v)
@@ -56,6 +58,8 @@ typedef struct {
 #define __ATOMIC_SUB_RETURN(v, p)	__atomic_sub_fetch(p, v, __ATOMIC_RELAXED)
 #define __ATOMIC_SUB_RETURN_RELEASE(v, p)				\
 					__atomic_sub_fetch(p, v, __ATOMIC_RELEASE)
+#define __ATOMIC_AND(p)			__atomic_and_fetch(p, v, __ATOMIC_RELAXED)
+#define __ATOMIC_OR(p)			__atomic_or_fetch(p, v, __ATOMIC_RELAXED)
 
 #define xchg(p, v)			__atomic_exchange_n(p, v, __ATOMIC_SEQ_CST)
 #define xchg_acquire(p, v)		__atomic_exchange_n(p, v, __ATOMIC_ACQUIRE)
@@ -244,6 +248,16 @@ static inline bool a_type##_inc_not_zero(a_type##_t *v)			\
 	return a_type##_add_unless(v, 1, 0);				\
 }									\
 									\
+static inline void a_type##_and(i_type a, a_type##_t *v)		\
+{									\
+	__ATOMIC_AND(a, v);						\
+}									\
+									\
+static inline void a_type##_or(i_type a, a_type##_t *v)			\
+{									\
+	__ATOMIC_OR(a, v);						\
+}									\
+									\
 static inline i_type a_type##_xchg(a_type##_t *v, i_type i)		\
 {									\
 	return xchg(&v->counter, i);					\
diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h
index 756eb3d1..9ed79f42 100644
--- a/include/linux/mean_and_variance.h
+++ b/include/linux/mean_and_variance.h
@@ -2,122 +2,112 @@
 #ifndef MEAN_AND_VARIANCE_H_
 #define MEAN_AND_VARIANCE_H_
 
-#include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/kernel.h>
 #include <linux/limits.h>
 #include <linux/math64.h>
+#include <stdlib.h>
 
 #define SQRT_U64_MAX 4294967295ULL
 
-/**
- * abs - return absolute value of an argument
- * @x: the value.  If it is unsigned type, it is converted to signed type first.
- *     char is treated as if it was signed (regardless of whether it really is)
- *     but the macro's return type is preserved as char.
- *
- * Return: an absolute value of x.
+/*
+ * u128_u: u128 user mode, because not all architectures support a real int128
+ * type
  */
-#define abs(x)	__abs_choose_expr(x, long long,				\
-		__abs_choose_expr(x, long,				\
-		__abs_choose_expr(x, int,				\
-		__abs_choose_expr(x, short,				\
-		__abs_choose_expr(x, char,				\
-		__builtin_choose_expr(					\
-			__builtin_types_compatible_p(typeof(x), char),	\
-			(char)({ signed char __x = (x); __x<0?-__x:__x; }), \
-			((void)0)))))))
 
-#define __abs_choose_expr(x, type, other) __builtin_choose_expr(	\
-	__builtin_types_compatible_p(typeof(x),   signed type) ||	\
-	__builtin_types_compatible_p(typeof(x), unsigned type),		\
-	({ signed type __x = (x); __x < 0 ? -__x : __x; }), other)
+#ifdef __SIZEOF_INT128__
 
-#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
-
-typedef unsigned __int128 u128;
+typedef struct {
+	unsigned __int128 v;
+} __aligned(16) u128_u;
 
-static inline u128 u64_to_u128(u64 a)
+static inline u128_u u64_to_u128(u64 a)
 {
-	return (u128)a;
+	return (u128_u) { .v = a };
 }
 
-static inline u64 u128_to_u64(u128 a)
+static inline u64 u128_lo(u128_u a)
 {
-	return (u64)a;
+	return a.v;
 }
 
-static inline u64 u128_shr64_to_u64(u128 a)
+static inline u64 u128_hi(u128_u a)
 {
-	return (u64)(a >> 64);
+	return a.v >> 64;
 }
 
-static inline u128 u128_add(u128 a, u128 b)
+static inline u128_u u128_add(u128_u a, u128_u b)
 {
-	return a + b;
+	a.v += b.v;
+	return a;
 }
 
-static inline u128 u128_sub(u128 a, u128 b)
+static inline u128_u u128_sub(u128_u a, u128_u b)
 {
-	return a - b;
+	a.v -= b.v;
+	return a;
 }
 
-static inline u128 u128_shl(u128 i, s8 shift)
+static inline u128_u u128_shl(u128_u a, s8 shift)
 {
-	return i << shift;
+	a.v <<= shift;
+	return a;
 }
 
-static inline u128 u128_shl64_add(u64 a, u64 b)
+static inline u128_u u128_square(u64 a)
 {
-	return ((u128)a << 64) + b;
-}
+	u128_u b = u64_to_u128(a);
 
-static inline u128 u128_square(u64 i)
-{
-	return i*i;
+	b.v *= b.v;
+	return b;
 }
 
 #else
 
 typedef struct {
 	u64 hi, lo;
-} u128;
+} __aligned(16) u128_u;
+
+/* conversions */
 
-static inline u128 u64_to_u128(u64 a)
+static inline u128_u u64_to_u128(u64 a)
 {
-	return (u128){ .lo = a };
+	return (u128_u) { .lo = a };
 }
 
-static inline u64 u128_to_u64(u128 a)
+static inline u64 u128_lo(u128_u a)
 {
 	return a.lo;
 }
 
-static inline u64 u128_shr64_to_u64(u128 a)
+static inline u64 u128_hi(u128_u a)
 {
 	return a.hi;
 }
 
-static inline u128 u128_add(u128 a, u128 b)
+/* arithmetic */
+
+static inline u128_u u128_add(u128_u a, u128_u b)
 {
-	u128 c;
+	u128_u c;
 
 	c.lo = a.lo + b.lo;
 	c.hi = a.hi + b.hi + (c.lo < a.lo);
 	return c;
 }
 
-static inline u128 u128_sub(u128 a, u128 b)
+static inline u128_u u128_sub(u128_u a, u128_u b)
 {
-	u128 c;
+	u128_u c;
 
 	c.lo = a.lo - b.lo;
 	c.hi = a.hi - b.hi - (c.lo > a.lo);
 	return c;
 }
 
-static inline u128 u128_shl(u128 i, s8 shift)
+static inline u128_u u128_shl(u128_u i, s8 shift)
 {
-	u128 r;
+	u128_u r;
 
 	r.lo = i.lo << shift;
 	if (shift < 64)
@@ -129,15 +119,10 @@ static inline u128 u128_shl(u128 i, s8 shift)
 	return r;
 }
 
-static inline u128 u128_shl64_add(u64 a, u64 b)
+static inline u128_u u128_square(u64 i)
 {
-	return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b));
-}
-
-static inline u128 u128_square(u64 i)
-{
-	u128 r;
-	u64  h = i >> 32, l = i & (u64)U32_MAX;
+	u128_u r;
+	u64  h = i >> 32, l = i & U32_MAX;
 
 	r =             u128_shl(u64_to_u128(h*h), 64);
 	r = u128_add(r, u128_shl(u64_to_u128(h*l), 32));
@@ -148,85 +133,69 @@ static inline u128 u128_square(u64 i)
 
 #endif
 
-static inline u128 u128_div(u128 n, u64 d)
+static inline u128_u u64s_to_u128(u64 hi, u64 lo)
 {
-	u128 r;
-	u64 rem;
-	u64 hi = u128_shr64_to_u64(n);
-	u64 lo = u128_to_u64(n);
-	u64  h =  hi & ((u64)U32_MAX  << 32);
-	u64  l = (hi &  (u64)U32_MAX) << 32;
+	u128_u c = u64_to_u128(hi);
 
-	r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
-	r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
-	r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
-	return r;
+	c = u128_shl(c, 64);
+	c = u128_add(c, u64_to_u128(lo));
+	return c;
 }
 
+u128_u u128_div(u128_u n, u64 d);
+
 struct mean_and_variance {
-	s64 n;
-	s64 sum;
-	u128 sum_squares;
+	s64	n;
+	s64	sum;
+	u128_u	sum_squares;
 };
 
 /* expontentially weighted variant */
 struct mean_and_variance_weighted {
-	bool init;
-	u8 w;
-	s64 mean;
-	u64 variance;
+	bool	init;
+	u8	weight;	/* base 2 logarithim */
+	s64	mean;
+	u64	variance;
 };
 
-s64 fast_divpow2(s64 n, u8 d);
+/**
+ * fast_divpow2() - fast approximation for n / (1 << d)
+ * @n: numerator
+ * @d: the power of 2 denominator.
+ *
+ * note: this rounds towards 0.
+ */
+static inline s64 fast_divpow2(s64 n, u8 d)
+{
+	return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
+}
 
+/**
+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
+ * and return it.
+ * @s1: the mean_and_variance to update.
+ * @v1: the new sample.
+ *
+ * see linked pdf equation 12.
+ */
 static inline struct mean_and_variance
-mean_and_variance_update_inlined(struct mean_and_variance s1, s64 v1)
-{
-	struct mean_and_variance s2;
-	u64 v2 = abs(v1);
-
-	s2.n           = s1.n + 1;
-	s2.sum         = s1.sum + v1;
-	s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2));
-	return s2;
-}
-
-static inline struct mean_and_variance_weighted
-mean_and_variance_weighted_update_inlined(struct mean_and_variance_weighted s1, s64 x)
-{
-	struct mean_and_variance_weighted s2;
-	// previous weighted variance.
-	u64 var_w0 = s1.variance;
-	u8 w = s2.w = s1.w;
-	// new value weighted.
-	s64 x_w = x << w;
-	s64 diff_w = x_w - s1.mean;
-	s64 diff = fast_divpow2(diff_w, w);
-	// new mean weighted.
-	s64 u_w1     = s1.mean + diff;
-
-	BUG_ON(w % 2 != 0);
-
-	if (!s1.init) {
-		s2.mean = x_w;
-		s2.variance = 0;
-	} else {
-		s2.mean = u_w1;
-		s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
-	}
-	s2.init = true;
-
-	return s2;
+mean_and_variance_update(struct mean_and_variance s, s64 v)
+{
+	return (struct mean_and_variance) {
+		.n           = s.n + 1,
+		.sum         = s.sum + v,
+		.sum_squares = u128_add(s.sum_squares, u128_square(abs(v))),
+	};
 }
 
-struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1);
-       s64		 mean_and_variance_get_mean(struct mean_and_variance s);
-       u64		 mean_and_variance_get_variance(struct mean_and_variance s1);
-       u32		 mean_and_variance_get_stddev(struct mean_and_variance s);
+s64 mean_and_variance_get_mean(struct mean_and_variance s);
+u64 mean_and_variance_get_variance(struct mean_and_variance s1);
+u32 mean_and_variance_get_stddev(struct mean_and_variance s);
+
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
 
-struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1);
-       s64			  mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-       u64			  mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-       u32			  mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
 
 #endif // MEAN_AND_VAIRANCE_H_
diff --git a/include/linux/six.h b/include/linux/six.h
index 83023f64..394da423 100644
--- a/include/linux/six.h
+++ b/include/linux/six.h
@@ -3,59 +3,124 @@
 #ifndef _LINUX_SIX_H
 #define _LINUX_SIX_H
 
-/*
- * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
- * semaphores, except with a third intermediate state, intent. Basic operations
- * are:
+/**
+ * DOC: SIX locks overview
  *
- * six_lock_read(&foo->lock);
- * six_unlock_read(&foo->lock);
+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
+ * but with an additional state: read/shared, intent, exclusive/write
  *
- * six_lock_intent(&foo->lock);
- * six_unlock_intent(&foo->lock);
+ * The purpose of the intent state is to allow for greater concurrency on tree
+ * structures without deadlocking. In general, a read can't be upgraded to a
+ * write lock without deadlocking, so an operation that updates multiple nodes
+ * will have to take write locks for the full duration of the operation.
  *
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
+ * But by adding an intent state, which is exclusive with other intent locks but
+ * not with readers, we can take intent locks at thte start of the operation,
+ * and then take write locks only for the actual update to each individual
+ * nodes, without deadlocking.
  *
- * Intent locks block other intent locks, but do not block read locks, and you
- * must have an intent lock held before taking a write lock, like so:
+ * Example usage:
+ *   six_lock_read(&foo->lock);
+ *   six_unlock_read(&foo->lock);
  *
- * six_lock_intent(&foo->lock);
- * six_lock_write(&foo->lock);
- * six_unlock_write(&foo->lock);
- * six_unlock_intent(&foo->lock);
+ * An intent lock must be held before taking a write lock:
+ *   six_lock_intent(&foo->lock);
+ *   six_lock_write(&foo->lock);
+ *   six_unlock_write(&foo->lock);
+ *   six_unlock_intent(&foo->lock);
  *
  * Other operations:
- *
  *   six_trylock_read()
  *   six_trylock_intent()
  *   six_trylock_write()
  *
- *   six_lock_downgrade():	convert from intent to read
- *   six_lock_tryupgrade():	attempt to convert from read to intent
- *
- * Locks also embed a sequence number, which is incremented when the lock is
- * locked or unlocked for write. The current sequence number can be grabbed
- * while a lock is held from lock->state.seq; then, if you drop the lock you can
- * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
- * iff it hasn't been locked for write in the meantime.
- *
- * There are also operations that take the lock type as a parameter, where the
- * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
- *
- *   six_lock_type(lock, type)
- *   six_unlock_type(lock, type)
- *   six_relock(lock, type, seq)
- *   six_trylock_type(lock, type)
- *   six_trylock_convert(lock, from, to)
- *
- * A lock may be held multiple times by the same thread (for read or intent,
- * not write). However, the six locks code does _not_ implement the actual
- * recursive checks itself though - rather, if your code (e.g. btree iterator
- * code) knows that the current thread already has a lock held, and for the
- * correct type, six_lock_increment() may be used to bump up the counter for
- * that type - the only effect is that one more call to unlock will be required
- * before the lock is unlocked.
+ *   six_lock_downgrade()	convert from intent to read
+ *   six_lock_tryupgrade()	attempt to convert from read to intent, may fail
+ *
+ * There are also interfaces that take the lock type as an enum:
+ *
+ *   six_lock_type(&foo->lock, SIX_LOCK_read);
+ *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
+ *   six_lock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
+ *
+ * Lock sequence numbers - unlock(), relock():
+ *
+ *   Locks embed sequences numbers, which are incremented on write lock/unlock.
+ *   This allows locks to be dropped and the retaken iff the state they protect
+ *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
+ *   doing IO or allocating memory.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     u32 seq = six_lock_seq(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *
+ *     some_operation_that_may_block();
+ *
+ *     if (six_relock_read(&foo->lock, seq)) { ... }
+ *
+ *   If the relock operation succeeds, it is as if the lock was never unlocked.
+ *
+ * Reentrancy:
+ *
+ *   Six locks are not by themselves reentrent, but have counters for both the
+ *   read and intent states that can be used to provide reentrency by an upper
+ *   layer that tracks held locks. If a lock is known to already be held in the
+ *   read or intent state, six_lock_increment() can be used to bump the "lock
+ *   held in this state" counter, increasing the number of unlock calls that
+ *   will be required to fully unlock it.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     six_lock_increment(&foo->lock, SIX_LOCK_read);
+ *     six_unlock_read(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *   foo->lock is now fully unlocked.
+ *
+ *   Since the intent state supercedes read, it's legal to increment the read
+ *   counter when holding an intent lock, but not the reverse.
+ *
+ *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
+ *   is not legal.
+ *
+ * should_sleep_fn:
+ *
+ *   There is a six_lock() variant that takes a function pointer that is called
+ *   immediately prior to schedule() when blocking, and may return an error to
+ *   abort.
+ *
+ *   One possible use for this feature is when objects being locked are part of
+ *   a cache and may reused, and lock ordering is based on a property of the
+ *   object that will change when the object is reused - i.e. logical key order.
+ *
+ *   If looking up an object in the cache may race with object reuse, and lock
+ *   ordering is required to prevent deadlock, object reuse may change the
+ *   correct lock order for that object and cause a deadlock. should_sleep_fn
+ *   can be used to check if the object is still the object we want and avoid
+ *   this deadlock.
+ *
+ * Wait list entry interface:
+ *
+ *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
+ *   wait list entry. By embedding six_lock_waiter into another object, and by
+ *   traversing lock waitlists, it is then possible for an upper layer to
+ *   implement full cycle detection for deadlock avoidance.
+ *
+ *   should_sleep_fn should be used for invoking the cycle detector, walking the
+ *   graph of held locks to check for a deadlock. The upper layer must track
+ *   held locks for each thread, and each thread's held locks must be reachable
+ *   from its six_lock_waiter object.
+ *
+ *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
+ *   the lock, and before calling should_sleep_fn, and the wait object will not
+ *   be removed from the waitlist until either the lock has been successfully
+ *   acquired, or we aborted because should_sleep_fn returned an error.
+ *
+ *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
+ *   have timestamps in strictly ascending order - this is so the timestamp can
+ *   be used as a cursor for lock graph traverse.
  */
 
 #include <linux/lockdep.h>
@@ -63,41 +128,6 @@
 #include <linux/sched.h>
 #include <linux/types.h>
 
-#define SIX_LOCK_SEPARATE_LOCKFNS
-
-union six_lock_state {
-	struct {
-		atomic64_t	counter;
-	};
-
-	struct {
-		u64		v;
-	};
-
-	struct {
-		/* for waitlist_bitnr() */
-		unsigned long	l;
-	};
-
-	struct {
-		unsigned	read_lock:26;
-		unsigned	write_locking:1;
-		unsigned	intent_lock:1;
-		unsigned	nospin:1;
-		unsigned	waiters:3;
-		/*
-		 * seq works much like in seqlocks: it's incremented every time
-		 * we lock and unlock for write.
-		 *
-		 * If it's odd write lock is held, even unlocked.
-		 *
-		 * Thus readers can unlock, and then lock again later iff it
-		 * hasn't been modified in the meantime.
-		 */
-		u32		seq;
-	};
-};
-
 enum six_lock_type {
 	SIX_LOCK_read,
 	SIX_LOCK_intent,
@@ -105,7 +135,8 @@ enum six_lock_type {
 };
 
 struct six_lock {
-	union six_lock_state	state;
+	atomic_t		state;
+	u32			seq;
 	unsigned		intent_lock_recurse;
 	struct task_struct	*owner;
 	unsigned __percpu	*readers;
@@ -127,59 +158,210 @@ struct six_lock_waiter {
 
 typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
 
-static __always_inline void __six_lock_init(struct six_lock *lock,
-					    const char *name,
-					    struct lock_class_key *key)
-{
-	atomic64_set(&lock->state.counter, 0);
-	raw_spin_lock_init(&lock->wait_lock);
-	INIT_LIST_HEAD(&lock->wait_list);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
-	lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-}
+void six_lock_exit(struct six_lock *lock);
+
+enum six_lock_init_flags {
+	SIX_LOCK_INIT_PCPU	= 1U << 0,
+};
 
-#define six_lock_init(lock)						\
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags);
+
+/**
+ * six_lock_init - initialize a six lock
+ * @lock:	lock to initialize
+ * @flags:	optional flags, i.e. SIX_LOCK_INIT_PCPU
+ */
+#define six_lock_init(lock, flags)					\
 do {									\
 	static struct lock_class_key __key;				\
 									\
-	__six_lock_init((lock), #lock, &__key);				\
+	__six_lock_init((lock), #lock, &__key, flags);			\
 } while (0)
 
-#define __SIX_VAL(field, _v)	(((union six_lock_state) { .field = _v }).v)
+/**
+ * six_lock_seq - obtain current lock sequence number
+ * @lock:	six_lock to obtain sequence number for
+ *
+ * @lock should be held for read or intent, and not write
+ *
+ * By saving the lock sequence number, we can unlock @lock and then (typically
+ * after some blocking operation) attempt to relock it: the relock will succeed
+ * if the sequence number hasn't changed, meaning no write locks have been taken
+ * and state corresponding to what @lock protects is still valid.
+ */
+static inline u32 six_lock_seq(const struct six_lock *lock)
+{
+	return lock->seq;
+}
+
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_trylock_type - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	return six_trylock_ip(lock, type, _THIS_IP_);
+}
+
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip);
+
+/**
+ * six_lock_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
+ * for full documentation.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
+				  struct six_lock_waiter *wait,
+				  six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+/**
+ * six_lock_ip - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
+			      six_lock_should_sleep_fn should_sleep_fn, void *p,
+			      unsigned long ip)
+{
+	struct six_lock_waiter wait;
+
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
+}
+
+/**
+ * six_lock_type - take a six lock lock
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+				six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+	struct six_lock_waiter wait;
+
+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip);
+
+/**
+ * six_relock_type - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+				   unsigned seq)
+{
+	return six_relock_ip(lock, type, seq, _THIS_IP_);
+}
+
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_unlock_type - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+	six_unlock_ip(lock, type, _THIS_IP_);
+}
 
 #define __SIX_LOCK(type)						\
-bool six_trylock_ip_##type(struct six_lock *, unsigned long);		\
-bool six_relock_ip_##type(struct six_lock *, u32, unsigned long);	\
-int six_lock_ip_##type(struct six_lock *, six_lock_should_sleep_fn,	\
-		       void *, unsigned long);				\
-int six_lock_ip_waiter_##type(struct six_lock *, struct six_lock_waiter *,\
-			six_lock_should_sleep_fn, void *, unsigned long);\
-void six_unlock_ip_##type(struct six_lock *, unsigned long);		\
+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
+{									\
+	return six_trylock_ip(lock, SIX_LOCK_##type, ip);		\
+}									\
 									\
 static inline bool six_trylock_##type(struct six_lock *lock)		\
 {									\
-	return six_trylock_ip_##type(lock, _THIS_IP_);			\
+	return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);	\
+}									\
+									\
+static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
+			   struct six_lock_waiter *wait,		\
+			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
+			   unsigned long ip)				\
+{									\
+	return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
+}									\
+									\
+static inline int six_lock_ip_##type(struct six_lock *lock,		\
+		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
+		    unsigned long ip)					\
+{									\
+	return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
+}									\
+									\
+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
+{									\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);		\
 }									\
+									\
 static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
 {									\
-	return six_relock_ip_##type(lock, seq, _THIS_IP_);		\
+	return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);	\
 }									\
+									\
 static inline int six_lock_##type(struct six_lock *lock,		\
 				  six_lock_should_sleep_fn fn, void *p)\
 {									\
 	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
 }									\
-static inline int six_lock_waiter_##type(struct six_lock *lock,		\
-			struct six_lock_waiter *wait,			\
-			six_lock_should_sleep_fn fn, void *p)		\
+									\
+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
 {									\
-	return six_lock_ip_waiter_##type(lock, wait, fn, p, _THIS_IP_);	\
+	six_unlock_ip(lock, SIX_LOCK_##type, ip);			\
 }									\
+									\
 static inline void six_unlock_##type(struct six_lock *lock)		\
 {									\
-	return six_unlock_ip_##type(lock, _THIS_IP_);			\
+	six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);		\
 }
 
 __SIX_LOCK(read)
@@ -187,55 +369,6 @@ __SIX_LOCK(intent)
 __SIX_LOCK(write)
 #undef __SIX_LOCK
 
-#define SIX_LOCK_DISPATCH(type, fn, ...)			\
-	switch (type) {						\
-	case SIX_LOCK_read:					\
-		return fn##_read(__VA_ARGS__);			\
-	case SIX_LOCK_intent:					\
-		return fn##_intent(__VA_ARGS__);		\
-	case SIX_LOCK_write:					\
-		return fn##_write(__VA_ARGS__);			\
-	default:						\
-		BUG();						\
-	}
-
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	SIX_LOCK_DISPATCH(type, six_trylock, lock);
-}
-
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-				   unsigned seq)
-{
-	SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
-}
-
-static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
-				six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-	SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
-}
-
-static inline int six_lock_type_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-				struct six_lock_waiter *wait,
-				six_lock_should_sleep_fn should_sleep_fn, void *p,
-				unsigned long ip)
-{
-	SIX_LOCK_DISPATCH(type, six_lock_ip_waiter, lock, wait, should_sleep_fn, p, ip);
-}
-
-static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
-				struct six_lock_waiter *wait,
-				six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-	SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p);
-}
-
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-	SIX_LOCK_DISPATCH(type, six_unlock, lock);
-}
-
 void six_lock_downgrade(struct six_lock *);
 bool six_lock_tryupgrade(struct six_lock *);
 bool six_trylock_convert(struct six_lock *, enum six_lock_type,
@@ -245,13 +378,11 @@ void six_lock_increment(struct six_lock *, enum six_lock_type);
 
 void six_lock_wakeup_all(struct six_lock *);
 
-void six_lock_pcpu_free(struct six_lock *);
-void six_lock_pcpu_alloc(struct six_lock *);
-
 struct six_lock_count {
 	unsigned n[3];
 };
 
 struct six_lock_count six_lock_counts(struct six_lock *);
+void six_lock_readers_add(struct six_lock *, int);
 
 #endif /* _LINUX_SIX_H */
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index dcdef3bc..f774a660 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -269,9 +269,9 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 	int rw = flags & WRITE;
 
-	if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) {
-		prt_printf(err, "bad val size (%lu != %u)",
-		       bkey_val_u64s(k.k), alloc_v4_u64s(a.v));
+	if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
+		prt_printf(err, "bad val size (%u > %lu)",
+		       alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
 		return -BCH_ERR_invalid_bkey;
 	}
 
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index b58b876f..ee7ba700 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -724,7 +724,7 @@ unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
 	return 0;
 }
 
-#ifdef CONFIG_X86_64
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
 
 #define I(_x)			(*(out)++ = (_x))
 #define I1(i0)						I(i0)
diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h
index 727bed99..e81fb3e0 100644
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@@ -9,9 +9,17 @@
 #include "util.h"
 #include "vstructs.h"
 
+#if 0
+
+/*
+ * compiled unpack functions are disabled, pending a new interface for
+ * dynamically allocating executable memory:
+ */
+
 #ifdef CONFIG_X86_64
 #define HAVE_BCACHEFS_COMPILED_UNPACK	1
 #endif
+#endif
 
 void bch2_bkey_packed_to_binary_text(struct printbuf *,
 				     const struct bkey_format *,
diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c
index 73d32688..f8402709 100644
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@@ -62,10 +62,12 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 
 	EBUG_ON(btree_node_write_in_flight(b));
 
+	clear_btree_node_just_written(b);
+
 	kvpfree(b->data, btree_bytes(c));
 	b->data = NULL;
 #ifdef __KERNEL__
-	vfree(b->aux_data);
+	kvfree(b->aux_data);
 #else
 	munmap(b->aux_data, btree_aux_data_bytes(b));
 #endif
@@ -100,7 +102,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 	if (!b->data)
 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 #ifdef __KERNEL__
-	b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
+	b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
 #else
 	b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
 			   PROT_READ|PROT_WRITE|PROT_EXEC,
@@ -126,7 +128,6 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
 		return NULL;
 
 	bkey_btree_ptr_init(&b->key);
-	bch2_btree_lock_init(&b->c);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	lockdep_set_no_check_recursion(&b->c.lock.dep_map);
 #endif
@@ -150,6 +151,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 		return NULL;
 	}
 
+	bch2_btree_lock_init(&b->c, 0);
+
 	bc->used++;
 	list_add(&b->list, &bc->freeable);
 	return b;
@@ -484,7 +487,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	while (!list_empty(&bc->freed_nonpcpu)) {
 		b = list_first_entry(&bc->freed_nonpcpu, struct btree, list);
 		list_del(&b->list);
-		six_lock_pcpu_free(&b->c.lock);
+		six_lock_exit(&b->c.lock);
 		kfree(b);
 	}
 
@@ -645,8 +648,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
 		mutex_lock(&bc->lock);
 	}
 
-	if (pcpu_read_locks)
-		six_lock_pcpu_alloc(&b->c.lock);
+	bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
 
 	BUG_ON(!six_trylock_intent(&b->c.lock));
 	BUG_ON(!six_trylock_write(&b->c.lock));
@@ -700,6 +702,7 @@ err:
 	/* Try to cannibalize another cached btree node: */
 	if (bc->alloc_lock == current) {
 		b2 = btree_node_cannibalize(c);
+		clear_btree_node_just_written(b2);
 		bch2_btree_node_hash_remove(bc, b2);
 
 		if (b) {
@@ -784,7 +787,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	set_btree_node_read_in_flight(b);
 
 	six_unlock_write(&b->c.lock);
-	seq = b->c.lock.state.seq;
+	seq = six_lock_seq(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
 
 	/* Unlock before doing IO: */
@@ -908,7 +911,7 @@ retry:
 	}
 
 	if (unlikely(btree_node_read_in_flight(b))) {
-		u32 seq = b->c.lock.state.seq;
+		u32 seq = six_lock_seq(&b->c.lock);
 
 		six_unlock_type(&b->c.lock, lock_type);
 		bch2_trans_unlock(trans);
@@ -1006,7 +1009,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 	}
 
 	if (unlikely(btree_node_read_in_flight(b))) {
-		u32 seq = b->c.lock.state.seq;
+		u32 seq = six_lock_seq(&b->c.lock);
 
 		six_unlock_type(&b->c.lock, lock_type);
 		bch2_trans_unlock(trans);
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index decbbaac..0a7a18ec 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -483,7 +483,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 	struct btree_node_entry *bne;
 	bool reinit_iter = false;
 
-	EBUG_ON(!(b->c.lock.state.seq & 1));
+	EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]);
 	BUG_ON(bset_written(b, bset(b, &b->set[1])));
 	BUG_ON(btree_node_just_written(b));
 
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 365794dc..4b9c04dc 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -652,9 +652,8 @@ void bch2_btree_path_level_init(struct btree_trans *trans,
 	BUG_ON(path->cached);
 
 	EBUG_ON(!btree_path_pos_in_node(path, b));
-	EBUG_ON(b->c.lock.state.seq & 1);
 
-	path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+	path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
 	path->l[b->c.level].b = b;
 	__btree_path_level_init(path, b->c.level);
 }
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 02dd81a1..198e3815 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -42,14 +42,7 @@ static inline struct btree *btree_path_node(struct btree_path *path,
 static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
 					const struct btree *b, unsigned level)
 {
-	/*
-	 * We don't compare the low bits of the lock sequence numbers because
-	 * @path might have taken a write lock on @b, and we don't want to skip
-	 * the linked path if the sequence numbers were equal before taking that
-	 * write lock. The lock sequence number is incremented by taking and
-	 * releasing write locks and is even when unlocked:
-	 */
-	return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
+	return path->l[level].lock_seq == six_lock_seq(&b->c.lock);
 }
 
 static inline struct btree *btree_node_parent(struct btree_path *path,
diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c
index 3b333e3b..645fa994 100644
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@@ -252,7 +252,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 		}
 
 		path->l[0].b = (void *) ck;
-		path->l[0].lock_seq = ck->c.lock.state.seq;
+		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
 		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
 
 		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
@@ -283,9 +283,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 		return NULL;
 init:
 	INIT_LIST_HEAD(&ck->list);
-	bch2_btree_lock_init(&ck->c);
-	if (pcpu_readers)
-		six_lock_pcpu_alloc(&ck->c.lock);
+	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
 
 	ck->c.cached = true;
 	BUG_ON(!six_trylock_intent(&ck->c.lock));
@@ -341,9 +339,6 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 		}
 
 		mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
-	} else {
-		if (path->btree_id == BTREE_ID_subvolumes)
-			six_lock_pcpu_alloc(&ck->c.lock);
 	}
 
 	ck->c.level		= 0;
@@ -512,7 +507,7 @@ retry:
 		mark_btree_node_locked(trans, path, 0, lock_want);
 	}
 
-	path->l[0].lock_seq	= ck->c.lock.state.seq;
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
 	path->l[0].b		= (void *) ck;
 fill:
 	path->uptodate = BTREE_ITER_UPTODATE;
@@ -594,7 +589,7 @@ retry:
 		mark_btree_node_locked(trans, path, 0, lock_want);
 	}
 
-	path->l[0].lock_seq	= ck->c.lock.state.seq;
+	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
 	path->l[0].b		= (void *) ck;
 fill:
 	if (!ck->valid)
@@ -872,7 +867,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 			break;
 
 		list_del(&ck->list);
-		six_lock_pcpu_free(&ck->c.lock);
+		six_lock_exit(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 		atomic_long_dec(&bc->nr_freed);
 		scanned++;
@@ -888,7 +883,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 			break;
 
 		list_del(&ck->list);
-		six_lock_pcpu_free(&ck->c.lock);
+		six_lock_exit(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 		atomic_long_dec(&bc->nr_freed);
 		scanned++;
@@ -1013,7 +1008,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
 		list_del(&ck->list);
 		kfree(ck->k);
-		six_lock_pcpu_free(&ck->c.lock);
+		six_lock_exit(&ck->c.lock);
 		kmem_cache_free(bch2_key_cache, ck);
 	}
 
diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c
index b9998665..70639a15 100644
--- a/libbcachefs/btree_locking.c
+++ b/libbcachefs/btree_locking.c
@@ -6,9 +6,10 @@
 
 static struct lock_class_key bch2_btree_node_lock_key;
 
-void bch2_btree_lock_init(struct btree_bkey_cached_common *b)
+void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
+			  enum six_lock_init_flags flags)
 {
-	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key);
+	__six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
 }
 
 #ifdef CONFIG_LOCKDEP
@@ -20,16 +21,6 @@ void bch2_assert_btree_nodes_not_locked(void)
 
 /* Btree node locking: */
 
-static inline void six_lock_readers_add(struct six_lock *lock, int nr)
-{
-	if (lock->readers)
-		this_cpu_add(*lock->readers, nr);
-	else if (nr > 0)
-		atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter);
-	else
-		atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter);
-}
-
 struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
 						  struct btree_path *skip,
 						  struct btree_bkey_cached_common *b,
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 327780ce..b341cc89 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -14,7 +14,7 @@
 
 #include "btree_iter.h"
 
-void bch2_btree_lock_init(struct btree_bkey_cached_common *);
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
 
 #ifdef CONFIG_LOCKDEP
 void bch2_assert_btree_nodes_not_locked(void);
@@ -176,13 +176,13 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
 	struct btree_path *linked;
 
 	EBUG_ON(path->l[b->c.level].b != b);
-	EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
 	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
 
 	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
 
 	trans_for_each_path_with_node(trans, b, linked)
-		linked->l[b->c.level].lock_seq += 2;
+		linked->l[b->c.level].lock_seq++;
 
 	six_unlock_write(&b->c.lock);
 }
@@ -206,8 +206,8 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans,
 	trans->lock_must_abort	= false;
 	trans->locking		= b;
 
-	ret = six_lock_type_ip_waiter(&b->lock, type, &trans->locking_wait,
-				   bch2_six_check_for_deadlock, trans, ip);
+	ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait,
+				 bch2_six_check_for_deadlock, trans, ip);
 	WRITE_ONCE(trans->locking, NULL);
 	WRITE_ONCE(trans->locking_wait.start_time, 0);
 	return ret;
@@ -284,7 +284,7 @@ static inline int __btree_node_lock_write(struct btree_trans *trans,
 					  bool lock_may_not_fail)
 {
 	EBUG_ON(&path->l[b->level].b->c != b);
-	EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
+	EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock));
 	EBUG_ON(!btree_node_intent_locked(path, b->level));
 
 	/*
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 6ba0954e..1319337c 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -688,7 +688,7 @@ err:
 		bch2_trans_unlock(&trans);
 		btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
 		mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
-		path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+		path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
 		path->l[b->c.level].b = b;
 
 		bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index bce42eef..bd144182 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -137,17 +137,17 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
 {
 	struct bch_fs_usage_online *ret;
-	unsigned seq, i, v, u64s = fs_usage_u64s(c) + 1;
+	unsigned nr_replicas = READ_ONCE(c->replicas.nr);
+	unsigned seq, i;
 retry:
-	ret = kmalloc(u64s * sizeof(u64), GFP_NOFS);
+	ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_NOFS);
 	if (unlikely(!ret))
 		return NULL;
 
 	percpu_down_read(&c->mark_lock);
 
-	v = fs_usage_u64s(c) + 1;
-	if (unlikely(u64s != v)) {
-		u64s = v;
+	if (nr_replicas != c->replicas.nr) {
+		nr_replicas = c->replicas.nr;
 		percpu_up_read(&c->mark_lock);
 		kfree(ret);
 		goto retry;
@@ -157,10 +157,12 @@ retry:
 
 	do {
 		seq = read_seqcount_begin(&c->usage_lock);
-		unsafe_memcpy(&ret->u, c->usage_base, u64s * sizeof(u64),
+		unsafe_memcpy(&ret->u, c->usage_base,
+			      __fs_usage_u64s(nr_replicas) * sizeof(u64),
 			      "embedded variable length struct");
 		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
+			acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i],
+					__fs_usage_u64s(nr_replicas));
 	} while (read_seqcount_retry(&c->usage_lock, seq));
 
 	return ret;
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index d677b022..bdf4fff9 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -207,10 +207,24 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
 
 /* Filesystem usage: */
 
+static inline unsigned __fs_usage_u64s(unsigned nr_replicas)
+{
+	return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas;
+}
+
 static inline unsigned fs_usage_u64s(struct bch_fs *c)
 {
-	return sizeof(struct bch_fs_usage) / sizeof(u64) +
-		READ_ONCE(c->replicas.nr);
+	return __fs_usage_u64s(READ_ONCE(c->replicas.nr));
+}
+
+static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas)
+{
+	return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas;
+}
+
+static inline unsigned fs_usage_online_u64s(struct bch_fs *c)
+{
+	return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr));
 }
 
 static inline unsigned dev_usage_u64s(void)
diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h
index 8027c2a1..cfb1779d 100644
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@@ -420,7 +420,9 @@ TRACE_EVENT(btree_path_relock_fail,
 		else
 			scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
 		__entry->iter_lock_seq		= path->l[level].lock_seq;
-		__entry->node_lock_seq		= is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
 	),
 
 	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
@@ -475,7 +477,9 @@ TRACE_EVENT(btree_path_upgrade_fail,
 		__entry->read_count		= c.n[SIX_LOCK_read];
 		__entry->intent_count		= c.n[SIX_LOCK_read];
 		__entry->iter_lock_seq		= path->l[level].lock_seq;
-		__entry->node_lock_seq		= is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0;
+		__entry->node_lock_seq		= is_btree_node(path, level)
+			? six_lock_seq(&path->l[level].b->c.lock)
+			: 0;
 	),
 
 	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index dfc55fe4..90796863 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -350,11 +350,8 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
 
 	if (time_after64(end, start)) {
 		duration = end - start;
-		stats->duration_stats = mean_and_variance_update_inlined(stats->duration_stats,
-								 duration);
-		stats->duration_stats_weighted = mean_and_variance_weighted_update(
-			stats->duration_stats_weighted,
-			duration);
+		stats->duration_stats = mean_and_variance_update(stats->duration_stats, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
 		stats->max_duration = max(stats->max_duration, duration);
 		stats->min_duration = min(stats->min_duration, duration);
 		bch2_quantiles_update(&stats->quantiles, duration);
@@ -362,10 +359,8 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
 
 	if (time_after64(end, stats->last_event)) {
 		freq = end - stats->last_event;
-		stats->freq_stats = mean_and_variance_update_inlined(stats->freq_stats, freq);
-		stats->freq_stats_weighted = mean_and_variance_weighted_update(
-			stats->freq_stats_weighted,
-			freq);
+		stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
 		stats->max_freq = max(stats->max_freq, freq);
 		stats->min_freq = min(stats->min_freq, freq);
 		stats->last_event = end;
@@ -594,8 +589,8 @@ void bch2_time_stats_exit(struct bch2_time_stats *stats)
 void bch2_time_stats_init(struct bch2_time_stats *stats)
 {
 	memset(stats, 0, sizeof(*stats));
-	stats->duration_stats_weighted.w = 8;
-	stats->freq_stats_weighted.w = 8;
+	stats->duration_stats_weighted.weight = 8;
+	stats->freq_stats_weighted.weight = 8;
 	stats->min_duration = U64_MAX;
 	stats->min_freq = U64_MAX;
 	spin_lock_init(&stats->lock);
diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c
index bd08da5f..eb5f2ba0 100644
--- a/linux/mean_and_variance.c
+++ b/linux/mean_and_variance.c
@@ -43,38 +43,28 @@
 #include <linux/mean_and_variance.h>
 #include <linux/module.h>
 
-/**
- * fast_divpow2() - fast approximation for n / (1 << d)
- * @n: numerator
- * @d: the power of 2 denominator.
- *
- * note: this rounds towards 0.
- */
-s64 fast_divpow2(s64 n, u8 d)
+u128_u u128_div(u128_u n, u64 d)
 {
-	return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d;
-}
+	u128_u r;
+	u64 rem;
+	u64 hi = u128_hi(n);
+	u64 lo = u128_lo(n);
+	u64  h =  hi & ((u64) U32_MAX  << 32);
+	u64  l = (hi &  (u64) U32_MAX) << 32;
 
-/**
- * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1
- * and return it.
- * @s1: the mean_and_variance to update.
- * @v1: the new sample.
- *
- * see linked pdf equation 12.
- */
-struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1)
-{
-	return mean_and_variance_update_inlined(s1, v1);
+	r =             u128_shl(u64_to_u128(div64_u64_rem(h,                d, &rem)), 64);
+	r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l  + (rem << 32), d, &rem)), 32));
+	r = u128_add(r,          u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem)));
+	return r;
 }
-EXPORT_SYMBOL_GPL(mean_and_variance_update);
+EXPORT_SYMBOL_GPL(u128_div);
 
 /**
  * mean_and_variance_get_mean() - get mean from @s
  */
 s64 mean_and_variance_get_mean(struct mean_and_variance s)
 {
-	return div64_u64(s.sum, s.n);
+	return s.n ? div64_u64(s.sum, s.n) : 0;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
 
@@ -85,10 +75,14 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
  */
 u64 mean_and_variance_get_variance(struct mean_and_variance s1)
 {
-	u128 s2 = u128_div(s1.sum_squares, s1.n);
-	u64  s3 = abs(mean_and_variance_get_mean(s1));
+	if (s1.n) {
+		u128_u s2 = u128_div(s1.sum_squares, s1.n);
+		u64  s3 = abs(mean_and_variance_get_mean(s1));
 
-	return u128_to_u64(u128_sub(s2, u128_square(s3)));
+		return u128_lo(u128_sub(s2, u128_square(s3)));
+	} else {
+		return 0;
+	}
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
 
@@ -109,10 +103,26 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
  * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
  * values are stored bitshifted for performance and added precision.
  */
-struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1,
-								    s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
 {
-	return mean_and_variance_weighted_update_inlined(s1, x);
+	// previous weighted variance.
+	u8 w		= s->weight;
+	u64 var_w0	= s->variance;
+	// new value weighted.
+	s64 x_w		= x << w;
+	s64 diff_w	= x_w - s->mean;
+	s64 diff	= fast_divpow2(diff_w, w);
+	// new mean weighted.
+	s64 u_w1	= s->mean + diff;
+
+	if (!s->init) {
+		s->mean = x_w;
+		s->variance = 0;
+	} else {
+		s->mean = u_w1;
+		s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
+	}
+	s->init = true;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
 
@@ -121,7 +131,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
  */
 s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
 {
-	return fast_divpow2(s.mean, s.w);
+	return fast_divpow2(s.mean, s.weight);
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
 
@@ -131,7 +141,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
 u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
 {
 	// always positive don't need fast divpow2
-	return s.variance >> s.w;
+	return s.variance >> s.weight;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
 
diff --git a/linux/six.c b/linux/six.c
index 3d366a84..a47cd6d0 100644
--- a/linux/six.c
+++ b/linux/six.c
@@ -14,9 +14,9 @@
 #include <trace/events/lock.h>
 
 #ifdef DEBUG
-#define EBUG_ON(cond)		BUG_ON(cond)
+#define EBUG_ON(cond)			BUG_ON(cond)
 #else
-#define EBUG_ON(cond)		do {} while (0)
+#define EBUG_ON(cond)			do {} while (0)
 #endif
 
 #define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
@@ -24,59 +24,69 @@
 
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
 
+#define SIX_LOCK_HELD_read_OFFSET	0
+#define SIX_LOCK_HELD_read		~(~0U << 26)
+#define SIX_LOCK_HELD_intent		(1U << 26)
+#define SIX_LOCK_HELD_write		(1U << 27)
+#define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
+#define SIX_LOCK_WAITING_intent		(1U << (28 + SIX_LOCK_intent))
+#define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
+#define SIX_LOCK_NOSPIN			(1U << 31)
+
 struct six_lock_vals {
 	/* Value we add to the lock in order to take the lock: */
-	u64			lock_val;
+	u32			lock_val;
 
 	/* If the lock has this value (used as a mask), taking the lock fails: */
-	u64			lock_fail;
-
-	/* Value we add to the lock in order to release the lock: */
-	u64			unlock_val;
+	u32			lock_fail;
 
 	/* Mask that indicates lock is held for this type: */
-	u64			held_mask;
+	u32			held_mask;
 
 	/* Waitlist we wakeup when releasing the lock: */
 	enum six_lock_type	unlock_wakeup;
 };
 
-#define __SIX_LOCK_HELD_read	__SIX_VAL(read_lock, ~0)
-#define __SIX_LOCK_HELD_intent	__SIX_VAL(intent_lock, ~0)
-#define __SIX_LOCK_HELD_write	__SIX_VAL(seq, 1)
-
 #define LOCK_VALS {							\
 	[SIX_LOCK_read] = {						\
-		.lock_val	= __SIX_VAL(read_lock, 1),		\
-		.lock_fail	= __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
-		.unlock_val	= -__SIX_VAL(read_lock, 1),		\
-		.held_mask	= __SIX_LOCK_HELD_read,			\
+		.lock_val	= 1U << SIX_LOCK_HELD_read_OFFSET,	\
+		.lock_fail	= SIX_LOCK_HELD_write,			\
+		.held_mask	= SIX_LOCK_HELD_read,			\
 		.unlock_wakeup	= SIX_LOCK_write,			\
 	},								\
 	[SIX_LOCK_intent] = {						\
-		.lock_val	= __SIX_VAL(intent_lock, 1),		\
-		.lock_fail	= __SIX_LOCK_HELD_intent,		\
-		.unlock_val	= -__SIX_VAL(intent_lock, 1),		\
-		.held_mask	= __SIX_LOCK_HELD_intent,		\
+		.lock_val	= SIX_LOCK_HELD_intent,			\
+		.lock_fail	= SIX_LOCK_HELD_intent,			\
+		.held_mask	= SIX_LOCK_HELD_intent,			\
 		.unlock_wakeup	= SIX_LOCK_intent,			\
 	},								\
 	[SIX_LOCK_write] = {						\
-		.lock_val	= __SIX_VAL(seq, 1),			\
-		.lock_fail	= __SIX_LOCK_HELD_read,			\
-		.unlock_val	= __SIX_VAL(seq, 1),			\
-		.held_mask	= __SIX_LOCK_HELD_write,		\
+		.lock_val	= SIX_LOCK_HELD_write,			\
+		.lock_fail	= SIX_LOCK_HELD_read,			\
+		.held_mask	= SIX_LOCK_HELD_write,			\
 		.unlock_wakeup	= SIX_LOCK_read,			\
 	},								\
 }
 
+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
+{
+	if ((atomic_read(&lock->state) & mask) != mask)
+		atomic_or(mask, &lock->state);
+}
+
+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
+{
+	if (atomic_read(&lock->state) & mask)
+		atomic_and(~mask, &lock->state);
+}
+
 static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-				 union six_lock_state old,
-				 struct task_struct *owner)
+				 u32 old, struct task_struct *owner)
 {
 	if (type != SIX_LOCK_intent)
 		return;
 
-	if (!old.intent_lock) {
+	if (!(old & SIX_LOCK_HELD_intent)) {
 		EBUG_ON(lock->owner);
 		lock->owner = owner;
 	} else {
@@ -94,22 +104,25 @@ static inline unsigned pcpu_read_count(struct six_lock *lock)
 	return read_count;
 }
 
-/* This is probably up there with the more evil things I've done */
-#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
-
-static int __do_six_trylock_type(struct six_lock *lock,
-				 enum six_lock_type type,
-				 struct task_struct *task,
-				 bool try)
+/*
+ * __do_six_trylock() - main trylock routine
+ *
+ * Returns 1 on success, 0 on failure
+ *
+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
+ * for anoter thread taking the competing lock type, and we may havve to do a
+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
+ */
+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
+			    struct task_struct *task, bool try)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old, new;
 	int ret;
-	u64 v;
+	u32 old, new, v;
 
 	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
-	EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
-	EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
+	EBUG_ON(type == SIX_LOCK_write &&
+		(try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
 
 	/*
 	 * Percpu reader mode:
@@ -124,101 +137,75 @@ static int __do_six_trylock_type(struct six_lock *lock,
 	 * the lock, then issues a full memory barrier, then reads from the
 	 * other thread's variable to check if the other thread thinks it has
 	 * the lock. If we raced, we backoff and retry/sleep.
+	 *
+	 * Failure to take the lock may cause a spurious trylock failure in
+	 * another thread, because we temporarily set the lock to indicate that
+	 * we held it. This would be a problem for a thread in six_lock(), when
+	 * they are calling trylock after adding themself to the waitlist and
+	 * prior to sleeping.
+	 *
+	 * Therefore, if we fail to get the lock, and there were waiters of the
+	 * type we conflict with, we will have to issue a wakeup.
+	 *
+	 * Since we may be called under wait_lock (and by the wakeup code
+	 * itself), we return that the wakeup has to be done instead of doing it
+	 * here.
 	 */
-
 	if (type == SIX_LOCK_read && lock->readers) {
 		preempt_disable();
 		this_cpu_inc(*lock->readers); /* signal that we own lock */
 
 		smp_mb();
 
-		old.v = READ_ONCE(lock->state.v);
-		ret = !(old.v & l[type].lock_fail);
+		old = atomic_read(&lock->state);
+		ret = !(old & l[type].lock_fail);
 
 		this_cpu_sub(*lock->readers, !ret);
 		preempt_enable();
 
-		/*
-		 * If we failed because a writer was trying to take the
-		 * lock, issue a wakeup because we might have caused a
-		 * spurious trylock failure:
-		 */
-#if 0
-		/*
-		 * This code should be sufficient, but we're seeing unexplained
-		 * lost wakeups:
-		 */
-		if (old.write_locking)
+		if (!ret && (old & SIX_LOCK_WAITING_write))
 			ret = -1 - SIX_LOCK_write;
-#else
-		if (!ret)
-			ret = -1 - SIX_LOCK_write;
-#endif
 	} else if (type == SIX_LOCK_write && lock->readers) {
 		if (try) {
-			atomic64_add(__SIX_VAL(write_locking, 1),
-				     &lock->state.counter);
-			smp_mb__after_atomic();
-		} else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) {
-			atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write),
-				     &lock->state.counter);
-			/*
-			 * pairs with barrier after unlock and before checking
-			 * for readers in unlock path
-			 */
+			atomic_add(SIX_LOCK_HELD_write, &lock->state);
 			smp_mb__after_atomic();
 		}
 
 		ret = !pcpu_read_count(lock);
 
-		/*
-		 * On success, we increment lock->seq; also we clear
-		 * write_locking unless we failed from the lock path:
-		 */
-		v = 0;
-		if (ret)
-			v += __SIX_VAL(seq, 1);
-		if (ret || try)
-			v -= __SIX_VAL(write_locking, 1);
-
 		if (try && !ret) {
-			old.v = atomic64_add_return(v, &lock->state.counter);
-			if (old.waiters & (1 << SIX_LOCK_read))
+			old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
+			if (old & SIX_LOCK_WAITING_read)
 				ret = -1 - SIX_LOCK_read;
-		} else {
-			atomic64_add(v, &lock->state.counter);
 		}
 	} else {
-		v = READ_ONCE(lock->state.v);
+		v = atomic_read(&lock->state);
 		do {
-			new.v = old.v = v;
+			new = old = v;
 
-			if (!(old.v & l[type].lock_fail)) {
-				new.v += l[type].lock_val;
+			ret = !(old & l[type].lock_fail);
 
-				if (type == SIX_LOCK_write)
-					new.write_locking = 0;
-			} else if (!try && !(new.waiters & (1 << type)))
-				new.waiters |= 1 << type;
-			else
-				break; /* waiting bit already set */
-		} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-					old.v, new.v)) != old.v);
+			if (!ret || (type == SIX_LOCK_write && !try)) {
+				smp_mb();
+				break;
+			}
 
-		ret = !(old.v & l[type].lock_fail);
+			new += l[type].lock_val;
+		} while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old);
 
-		EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
+		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
 	}
 
 	if (ret > 0)
 		six_set_owner(lock, type, old, task);
 
-	EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking));
+	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
+		(atomic_read(&lock->state) & SIX_LOCK_HELD_write));
 
 	return ret;
 }
 
-static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
 {
 	struct six_lock_waiter *w, *next;
 	struct task_struct *task;
@@ -237,7 +224,7 @@ again:
 			goto unlock;
 		saw_one = true;
 
-		ret = __do_six_trylock_type(lock, lock_type, w->task, false);
+		ret = __do_six_trylock(lock, lock_type, w->task, false);
 		if (ret <= 0)
 			goto unlock;
 
@@ -252,7 +239,7 @@ again:
 		wake_up_process(task);
 	}
 
-	clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v);
+	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
 unlock:
 	raw_spin_unlock(&lock->wait_lock);
 
@@ -262,96 +249,74 @@ unlock:
 	}
 }
 
-static inline void six_lock_wakeup(struct six_lock *lock,
-				   union six_lock_state state,
-				   enum six_lock_type lock_type)
+__always_inline
+static void six_lock_wakeup(struct six_lock *lock, u32 state,
+			    enum six_lock_type lock_type)
 {
-	if (lock_type == SIX_LOCK_write && state.read_lock)
+	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
 		return;
 
-	if (!(state.waiters & (1 << lock_type)))
+	if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
 		return;
 
 	__six_lock_wakeup(lock, lock_type);
 }
 
-static bool do_six_trylock_type(struct six_lock *lock,
-				enum six_lock_type type,
-				bool try)
+__always_inline
+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
 {
 	int ret;
 
-	ret = __do_six_trylock_type(lock, type, current, try);
+	ret = __do_six_trylock(lock, type, current, try);
 	if (ret < 0)
 		__six_lock_wakeup(lock, -ret - 1);
 
 	return ret > 0;
 }
 
-__always_inline __flatten
-static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type,
-			       unsigned long ip)
+/**
+ * six_trylock_ip - attempt to take a six lock without blocking
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
-	if (!do_six_trylock_type(lock, type, true))
+	if (!do_six_trylock(lock, type, true))
 		return false;
 
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	return true;
 }
-
-__always_inline __flatten
-static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
-			      unsigned seq, unsigned long ip)
+EXPORT_SYMBOL_GPL(six_trylock_ip);
+
+/**
+ * six_relock_ip - attempt to re-take a lock that was held previously
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
+ *		held previously
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+		   unsigned seq, unsigned long ip)
 {
-	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state old;
-	u64 v;
-
-	EBUG_ON(type == SIX_LOCK_write);
-
-	if (type == SIX_LOCK_read &&
-	    lock->readers) {
-		bool ret;
-
-		preempt_disable();
-		this_cpu_inc(*lock->readers);
-
-		smp_mb();
-
-		old.v = READ_ONCE(lock->state.v);
-		ret = !(old.v & l[type].lock_fail) && old.seq == seq;
-
-		this_cpu_sub(*lock->readers, !ret);
-		preempt_enable();
-
-		/*
-		 * Similar to the lock path, we may have caused a spurious write
-		 * lock fail and need to issue a wakeup:
-		 */
-		if (ret)
-			six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-		else
-			six_lock_wakeup(lock, old, SIX_LOCK_write);
+	if (lock->seq != seq || !six_trylock_ip(lock, type, ip))
+		return false;
 
-		return ret;
+	if (lock->seq != seq) {
+		six_unlock_ip(lock, type, ip);
+		return false;
 	}
 
-	v = READ_ONCE(lock->state.v);
-	do {
-		old.v = v;
-
-		if (old.seq != seq || old.v & l[type].lock_fail)
-			return false;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v,
-				old.v + l[type].lock_val)) != old.v);
-
-	six_set_owner(lock, type, old, current);
-	if (type != SIX_LOCK_write)
-		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
 	return true;
 }
+EXPORT_SYMBOL_GPL(six_relock_ip);
 
 #ifdef CONFIG_LOCK_SPIN_ON_OWNER
 
@@ -371,17 +336,6 @@ static inline bool six_can_spin_on_owner(struct six_lock *lock)
 	return ret;
 }
 
-static inline void six_set_nospin(struct six_lock *lock)
-{
-	union six_lock_state old, new;
-	u64 v = READ_ONCE(lock->state.v);
-
-	do {
-		new.v = old.v = v;
-		new.nospin = true;
-	} while ((v = atomic64_cmpxchg(&lock->state.counter, old.v, new.v)) != old.v);
-}
-
 static inline bool six_spin_on_owner(struct six_lock *lock,
 				     struct task_struct *owner,
 				     u64 end_time)
@@ -405,7 +359,7 @@ static inline bool six_spin_on_owner(struct six_lock *lock,
 		}
 
 		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
-			six_set_nospin(lock);
+			six_set_bitmask(lock, SIX_LOCK_NOSPIN);
 			ret = false;
 			break;
 		}
@@ -445,7 +399,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 		if (owner && !six_spin_on_owner(lock, owner, end_time))
 			break;
 
-		if (do_six_trylock_type(lock, type, false)) {
+		if (do_six_trylock(lock, type, false)) {
 			osq_unlock(&lock->osq);
 			preempt_enable();
 			return true;
@@ -494,17 +448,16 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
 #endif
 
 noinline
-static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
-				    struct six_lock_waiter *wait,
-				    six_lock_should_sleep_fn should_sleep_fn, void *p,
-				    unsigned long ip)
+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
+			     struct six_lock_waiter *wait,
+			     six_lock_should_sleep_fn should_sleep_fn, void *p,
+			     unsigned long ip)
 {
-	union six_lock_state old;
 	int ret = 0;
 
 	if (type == SIX_LOCK_write) {
-		EBUG_ON(lock->state.write_locking);
-		atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
+		EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+		atomic_add(SIX_LOCK_HELD_write, &lock->state);
 		smp_mb__after_atomic();
 	}
 
@@ -519,13 +472,12 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 	wait->lock_acquired	= false;
 
 	raw_spin_lock(&lock->wait_lock);
-	if (!(lock->state.waiters & (1 << type)))
-		set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v);
+	six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
 	/*
-	 * Retry taking the lock after taking waitlist lock, have raced with an
-	 * unlock:
+	 * Retry taking the lock after taking waitlist lock, in case we raced
+	 * with an unlock:
 	 */
-	ret = __do_six_trylock_type(lock, type, current, false);
+	ret = __do_six_trylock(lock, type, current, false);
 	if (ret <= 0) {
 		wait->start_time = local_clock();
 
@@ -565,7 +517,7 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 				list_del(&wait->list);
 			raw_spin_unlock(&lock->wait_lock);
 
-			if (wait->lock_acquired)
+			if (unlikely(wait->lock_acquired))
 				do_six_unlock_type(lock, type);
 			break;
 		}
@@ -575,21 +527,49 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty
 
 	__set_current_state(TASK_RUNNING);
 out:
-	if (ret && type == SIX_LOCK_write && lock->state.write_locking) {
-		old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
-					    &lock->state.counter);
-		six_lock_wakeup(lock, old, SIX_LOCK_read);
+	if (ret && type == SIX_LOCK_write) {
+		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+		six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
 	}
 	trace_contention_end(lock, 0);
 
 	return ret;
 }
 
-__always_inline __flatten
-static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type,
-			 struct six_lock_waiter *wait,
-			 six_lock_should_sleep_fn should_sleep_fn, void *p,
-			 unsigned long ip)
+/**
+ * six_lock_ip_waiter - take a lock, with full waitlist interface
+ * @lock:	lock to take
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:	pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *		to scheduling
+ * @p:		passed through to @should_sleep_fn
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * This is the most general six_lock() variant, with parameters to support full
+ * cycle detection for deadlock avoidance.
+ *
+ * The code calling this function must implement tracking of held locks, and the
+ * @wait object should be embedded into the struct that tracks held locks -
+ * which must also be accessible in a thread-safe way.
+ *
+ * @should_sleep_fn should invoke the cycle detector; it should walk each
+ * lock's waiters, and for each waiter recursively walk their held locks.
+ *
+ * When this function must block, @wait will be added to @lock's waitlist before
+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
+ * removed from the lock waitlist until the lock has been successfully acquired,
+ * or we abort.
+ *
+ * @wait.start_time will be monotonically increasing for any given waitlist, and
+ * thus may be used as a loop cursor.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+		       struct six_lock_waiter *wait,
+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
+		       unsigned long ip)
 {
 	int ret;
 
@@ -598,8 +578,8 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type
 	if (type != SIX_LOCK_write)
 		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
 
-	ret = do_six_trylock_type(lock, type, true) ? 0
-		: __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p, ip);
+	ret = do_six_trylock(lock, type, true) ? 0
+		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
 
 	if (ret && type != SIX_LOCK_write)
 		six_release(&lock->dep_map, ip);
@@ -608,22 +588,13 @@ static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
 
 __always_inline
-static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
-			   six_lock_should_sleep_fn should_sleep_fn, void *p,
-			   unsigned long ip)
-{
-	struct six_lock_waiter wait;
-
-	return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-}
-
-__always_inline __flatten
 static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 {
 	const struct six_lock_vals l[] = LOCK_VALS;
-	union six_lock_state state;
+	u32 state;
 
 	if (type == SIX_LOCK_intent)
 		lock->owner = NULL;
@@ -633,26 +604,39 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
 		smp_mb(); /* unlock barrier */
 		this_cpu_dec(*lock->readers);
 		smp_mb(); /* between unlocking and checking for waiters */
-		state.v = READ_ONCE(lock->state.v);
+		state = atomic_read(&lock->state);
 	} else {
-		u64 v = l[type].unlock_val;
+		u32 v = l[type].lock_val;
 
 		if (type != SIX_LOCK_read)
-			v -= lock->state.v & __SIX_VAL(nospin, 1);
+			v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
 
-		EBUG_ON(!(lock->state.v & l[type].held_mask));
-		state.v = atomic64_add_return_release(v, &lock->state.counter);
+		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
+		state = atomic_sub_return_release(v, &lock->state);
 	}
 
 	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
 }
 
-__always_inline __flatten
-static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type,
-			      unsigned long ip)
+/**
+ * six_unlock_ip - drop a six lock
+ * @lock:	lock to unlock
+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);				read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
+ */
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
 {
 	EBUG_ON(type == SIX_LOCK_write &&
-		!(lock->state.v & __SIX_LOCK_HELD_intent));
+		!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
 	EBUG_ON((type == SIX_LOCK_write ||
 		 type == SIX_LOCK_intent) &&
 		lock->owner != current);
@@ -666,52 +650,18 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type,
 		return;
 	}
 
+	lock->seq += type == SIX_LOCK_write;
+
 	do_six_unlock_type(lock, type);
 }
+EXPORT_SYMBOL_GPL(six_unlock_ip);
 
-#define __SIX_LOCK(type)						\
-bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)	\
-{									\
-	return __six_trylock_type(lock, SIX_LOCK_##type, ip);		\
-}									\
-EXPORT_SYMBOL_GPL(six_trylock_ip_##type);				\
-									\
-bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-{									\
-	return __six_relock_type(lock, SIX_LOCK_##type, seq, ip);	\
-}									\
-EXPORT_SYMBOL_GPL(six_relock_ip_##type);				\
-									\
-int six_lock_ip_##type(struct six_lock *lock,				\
-		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
-		    unsigned long ip)					\
-{									\
-	return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-}									\
-EXPORT_SYMBOL_GPL(six_lock_ip_##type);					\
-									\
-int six_lock_ip_waiter_##type(struct six_lock *lock,			\
-			   struct six_lock_waiter *wait,		\
-			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
-			   unsigned long ip)				\
-{									\
-	return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-}									\
-EXPORT_SYMBOL_GPL(six_lock_ip_waiter_##type);				\
-									\
-void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
-{									\
-	__six_unlock_type(lock, SIX_LOCK_##type, ip);			\
-}									\
-EXPORT_SYMBOL_GPL(six_unlock_ip_##type);
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-
-#undef __SIX_LOCK
-
-/* Convert from intent to read: */
+/**
+ * six_lock_downgrade - convert an intent lock to a read lock
+ * @lock:	lock to dowgrade
+ *
+ * @lock will have read count incremented and intent count decremented
+ */
 void six_lock_downgrade(struct six_lock *lock)
 {
 	six_lock_increment(lock, SIX_LOCK_read);
@@ -719,25 +669,33 @@ void six_lock_downgrade(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_downgrade);
 
+/**
+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
+ * @lock:	lock to upgrade
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
 bool six_lock_tryupgrade(struct six_lock *lock)
 {
-	union six_lock_state old, new;
-	u64 v = READ_ONCE(lock->state.v);
+	const struct six_lock_vals l[] = LOCK_VALS;
+	u32 old, new, v = atomic_read(&lock->state);
 
 	do {
-		new.v = old.v = v;
+		new = old = v;
 
-		if (new.intent_lock)
+		if (new & SIX_LOCK_HELD_intent)
 			return false;
 
 		if (!lock->readers) {
-			EBUG_ON(!new.read_lock);
-			new.read_lock--;
+			EBUG_ON(!(new & SIX_LOCK_HELD_read));
+			new -= l[SIX_LOCK_read].lock_val;
 		}
 
-		new.intent_lock = 1;
-	} while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
-				old.v, new.v)) != old.v);
+		new |= SIX_LOCK_HELD_intent;
+	} while ((v = atomic_cmpxchg_acquire(&lock->state, old, new)) != old);
 
 	if (lock->readers)
 		this_cpu_dec(*lock->readers);
@@ -748,6 +706,17 @@ bool six_lock_tryupgrade(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
 
+/**
+ * six_trylock_convert - attempt to convert a held lock from one type to another
+ * @lock:	lock to upgrade
+ * @from:	SIX_LOCK_read or SIX_LOCK_intent
+ * @to:		SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
 bool six_trylock_convert(struct six_lock *lock,
 			 enum six_lock_type from,
 			 enum six_lock_type to)
@@ -766,9 +735,16 @@ bool six_trylock_convert(struct six_lock *lock,
 }
 EXPORT_SYMBOL_GPL(six_trylock_convert);
 
-/*
- * Increment read/intent lock count, assuming we already have it read or intent
- * locked:
+/**
+ * six_lock_increment - increase held lock count on a lock that is already held
+ * @lock:	lock to increment
+ * @type:	SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * @lock must already be held, with a lock type that is greater than or equal to
+ * @type
+ *
+ * A corresponding six_unlock_type() call will be required for @lock to be fully
+ * unlocked.
  */
 void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 {
@@ -783,13 +759,14 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 		if (lock->readers) {
 			this_cpu_inc(*lock->readers);
 		} else {
-			EBUG_ON(!lock->state.read_lock &&
-				!lock->state.intent_lock);
-			atomic64_add(l[type].lock_val, &lock->state.counter);
+			EBUG_ON(!(atomic_read(&lock->state) &
+				  (SIX_LOCK_HELD_read|
+				   SIX_LOCK_HELD_intent)));
+			atomic_add(l[type].lock_val, &lock->state);
 		}
 		break;
 	case SIX_LOCK_intent:
-		EBUG_ON(!lock->state.intent_lock);
+		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
 		lock->intent_lock_recurse++;
 		break;
 	case SIX_LOCK_write:
@@ -799,9 +776,19 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
 }
 EXPORT_SYMBOL_GPL(six_lock_increment);
 
+/**
+ * six_lock_wakeup_all - wake up all waiters on @lock
+ * @lock:	lock to wake up waiters for
+ *
+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
+ * abort the lock operation.
+ *
+ * This function is never needed in a bug-free program; it's only useful in
+ * debug code, e.g. to determine if a cycle detector is at fault.
+ */
 void six_lock_wakeup_all(struct six_lock *lock)
 {
-	union six_lock_state state = lock->state;
+	u32 state = atomic_read(&lock->state);
 	struct six_lock_waiter *w;
 
 	six_lock_wakeup(lock, state, SIX_LOCK_read);
@@ -815,38 +802,96 @@ void six_lock_wakeup_all(struct six_lock *lock)
 }
 EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
 
-void six_lock_pcpu_free(struct six_lock *lock)
-{
-	BUG_ON(lock->readers && pcpu_read_count(lock));
-	BUG_ON(lock->state.read_lock);
-
-	free_percpu(lock->readers);
-	lock->readers = NULL;
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
-
-void six_lock_pcpu_alloc(struct six_lock *lock)
-{
-#ifdef __KERNEL__
-	if (!lock->readers)
-		lock->readers = alloc_percpu(unsigned);
-#endif
-}
-EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
-
-/*
- * Returns lock held counts, for both read and intent
+/**
+ * six_lock_counts - return held lock counts, for each lock type
+ * @lock:	lock to return counters for
+ *
+ * Return: the number of times a lock is held for read, intent and write.
  */
 struct six_lock_count six_lock_counts(struct six_lock *lock)
 {
 	struct six_lock_count ret;
 
 	ret.n[SIX_LOCK_read]	= !lock->readers
-		? lock->state.read_lock
+		? atomic_read(&lock->state) & SIX_LOCK_HELD_read
 		: pcpu_read_count(lock);
-	ret.n[SIX_LOCK_intent]	= lock->state.intent_lock + lock->intent_lock_recurse;
-	ret.n[SIX_LOCK_write]	= lock->state.seq & 1;
+	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
+		lock->intent_lock_recurse;
+	ret.n[SIX_LOCK_write]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(six_lock_counts);
+
+/**
+ * six_lock_readers_add - directly manipulate reader count of a lock
+ * @lock:	lock to add/subtract readers for
+ * @nr:		reader count to add/subtract
+ *
+ * When an upper layer is implementing lock reentrency, we may have both read
+ * and intent locks on the same lock.
+ *
+ * When we need to take a write lock, the read locks will cause self-deadlock,
+ * because six locks themselves do not track which read locks are held by the
+ * current thread and which are held by a different thread - it does no
+ * per-thread tracking of held locks.
+ *
+ * The upper layer that is tracking held locks may however, if trylock() has
+ * failed, count up its own read locks, subtract them, take the write lock, and
+ * then re-add them.
+ *
+ * As in any other situation when taking a write lock, @lock must be held for
+ * intent one (or more) times, so @lock will never be left unlocked.
+ */
+void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+	if (lock->readers) {
+		this_cpu_add(*lock->readers, nr);
+	} else {
+		EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
+		/* reader count starts at bit 0 */
+		atomic_add(nr, &lock->state);
+	}
+}
+EXPORT_SYMBOL_GPL(six_lock_readers_add);
+
+/**
+ * six_lock_exit - release resources held by a lock prior to freeing
+ * @lock:	lock to exit
+ *
+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
+ * required to free the percpu read counts.
+ */
+void six_lock_exit(struct six_lock *lock)
+{
+	WARN_ON(lock->readers && pcpu_read_count(lock));
+	WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
+
+	free_percpu(lock->readers);
+	lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_exit);
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+		     struct lock_class_key *key, enum six_lock_init_flags flags)
+{
+	atomic_set(&lock->state, 0);
+	raw_spin_lock_init(&lock->wait_lock);
+	INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+
+	if (flags & SIX_LOCK_INIT_PCPU) {
+		/*
+		 * We don't return an error here on memory allocation failure
+		 * since percpu is an optimization, and locks will work with the
+		 * same semantics in non-percpu mode: callers can check for
+		 * failure if they wish by checking lock->readers, but generally
+		 * will not want to treat it as an error.
+		 */
+		lock->readers = alloc_percpu(unsigned);
+	}
+}
+EXPORT_SYMBOL_GPL(__six_lock_init);
author	Kent Overstreet <kent.overstreet@linux.dev>	2023-05-25 17:52:28 -0400
committer	Kent Overstreet <kent.overstreet@linux.dev>	2023-05-25 22:25:34 -0400
commit	1f78fed4693a5361f56508daac59bebd5b556379 (patch)
tree	267c710018040b6caa9193a1ee34e514317709c4
parent	b8b8dcfaed641eabeec8ba070e1e23665bc4ceb2 (diff)
download	bcachefs-tools-1f78fed4693a5361f56508daac59bebd5b556379.tar.gz