From: Paul Mackerras <paulus@samba.org>

The patch below moves the ppc64 spinlocks and rwlocks out of line and into
arch/ppc64/lib/locks.c, and implements _raw_spin_lock_flags for ppc64.

Part of the motivation for moving the spinlocks and rwlocks out of line was
that I needed to add code to the slow paths to yield the processor to the
hypervisor on systems with shared processors.  On these systems, a cpu as
seen by the kernel is a virtual processor that is not necessarily running
full-time on a real physical cpu.  If we are spinning on a lock which is
held by another virtual processor which is not running at the moment, we
are just wasting time.  In such a situation it is better to do a hypervisor
call to ask it to give the rest of our time slice to the lock holder so
that forward progress can be made.

The one problem with out-of-line spinlock routines is that lock contention
will show up in profiles in the spin_lock etc.  routines rather than in the
callers, as it does with inline spinlocks.  I have added a
CONFIG_SPINLINE config option for people that want to do profiling.
 In the longer term, Anton is talking about teaching the profiling code to
attribute samples in the spin lock routines to the routine's caller.

This patch reduces the kernel by about 80kB on my G5.  With inline
spinlocks selected, the kernel gets about 4kB bigger than without the
patch, because _raw_spin_lock_flags is slightly bigger than _raw_spin_lock.

This patch depends on the patch from Keith Owens to add
_raw_spin_lock_flags.


---

 25-akpm/arch/ppc64/kernel/pacaData.c |    1 
 25-akpm/arch/ppc64/lib/Makefile      |    2 
 25-akpm/arch/ppc64/lib/locks.c       |  285 +++++++++++++++++++++++++++++++++++
 25-akpm/include/asm-ppc64/paca.h     |   10 -
 25-akpm/include/asm-ppc64/spinlock.h |  135 ++++++++++++----
 5 files changed, 392 insertions(+), 41 deletions(-)

diff -puN arch/ppc64/kernel/pacaData.c~un-inline-spinlocks-on-ppc64 arch/ppc64/kernel/pacaData.c
--- 25/arch/ppc64/kernel/pacaData.c~un-inline-spinlocks-on-ppc64	2004-05-09 17:18:07.572974672 -0700
+++ 25-akpm/arch/ppc64/kernel/pacaData.c	2004-05-09 17:18:07.579973608 -0700
@@ -36,6 +36,7 @@ struct systemcfg *systemcfg;
 {									    \
 	.xLpPacaPtr = &paca[number].xLpPaca,				    \
 	.xLpRegSavePtr = &paca[number].xRegSav,				    \
+	.lock_token = 0x8000,						    \
 	.xPacaIndex = (number),		/* Paca Index */		    \
 	.default_decr = 0x00ff0000,	/* Initial Decr */		    \
 	.xStab_data = {							    \
diff -puN /dev/null arch/ppc64/lib/locks.c
--- /dev/null	2003-09-15 06:40:47.000000000 -0700
+++ 25-akpm/arch/ppc64/lib/locks.c	2004-05-09 17:18:07.581973304 -0700
@@ -0,0 +1,285 @@
+/*
+ * Spin and read/write lock operations.
+ *
+ * Copyright (C) 2001-2004 Paul Mackerras <paulus@au.ibm.com>, IBM
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM
+ *   Rework to support virtual processors
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/hvcall.h>
+#include <asm/iSeries/HvCall.h>
+
+#ifndef CONFIG_SPINLINE
+
+/*
+ * On a system with shared processors (that is, where a physical
+ * processor is multiplexed between several virtual processors),
+ * there is no point spinning on a lock if the holder of the lock
+ * isn't currently scheduled on a physical processor.  Instead
+ * we detect this situation and ask the hypervisor to give the
+ * rest of our timeslice to the lock holder.
+ *
+ * So that we can tell which virtual processor is holding a lock,
+ * we put 0x80000000 | smp_processor_id() in the lock when it is
+ * held.  Conveniently, we have a word in the paca that holds this
+ * value.
+ */
+
+/* waiting for a spinlock... */
+#if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
+void __spin_yield(spinlock_t *lock)
+{
+	unsigned int lock_value, holder_cpu, yield_count;
+	struct paca_struct *holder_paca;
+
+	lock_value = lock->lock;
+	if (lock_value == 0)
+		return;
+	holder_cpu = lock_value & 0xffff;
+	BUG_ON(holder_cpu >= NR_CPUS);
+	holder_paca = &paca[holder_cpu];
+	yield_count = holder_paca->xLpPaca.xYieldCount;
+	if ((yield_count & 1) == 0)
+		return;		/* virtual cpu is currently running */
+	rmb();
+	if (lock->lock != lock_value)
+		return;		/* something has changed */
+#ifdef CONFIG_PPC_ISERIES
+	HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc,
+		((u64)holder_cpu << 32) | yield_count);
+#else
+	plpar_hcall_norets(H_CONFER, holder_cpu, yield_count);
+#endif
+}
+
+#else /* SPLPAR || ISERIES */
+#define __spin_yield(x)	barrier()
+#endif
+
+/*
+ * This returns the old value in the lock, so we succeeded
+ * in getting the lock if the return value is 0.
+ */
+static __inline__ unsigned long __spin_trylock(spinlock_t *lock)
+{
+	unsigned long tmp, tmp2;
+
+	__asm__ __volatile__(
+"	lwz		%1,24(13)		# __spin_trylock\n\
+1:	lwarx		%0,0,%2\n\
+	cmpwi		0,%0,0\n\
+	bne-		2f\n\
+	stwcx.		%1,0,%2\n\
+	bne-		1b\n\
+	isync\n\
+2:"	: "=&r" (tmp), "=&r" (tmp2)
+	: "r" (&lock->lock)
+	: "cr0", "memory");
+
+	return tmp;
+}
+
+int _raw_spin_trylock(spinlock_t *lock)
+{
+	return __spin_trylock(lock) == 0;
+}
+
+EXPORT_SYMBOL(_raw_spin_trylock);
+
+void _raw_spin_lock(spinlock_t *lock)
+{
+	while (1) {
+		if (likely(__spin_trylock(lock) == 0))
+			break;
+		do {
+			HMT_low();
+			__spin_yield(lock);
+		} while (likely(lock->lock != 0));
+		HMT_medium();
+	}
+}
+
+EXPORT_SYMBOL(_raw_spin_lock);
+
+void _raw_spin_lock_flags(spinlock_t *lock, unsigned long flags)
+{
+	unsigned long flags_dis;
+
+	while (1) {
+		if (likely(__spin_trylock(lock) == 0))
+			break;
+		local_save_flags(flags_dis);
+		local_irq_restore(flags);
+		do {
+			HMT_low();
+			__spin_yield(lock);
+		} while (likely(lock->lock != 0));
+		HMT_medium();
+		local_irq_restore(flags_dis);
+	}
+}
+
+EXPORT_SYMBOL(_raw_spin_lock_flags);
+
+void spin_unlock_wait(spinlock_t *lock)
+{
+	while (lock->lock)
+		__spin_yield(lock);
+}
+
+EXPORT_SYMBOL(spin_unlock_wait);
+
+/*
+ * Waiting for a read lock or a write lock on a rwlock...
+ * This turns out to be the same for read and write locks, since
+ * we only know the holder if it is write-locked.
+ */
+#if defined(CONFIG_PPC_SPLPAR) || defined(CONFIG_PPC_ISERIES)
+void __rw_yield(rwlock_t *rw)
+{
+	int lock_value;
+	unsigned int holder_cpu, yield_count;
+	struct paca_struct *holder_paca;
+
+	lock_value = rw->lock;
+	if (lock_value >= 0)
+		return;		/* no write lock at present */
+	holder_cpu = lock_value & 0xffff;
+	BUG_ON(holder_cpu >= NR_CPUS);
+	holder_paca = &paca[holder_cpu];
+	yield_count = holder_paca->xLpPaca.xYieldCount;
+	if ((yield_count & 1) == 0)
+		return;		/* virtual cpu is currently running */
+	rmb();
+	if (rw->lock != lock_value)
+		return;		/* something has changed */
+#ifdef CONFIG_PPC_ISERIES
+	HvCall2(HvCallBaseYieldProcessor, HvCall_YieldToProc,
+		((u64)holder_cpu << 32) | yield_count);
+#else
+	plpar_hcall_norets(H_CONFER, holder_cpu, yield_count);
+#endif
+}
+
+#else /* SPLPAR || ISERIES */
+#define __rw_yield(x)	barrier()
+#endif
+
+/*
+ * This returns the old value in the lock + 1,
+ * so we got a read lock if the return value is > 0.
+ */
+static __inline__ long __read_trylock(rwlock_t *rw)
+{
+	long tmp;
+
+	__asm__ __volatile__(
+"1:	lwarx		%0,0,%1		# read_trylock\n\
+	extsw		%0,%0\n\
+	addic.		%0,%0,1\n\
+	ble-		2f\n\
+	stwcx.		%0,0,%1\n\
+	bne-		1b\n\
+	isync\n\
+2:"	: "=&r" (tmp)
+	: "r" (&rw->lock)
+	: "cr0", "xer", "memory");
+
+	return tmp;
+}
+
+int _raw_read_trylock(rwlock_t *rw)
+{
+	return __read_trylock(rw) > 0;
+}
+
+EXPORT_SYMBOL(_raw_read_trylock);
+
+void _raw_read_lock(rwlock_t *rw)
+{
+	while (1) {
+		if (likely(__read_trylock(rw) > 0))
+			break;
+		do {
+			HMT_low();
+			__rw_yield(rw);
+		} while (likely(rw->lock < 0));
+		HMT_medium();
+	}
+}
+
+EXPORT_SYMBOL(_raw_read_lock);
+
+void _raw_read_unlock(rwlock_t *rw)
+{
+	long tmp;
+
+	__asm__ __volatile__(
+	"eieio				# read_unlock\n\
+1:	lwarx		%0,0,%1\n\
+	addic		%0,%0,-1\n\
+	stwcx.		%0,0,%1\n\
+	bne-		1b"
+	: "=&r"(tmp)
+	: "r"(&rw->lock)
+	: "cr0", "memory");
+}
+
+EXPORT_SYMBOL(_raw_read_unlock);
+
+/*
+ * This returns the old value in the lock,
+ * so we got the write lock if the return value is 0.
+ */
+static __inline__ long __write_trylock(rwlock_t *rw)
+{
+	long tmp, tmp2;
+
+	__asm__ __volatile__(
+"	lwz		%1,24(13)		# write_trylock\n\
+1:	lwarx		%0,0,%2\n\
+	cmpwi		0,%0,0\n\
+	bne-		2f\n\
+	stwcx.		%1,0,%2\n\
+	bne-		1b\n\
+	isync\n\
+2:"	: "=&r" (tmp), "=&r" (tmp2)
+	: "r" (&rw->lock)
+	: "cr0", "memory");
+
+	return tmp;
+}
+
+int _raw_write_trylock(rwlock_t *rw)
+{
+	return __write_trylock(rw) == 0;
+}
+
+EXPORT_SYMBOL(_raw_write_trylock);
+
+void _raw_write_lock(rwlock_t *rw)
+{
+	while (1) {
+		if (likely(__write_trylock(rw) == 0))
+			break;
+		do {
+			HMT_low();
+			__rw_yield(rw);
+		} while (likely(rw->lock != 0));
+		HMT_medium();
+	}
+}
+
+EXPORT_SYMBOL(_raw_write_lock);
+
+#endif /* CONFIG_SPINLINE */
diff -puN arch/ppc64/lib/Makefile~un-inline-spinlocks-on-ppc64 arch/ppc64/lib/Makefile
--- 25/arch/ppc64/lib/Makefile~un-inline-spinlocks-on-ppc64	2004-05-09 17:18:07.573974520 -0700
+++ 25-akpm/arch/ppc64/lib/Makefile	2004-05-09 17:18:07.582973152 -0700
@@ -3,4 +3,4 @@
 #
 
 lib-y := checksum.o dec_and_lock.o string.o strcase.o
-lib-y += copypage.o memcpy.o copyuser.o
+lib-y += copypage.o memcpy.o copyuser.o locks.o
diff -puN include/asm-ppc64/paca.h~un-inline-spinlocks-on-ppc64 include/asm-ppc64/paca.h
--- 25/include/asm-ppc64/paca.h~un-inline-spinlocks-on-ppc64	2004-05-09 17:18:07.574974368 -0700
+++ 25-akpm/include/asm-ppc64/paca.h	2004-05-09 17:18:07.582973152 -0700
@@ -60,8 +60,11 @@ struct paca_struct {
 	struct ItLpPaca *xLpPacaPtr;	/* Pointer to LpPaca for PLIC		0x00 */
 	struct ItLpRegSave *xLpRegSavePtr; /* Pointer to LpRegSave for PLIC	0x08 */
 	u64 xCurrent;  		        /* Pointer to current			0x10 */
-	u16 xPacaIndex;			/* Logical processor number		0x18 */
-        u16 xHwProcNum;                 /* Physical processor number            0x1A */
+	/* Note: the spinlock functions in arch/ppc64/lib/locks.c load lock_token and
+	   xPacaIndex with a single lwz instruction, using the constant offset 24.
+	   If you move either field, fix the spinlocks and rwlocks. */
+	u16 lock_token;			/* Constant 0x8000, used in spinlocks	0x18 */
+	u16 xPacaIndex;			/* Logical processor number		0x1A */
 	u32 default_decr;		/* Default decrementer value		0x1c */	
 	u64 xKsave;			/* Saved Kernel stack addr or zero	0x20 */
 	struct ItLpQueue *lpQueuePtr;	/* LpQueue handled by this processor    0x28 */
@@ -70,7 +73,8 @@ struct paca_struct {
 	u8 *exception_sp;		/*                                      0x50 */
 	u8 xProcEnabled;		/*                                      0x58 */
 	u8 prof_enabled;		/* 1=iSeries profiling enabled          0x59 */
-	u8 resv1[38];			/*					0x5a-0x7f*/
+	u16 xHwProcNum;			/* Physical processor number		0x5a */
+	u8 resv1[36];			/*					0x5c */
 
 /*=====================================================================================
  * CACHE_LINE_2 0x0080 - 0x00FF
diff -puN include/asm-ppc64/spinlock.h~un-inline-spinlocks-on-ppc64 include/asm-ppc64/spinlock.h
--- 25/include/asm-ppc64/spinlock.h~un-inline-spinlocks-on-ppc64	2004-05-09 17:18:07.576974064 -0700
+++ 25-akpm/include/asm-ppc64/spinlock.h	2004-05-09 17:18:07.584972848 -0700
@@ -4,7 +4,7 @@
 /*
  * Simple spin lock operations.  
  *
- * Copyright (C) 2001 Paul Mackerras <paulus@au.ibm.com>, IBM
+ * Copyright (C) 2001-2004 Paul Mackerras <paulus@au.ibm.com>, IBM
  * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
  *
  * Type of int is used as a full 64b word is not necessary.
@@ -14,6 +14,8 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#include <linux/config.h>
+
 typedef struct {
 	volatile unsigned int lock;
 } spinlock_t;
@@ -22,26 +24,48 @@ typedef struct {
 #define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0 }
 
 #define spin_is_locked(x)	((x)->lock != 0)
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
+#define spin_lock_init(x)	do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
+
+static __inline__ void _raw_spin_unlock(spinlock_t *lock)
+{
+	__asm__ __volatile__("eieio	# spin_unlock": : :"memory");
+	lock->lock = 0;
+}
+
+/*
+ * Normally we use the spinlock functions in arch/ppc64/lib/locks.c.
+ * For special applications such as profiling, we can have the
+ * spinlock functions inline by defining CONFIG_SPINLINE.
+ * This is not recommended on partitioned systems with shared
+ * processors, since the inline spinlock functions don't include
+ * the code for yielding the CPU to the lock holder.
+ */
+
+#ifndef CONFIG_SPINLINE
+extern int _raw_spin_trylock(spinlock_t *lock);
+extern void _raw_spin_lock(spinlock_t *lock);
+extern void _raw_spin_lock_flags(spinlock_t *lock, unsigned long flags);
+extern void spin_unlock_wait(spinlock_t *lock);
+
+#else
 
 static __inline__ int _raw_spin_trylock(spinlock_t *lock)
 {
-	unsigned int tmp;
+	unsigned int tmp, tmp2;
 
 	__asm__ __volatile__(
-"1:	lwarx		%0,0,%1		# spin_trylock\n\
+"1:	lwarx		%0,0,%2		# spin_trylock\n\
 	cmpwi		0,%0,0\n\
-	li		%0,0\n\
 	bne-		2f\n\
-	li		%0,1\n\
-	stwcx.		%0,0,%1\n\
+	lwz		%1,24(13)\n\
+	stwcx.		%1,0,%2\n\
 	bne-		1b\n\
 	isync\n\
-2:"	: "=&r"(tmp)
+2:"	: "=&r"(tmp), "=&r"(tmp2)
 	: "r"(&lock->lock)
 	: "cr0", "memory");
 
-	return tmp;
+	return tmp != 0;
 }
 
 static __inline__ void _raw_spin_lock(spinlock_t *lock)
@@ -59,20 +83,51 @@ static __inline__ void _raw_spin_lock(sp
 "2:	lwarx		%0,0,%1\n\
 	cmpwi		0,%0,0\n\
 	bne-		1b\n\
-	stwcx.		%2,0,%1\n\
+	lwz		%0,24(13)\n\
+	stwcx.		%0,0,%1\n\
 	bne-		2b\n\
 	isync"
 	: "=&r"(tmp)
-	: "r"(&lock->lock), "r"(1)
+	: "r"(&lock->lock)
 	: "cr0", "memory");
 }
 
-static __inline__ void _raw_spin_unlock(spinlock_t *lock)
+/*
+ * Note: if we ever want to inline the spinlocks on iSeries,
+ * we will have to change the irq enable/disable stuff in here.
+ */
+static __inline__ void _raw_spin_lock_flags(spinlock_t *lock,
+					    unsigned long flags)
 {
-	__asm__ __volatile__("eieio	# spin_unlock": : :"memory");
-	lock->lock = 0;
+	unsigned int tmp;
+	unsigned long tmp2;
+
+	__asm__ __volatile__(
+	"b		2f		# spin_lock\n\
+1:	mfmsr		%1\n\
+	mtmsrd		%3,1\n\
+2:"	HMT_LOW
+"	lwzx		%0,0,%2\n\
+	cmpwi		0,%0,0\n\
+	bne+		2b\n"
+	HMT_MEDIUM
+"	mtmsrd		%1,1\n\
+3:	lwarx		%0,0,%2\n\
+	cmpwi		0,%0,0\n\
+	bne-		1b\n\
+	lwz		%1,24(13)\n\
+	stwcx.		%1,0,%2\n\
+	bne-		3b\n\
+	isync"
+	: "=&r"(tmp), "=&r"(tmp2)
+	: "r"(&lock->lock), "r"(flags)
+	: "cr0", "memory");
 }
 
+#define spin_unlock_wait(x)	do { cpu_relax(); } while (spin_is_locked(x))
+
+#endif /* CONFIG_SPINLINE */
+
 /*
  * Read-write spinlocks, allowing multiple readers
  * but only one writer.
@@ -89,6 +144,34 @@ typedef struct {
 
 #define RW_LOCK_UNLOCKED (rwlock_t) { 0 }
 
+#define rwlock_init(x)		do { *(x) = RW_LOCK_UNLOCKED; } while(0)
+#define rwlock_is_locked(x)	((x)->lock)
+
+static __inline__ int is_read_locked(rwlock_t *rw)
+{
+	return rw->lock > 0;
+}
+
+static __inline__ int is_write_locked(rwlock_t *rw)
+{
+	return rw->lock < 0;
+}
+
+static __inline__ void _raw_write_unlock(rwlock_t *rw)
+{
+	__asm__ __volatile__("eieio		# write_unlock": : :"memory");
+	rw->lock = 0;
+}
+
+#ifndef CONFIG_SPINLINE
+extern int _raw_read_trylock(rwlock_t *rw);
+extern void _raw_read_lock(rwlock_t *rw);
+extern void _raw_read_unlock(rwlock_t *rw);
+extern int _raw_write_trylock(rwlock_t *rw);
+extern void _raw_write_lock(rwlock_t *rw);
+extern void _raw_write_unlock(rwlock_t *rw);
+
+#else
 static __inline__ int _raw_read_trylock(rwlock_t *rw)
 {
 	unsigned int tmp;
@@ -193,29 +276,7 @@ static __inline__ void _raw_write_lock(r
 	: "r"(&rw->lock), "r"(-1)
 	: "cr0", "memory");
 }
-
-static __inline__ void _raw_write_unlock(rwlock_t *rw)
-{
-	__asm__ __volatile__("eieio		# write_unlock": : :"memory");
-	rw->lock = 0;
-}
-
-static __inline__ int is_read_locked(rwlock_t *rw)
-{
-	return rw->lock > 0;
-}
-
-static __inline__ int is_write_locked(rwlock_t *rw)
-{
-	return rw->lock < 0;
-}
-
-#define spin_lock_init(x)      do { *(x) = SPIN_LOCK_UNLOCKED; } while(0)
-#define spin_unlock_wait(x)    do { cpu_relax(); } while(spin_is_locked(x))
-
-#define rwlock_init(x)         do { *(x) = RW_LOCK_UNLOCKED; } while(0)
-
-#define rwlock_is_locked(x)	((x)->lock)
+#endif /* CONFIG_SPINLINE */
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_SPINLOCK_H */

_