aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/sparc/mm/srmmu.c5
-rw-r--r--arch/sparc/mm/sun4c.c7
-rw-r--r--arch/sparc64/Kconfig29
-rw-r--r--arch/sparc64/Makefile4
-rw-r--r--arch/sparc64/kernel/cpu.c2
-rw-r--r--arch/sparc64/kernel/dtlb_backend.S160
-rw-r--r--arch/sparc64/kernel/dtlb_base.S4
-rw-r--r--arch/sparc64/kernel/entry.S224
-rw-r--r--arch/sparc64/kernel/etrap.S72
-rw-r--r--arch/sparc64/kernel/head.S81
-rw-r--r--arch/sparc64/kernel/rtrap.S35
-rw-r--r--arch/sparc64/kernel/semaphore.c76
-rw-r--r--arch/sparc64/kernel/setup.c16
-rw-r--r--arch/sparc64/kernel/smp.c92
-rw-r--r--arch/sparc64/kernel/sparc64_ksyms.c17
-rw-r--r--arch/sparc64/kernel/sys_sparc32.c2
-rw-r--r--arch/sparc64/kernel/trampoline.S15
-rw-r--r--arch/sparc64/kernel/traps.c42
-rw-r--r--arch/sparc64/kernel/unaligned.c6
-rw-r--r--arch/sparc64/kernel/winfixup.S74
-rw-r--r--arch/sparc64/lib/Makefile4
-rw-r--r--arch/sparc64/lib/U1memcpy.S84
-rw-r--r--arch/sparc64/lib/U3memcpy.S28
-rw-r--r--arch/sparc64/lib/VIS.h128
-rw-r--r--arch/sparc64/lib/VISbzero.S274
-rw-r--r--arch/sparc64/lib/VIScsum.S546
-rw-r--r--arch/sparc64/lib/VIScsumcopy.S897
-rw-r--r--arch/sparc64/lib/VIScsumcopyusr.S916
-rw-r--r--arch/sparc64/lib/VISmemset.S240
-rw-r--r--arch/sparc64/lib/atomic.S64
-rw-r--r--arch/sparc64/lib/bitops.S42
-rw-r--r--arch/sparc64/lib/bzero.S158
-rw-r--r--arch/sparc64/lib/checksum.S644
-rw-r--r--arch/sparc64/lib/csum_copy.S308
-rw-r--r--arch/sparc64/lib/csum_copy_from_user.S21
-rw-r--r--arch/sparc64/lib/csum_copy_to_user.S21
-rw-r--r--arch/sparc64/lib/debuglocks.c76
-rw-r--r--arch/sparc64/lib/dec_and_lock.S16
-rw-r--r--arch/sparc64/lib/mcount.S18
-rw-r--r--arch/sparc64/lib/memcmp.S4
-rw-r--r--arch/sparc64/lib/memmove.S10
-rw-r--r--arch/sparc64/lib/memscan.S32
-rw-r--r--arch/sparc64/lib/rwsem.S165
-rw-r--r--arch/sparc64/lib/rwsem.c239
-rw-r--r--arch/sparc64/lib/strlen.S12
-rw-r--r--arch/sparc64/lib/strlen_user.S12
-rw-r--r--arch/sparc64/lib/strncpy_from_user.S26
-rw-r--r--arch/sparc64/lib/xor.S46
-rw-r--r--arch/sparc64/mm/fault.c4
-rw-r--r--arch/sparc64/mm/hugetlbpage.c39
-rw-r--r--arch/sparc64/mm/init.c143
-rw-r--r--arch/sparc64/mm/tlb.c39
-rw-r--r--arch/sparc64/mm/ultra.S109
-rw-r--r--arch/sparc64/prom/map.S2
-rw-r--r--arch/sparc64/prom/p1275.c20
-rw-r--r--include/asm-parisc/unaligned.h2
-rw-r--r--include/asm-sparc/pgtable.h23
-rw-r--r--include/asm-sparc64/cacheflush.h17
-rw-r--r--include/asm-sparc64/checksum.h51
-rw-r--r--include/asm-sparc64/cpudata.h5
-rw-r--r--include/asm-sparc64/ide.h10
-rw-r--r--include/asm-sparc64/mmu.h96
-rw-r--r--include/asm-sparc64/mmu_context.h33
-rw-r--r--include/asm-sparc64/page.h11
-rw-r--r--include/asm-sparc64/percpu.h41
-rw-r--r--include/asm-sparc64/pgalloc.h69
-rw-r--r--include/asm-sparc64/pgtable.h38
-rw-r--r--include/asm-sparc64/rwsem-const.h12
-rw-r--r--include/asm-sparc64/rwsem.h17
-rw-r--r--include/asm-sparc64/spitfire.h47
-rw-r--r--include/asm-sparc64/system.h42
-rw-r--r--include/asm-sparc64/tlb.h6
72 files changed, 2150 insertions, 4720 deletions
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index 8c66349f316b2..c89a803cbc20d 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -161,6 +161,9 @@ static inline int srmmu_pte_none(pte_t pte)
static inline int srmmu_pte_present(pte_t pte)
{ return ((pte_val(pte) & SRMMU_ET_MASK) == SRMMU_ET_PTE); }
+static inline int srmmu_pte_read(pte_t pte)
+{ return !(pte_val(pte) & SRMMU_NOREAD); }
+
static inline void srmmu_pte_clear(pte_t *ptep)
{ srmmu_set_pte(ptep, __pte(0)); }
@@ -2166,6 +2169,7 @@ void __init ld_mmu_srmmu(void)
BTFIXUPSET_CALL(pte_present, srmmu_pte_present, BTFIXUPCALL_NORM);
BTFIXUPSET_CALL(pte_clear, srmmu_pte_clear, BTFIXUPCALL_SWAPO0G0);
+ BTFIXUPSET_CALL(pte_read, srmmu_pte_read, BTFIXUPCALL_NORM);
BTFIXUPSET_CALL(pmd_bad, srmmu_pmd_bad, BTFIXUPCALL_NORM);
BTFIXUPSET_CALL(pmd_present, srmmu_pmd_present, BTFIXUPCALL_NORM);
@@ -2196,7 +2200,6 @@ void __init ld_mmu_srmmu(void)
BTFIXUPSET_CALL(free_pgd_fast, srmmu_free_pgd_fast, BTFIXUPCALL_NORM);
BTFIXUPSET_CALL(get_pgd_fast, srmmu_get_pgd_fast, BTFIXUPCALL_NORM);
- BTFIXUPSET_HALF(pte_readi, SRMMU_NOREAD);
BTFIXUPSET_HALF(pte_writei, SRMMU_WRITE);
BTFIXUPSET_HALF(pte_dirtyi, SRMMU_DIRTY);
BTFIXUPSET_HALF(pte_youngi, SRMMU_REF);
diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c
index 03342120f1f85..1d560390e2821 100644
--- a/arch/sparc/mm/sun4c.c
+++ b/arch/sparc/mm/sun4c.c
@@ -1746,6 +1746,11 @@ static int sun4c_pte_present(pte_t pte)
}
static void sun4c_pte_clear(pte_t *ptep) { *ptep = __pte(0); }
+static int sun4c_pte_read(pte_t pte)
+{
+ return (pte_val(pte) & _SUN4C_PAGE_READ);
+}
+
static int sun4c_pmd_bad(pmd_t pmd)
{
return (((pmd_val(pmd) & ~PAGE_MASK) != PGD_TABLE) ||
@@ -2199,6 +2204,7 @@ void __init ld_mmu_sun4c(void)
BTFIXUPSET_CALL(pte_present, sun4c_pte_present, BTFIXUPCALL_NORM);
BTFIXUPSET_CALL(pte_clear, sun4c_pte_clear, BTFIXUPCALL_STG0O0);
+ BTFIXUPSET_CALL(pte_read, sun4c_pte_read, BTFIXUPCALL_NORM);
BTFIXUPSET_CALL(pmd_bad, sun4c_pmd_bad, BTFIXUPCALL_NORM);
BTFIXUPSET_CALL(pmd_present, sun4c_pmd_present, BTFIXUPCALL_NORM);
@@ -2225,7 +2231,6 @@ void __init ld_mmu_sun4c(void)
BTFIXUPSET_CALL(free_pgd_fast, sun4c_free_pgd_fast, BTFIXUPCALL_NORM);
BTFIXUPSET_CALL(get_pgd_fast, sun4c_get_pgd_fast, BTFIXUPCALL_NORM);
- BTFIXUPSET_HALF(pte_readi, _SUN4C_PAGE_READ);
BTFIXUPSET_HALF(pte_writei, _SUN4C_PAGE_WRITE);
BTFIXUPSET_HALF(pte_dirtyi, _SUN4C_PAGE_MODIFIED);
BTFIXUPSET_HALF(pte_youngi, _SUN4C_PAGE_ACCESSED);
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index b31687f3e7214..46a2436c9600c 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -16,6 +16,33 @@ config TIME_INTERPOLATION
bool
default y
+choice
+ prompt "Kernel page size"
+ default SPARC64_PAGE_SIZE_8KB
+
+config SPARC64_PAGE_SIZE_8KB
+ bool "8KB"
+ help
+ This lets you select the page size of the kernel.
+
+ 8KB and 64KB work quite well, since Sparc ELF sections
+ provide for up to 64KB alignment.
+
+ Therefore, 512KB and 4MB are for expert hackers only.
+
+ If you don't know what to do, choose 8KB.
+
+config SPARC64_PAGE_SIZE_64KB
+ bool "64KB"
+
+config SPARC64_PAGE_SIZE_512KB
+ bool "512KB"
+
+config SPARC64_PAGE_SIZE_4MB
+ bool "4MB"
+
+endchoice
+
source "init/Kconfig"
config SYSVIPC_COMPAT
@@ -198,9 +225,11 @@ config HUGETLB_PAGE_SIZE_4MB
bool "4MB"
config HUGETLB_PAGE_SIZE_512K
+ depends on !SPARC64_PAGE_SIZE_4MB
bool "512K"
config HUGETLB_PAGE_SIZE_64K
+ depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512K
bool "64K"
endchoice
diff --git a/arch/sparc64/Makefile b/arch/sparc64/Makefile
index 61724880f20d3..43fe382da0789 100644
--- a/arch/sparc64/Makefile
+++ b/arch/sparc64/Makefile
@@ -41,10 +41,10 @@ endif
ifneq ($(NEW_GCC),y)
CFLAGS := $(CFLAGS) -pipe -mno-fpu -mtune=ultrasparc -mmedlow \
- -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare
+ -ffixed-g4 -ffixed-g5 -fcall-used-g7 -Wno-sign-compare
else
CFLAGS := $(CFLAGS) -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow \
- -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare \
+ -ffixed-g4 -ffixed-g5 -fcall-used-g7 -Wno-sign-compare \
$(CC_UNDECL)
AFLAGS += -m64 -mcpu=ultrasparc $(CC_UNDECL)
endif
diff --git a/arch/sparc64/kernel/cpu.c b/arch/sparc64/kernel/cpu.c
index 9043e2e03a1f4..48756958116b5 100644
--- a/arch/sparc64/kernel/cpu.c
+++ b/arch/sparc64/kernel/cpu.c
@@ -38,6 +38,7 @@ struct cpu_fp_info linux_sparc_fpu[] = {
{ 0x3e, 0x14, 0, "UltraSparc III integrated FPU"},
{ 0x3e, 0x15, 0, "UltraSparc III+ integrated FPU"},
{ 0x3e, 0x16, 0, "UltraSparc IIIi integrated FPU"},
+ { 0x3e, 0x18, 0, "UltraSparc IV integrated FPU"},
};
#define NSPARCFPU (sizeof(linux_sparc_fpu)/sizeof(struct cpu_fp_info))
@@ -51,6 +52,7 @@ struct cpu_iu_info linux_sparc_chips[] = {
{ 0x3e, 0x14, "TI UltraSparc III (Cheetah)"},
{ 0x3e, 0x15, "TI UltraSparc III+ (Cheetah+)"},
{ 0x3e, 0x16, "TI UltraSparc IIIi (Jalapeno)"},
+ { 0x3e, 0x18, "TI UltraSparc IV (Jaguar)"},
};
#define NSPARCCHIPS (sizeof(linux_sparc_chips)/sizeof(struct cpu_iu_info))
diff --git a/arch/sparc64/kernel/dtlb_backend.S b/arch/sparc64/kernel/dtlb_backend.S
index e6bc4a26aeb9b..b73a3c8587704 100644
--- a/arch/sparc64/kernel/dtlb_backend.S
+++ b/arch/sparc64/kernel/dtlb_backend.S
@@ -7,60 +7,143 @@
*/
#include <asm/pgtable.h>
-#include <asm/mmu_context.h>
+#include <asm/mmu.h>
#if PAGE_SHIFT == 13
-#define FILL_VALID_SZ_BITS1(r1) \
- sllx %g2, 62, r1
-#define FILL_VALID_SZ_BITS2(r1)
-#define FILL_VALID_SZ_BITS_NOP nop
+#define SZ_BITS _PAGE_SZ8K
#elif PAGE_SHIFT == 16
-#define FILL_VALID_SZ_BITS1(r1) \
- or %g0, 5, r1
-#define FILL_VALID_SZ_BITS2(r1) \
- sllx r1, 61, r1
-#define FILL_VALID_SZ_BITS_NOP
-#else
-#error unsupported PAGE_SIZE
-#endif /* PAGE_SHIFT */
+#define SZ_BITS _PAGE_SZ64K
+#elif PAGE_SHIFT == 19
+#define SZ_BITS _PAGE_SZ512K
+#elif PAGE_SHIFT == 22
+#define SZ_BITS _PAGE_SZ4M
+#endif
+
+#define VALID_SZ_BITS (_PAGE_VALID | SZ_BITS)
#define VPTE_BITS (_PAGE_CP | _PAGE_CV | _PAGE_P )
#define VPTE_SHIFT (PAGE_SHIFT - 3)
-#define TLB_PMD_SHIFT (PAGE_SHIFT - 3 + 3)
-#define TLB_PGD_SHIFT (PMD_BITS + PAGE_SHIFT - 3 + 3)
-#define TLB_PMD_MASK (((1 << PMD_BITS) - 1) << 1)
-#define TLB_PGD_MASK (((1 << (VA_BITS - PAGE_SHIFT - (PAGE_SHIFT - 3) - PMD_BITS)) - 1) << 2)
/* Ways we can get here:
*
* 1) Nucleus loads and stores to/from PA-->VA direct mappings at tl>1.
* 2) Nucleus loads and stores to/from user/kernel window save areas.
* 3) VPTE misses from dtlb_base and itlb_base.
+ *
+ * We need to extract out the PMD and PGDIR indexes from the
+ * linear virtual page table access address. The PTE index
+ * is at the bottom, but we are not concerned with it. Bits
+ * 0 to 2 are clear since each PTE is 8 bytes in size. Each
+ * PMD and PGDIR entry are 4 bytes in size. Thus, this
+ * address looks something like:
+ *
+ * |---------------------------------------------------------------|
+ * | ... | PGDIR index | PMD index | PTE index | |
+ * |---------------------------------------------------------------|
+ * 63 F E D C B A 3 2 0 <- bit nr
+ *
+ * The variable bits above are defined as:
+ * A --> 3 + (PAGE_SHIFT - log2(8))
+ * --> 3 + (PAGE_SHIFT - 3) - 1
+ * (ie. this is "bit 3" + PAGE_SIZE - size of PTE entry in bits - 1)
+ * B --> A + 1
+ * C --> B + (PAGE_SHIFT - log2(4))
+ * --> B + (PAGE_SHIFT - 2) - 1
+ * (ie. this is "bit B" + PAGE_SIZE - size of PMD entry in bits - 1)
+ * D --> C + 1
+ * E --> D + (PAGE_SHIFT - log2(4))
+ * --> D + (PAGE_SHIFT - 2) - 1
+ * (ie. this is "bit D" + PAGE_SIZE - size of PGDIR entry in bits - 1)
+ * F --> E + 1
+ *
+ * (Note how "B" always evalutes to PAGE_SHIFT, all the other constants
+ * cancel out.)
+ *
+ * For 8K PAGE_SIZE (thus, PAGE_SHIFT of 13) the bit numbers are:
+ * A --> 12
+ * B --> 13
+ * C --> 23
+ * D --> 24
+ * E --> 34
+ * F --> 35
+ *
+ * For 64K PAGE_SIZE (thus, PAGE_SHIFT of 16) the bit numbers are:
+ * A --> 15
+ * B --> 16
+ * C --> 29
+ * D --> 30
+ * E --> 43
+ * F --> 44
+ *
+ * Because bits both above and below each PGDIR and PMD index need to
+ * be masked out, and the index can be as long as 14 bits (when using a
+ * 64K PAGE_SIZE, and thus a PAGE_SHIFT of 16), we need 3 instructions
+ * to extract each index out.
+ *
+ * Shifts do not pair very well on UltraSPARC-I, II, IIi, and IIe, so
+ * we try to avoid using them for the entire operation. We could setup
+ * a mask anywhere from bit 31 down to bit 10 using the sethi instruction.
+ *
+ * We need a mask covering bits B --> C and one covering D --> E.
+ * For 8K PAGE_SIZE these masks are 0x00ffe000 and 0x7ff000000.
+ * For 64K PAGE_SIZE these masks are 0x3fff0000 and 0xfffc0000000.
+ * The second in each set cannot be loaded with a single sethi
+ * instruction, because the upper bits are past bit 32. We would
+ * need to use a sethi + a shift.
+ *
+ * For the time being, we use 2 shifts and a simple "and" mask.
+ * We shift left to clear the bits above the index, we shift down
+ * to clear the bits below the index (sans the log2(4 or 8) bits)
+ * and a mask to clear the log2(4 or 8) bits. We need therefore
+ * define 4 shift counts, all of which are relative to PAGE_SHIFT.
+ *
+ * Although unsupportable for other reasons, this does mean that
+ * 512K and 4MB page sizes would be generaally supported by the
+ * kernel. (ELF binaries would break with > 64K PAGE_SIZE since
+ * the sections are only aligned that strongly).
+ *
+ * The operations performed for extraction are thus:
+ *
+ * ((X << FOO_SHIFT_LEFT) >> FOO_SHIFT_RIGHT) & ~0x3
+ *
*/
+#define A (3 + (PAGE_SHIFT - 3) - 1)
+#define B (A + 1)
+#define C (B + (PAGE_SHIFT - 2) - 1)
+#define D (C + 1)
+#define E (D + (PAGE_SHIFT - 2) - 1)
+#define F (E + 1)
+
+#define PMD_SHIFT_LEFT (64 - D)
+#define PMD_SHIFT_RIGHT (64 - (D - B) - 2)
+#define PGDIR_SHIFT_LEFT (64 - F)
+#define PGDIR_SHIFT_RIGHT (64 - (F - D) - 2)
+#define LOW_MASK_BITS 0x3
+
/* TLB1 ** ICACHE line 1: tl1 DTLB and quick VPTE miss */
ldxa [%g1 + %g1] ASI_DMMU, %g4 ! Get TAG_ACCESS
add %g3, %g3, %g5 ! Compute VPTE base
cmp %g4, %g5 ! VPTE miss?
bgeu,pt %xcc, 1f ! Continue here
- andcc %g4, TAG_CONTEXT_BITS, %g5 ! From Nucleus? (for tl0 miss)
- ba,pt %xcc, from_tl1_trap ! Fall to tl0 miss
- rdpr %tl, %g5 ! For tl0 miss TL==3 test
+ andcc %g4, TAG_CONTEXT_BITS, %g5 ! tl0 miss Nucleus test
+ ba,a,pt %xcc, from_tl1_trap ! Fall to tl0 miss
1: sllx %g6, VPTE_SHIFT, %g4 ! Position TAG_ACCESS
+ or %g4, %g5, %g4 ! Prepare TAG_ACCESS
/* TLB1 ** ICACHE line 2: Quick VPTE miss */
- or %g4, %g5, %g4 ! Prepare TAG_ACCESS
mov TSB_REG, %g1 ! Grab TSB reg
ldxa [%g1] ASI_DMMU, %g5 ! Doing PGD caching?
- srlx %g6, (TLB_PMD_SHIFT - 1), %g1 ! Position PMD offset
+ sllx %g6, PMD_SHIFT_LEFT, %g1 ! Position PMD offset
be,pn %xcc, sparc64_vpte_nucleus ! Is it from Nucleus?
- and %g1, TLB_PMD_MASK, %g1 ! Mask PMD offset bits
+ srlx %g1, PMD_SHIFT_RIGHT, %g1 ! Mask PMD offset bits
brnz,pt %g5, sparc64_vpte_continue ! Yep, go like smoke
- add %g1, %g1, %g1 ! Position PMD offset some more
+ andn %g1, LOW_MASK_BITS, %g1 ! Final PMD mask
+ sllx %g6, PGDIR_SHIFT_LEFT, %g5 ! Position PGD offset
/* TLB1 ** ICACHE line 3: Quick VPTE miss */
- srlx %g6, (TLB_PGD_SHIFT - 2), %g5 ! Position PGD offset
- and %g5, TLB_PGD_MASK, %g5 ! Mask PGD offset
+ srlx %g5, PGDIR_SHIFT_RIGHT, %g5 ! Mask PGD offset bits
+ andn %g5, LOW_MASK_BITS, %g5 ! Final PGD mask
lduwa [%g7 + %g5] ASI_PHYS_USE_EC, %g5! Load PGD
brz,pn %g5, vpte_noent ! Valid?
sparc64_kpte_continue:
@@ -71,23 +154,28 @@ sparc64_vpte_continue:
brz,pn %g5, vpte_noent ! Valid?
/* TLB1 ** ICACHE line 4: Quick VPTE miss */
- FILL_VALID_SZ_BITS1(%g1) ! Put _PAGE_VALID into %g1
- FILL_VALID_SZ_BITS2(%g1) ! Put _PAGE_VALID into %g1
+ mov (VALID_SZ_BITS >> 61), %g1 ! upper vpte into %g1
+ sllx %g1, 61, %g1 ! finish calc
or %g5, VPTE_BITS, %g5 ! Prepare VPTE data
or %g5, %g1, %g5 ! ...
mov TLB_SFSR, %g1 ! Restore %g1 value
stxa %g5, [%g0] ASI_DTLB_DATA_IN ! Load VPTE into TLB
stxa %g4, [%g1 + %g1] ASI_DMMU ! Restore previous TAG_ACCESS
retry ! Load PTE once again
- FILL_VALID_SZ_BITS_NOP
+#undef SZ_BITS
+#undef VALID_SZ_BITS
#undef VPTE_SHIFT
-#undef TLB_PMD_SHIFT
-#undef TLB_PGD_SHIFT
#undef VPTE_BITS
-#undef TLB_PMD_MASK
-#undef TLB_PGD_MASK
-#undef FILL_VALID_SZ_BITS1
-#undef FILL_VALID_SZ_BITS2
-#undef FILL_VALID_SZ_BITS_NOP
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef PMD_SHIFT_LEFT
+#undef PMD_SHIFT_RIGHT
+#undef PGDIR_SHIFT_LEFT
+#undef PGDIR_SHIFT_RIGHT
+#undef LOW_MASK_BITS
diff --git a/arch/sparc64/kernel/dtlb_base.S b/arch/sparc64/kernel/dtlb_base.S
index 294fb44aeb2c9..ded2fed23fcc5 100644
--- a/arch/sparc64/kernel/dtlb_base.S
+++ b/arch/sparc64/kernel/dtlb_base.S
@@ -7,7 +7,7 @@
*/
#include <asm/pgtable.h>
-#include <asm/mmu_context.h>
+#include <asm/mmu.h>
/* %g1 TLB_SFSR (%g1 + %g1 == TLB_TAG_ACCESS)
* %g2 (KERN_HIGHBITS | KERN_LOWBITS)
@@ -68,8 +68,8 @@
/* DTLB ** ICACHE line 1: Quick user TLB misses */
ldxa [%g1 + %g1] ASI_DMMU, %g4 ! Get TAG_ACCESS
andcc %g4, TAG_CONTEXT_BITS, %g0 ! From Nucleus?
- mov 1, %g5 ! For TL==3 test
from_tl1_trap:
+ rdpr %tl, %g5 ! For TL==3 test
CREATE_VPTE_OFFSET1(%g4, %g6) ! Create VPTE offset
be,pn %xcc, 3f ! Yep, special processing
CREATE_VPTE_OFFSET2(%g4, %g6) ! Create VPTE offset
diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
index c4b705d0e00ca..a47f2d0b1a29b 100644
--- a/arch/sparc64/kernel/entry.S
+++ b/arch/sparc64/kernel/entry.S
@@ -38,97 +38,150 @@
* range (note that this is only possible for instruction miss, data misses to
* obp range do not use vpte). If so, go back directly to the faulting address.
* This is because we want to read the tpc, otherwise we have no way of knowing
- * the 8k aligned faulting address if we are using >8k kernel pagesize. This also
- * ensures no vpte range addresses are dropped into tlb while obp is executing
- * (see inherit_locked_prom_mappings() rant).
+ * the 8k aligned faulting address if we are using >8k kernel pagesize. This
+ * also ensures no vpte range addresses are dropped into tlb while obp is
+ * executing (see inherit_locked_prom_mappings() rant).
*/
sparc64_vpte_nucleus:
+ /* Load 0xf0000000, which is LOW_OBP_ADDRESS. */
mov 0xf, %g5
- sllx %g5, 28, %g5 ! Load 0xf0000000
- cmp %g4, %g5 ! Is addr >= LOW_OBP_ADDRESS?
+ sllx %g5, 28, %g5
+
+ /* Is addr >= LOW_OBP_ADDRESS? */
+ cmp %g4, %g5
blu,pn %xcc, sparc64_vpte_patchme1
mov 0x1, %g5
- sllx %g5, 32, %g5 ! Load 0x100000000
- cmp %g4, %g5 ! Is addr < HI_OBP_ADDRESS?
+
+ /* Load 0x100000000, which is HI_OBP_ADDRESS. */
+ sllx %g5, 32, %g5
+
+ /* Is addr < HI_OBP_ADDRESS? */
+ cmp %g4, %g5
blu,pn %xcc, obp_iaddr_patch
nop
+
+ /* These two instructions are patched by paginig_init(). */
sparc64_vpte_patchme1:
- sethi %hi(0), %g5 ! This has to be patched
+ sethi %hi(0), %g5
sparc64_vpte_patchme2:
- or %g5, %lo(0), %g5 ! This is patched too
- ba,pt %xcc, sparc64_kpte_continue ! Part of dtlb_backend
- add %g1, %g1, %g1 ! Finish PMD offset adjustment
+ or %g5, %lo(0), %g5
+
+ /* With kernel PGD in %g5, branch back into dtlb_backend. */
+ ba,pt %xcc, sparc64_kpte_continue
+ andn %g1, 0x3, %g1 /* Finish PMD offset adjustment. */
vpte_noent:
- mov TLB_SFSR, %g1 ! Restore %g1 value
- stxa %g4, [%g1 + %g1] ASI_DMMU ! Restore previous TAG_ACCESS
- done ! Slick trick
+ /* Restore previous TAG_ACCESS, %g5 is zero, and we will
+ * skip over the trap instruction so that the top level
+ * TLB miss handler will thing this %g5 value is just an
+ * invalid PTE, thus branching to full fault processing.
+ */
+ mov TLB_SFSR, %g1
+ stxa %g4, [%g1 + %g1] ASI_DMMU
+ done
.globl obp_iaddr_patch
- .globl obp_daddr_patch
-
obp_iaddr_patch:
- sethi %hi(0), %g5 ! This and following is patched
- or %g5, %lo(0), %g5 ! g5 now holds obp pmd base physaddr
- wrpr %g0, 1, %tl ! Behave as if we are at TL0
- rdpr %tpc, %g4 ! Find original faulting iaddr
- srlx %g4, 13, %g4 ! Throw out context bits
- sllx %g4, 13, %g4 ! g4 has vpn + ctx0 now
- mov TLB_SFSR, %g1 ! Restore %g1 value
- stxa %g4, [%g1 + %g1] ASI_IMMU ! Restore previous TAG_ACCESS
- srlx %g4, 23, %g6 ! Find pmd number
- and %g6, 0x7ff, %g6 ! Find pmd number
- sllx %g6, 2, %g6 ! Find pmd offset
- lduwa [%g5 + %g6] ASI_PHYS_USE_EC, %g5! Load pmd, ie pagetable physaddr
- brz,pn %g5, longpath ! Kill the PROM ? :-)
- sllx %g5, 11, %g5 ! Shift into place
- srlx %g4, 13, %g6 ! find pte number in pagetable
- and %g6, 0x3ff, %g6 ! find pte number in pagetable
- sllx %g6, 3, %g6 ! find pte offset in pagetable
- ldxa [%g5 + %g6] ASI_PHYS_USE_EC, %g5! Load pte
- brgez,pn %g5, longpath ! Kill the PROM ? :-)
+ /* These two instructions patched by inherit_prom_mappings(). */
+ sethi %hi(0), %g5
+ or %g5, %lo(0), %g5
+
+ /* Behave as if we are at TL0. */
+ wrpr %g0, 1, %tl
+ rdpr %tpc, %g4 /* Find original faulting iaddr */
+ srlx %g4, 13, %g4 /* Throw out context bits */
+ sllx %g4, 13, %g4 /* g4 has vpn + ctx0 now */
+
+ /* Restore previous TAG_ACCESS. */
+ mov TLB_SFSR, %g1
+ stxa %g4, [%g1 + %g1] ASI_IMMU
+
+ /* Get PMD offset. */
+ srlx %g4, 23, %g6
+ and %g6, 0x7ff, %g6
+ sllx %g6, 2, %g6
+
+ /* Load PMD, is it valid? */
+ lduwa [%g5 + %g6] ASI_PHYS_USE_EC, %g5
+ brz,pn %g5, longpath
+ sllx %g5, 11, %g5
+
+ /* Get PTE offset. */
+ srlx %g4, 13, %g6
+ and %g6, 0x3ff, %g6
+ sllx %g6, 3, %g6
+
+ /* Load PTE. */
+ ldxa [%g5 + %g6] ASI_PHYS_USE_EC, %g5
+ brgez,pn %g5, longpath
nop
- stxa %g5, [%g0] ASI_ITLB_DATA_IN ! put into tlb
- retry ! go back to original fault
+ /* TLB load and return from trap. */
+ stxa %g5, [%g0] ASI_ITLB_DATA_IN
+ retry
+
+ .globl obp_daddr_patch
obp_daddr_patch:
- sethi %hi(0), %g5 ! This and following is patched
- or %g5, %lo(0), %g5 ! g5 now holds obp pmd base physaddr
- srlx %g4, 23, %g6 ! Find pmd number
- and %g6, 0x7ff, %g6 ! Find pmd number
- sllx %g6, 2, %g6 ! Find pmd offset
- lduwa [%g5 + %g6] ASI_PHYS_USE_EC, %g5! Load pmd, ie pagetable physaddr
+ /* These two instructions patched by inherit_prom_mappings(). */
+ sethi %hi(0), %g5
+ or %g5, %lo(0), %g5
+
+ /* Get PMD offset. */
+ srlx %g4, 23, %g6
+ and %g6, 0x7ff, %g6
+ sllx %g6, 2, %g6
+
+ /* Load PMD, is it valid? */
+ lduwa [%g5 + %g6] ASI_PHYS_USE_EC, %g5
brz,pn %g5, longpath
- sllx %g5, 11, %g5 ! Shift into place
- srlx %g4, 13, %g6 ! find pte number in pagetable
- and %g6, 0x3ff, %g6 ! find pte number in pagetable
- sllx %g6, 3, %g6 ! find pte offset in pagetable
- ldxa [%g5 + %g6] ASI_PHYS_USE_EC, %g5! Load pte
+ sllx %g5, 11, %g5
+
+ /* Get PTE offset. */
+ srlx %g4, 13, %g6
+ and %g6, 0x3ff, %g6
+ sllx %g6, 3, %g6
+
+ /* Load PTE. */
+ ldxa [%g5 + %g6] ASI_PHYS_USE_EC, %g5
brgez,pn %g5, longpath
nop
- stxa %g5, [%g0] ASI_DTLB_DATA_IN ! put into tlb
+
+ /* TLB load and return from trap. */
+ stxa %g5, [%g0] ASI_DTLB_DATA_IN
retry
/*
- * On a first level data miss, check whether this is to the OBP range (note that
- * such accesses can be made by prom, as well as by kernel using prom_getproperty
- * on "address"), and if so, do not use vpte access ... rather, use information
- * saved during inherit_prom_mappings() using 8k pagesize.
+ * On a first level data miss, check whether this is to the OBP range (note
+ * that such accesses can be made by prom, as well as by kernel using
+ * prom_getproperty on "address"), and if so, do not use vpte access ...
+ * rather, use information saved during inherit_prom_mappings() using 8k
+ * pagesize.
*/
kvmap:
+ /* Load 0xf0000000, which is LOW_OBP_ADDRESS. */
mov 0xf, %g5
- sllx %g5, 28, %g5 ! Load 0xf0000000
- cmp %g4, %g5 ! Is addr >= LOW_OBP_ADDRESS?
+ sllx %g5, 28, %g5
+
+ /* Is addr >= LOW_OBP_ADDRESS? */
+ cmp %g4, %g5
blu,pn %xcc, vmalloc_addr
mov 0x1, %g5
- sllx %g5, 32, %g5 ! Load 0x100000000
- cmp %g4, %g5 ! Is addr < HI_OBP_ADDRESS?
+
+ /* Load 0x100000000, which is HI_OBP_ADDRESS. */
+ sllx %g5, 32, %g5
+
+ /* Is addr < HI_OBP_ADDRESS? */
+ cmp %g4, %g5
blu,pn %xcc, obp_daddr_patch
nop
-vmalloc_addr: ! vmalloc addr accessed
- ldxa [%g3 + %g6] ASI_N, %g5 ! Yep, load k-vpte
- brgez,pn %g5, longpath ! Valid, load into TLB
+
+vmalloc_addr:
+ /* If we get here, a vmalloc addr accessed, load kernel VPTE. */
+ ldxa [%g3 + %g6] ASI_N, %g5
+ brgez,pn %g5, longpath
nop
+
+ /* PTE is valid, load into TLB and return from trap. */
stxa %g5, [%g0] ASI_DTLB_DATA_IN ! Reload TLB
retry
@@ -199,9 +252,11 @@ do_fpdis:
faddd %f0, %f2, %f4
fmuld %f0, %f2, %f6
ldxa [%g3] ASI_DMMU, %g5
- add %g6, TI_FPREGS + 0xc0, %g2
- stxa %g0, [%g3] ASI_DMMU
+cplus_fptrap_insn_1:
+ sethi %hi(0), %g2
+ stxa %g2, [%g3] ASI_DMMU
membar #Sync
+ add %g6, TI_FPREGS + 0xc0, %g2
faddd %f0, %f2, %f8
fmuld %f0, %f2, %f10
ldda [%g1] ASI_BLK_S, %f32 ! grrr, where is ASI_BLK_NUCLEUS 8-(
@@ -225,7 +280,9 @@ do_fpdis:
fzero %f34
ldxa [%g3] ASI_DMMU, %g5
add %g6, TI_FPREGS, %g1
- stxa %g0, [%g3] ASI_DMMU
+cplus_fptrap_insn_2:
+ sethi %hi(0), %g2
+ stxa %g2, [%g3] ASI_DMMU
membar #Sync
add %g6, TI_FPREGS + 0x40, %g2
faddd %f32, %f34, %f36
@@ -249,9 +306,11 @@ do_fpdis:
3: mov SECONDARY_CONTEXT, %g3
add %g6, TI_FPREGS, %g1
ldxa [%g3] ASI_DMMU, %g5
- mov 0x40, %g2
- stxa %g0, [%g3] ASI_DMMU
+cplus_fptrap_insn_3:
+ sethi %hi(0), %g2
+ stxa %g2, [%g3] ASI_DMMU
membar #Sync
+ mov 0x40, %g2
ldda [%g1] ASI_BLK_S, %f0 ! grrr, where is ASI_BLK_NUCLEUS 8-(
ldda [%g1 + %g2] ASI_BLK_S, %f16
add %g1, 0x80, %g1
@@ -412,10 +471,12 @@ do_fptrap_after_fsr:
rd %gsr, %g3
stx %g3, [%g6 + TI_GSR]
mov SECONDARY_CONTEXT, %g3
- add %g6, TI_FPREGS, %g2
ldxa [%g3] ASI_DMMU, %g5
- stxa %g0, [%g3] ASI_DMMU
+cplus_fptrap_insn_4:
+ sethi %hi(0), %g2
+ stxa %g2, [%g3] ASI_DMMU
membar #Sync
+ add %g6, TI_FPREGS, %g2
andcc %g1, FPRS_DL, %g0
be,pn %icc, 4f
mov 0x40, %g3
@@ -433,6 +494,33 @@ do_fptrap_after_fsr:
ba,pt %xcc, etrap
wr %g0, 0, %fprs
+cplus_fptrap_1:
+ sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2
+
+ .globl cheetah_plus_patch_fpdis
+cheetah_plus_patch_fpdis:
+ /* We configure the dTLB512_0 for 4MB pages and the
+ * dTLB512_1 for 8K pages when in context zero.
+ */
+ sethi %hi(cplus_fptrap_1), %o0
+ lduw [%o0 + %lo(cplus_fptrap_1)], %o1
+
+ set cplus_fptrap_insn_1, %o2
+ stw %o1, [%o2]
+ flush %o2
+ set cplus_fptrap_insn_2, %o2
+ stw %o1, [%o2]
+ flush %o2
+ set cplus_fptrap_insn_3, %o2
+ stw %o1, [%o2]
+ flush %o2
+ set cplus_fptrap_insn_4, %o2
+ stw %o1, [%o2]
+ flush %o2
+
+ retl
+ nop
+
/* The registers for cross calls will be:
*
* DATA 0: [low 32-bits] Address of function to call, jmp to this
@@ -1642,7 +1730,7 @@ ret_from_syscall:
andn %o7, _TIF_NEWCHILD, %l0
stx %l0, [%g6 + TI_FLAGS]
call schedule_tail
- mov %g5, %o0
+ mov %g7, %o0
andcc %l0, _TIF_PERFCTR, %g0
be,pt %icc, 1f
nop
diff --git a/arch/sparc64/kernel/etrap.S b/arch/sparc64/kernel/etrap.S
index d50b755c7e9c3..52cde3a262313 100644
--- a/arch/sparc64/kernel/etrap.S
+++ b/arch/sparc64/kernel/etrap.S
@@ -14,6 +14,7 @@
#include <asm/spitfire.h>
#include <asm/head.h>
#include <asm/processor.h>
+#include <asm/mmu.h>
#define TASK_REGOFF (THREAD_SIZE-TRACEREG_SZ-STACKFRAME_SZ)
#define ETRAP_PSTATE1 (PSTATE_RMO | PSTATE_PRIV)
@@ -67,7 +68,13 @@ etrap_irq:
wrpr %g3, 0, %otherwin
wrpr %g2, 0, %wstate
- stxa %g0, [%l4] ASI_DMMU
+cplus_etrap_insn_1:
+ sethi %hi(0), %g3
+ sllx %g3, 32, %g3
+cplus_etrap_insn_2:
+ sethi %hi(0), %g2
+ or %g3, %g2, %g3
+ stxa %g3, [%l4] ASI_DMMU
flush %l6
wr %g0, ASI_AIUS, %asi
2: wrpr %g0, 0x0, %tl
@@ -95,11 +102,15 @@ etrap_irq:
stx %i7, [%sp + PTREGS_OFF + PT_V9_I7]
wrpr %g0, ETRAP_PSTATE2, %pstate
mov %l6, %g6
+#ifdef CONFIG_SMP
+ ldub [%g6 + TI_CPU], %g3
+ sethi %hi(__per_cpu_offset), %g2
+ or %g2, %lo(__per_cpu_offset), %g2
+ sllx %g3, 3, %g3
+ ldx [%g2 + %g3], %g5
+#endif
jmpl %l2 + 0x4, %g0
ldx [%g6 + TI_TASK], %g4
- nop
- nop
- nop
3: ldub [%l6 + TI_FPDEPTH], %l5
add %l6, TI_FPSAVED + 1, %l4
@@ -207,7 +218,13 @@ scetrap: rdpr %pil, %g2
mov PRIMARY_CONTEXT, %l4
wrpr %g3, 0, %otherwin
wrpr %g2, 0, %wstate
- stxa %g0, [%l4] ASI_DMMU
+cplus_etrap_insn_3:
+ sethi %hi(0), %g3
+ sllx %g3, 32, %g3
+cplus_etrap_insn_4:
+ sethi %hi(0), %g2
+ or %g3, %g2, %g3
+ stxa %g3, [%l4] ASI_DMMU
flush %l6
mov ASI_AIUS, %l7
@@ -241,11 +258,50 @@ scetrap: rdpr %pil, %g2
stx %i6, [%sp + PTREGS_OFF + PT_V9_I6]
mov %l6, %g6
stx %i7, [%sp + PTREGS_OFF + PT_V9_I7]
+#ifdef CONFIG_SMP
+ ldub [%g6 + TI_CPU], %g3
+ sethi %hi(__per_cpu_offset), %g2
+ or %g2, %lo(__per_cpu_offset), %g2
+ sllx %g3, 3, %g3
+ ldx [%g2 + %g3], %g5
+#endif
ldx [%g6 + TI_TASK], %g4
done
- nop
- nop
#undef TASK_REGOFF
#undef ETRAP_PSTATE1
-#undef ETRAP_PSTATE2
+
+cplus_einsn_1:
+ sethi %uhi(CTX_CHEETAH_PLUS_NUC), %g3
+cplus_einsn_2:
+ sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2
+
+ .globl cheetah_plus_patch_etrap
+cheetah_plus_patch_etrap:
+ /* We configure the dTLB512_0 for 4MB pages and the
+ * dTLB512_1 for 8K pages when in context zero.
+ */
+ sethi %hi(cplus_einsn_1), %o0
+ sethi %hi(cplus_etrap_insn_1), %o2
+ lduw [%o0 + %lo(cplus_einsn_1)], %o1
+ or %o2, %lo(cplus_etrap_insn_1), %o2
+ stw %o1, [%o2]
+ flush %o2
+ sethi %hi(cplus_etrap_insn_3), %o2
+ or %o2, %lo(cplus_etrap_insn_3), %o2
+ stw %o1, [%o2]
+ flush %o2
+
+ sethi %hi(cplus_einsn_2), %o0
+ sethi %hi(cplus_etrap_insn_2), %o2
+ lduw [%o0 + %lo(cplus_einsn_2)], %o1
+ or %o2, %lo(cplus_etrap_insn_2), %o2
+ stw %o1, [%o2]
+ flush %o2
+ sethi %hi(cplus_etrap_insn_4), %o2
+ or %o2, %lo(cplus_etrap_insn_4), %o2
+ stw %o1, [%o2]
+ flush %o2
+
+ retl
+ nop
diff --git a/arch/sparc64/kernel/head.S b/arch/sparc64/kernel/head.S
index 4a286a8000b07..954093551597f 100644
--- a/arch/sparc64/kernel/head.S
+++ b/arch/sparc64/kernel/head.S
@@ -25,6 +25,7 @@
#include <asm/dcu.h>
#include <asm/head.h>
#include <asm/ttable.h>
+#include <asm/mmu.h>
/* This section from from _start to sparc64_boot_end should fit into
* 0x0000.0000.0040.4000 to 0x0000.0000.0040.8000 and will be sharing space
@@ -88,8 +89,8 @@ sparc_ramdisk_image64:
* PROM entry point is on %o4
*/
sparc64_boot:
- BRANCH_IF_CHEETAH_BASE(g1,g5,cheetah_boot)
- BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g5,cheetah_plus_boot)
+ BRANCH_IF_CHEETAH_BASE(g1,g7,cheetah_boot)
+ BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,cheetah_plus_boot)
ba,pt %xcc, spitfire_boot
nop
@@ -102,11 +103,11 @@ cheetah_boot:
mov DCR_BPE | DCR_RPE | DCR_SI | DCR_IFPOE | DCR_MS, %g1
wr %g1, %asr18
- sethi %uhi(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g5
- or %g5, %ulo(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g5
- sllx %g5, 32, %g5
- or %g5, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g5
- stxa %g5, [%g0] ASI_DCU_CONTROL_REG
+ sethi %uhi(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g7
+ or %g7, %ulo(DCU_ME|DCU_RE|DCU_HPE|DCU_SPE|DCU_SL|DCU_WE), %g7
+ sllx %g7, 32, %g7
+ or %g7, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g7
+ stxa %g7, [%g0] ASI_DCU_CONTROL_REG
membar #Sync
cheetah_generic_boot:
@@ -491,7 +492,7 @@ sun4u_init:
stxa %g3, [%g2] ASI_DMMU
membar #Sync
- BRANCH_IF_ANY_CHEETAH(g1,g5,cheetah_tlb_fixup)
+ BRANCH_IF_ANY_CHEETAH(g1,g7,cheetah_tlb_fixup)
ba,pt %xcc, spitfire_tlb_fixup
nop
@@ -515,14 +516,31 @@ cheetah_tlb_fixup:
membar #Sync
mov 2, %g2 /* Set TLB type to cheetah+. */
- BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g5,g7,1f)
+ BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,1f)
mov 1, %g2 /* Set TLB type to cheetah. */
-1: sethi %hi(tlb_type), %g5
- stw %g2, [%g5 + %lo(tlb_type)]
+1: sethi %hi(tlb_type), %g1
+ stw %g2, [%g1 + %lo(tlb_type)]
- /* Patch copy/page operations to cheetah optimized versions. */
+ BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,1f)
+ ba,pt %xcc, 2f
+ nop
+
+1: /* Patch context register writes to support nucleus page
+ * size correctly.
+ */
+ call cheetah_plus_patch_etrap
+ nop
+ call cheetah_plus_patch_rtrap
+ nop
+ call cheetah_plus_patch_fpdis
+ nop
+ call cheetah_plus_patch_winfixup
+ nop
+
+
+2: /* Patch copy/page operations to cheetah optimized versions. */
call cheetah_patch_copyops
nop
call cheetah_patch_cachetlbops
@@ -549,8 +567,8 @@ spitfire_tlb_fixup:
/* Set TLB type to spitfire. */
mov 0, %g2
- sethi %hi(tlb_type), %g5
- stw %g2, [%g5 + %lo(tlb_type)]
+ sethi %hi(tlb_type), %g1
+ stw %g2, [%g1 + %lo(tlb_type)]
tlb_fixup_done:
sethi %hi(init_thread_union), %g6
@@ -578,12 +596,18 @@ tlb_fixup_done:
#endif
wr %g0, ASI_P, %asi
- mov 1, %g5
- sllx %g5, THREAD_SHIFT, %g5
- sub %g5, (STACKFRAME_SZ + STACK_BIAS), %g5
- add %g6, %g5, %sp
+ mov 1, %g1
+ sllx %g1, THREAD_SHIFT, %g1
+ sub %g1, (STACKFRAME_SZ + STACK_BIAS), %g1
+ add %g6, %g1, %sp
mov 0, %fp
+ /* Set per-cpu pointer initially to zero, this makes
+ * the boot-cpu use the in-kernel-image per-cpu areas
+ * before setup_per_cpu_area() is invoked.
+ */
+ clr %g5
+
wrpr %g0, 0, %wstate
wrpr %g0, 0x0, %tl
@@ -619,8 +643,8 @@ setup_tba: /* i0 = is_starfire */
rdpr %pstate, %o1
mov %g6, %o2
wrpr %o1, (PSTATE_AG|PSTATE_IE), %pstate
- sethi %hi(sparc64_ttable_tl0), %g5
- wrpr %g5, %tba
+ sethi %hi(sparc64_ttable_tl0), %g1
+ wrpr %g1, %tba
mov %o2, %g6
/* Set up MMU globals */
@@ -685,10 +709,23 @@ spitfire_vpte_base:
call init_irqwork_curcpu
nop
- sethi %hi(sparc64_ttable_tl0), %g5
call prom_set_trap_table
- mov %g5, %o0
+ sethi %hi(sparc64_ttable_tl0), %o0
+
+ BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g2,g3,1f)
+ ba,pt %xcc, 2f
+ nop
+1: /* Start using proper page size encodings in ctx register. */
+ sethi %uhi(CTX_CHEETAH_PLUS_NUC), %g3
+ mov PRIMARY_CONTEXT, %g1
+ sllx %g3, 32, %g3
+ sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2
+ or %g3, %g2, %g3
+ stxa %g3, [%g1] ASI_DMMU
+ membar #Sync
+
+2:
rdpr %pstate, %o1
or %o1, PSTATE_IE, %o1
wrpr %o1, 0, %pstate
diff --git a/arch/sparc64/kernel/rtrap.S b/arch/sparc64/kernel/rtrap.S
index b7c3277bb92ac..e917752080062 100644
--- a/arch/sparc64/kernel/rtrap.S
+++ b/arch/sparc64/kernel/rtrap.S
@@ -222,8 +222,9 @@ rt_continue: ldx [%sp + PTREGS_OFF + PT_V9_G1], %g1
ldx [%sp + PTREGS_OFF + PT_V9_G3], %g3
ldx [%sp + PTREGS_OFF + PT_V9_G4], %g4
- ldx [%sp + PTREGS_OFF + PT_V9_G5], %g5
- ldx [%sp + PTREGS_OFF + PT_V9_G6], %g6
+ brz,a,pn %l3, 1f
+ ldx [%sp + PTREGS_OFF + PT_V9_G5], %g5
+1: ldx [%sp + PTREGS_OFF + PT_V9_G6], %g6
ldx [%sp + PTREGS_OFF + PT_V9_G7], %g7
wrpr %g0, RTRAP_PSTATE_AG_IRQOFF, %pstate
ldx [%sp + PTREGS_OFF + PT_V9_I0], %i0
@@ -250,6 +251,10 @@ rt_continue: ldx [%sp + PTREGS_OFF + PT_V9_G1], %g1
brnz,pn %l3, kern_rtt
mov PRIMARY_CONTEXT, %l7
ldxa [%l7 + %l7] ASI_DMMU, %l0
+cplus_rtrap_insn_1:
+ sethi %hi(0), %l1
+ sllx %l1, 32, %l1
+ or %l0, %l1, %l0
stxa %l0, [%l7] ASI_DMMU
flush %g6
rdpr %wstate, %l1
@@ -298,10 +303,10 @@ kern_fpucheck: ldub [%g6 + TI_FPDEPTH], %l5
andcc %l2, FPRS_FEF, %g0
be,pn %icc, 5f
sll %o0, 3, %o5
- rd %fprs, %g5
+ rd %fprs, %g1
- wr %g5, FPRS_FEF, %fprs
- ldx [%o1 + %o5], %g5
+ wr %g1, FPRS_FEF, %fprs
+ ldx [%o1 + %o5], %g1
add %g6, TI_XFSR, %o1
membar #StoreLoad | #LoadLoad
sll %o0, 8, %o2
@@ -313,7 +318,7 @@ kern_fpucheck: ldub [%g6 + TI_FPDEPTH], %l5
ldda [%o4 + %o2] ASI_BLK_P, %f16
1: andcc %l2, FPRS_DU, %g0
be,pn %icc, 1f
- wr %g5, 0, %gsr
+ wr %g1, 0, %gsr
add %o2, 0x80, %o2
ldda [%o3 + %o2] ASI_BLK_P, %f32
ldda [%o4 + %o2] ASI_BLK_P, %f48
@@ -335,3 +340,21 @@ kern_fpucheck: ldub [%g6 + TI_FPDEPTH], %l5
wr %g0, FPRS_DU, %fprs
ba,pt %xcc, rt_continue
stb %l5, [%g6 + TI_FPDEPTH]
+
+cplus_rinsn_1:
+ sethi %uhi(CTX_CHEETAH_PLUS_NUC), %l1
+
+ .globl cheetah_plus_patch_rtrap
+cheetah_plus_patch_rtrap:
+ /* We configure the dTLB512_0 for 4MB pages and the
+ * dTLB512_1 for 8K pages when in context zero.
+ */
+ sethi %hi(cplus_rinsn_1), %o0
+ sethi %hi(cplus_rtrap_insn_1), %o2
+ lduw [%o0 + %lo(cplus_rinsn_1)], %o1
+ or %o2, %lo(cplus_rtrap_insn_1), %o2
+ stw %o1, [%o2]
+ flush %o2
+
+ retl
+ nop
diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
index 9ddfcb9a19001..63496c43fe173 100644
--- a/arch/sparc64/kernel/semaphore.c
+++ b/arch/sparc64/kernel/semaphore.c
@@ -65,30 +65,25 @@ void up(struct semaphore *sem)
__asm__ __volatile__("\n"
" ! up sem(%0)\n"
" membar #StoreLoad | #LoadLoad\n"
-"1: lduw [%0], %%g5\n"
-" add %%g5, 1, %%g7\n"
-" cas [%0], %%g5, %%g7\n"
-" cmp %%g5, %%g7\n"
+"1: lduw [%0], %%g1\n"
+" add %%g1, 1, %%g7\n"
+" cas [%0], %%g1, %%g7\n"
+" cmp %%g1, %%g7\n"
" bne,pn %%icc, 1b\n"
" addcc %%g7, 1, %%g0\n"
" ble,pn %%icc, 3f\n"
" membar #StoreLoad | #StoreStore\n"
"2:\n"
" .subsection 2\n"
-"3: mov %0, %%g5\n"
+"3: mov %0, %%g1\n"
" save %%sp, -160, %%sp\n"
-" mov %%g1, %%l1\n"
-" mov %%g2, %%l2\n"
-" mov %%g3, %%l3\n"
" call %1\n"
-" mov %%g5, %%o0\n"
-" mov %%l1, %%g1\n"
-" mov %%l2, %%g2\n"
+" mov %%g1, %%o0\n"
" ba,pt %%xcc, 2b\n"
-" restore %%l3, %%g0, %%g3\n"
+" restore\n"
" .previous\n"
: : "r" (sem), "i" (__up)
- : "g5", "g7", "memory", "cc");
+ : "g1", "g2", "g3", "g7", "memory", "cc");
}
static void __sched __down(struct semaphore * sem)
@@ -127,30 +122,25 @@ void __sched down(struct semaphore *sem)
__asm__ __volatile__("\n"
" ! down sem(%0)\n"
-"1: lduw [%0], %%g5\n"
-" sub %%g5, 1, %%g7\n"
-" cas [%0], %%g5, %%g7\n"
-" cmp %%g5, %%g7\n"
+"1: lduw [%0], %%g1\n"
+" sub %%g1, 1, %%g7\n"
+" cas [%0], %%g1, %%g7\n"
+" cmp %%g1, %%g7\n"
" bne,pn %%icc, 1b\n"
" cmp %%g7, 1\n"
" bl,pn %%icc, 3f\n"
" membar #StoreLoad | #StoreStore\n"
"2:\n"
" .subsection 2\n"
-"3: mov %0, %%g5\n"
+"3: mov %0, %%g1\n"
" save %%sp, -160, %%sp\n"
-" mov %%g1, %%l1\n"
-" mov %%g2, %%l2\n"
-" mov %%g3, %%l3\n"
" call %1\n"
-" mov %%g5, %%o0\n"
-" mov %%l1, %%g1\n"
-" mov %%l2, %%g2\n"
+" mov %%g1, %%o0\n"
" ba,pt %%xcc, 2b\n"
-" restore %%l3, %%g0, %%g3\n"
+" restore\n"
" .previous\n"
: : "r" (sem), "i" (__down)
- : "g5", "g7", "memory", "cc");
+ : "g1", "g2", "g3", "g7", "memory", "cc");
}
int down_trylock(struct semaphore *sem)
@@ -175,20 +165,20 @@ int down_trylock(struct semaphore *sem)
__asm__ __volatile__("\n"
" ! down_trylock sem(%1) ret(%0)\n"
-"1: lduw [%1], %%g5\n"
-" sub %%g5, 1, %%g7\n"
-" cmp %%g5, 1\n"
+"1: lduw [%1], %%g1\n"
+" sub %%g1, 1, %%g7\n"
+" cmp %%g1, 1\n"
" bl,pn %%icc, 2f\n"
" mov 1, %0\n"
-" cas [%1], %%g5, %%g7\n"
-" cmp %%g5, %%g7\n"
+" cas [%1], %%g1, %%g7\n"
+" cmp %%g1, %%g7\n"
" bne,pn %%icc, 1b\n"
" mov 0, %0\n"
" membar #StoreLoad | #StoreStore\n"
"2:\n"
: "=&r" (ret)
: "r" (sem)
- : "g5", "g7", "memory", "cc");
+ : "g1", "g7", "memory", "cc");
return ret;
}
@@ -237,31 +227,25 @@ int __sched down_interruptible(struct semaphore *sem)
__asm__ __volatile__("\n"
" ! down_interruptible sem(%2) ret(%0)\n"
-"1: lduw [%2], %%g5\n"
-" sub %%g5, 1, %%g7\n"
-" cas [%2], %%g5, %%g7\n"
-" cmp %%g5, %%g7\n"
+"1: lduw [%2], %%g1\n"
+" sub %%g1, 1, %%g7\n"
+" cas [%2], %%g1, %%g7\n"
+" cmp %%g1, %%g7\n"
" bne,pn %%icc, 1b\n"
" cmp %%g7, 1\n"
" bl,pn %%icc, 3f\n"
" membar #StoreLoad | #StoreStore\n"
"2:\n"
" .subsection 2\n"
-"3: mov %2, %%g5\n"
+"3: mov %2, %%g1\n"
" save %%sp, -160, %%sp\n"
-" mov %%g1, %%l1\n"
-" mov %%g2, %%l2\n"
-" mov %%g3, %%l3\n"
" call %3\n"
-" mov %%g5, %%o0\n"
-" mov %%l1, %%g1\n"
-" mov %%l2, %%g2\n"
-" mov %%l3, %%g3\n"
+" mov %%g1, %%o0\n"
" ba,pt %%xcc, 2b\n"
-" restore %%o0, %%g0, %0\n"
+" restore\n"
" .previous\n"
: "=r" (ret)
: "0" (ret), "r" (sem), "i" (__down_interruptible)
- : "g5", "g7", "memory", "cc");
+ : "g1", "g2", "g3", "g7", "memory", "cc");
return ret;
}
diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c
index 0c9ce2bb5100a..12c3d84b7460c 100644
--- a/arch/sparc64/kernel/setup.c
+++ b/arch/sparc64/kernel/setup.c
@@ -47,6 +47,7 @@
#include <asm/timer.h>
#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/mmu.h>
#ifdef CONFIG_IP_PNP
#include <net/ipconfig.h>
@@ -157,11 +158,11 @@ int prom_callback(long *args)
for_each_process(p) {
mm = p->mm;
- if (CTX_HWBITS(mm->context) == ctx)
+ if (CTX_NRBITS(mm->context) == ctx)
break;
}
if (!mm ||
- CTX_HWBITS(mm->context) != ctx)
+ CTX_NRBITS(mm->context) != ctx)
goto done;
pgdp = pgd_offset(mm, va);
@@ -187,12 +188,19 @@ int prom_callback(long *args)
}
if ((va >= KERNBASE) && (va < (KERNBASE + (4 * 1024 * 1024)))) {
+ unsigned long kernel_pctx = 0;
+
+ if (tlb_type == cheetah_plus)
+ kernel_pctx |= (CTX_CHEETAH_PLUS_NUC |
+ CTX_CHEETAH_PLUS_CTX0);
+
/* Spitfire Errata #32 workaround */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
- : "r" (0),
- "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
+ : "r" (kernel_pctx),
+ "r" (PRIMARY_CONTEXT),
+ "i" (ASI_DMMU));
/*
* Locked down tlb entry.
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 1441ef81b8abe..6550d981b450c 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -89,7 +89,6 @@ void __init smp_store_cpu_info(int id)
cpu_data(id).pgcache_size = 0;
cpu_data(id).pte_cache[0] = NULL;
cpu_data(id).pte_cache[1] = NULL;
- cpu_data(id).pgdcache_size = 0;
cpu_data(id).pgd_cache = NULL;
cpu_data(id).idle_volume = 1;
}
@@ -108,6 +107,10 @@ void __init smp_callin(void)
__flush_tlb_all();
+ __asm__ __volatile__("mov %0, %%g5\n\t"
+ : /* no outputs */
+ : "r" (__per_cpu_offset[cpuid]));
+
smp_setup_percpu_timer();
local_irq_enable();
@@ -627,7 +630,10 @@ extern unsigned long xcall_flush_tlb_all_spitfire;
extern unsigned long xcall_flush_tlb_all_cheetah;
extern unsigned long xcall_report_regs;
extern unsigned long xcall_receive_signal;
+
+#ifdef DCACHE_ALIASING_POSSIBLE
extern unsigned long xcall_flush_dcache_page_cheetah;
+#endif
extern unsigned long xcall_flush_dcache_page_spitfire;
#ifdef CONFIG_DEBUG_DCFLUSH
@@ -637,7 +643,7 @@ extern atomic_t dcpage_flushes_xcall;
static __inline__ void __local_flush_dcache_page(struct page *page)
{
-#if (L1DCACHE_SIZE > PAGE_SIZE)
+#ifdef DCACHE_ALIASING_POSSIBLE
__flush_dcache_page(page_address(page),
((tlb_type == spitfire) &&
page_mapping(page) != NULL));
@@ -672,11 +678,13 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
(u64) pg_addr,
mask);
} else {
+#ifdef DCACHE_ALIASING_POSSIBLE
data0 =
((u64)&xcall_flush_dcache_page_cheetah);
cheetah_xcall_deliver(data0,
__pa(pg_addr),
0, mask);
+#endif
}
#ifdef CONFIG_DEBUG_DCFLUSH
atomic_inc(&dcpage_flushes_xcall);
@@ -709,10 +717,12 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
(u64) pg_addr,
mask);
} else {
+#ifdef DCACHE_ALIASING_POSSIBLE
data0 = ((u64)&xcall_flush_dcache_page_cheetah);
cheetah_xcall_deliver(data0,
__pa(pg_addr),
0, mask);
+#endif
}
#ifdef CONFIG_DEBUG_DCFLUSH
atomic_inc(&dcpage_flushes_xcall);
@@ -1055,74 +1065,6 @@ void __init smp_tick_init(void)
prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1;
}
-extern unsigned long cheetah_tune_scheduling(void);
-
-static void __init smp_tune_scheduling(void)
-{
- unsigned long orig_flush_base, flush_base, flags, *p;
- unsigned int ecache_size, order;
- cycles_t tick1, tick2, raw;
- int cpu_node;
-
- /* Approximate heuristic for SMP scheduling. It is an
- * estimation of the time it takes to flush the L2 cache
- * on the local processor.
- *
- * The ia32 chooses to use the L1 cache flush time instead,
- * and I consider this complete nonsense. The Ultra can service
- * a miss to the L1 with a hit to the L2 in 7 or 8 cycles, and
- * L2 misses are what create extra bus traffic (ie. the "cost"
- * of moving a process from one cpu to another).
- */
- printk("SMP: Calibrating ecache flush... ");
- if (tlb_type == cheetah || tlb_type == cheetah_plus)
- return;
-
- cpu_find_by_instance(0, &cpu_node, NULL);
- ecache_size = prom_getintdefault(cpu_node,
- "ecache-size", (512 * 1024));
- if (ecache_size > (4 * 1024 * 1024))
- ecache_size = (4 * 1024 * 1024);
- orig_flush_base = flush_base =
- __get_free_pages(GFP_KERNEL, order = get_order(ecache_size));
-
- if (flush_base != 0UL) {
- local_irq_save(flags);
-
- /* Scan twice the size once just to get the TLB entries
- * loaded and make sure the second scan measures pure misses.
- */
- for (p = (unsigned long *)flush_base;
- ((unsigned long)p) < (flush_base + (ecache_size<<1));
- p += (64 / sizeof(unsigned long)))
- *((volatile unsigned long *)p);
-
- tick1 = tick_ops->get_tick();
-
- __asm__ __volatile__("1:\n\t"
- "ldx [%0 + 0x000], %%g1\n\t"
- "ldx [%0 + 0x040], %%g2\n\t"
- "ldx [%0 + 0x080], %%g3\n\t"
- "ldx [%0 + 0x0c0], %%g5\n\t"
- "add %0, 0x100, %0\n\t"
- "cmp %0, %2\n\t"
- "bne,pt %%xcc, 1b\n\t"
- " nop"
- : "=&r" (flush_base)
- : "0" (flush_base),
- "r" (flush_base + ecache_size)
- : "g1", "g2", "g3", "g5");
-
- tick2 = tick_ops->get_tick();
-
- local_irq_restore(flags);
-
- raw = (tick2 - tick1);
-
- free_pages(orig_flush_base, order);
- }
-}
-
/* /proc/profile writes can call this, don't __init it please. */
static DEFINE_SPINLOCK(prof_setup_lock);
@@ -1177,6 +1119,11 @@ void __devinit smp_prepare_boot_cpu(void)
}
current_thread_info()->cpu = hard_smp_processor_id();
+
+ __asm__ __volatile__("mov %0, %%g5\n\t"
+ : /* no outputs */
+ : "r" (__per_cpu_offset[smp_processor_id()]));
+
cpu_set(smp_processor_id(), cpu_online_map);
cpu_set(smp_processor_id(), phys_cpu_present_map);
}
@@ -1212,11 +1159,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
(long) num_online_cpus(),
bogosum/(500000/HZ),
(bogosum/(5000/HZ))%100);
-
- /* We want to run this with all the other cpus spinning
- * in the kernel.
- */
- smp_tune_scheduling();
}
/* This needn't do anything as we do not sleep the cpu
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 3cec1ebb083b0..cad5a11228006 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -59,6 +59,7 @@
#include <asm/ns87303.h>
#include <asm/timer.h>
#include <asm/cpudata.h>
+#include <asm/rwsem.h>
struct poll {
int fd;
@@ -174,6 +175,15 @@ EXPORT_SYMBOL(down_trylock);
EXPORT_SYMBOL(down_interruptible);
EXPORT_SYMBOL(up);
+/* RW semaphores */
+EXPORT_SYMBOL(__down_read);
+EXPORT_SYMBOL(__down_read_trylock);
+EXPORT_SYMBOL(__down_write);
+EXPORT_SYMBOL(__down_write_trylock);
+EXPORT_SYMBOL(__up_read);
+EXPORT_SYMBOL(__up_write);
+EXPORT_SYMBOL(__downgrade_write);
+
/* Atomic counter implementation. */
EXPORT_SYMBOL(atomic_add);
EXPORT_SYMBOL(atomic_add_ret);
@@ -209,8 +219,11 @@ EXPORT_SYMBOL(__flushw_user);
EXPORT_SYMBOL(tlb_type);
EXPORT_SYMBOL(get_fb_unmapped_area);
EXPORT_SYMBOL(flush_icache_range);
+
EXPORT_SYMBOL(flush_dcache_page);
+#ifdef DCACHE_ALIASING_POSSIBLE
EXPORT_SYMBOL(__flush_dcache_range);
+#endif
EXPORT_SYMBOL(mostek_lock);
EXPORT_SYMBOL(mstk48t02_regs);
@@ -350,7 +363,9 @@ EXPORT_SYMBOL(__memset);
EXPORT_SYMBOL(memchr);
EXPORT_SYMBOL(csum_partial);
-EXPORT_SYMBOL(csum_partial_copy_sparc64);
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+EXPORT_SYMBOL(__csum_partial_copy_from_user);
+EXPORT_SYMBOL(__csum_partial_copy_to_user);
EXPORT_SYMBOL(ip_fast_csum);
/* Moving data to/from/in userspace. */
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index a9fa9a47074d8..567c91c77b20e 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -264,7 +264,7 @@ asmlinkage long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compa
switch (call) {
case SEMTIMEDOP:
- if (third)
+ if (fifth)
/* sign extend semid */
return compat_sys_semtimedop((int)first,
compat_ptr(ptr), second,
diff --git a/arch/sparc64/kernel/trampoline.S b/arch/sparc64/kernel/trampoline.S
index f1d764b2d39b5..2c8f9344b4eea 100644
--- a/arch/sparc64/kernel/trampoline.S
+++ b/arch/sparc64/kernel/trampoline.S
@@ -15,6 +15,7 @@
#include <asm/spitfire.h>
#include <asm/processor.h>
#include <asm/thread_info.h>
+#include <asm/mmu.h>
.data
.align 8
@@ -334,6 +335,20 @@ do_unlock:
call init_irqwork_curcpu
nop
+ BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g2,g3,1f)
+ ba,pt %xcc, 2f
+ nop
+
+1: /* Start using proper page size encodings in ctx register. */
+ sethi %uhi(CTX_CHEETAH_PLUS_NUC), %g3
+ mov PRIMARY_CONTEXT, %g1
+ sllx %g3, 32, %g3
+ sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2
+ or %g3, %g2, %g3
+ stxa %g3, [%g1] ASI_DMMU
+ membar #Sync
+
+2:
rdpr %pstate, %o1
or %o1, PSTATE_IE, %o1
wrpr %o1, 0, %pstate
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index 7d0e96f00bd00..56b203a2af696 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -806,48 +806,6 @@ static void cheetah_flush_ecache_line(unsigned long physaddr)
"i" (ASI_PHYS_USE_EC));
}
-#ifdef CONFIG_SMP
-unsigned long __init cheetah_tune_scheduling(void)
-{
- unsigned long tick1, tick2, raw;
- unsigned long flush_base = ecache_flush_physbase;
- unsigned long flush_linesize = ecache_flush_linesize;
- unsigned long flush_size = ecache_flush_size;
-
- /* Run through the whole cache to guarantee the timed loop
- * is really displacing cache lines.
- */
- __asm__ __volatile__("1: subcc %0, %4, %0\n\t"
- " bne,pt %%xcc, 1b\n\t"
- " ldxa [%2 + %0] %3, %%g0\n\t"
- : "=&r" (flush_size)
- : "0" (flush_size), "r" (flush_base),
- "i" (ASI_PHYS_USE_EC), "r" (flush_linesize));
-
- /* The flush area is 2 X Ecache-size, so cut this in half for
- * the timed loop.
- */
- flush_base = ecache_flush_physbase;
- flush_linesize = ecache_flush_linesize;
- flush_size = ecache_flush_size >> 1;
-
- tick1 = tick_ops->get_tick();
-
- __asm__ __volatile__("1: subcc %0, %4, %0\n\t"
- " bne,pt %%xcc, 1b\n\t"
- " ldxa [%2 + %0] %3, %%g0\n\t"
- : "=&r" (flush_size)
- : "0" (flush_size), "r" (flush_base),
- "i" (ASI_PHYS_USE_EC), "r" (flush_linesize));
-
- tick2 = tick_ops->get_tick();
-
- raw = (tick2 - tick1);
-
- return (raw - (raw >> 2));
-}
-#endif
-
/* Unfortunately, the diagnostic access to the I-cache tags we need to
* use to clear the thing interferes with I-cache coherency transactions.
*
diff --git a/arch/sparc64/kernel/unaligned.c b/arch/sparc64/kernel/unaligned.c
index 8a9d3b6bfe5c9..4372bf32ecf6f 100644
--- a/arch/sparc64/kernel/unaligned.c
+++ b/arch/sparc64/kernel/unaligned.c
@@ -379,8 +379,8 @@ void kernel_mna_trap_fault(struct pt_regs *regs, unsigned int insn)
printk(KERN_ALERT "Unable to handle kernel paging request in mna handler");
printk(KERN_ALERT " at virtual address %016lx\n",address);
printk(KERN_ALERT "current->{mm,active_mm}->context = %016lx\n",
- (current->mm ? current->mm->context :
- current->active_mm->context));
+ (current->mm ? CTX_HWBITS(current->mm->context) :
+ CTX_HWBITS(current->active_mm->context)));
printk(KERN_ALERT "current->{mm,active_mm}->pgd = %016lx\n",
(current->mm ? (unsigned long) current->mm->pgd :
(unsigned long) current->active_mm->pgd));
@@ -413,7 +413,7 @@ asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn, u
:
: "r" (regs), "r" (insn)
: "o0", "o1", "o2", "o3", "o4", "o5", "o7",
- "g1", "g2", "g3", "g4", "g5", "g7", "cc");
+ "g1", "g2", "g3", "g4", "g7", "cc");
} else {
unsigned long addr = compute_effective_address(regs, insn, ((insn >> 25) & 0x1f));
diff --git a/arch/sparc64/kernel/winfixup.S b/arch/sparc64/kernel/winfixup.S
index 3427d7a743e1f..ca9891a8dad82 100644
--- a/arch/sparc64/kernel/winfixup.S
+++ b/arch/sparc64/kernel/winfixup.S
@@ -14,6 +14,25 @@
#include <asm/thread_info.h>
.text
+
+set_pcontext:
+cplus_winfixup_insn_1:
+ sethi %hi(0), %l1
+ mov PRIMARY_CONTEXT, %g1
+ sllx %l1, 32, %l1
+cplus_winfixup_insn_2:
+ sethi %hi(0), %g2
+ or %l1, %g2, %l1
+ stxa %l1, [%g1] ASI_DMMU
+ flush %g6
+ retl
+ nop
+
+cplus_wfinsn_1:
+ sethi %uhi(CTX_CHEETAH_PLUS_NUC), %l1
+cplus_wfinsn_2:
+ sethi %hi(CTX_CHEETAH_PLUS_CTX0), %g2
+
.align 32
/* Here are the rules, pay attention.
@@ -62,9 +81,8 @@ fill_fixup:
wrpr %g0, 0x0, %canrestore ! Standard etrap stuff.
wrpr %g2, 0x0, %wstate ! This must be consistent.
wrpr %g0, 0x0, %otherwin ! We know this.
- mov PRIMARY_CONTEXT, %g1 ! Change contexts...
- stxa %g0, [%g1] ASI_DMMU ! Back into the nucleus.
- flush %g6 ! Flush instruction buffers
+ call set_pcontext ! Change contexts...
+ nop
rdpr %pstate, %l1 ! Prepare to change globals.
mov %g6, %o7 ! Get current.
@@ -75,6 +93,13 @@ fill_fixup:
wrpr %l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate
mov %o7, %g6
ldx [%g6 + TI_TASK], %g4
+#ifdef CONFIG_SMP
+ ldub [%g6 + TI_CPU], %g1
+ sethi %hi(__per_cpu_offset), %g2
+ or %g2, %lo(__per_cpu_offset), %g2
+ sllx %g1, 3, %g1
+ ldx [%g2 + %g1], %g5
+#endif
/* This is the same as below, except we handle this a bit special
* since we must preserve %l5 and %l6, see comment above.
@@ -183,9 +208,8 @@ fill_fixup_mna:
wrpr %g2, 0x0, %wstate ! This must be consistent.
wrpr %g0, 0x0, %otherwin ! We know this.
- mov PRIMARY_CONTEXT, %g1 ! Change contexts...
- stxa %g0, [%g1] ASI_DMMU ! Back into the nucleus.
- flush %g6 ! Flush instruction buffers
+ call set_pcontext ! Change contexts...
+ nop
rdpr %pstate, %l1 ! Prepare to change globals.
mov %g4, %o2 ! Setup args for
mov %g5, %o1 ! final call to mem_address_unaligned.
@@ -196,6 +220,13 @@ fill_fixup_mna:
wrpr %l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate
mov %o7, %g6 ! Get current back.
ldx [%g6 + TI_TASK], %g4 ! Finish it.
+#ifdef CONFIG_SMP
+ ldub [%g6 + TI_CPU], %g1
+ sethi %hi(__per_cpu_offset), %g2
+ or %g2, %lo(__per_cpu_offset), %g2
+ sllx %g1, 3, %g1
+ ldx [%g2 + %g1], %g5
+#endif
call mem_address_unaligned
add %sp, PTREGS_OFF, %o0
@@ -289,9 +320,8 @@ fill_fixup_dax:
wrpr %g2, 0x0, %wstate ! This must be consistent.
wrpr %g0, 0x0, %otherwin ! We know this.
- mov PRIMARY_CONTEXT, %g1 ! Change contexts...
- stxa %g0, [%g1] ASI_DMMU ! Back into the nucleus.
- flush %g6 ! Flush instruction buffers
+ call set_pcontext ! Change contexts...
+ nop
rdpr %pstate, %l1 ! Prepare to change globals.
mov %g4, %o1 ! Setup args for
mov %g5, %o2 ! final call to data_access_exception.
@@ -302,6 +332,13 @@ fill_fixup_dax:
wrpr %l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate
mov %o7, %g6 ! Get current back.
ldx [%g6 + TI_TASK], %g4 ! Finish it.
+#ifdef CONFIG_SMP
+ ldub [%g6 + TI_CPU], %g1
+ sethi %hi(__per_cpu_offset), %g2
+ or %g2, %lo(__per_cpu_offset), %g2
+ sllx %g1, 3, %g1
+ ldx [%g2 + %g1], %g5
+#endif
call data_access_exception
add %sp, PTREGS_OFF, %o0
@@ -368,3 +405,22 @@ window_dax_from_user_common:
ba,pt %xcc, rtrap
clr %l6
+
+ .globl cheetah_plus_patch_winfixup
+cheetah_plus_patch_winfixup:
+ sethi %hi(cplus_wfinsn_1), %o0
+ sethi %hi(cplus_winfixup_insn_1), %o2
+ lduw [%o0 + %lo(cplus_wfinsn_1)], %o1
+ or %o2, %lo(cplus_winfixup_insn_1), %o2
+ stw %o1, [%o2]
+ flush %o2
+
+ sethi %hi(cplus_wfinsn_2), %o0
+ sethi %hi(cplus_winfixup_insn_2), %o2
+ lduw [%o0 + %lo(cplus_wfinsn_2)], %o1
+ or %o2, %lo(cplus_winfixup_insn_2), %o2
+ stw %o1, [%o2]
+ flush %o2
+
+ retl
+ nop
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile
index 3cf408cb1695e..40dbeec7e5d6a 100644
--- a/arch/sparc64/lib/Makefile
+++ b/arch/sparc64/lib/Makefile
@@ -7,8 +7,8 @@ EXTRA_CFLAGS := -Werror
lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \
memscan.o strncpy_from_user.o strlen_user.o memcmp.o checksum.o \
- VISbzero.o VISmemset.o VIScsum.o VIScsumcopy.o \
- VIScsumcopyusr.o VISsave.o atomic.o bitops.o \
+ bzero.o csum_copy.o csum_copy_from_user.o csum_copy_to_user.o \
+ VISsave.o atomic.o bitops.o \
U1memcpy.o U1copy_from_user.o U1copy_to_user.o \
U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \
copy_in_user.o user_fixup.o memmove.o \
diff --git a/arch/sparc64/lib/U1memcpy.S b/arch/sparc64/lib/U1memcpy.S
index fffec2e3cef8e..da9b520c71894 100644
--- a/arch/sparc64/lib/U1memcpy.S
+++ b/arch/sparc64/lib/U1memcpy.S
@@ -7,7 +7,9 @@
#ifdef __KERNEL__
#include <asm/visasm.h>
#include <asm/asi.h>
+#define GLOBAL_SPARE g7
#else
+#define GLOBAL_SPARE g5
#define ASI_BLK_P 0xf0
#define FPRS_FEF 0x04
#ifdef MEMCPY_DEBUG
@@ -123,7 +125,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
cmp %g2, 0
tne %xcc, 5
PREAMBLE
- mov %o0, %g5
+ mov %o0, %o4
cmp %o2, 0
be,pn %XCC, 85f
or %o0, %o1, %o3
@@ -146,7 +148,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
* of bytes to copy to make 'dst' 64-byte aligned. We pre-
* subtract this from 'len'.
*/
- sub %o0, %o1, %o4
+ sub %o0, %o1, %GLOBAL_SPARE
sub %g2, 0x40, %g2
sub %g0, %g2, %g2
sub %o2, %g2, %o2
@@ -156,11 +158,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
1: subcc %g1, 0x1, %g1
EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
- EX_ST(STORE(stb, %o3, %o1 + %o4))
+ EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE))
bgu,pt %XCC, 1b
add %o1, 0x1, %o1
- add %o1, %o4, %o0
+ add %o1, %GLOBAL_SPARE, %o0
2: cmp %g2, 0x0
and %o1, 0x7, %g1
@@ -188,19 +190,19 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
3:
membar #LoadStore | #StoreStore | #StoreLoad
- subcc %o2, 0x40, %o4
+ subcc %o2, 0x40, %GLOBAL_SPARE
add %o1, %g1, %g1
- andncc %o4, (0x40 - 1), %o4
+ andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE
srl %g1, 3, %g2
- sub %o2, %o4, %g3
+ sub %o2, %GLOBAL_SPARE, %g3
andn %o1, (0x40 - 1), %o1
and %g2, 7, %g2
andncc %g3, 0x7, %g3
fmovd %f0, %f2
sub %g3, 0x8, %g3
- sub %o2, %o4, %o2
+ sub %o2, %GLOBAL_SPARE, %o2
- add %g1, %o4, %g1
+ add %g1, %GLOBAL_SPARE, %g1
subcc %o2, %g3, %o2
EX_LD(LOAD_BLK(%o1, %f0))
@@ -208,7 +210,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
add %g1, %g3, %g1
EX_LD(LOAD_BLK(%o1, %f16))
add %o1, 0x40, %o1
- sub %o4, 0x80, %o4
+ sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE
EX_LD(LOAD_BLK(%o1, %f32))
add %o1, 0x40, %o1
@@ -229,11 +231,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
.align 64
1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16)
- LOOP_CHUNK1(o1, o0, o4, 1f)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
- LOOP_CHUNK2(o1, o0, o4, 2f)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0)
- LOOP_CHUNK3(o1, o0, o4, 3f)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
ba,pt %xcc, 1b+4
faligndata %f0, %f2, %f48
1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32)
@@ -250,11 +252,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
STORE_JUMP(o0, f48, 56f) membar #Sync
1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18)
- LOOP_CHUNK1(o1, o0, o4, 1f)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
- LOOP_CHUNK2(o1, o0, o4, 2f)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2)
- LOOP_CHUNK3(o1, o0, o4, 3f)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
ba,pt %xcc, 1b+4
faligndata %f2, %f4, %f48
1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34)
@@ -271,11 +273,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
STORE_JUMP(o0, f48, 57f) membar #Sync
1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20)
- LOOP_CHUNK1(o1, o0, o4, 1f)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
- LOOP_CHUNK2(o1, o0, o4, 2f)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4)
- LOOP_CHUNK3(o1, o0, o4, 3f)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
ba,pt %xcc, 1b+4
faligndata %f4, %f6, %f48
1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36)
@@ -292,11 +294,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
STORE_JUMP(o0, f48, 58f) membar #Sync
1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22)
- LOOP_CHUNK1(o1, o0, o4, 1f)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
- LOOP_CHUNK2(o1, o0, o4, 2f)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6)
- LOOP_CHUNK3(o1, o0, o4, 3f)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
ba,pt %xcc, 1b+4
faligndata %f6, %f8, %f48
1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38)
@@ -313,11 +315,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
STORE_JUMP(o0, f48, 59f) membar #Sync
1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24)
- LOOP_CHUNK1(o1, o0, o4, 1f)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
- LOOP_CHUNK2(o1, o0, o4, 2f)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8)
- LOOP_CHUNK3(o1, o0, o4, 3f)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
ba,pt %xcc, 1b+4
faligndata %f8, %f10, %f48
1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40)
@@ -334,11 +336,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
STORE_JUMP(o0, f48, 60f) membar #Sync
1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26)
- LOOP_CHUNK1(o1, o0, o4, 1f)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
- LOOP_CHUNK2(o1, o0, o4, 2f)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10)
- LOOP_CHUNK3(o1, o0, o4, 3f)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
ba,pt %xcc, 1b+4
faligndata %f10, %f12, %f48
1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42)
@@ -355,11 +357,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
STORE_JUMP(o0, f48, 61f) membar #Sync
1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28)
- LOOP_CHUNK1(o1, o0, o4, 1f)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
- LOOP_CHUNK2(o1, o0, o4, 2f)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12)
- LOOP_CHUNK3(o1, o0, o4, 3f)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
ba,pt %xcc, 1b+4
faligndata %f12, %f14, %f48
1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44)
@@ -376,11 +378,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
STORE_JUMP(o0, f48, 62f) membar #Sync
1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30)
- LOOP_CHUNK1(o1, o0, o4, 1f)
+ LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f)
FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
- LOOP_CHUNK2(o1, o0, o4, 2f)
+ LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f)
FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14)
- LOOP_CHUNK3(o1, o0, o4, 3f)
+ LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f)
ba,pt %xcc, 1b+4
faligndata %f14, %f16, %f48
1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46)
@@ -449,18 +451,18 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
2: membar #StoreLoad | #StoreStore
VISExit
retl
- mov EX_RETVAL(%g5), %o0
+ mov EX_RETVAL(%o4), %o0
.align 64
70: /* 16 < len <= (5 * 64) */
bne,pn %XCC, 75f
sub %o0, %o1, %o3
-72: andn %o2, 0xf, %o4
+72: andn %o2, 0xf, %GLOBAL_SPARE
and %o2, 0xf, %o2
1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
- subcc %o4, 0x10, %o4
+ subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE
EX_ST(STORE(stx, %o5, %o1 + %o3))
add %o1, 0x8, %o1
EX_ST(STORE(stx, %g1, %o1 + %o3))
@@ -512,10 +514,10 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
andn %o1, 0x7, %o1
EX_LD(LOAD(ldx, %o1, %g2))
sub %o3, %g1, %o3
- andn %o2, 0x7, %o4
+ andn %o2, 0x7, %GLOBAL_SPARE
sllx %g2, %g1, %g2
1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
- subcc %o4, 0x8, %o4
+ subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE
add %o1, 0x8, %o1
srlx %g3, %o3, %o5
or %o5, %g2, %o5
@@ -544,7 +546,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
add %o1, 4, %o1
85: retl
- mov EX_RETVAL(%g5), %o0
+ mov EX_RETVAL(%o4), %o0
.align 32
90: EX_LD(LOAD(ldub, %o1, %g1))
@@ -553,6 +555,6 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
bgu,pt %XCC, 90b
add %o1, 1, %o1
retl
- mov EX_RETVAL(%g5), %o0
+ mov EX_RETVAL(%o4), %o0
.size FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc64/lib/U3memcpy.S b/arch/sparc64/lib/U3memcpy.S
index 8fe195a10bbad..7cae9cc6a204a 100644
--- a/arch/sparc64/lib/U3memcpy.S
+++ b/arch/sparc64/lib/U3memcpy.S
@@ -6,6 +6,7 @@
#ifdef __KERNEL__
#include <asm/visasm.h>
#include <asm/asi.h>
+#define GLOBAL_SPARE %g7
#else
#define ASI_BLK_P 0xf0
#define FPRS_FEF 0x04
@@ -17,6 +18,7 @@
#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs
#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
#endif
+#define GLOBAL_SPARE %g5
#endif
#ifndef EX_LD
@@ -84,7 +86,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
cmp %g2, 0
tne %xcc, 5
PREAMBLE
- mov %o0, %g5
+ mov %o0, %o4
cmp %o2, 0
be,pn %XCC, 85f
or %o0, %o1, %o3
@@ -109,7 +111,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
* of bytes to copy to make 'dst' 64-byte aligned. We pre-
* subtract this from 'len'.
*/
- sub %o0, %o1, %o4
+ sub %o0, %o1, GLOBAL_SPARE
sub %g2, 0x40, %g2
sub %g0, %g2, %g2
sub %o2, %g2, %o2
@@ -119,11 +121,11 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
1: subcc %g1, 0x1, %g1
EX_LD(LOAD(ldub, %o1 + 0x00, %o3))
- EX_ST(STORE(stb, %o3, %o1 + %o4))
+ EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE))
bgu,pt %XCC, 1b
add %o1, 0x1, %o1
- add %o1, %o4, %o0
+ add %o1, GLOBAL_SPARE, %o0
2: cmp %g2, 0x0
and %o1, 0x7, %g1
@@ -149,7 +151,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
3: LOAD(prefetch, %o1 + 0x000, #one_read)
LOAD(prefetch, %o1 + 0x040, #one_read)
- andn %o2, (0x40 - 1), %o4
+ andn %o2, (0x40 - 1), GLOBAL_SPARE
LOAD(prefetch, %o1 + 0x080, #one_read)
LOAD(prefetch, %o1 + 0x0c0, #one_read)
LOAD(prefetch, %o1 + 0x100, #one_read)
@@ -173,10 +175,10 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
faligndata %f10, %f12, %f26
EX_LD(LOAD(ldd, %o1 + 0x040, %f0))
- subcc %o4, 0x80, %o4
+ subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE
add %o1, 0x40, %o1
bgu,pt %XCC, 1f
- srl %o4, 6, %o3
+ srl GLOBAL_SPARE, 6, %o3
ba,pt %xcc, 2f
nop
@@ -315,9 +317,9 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
sub %o0, %o1, %o3
72:
- andn %o2, 0xf, %o4
+ andn %o2, 0xf, GLOBAL_SPARE
and %o2, 0xf, %o2
-1: subcc %o4, 0x10, %o4
+1: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE
EX_LD(LOAD(ldx, %o1 + 0x00, %o5))
EX_LD(LOAD(ldx, %o1 + 0x08, %g1))
EX_ST(STORE(stx, %o5, %o1 + %o3))
@@ -372,10 +374,10 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
andn %o1, 0x7, %o1
EX_LD(LOAD(ldx, %o1, %g2))
sub %o3, %g1, %o3
- andn %o2, 0x7, %o4
+ andn %o2, 0x7, GLOBAL_SPARE
sllx %g2, %g1, %g2
1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3))
- subcc %o4, 0x8, %o4
+ subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE
add %o1, 0x8, %o1
srlx %g3, %o3, %o5
or %o5, %g2, %o5
@@ -405,7 +407,7 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
add %o1, 4, %o1
85: retl
- mov EX_RETVAL(%g5), %o0
+ mov EX_RETVAL(%o4), %o0
.align 32
90:
@@ -415,6 +417,6 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
bgu,pt %XCC, 90b
add %o1, 1, %o1
retl
- mov EX_RETVAL(%g5), %o0
+ mov EX_RETVAL(%o4), %o0
.size FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc64/lib/VIS.h b/arch/sparc64/lib/VIS.h
deleted file mode 100644
index 9d93a70e7081f..0000000000000
--- a/arch/sparc64/lib/VIS.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* $Id: VIS.h,v 1.4 1999/05/25 16:52:50 jj Exp $
- * VIS.h: High speed copy/clear operations utilizing the UltraSparc
- * Visual Instruction Set.
- *
- * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1996, 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
- */
-
- /* VIS code can be used for numerous copy/set operation variants.
- * It can be made to work in the kernel, one single instance,
- * for all of memcpy, copy_to_user, and copy_from_user by setting
- * the ASI src/dest globals correctly. Furthermore it can
- * be used for kernel-->kernel page copies as well, a hook label
- * is put in here just for this purpose.
- *
- * For userland, compiling this without __KERNEL__ defined makes
- * it work just fine as a generic libc bcopy and memcpy.
- * If for userland it is compiled with a 32bit gcc (but you need
- * -Wa,-Av9a), the code will just rely on lower 32bits of
- * IEU registers, if you compile it with 64bit gcc (ie. define
- * __sparc_v9__), the code will use full 64bit.
- */
-
-#ifndef __VIS_H
-#define __VIS_H
-
-#ifdef __KERNEL__
-#include <asm/head.h>
-#include <asm/asi.h>
-#else
-#define ASI_AIUS 0x11 /* Secondary, user */
-#define ASI_BLK_AIUS 0x71 /* Secondary, user, blk ld/st */
-#define ASI_P 0x80 /* Primary, implicit */
-#define ASI_S 0x81 /* Secondary, implicit */
-#define ASI_BLK_COMMIT_P 0xe0 /* Primary, blk store commit */
-#define ASI_BLK_COMMIT_S 0xe1 /* Secondary, blk store commit */
-#define ASI_BLK_P 0xf0 /* Primary, blk ld/st */
-#define ASI_BLK_S 0xf1 /* Secondary, blk ld/st */
-#define FPRS_FEF 0x04
-#endif
-
- /* I'm telling you, they really did this chip right.
- * Perhaps the SunSoft folks should visit some of the
- * people in Sun Microelectronics and start some brain
- * cell exchange program...
- */
-#define ASI_BLK_XOR (ASI_P ^ ASI_BLK_P)
- /* Well, things get more hairy if we use ASI_AIUS as
- * USER_DS and ASI_P as KERNEL_DS, we'd reach
- * commit block stores this way which is not what we want...
- */
- /* ASI_P->ASI_BLK_P && ASI_AIUS->ASI_BLK_AIUS transitions can be done
- * as blkasi = asi | ASI_BLK_OR
- */
-#define ASI_BLK_OR (ASI_BLK_P & ~ASI_P)
- /* Transition back from ASI_BLK_P->ASI_P && ASI_BLK_AIUS->ASI_AIUS is
- * more complicated:
- * asi = blkasi ^ (blkasi >> 3) ^ ASI_BLK_XOR1
- */
-#define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
-
-#define asi_src %o3
-#define asi_dest %o4
-
-#ifdef __KERNEL__
-#define ASI_SETSRC_BLK wr asi_src, 0, %asi;
-#define ASI_SETSRC_NOBLK wr asi_src, 0, %asi;
-#define ASI_SETDST_BLK wr asi_dest, 0, %asi;
-#define ASI_SETDST_NOBLK wr asi_dest, 0, %asi;
-#define ASIBLK %asi
-#define ASINORMAL %asi
-#define LDUB lduba
-#define LDUH lduha
-#define LDUW lduwa
-#define LDX ldxa
-#define LDD ldda
-#define LDDF ldda
-#define LDBLK ldda
-#define STB stba
-#define STH stha
-#define STW stwa
-#define STD stda
-#define STX stxa
-#define STDF stda
-#define STBLK stda
-#else
-#define ASI_SETSRC_BLK
-#define ASI_SETSRC_NOBLK
-#define ASI_SETDST_BLK
-#define ASI_SETDST_NOBLK
-#define ASI_SETDST_SPECIAL
-#define ASIBLK %asi
-#define ASINORMAL
-#define LDUB ldub
-#define LDUH lduh
-#define LDUW lduw
-#define LDD ldd
-#define LDX ldx
-#define LDDF ldd
-#define LDBLK ldda
-#define STB stb
-#define STH sth
-#define STW stw
-#define STD std
-#define STX stx
-#define STDF std
-#define STBLK stda
-#endif
-
-#ifdef __KERNEL__
-
-#define REGS_64BIT
-
-#else
-
-#ifndef REGS_64BIT
-#ifdef __sparc_v9__
-#define REGS_64BIT
-#endif
-#endif
-
-#endif
-
-#ifndef REGS_64BIT
-#define xcc icc
-#endif
-
-#endif
diff --git a/arch/sparc64/lib/VISbzero.S b/arch/sparc64/lib/VISbzero.S
deleted file mode 100644
index 06b697bab974b..0000000000000
--- a/arch/sparc64/lib/VISbzero.S
+++ /dev/null
@@ -1,274 +0,0 @@
-/* $Id: VISbzero.S,v 1.11 2001/03/15 08:51:24 anton Exp $
- * VISbzero.S: High speed clear operations utilizing the UltraSparc
- * Visual Instruction Set.
- *
- * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1996, 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
- */
-
-#include "VIS.h"
-
-#ifdef __KERNEL__
-#include <asm/visasm.h>
-
-#define EXN(x,y,a,b,z) \
-98: x,y; \
- .section .fixup; \
- .align 4; \
-99: ba VISbzerofixup_ret##z; \
- a, b, %o0; \
- .section __ex_table; \
- .align 4; \
- .word 98b, 99b; \
- .text; \
- .align 4;
-#define EXC(x,y,a,b,c...) \
-98: x,y; \
- .section .fixup; \
- .align 4; \
-99: c; \
- ba VISbzerofixup_ret0; \
- a, b, %o0; \
- .section __ex_table; \
- .align 4; \
- .word 98b, 99b; \
- .text; \
- .align 4;
-#define EXO1(x,y) \
-98: x,y; \
- .section __ex_table; \
- .align 4; \
- .word 98b, VISbzerofixup_reto1; \
- .text; \
- .align 4;
-#define EX(x,y,a,b) EXN(x,y,a,b,0)
-#define EX1(x,y,a,b) EXN(x,y,a,b,1)
-#define EX2(x,y,a,b) EXN(x,y,a,b,2)
-#define EXT(start,end,handler) \
- .section __ex_table; \
- .align 4; \
- .word start, 0, end, handler; \
- .text; \
- .align 4
-#else
-#define EX(x,y,a,b) x,y
-#define EX1(x,y,a,b) x,y
-#define EX2(x,y,a,b) x,y
-#define EXC(x,y,a,b,c...) x,y
-#define EXO1(x,y) x,y
-#define EXT(a,b,c)
-#endif
-
-#define ZERO_BLOCKS(base, offset, source) \
- STX source, [base - offset - 0x38] ASINORMAL; \
- STX source, [base - offset - 0x30] ASINORMAL; \
- STX source, [base - offset - 0x28] ASINORMAL; \
- STX source, [base - offset - 0x20] ASINORMAL; \
- STX source, [base - offset - 0x18] ASINORMAL; \
- STX source, [base - offset - 0x10] ASINORMAL; \
- STX source, [base - offset - 0x08] ASINORMAL; \
- STX source, [base - offset - 0x00] ASINORMAL;
-
-#ifdef __KERNEL__
-#define RETL clr %o0
-#else
-#define RETL mov %g3, %o0
-#endif
-
- /* Well, bzero is a lot easier to get right than bcopy... */
-#ifdef __KERNEL__
- .section __ex_table,#alloc
- .section .fixup,#alloc,#execinstr
-#endif
- .text
- .align 32
-#ifdef __KERNEL__
- .globl __bzero, __bzero_noasi
-__bzero_noasi:
- rd %asi, %g5
- ba,pt %xcc, __bzero+12
- mov %g5, %o4
-__bzero:
- rd %asi, %g5
- wr %g0, ASI_P, %asi ! LSU Group
- mov ASI_P, %o4
-#else
- .globl bzero
-bzero_private:
-bzero:
-#ifndef REGS_64BIT
- srl %o1, 0, %o1
-#endif
- mov %o0, %g3
-#endif
- cmp %o1, 7
- bleu,pn %xcc, 17f
- andcc %o0, 3, %o2
- be,a,pt %xcc, 4f
- andcc %o0, 4, %g0
- cmp %o2, 3
- be,pn %xcc, 2f
- EXO1(STB %g0, [%o0 + 0x00] ASINORMAL)
- cmp %o2, 2
- be,pt %xcc, 2f
- EX(STB %g0, [%o0 + 0x01] ASINORMAL, sub %o1, 1)
- EX(STB %g0, [%o0 + 0x02] ASINORMAL, sub %o1, 2)
-2: sub %o2, 4, %o2
- sub %o0, %o2, %o0
- add %o1, %o2, %o1
- andcc %o0, 4, %g0
-4: be,pt %xcc, 2f
- cmp %o1, 128
- EXO1(STW %g0, [%o0] ASINORMAL)
- sub %o1, 4, %o1
- add %o0, 4, %o0
-2: blu,pn %xcc, 9f
- andcc %o0, 0x38, %o2
- be,pn %icc, 6f
- mov 64, %o5
- andcc %o0, 8, %g0
- be,pn %icc, 1f
- sub %o5, %o2, %o5
- EX(STX %g0, [%o0] ASINORMAL, sub %o1, 0)
- add %o0, 8, %o0
-1: andcc %o5, 16, %g0
- be,pn %icc, 1f
- sub %o1, %o5, %o1
- EX1(STX %g0, [%o0] ASINORMAL, add %g0, 0)
- EX1(STX %g0, [%o0 + 8] ASINORMAL, sub %g0, 8)
- add %o0, 16, %o0
-1: andcc %o5, 32, %g0
- be,pn %icc, 7f
- andncc %o1, 0x3f, %o3
- EX(STX %g0, [%o0] ASINORMAL, add %o1, 32)
- EX(STX %g0, [%o0 + 8] ASINORMAL, add %o1, 24)
- EX(STX %g0, [%o0 + 16] ASINORMAL, add %o1, 16)
- EX(STX %g0, [%o0 + 24] ASINORMAL, add %o1, 8)
- add %o0, 32, %o0
-6: andncc %o1, 0x3f, %o3
-7: be,pn %xcc, 9f
-#ifdef __KERNEL__
- or %o4, ASI_BLK_OR, %g7
- wr %g7, %g0, %asi
- VISEntryHalf
-#else
- wr %g0, ASI_BLK_P, %asi
-#endif
- membar #StoreLoad | #StoreStore | #LoadStore
- fzero %f0
- andcc %o3, 0xc0, %o2
- and %o1, 0x3f, %o1
- fzero %f2
- andn %o3, 0xff, %o3
- faddd %f0, %f2, %f4
- fmuld %f0, %f2, %f6
- cmp %o2, 64
- faddd %f0, %f2, %f8
- fmuld %f0, %f2, %f10
- faddd %f0, %f2, %f12
- brz,pn %o2, 10f
- fmuld %f0, %f2, %f14
- be,pn %icc, 2f
- EXC(STBLK %f0, [%o0 + 0x00] ASIBLK, add %o3, %o2, add %o2, %o1, %o2)
- cmp %o2, 128
- be,pn %icc, 2f
- EXC(STBLK %f0, [%o0 + 0x40] ASIBLK, add %o3, %o2, add %o2, %o1, %o2; sub %o2, 64, %o2)
- EXC(STBLK %f0, [%o0 + 0x80] ASIBLK, add %o3, %o2, add %o2, %o1, %o2; sub %o2, 128, %o2)
-2: brz,pn %o3, 12f
- add %o0, %o2, %o0
-10: EX(STBLK %f0, [%o0 + 0x00] ASIBLK, add %o3, %o1)
- EXC(STBLK %f0, [%o0 + 0x40] ASIBLK, add %o3, %o1, sub %o1, 64, %o1)
- EXC(STBLK %f0, [%o0 + 0x80] ASIBLK, add %o3, %o1, sub %o1, 128, %o1)
- EXC(STBLK %f0, [%o0 + 0xc0] ASIBLK, add %o3, %o1, sub %o1, 192, %o1)
-11: subcc %o3, 256, %o3
- bne,pt %xcc, 10b
- add %o0, 256, %o0
-12:
-#ifdef __KERNEL__
- VISExitHalf
- wr %o4, 0x0, %asi
-#else
-#ifndef REGS_64BIT
- wr %g0, FPRS_FEF, %fprs
-#endif
-#endif
- membar #StoreLoad | #StoreStore
-9: andcc %o1, 0xf8, %o2
- be,pn %xcc, 13f
- andcc %o1, 7, %o1
-#ifdef __KERNEL__
-14: sethi %hi(13f), %o4
- srl %o2, 1, %o3
- sub %o4, %o3, %o4
- jmpl %o4 + %lo(13f), %g0
- add %o0, %o2, %o0
-#else
-14: rd %pc, %o4
- srl %o2, 1, %o3
- sub %o4, %o3, %o4
- jmpl %o4 + (13f - 14b), %g0
- add %o0, %o2, %o0
-#endif
-12: ZERO_BLOCKS(%o0, 0xc8, %g0)
- ZERO_BLOCKS(%o0, 0x88, %g0)
- ZERO_BLOCKS(%o0, 0x48, %g0)
- ZERO_BLOCKS(%o0, 0x08, %g0)
- EXT(12b,13f,VISbzerofixup_zb)
-13: be,pn %xcc, 8f
- andcc %o1, 4, %g0
- be,pn %xcc, 1f
- andcc %o1, 2, %g0
- EX(STW %g0, [%o0] ASINORMAL, and %o1, 7)
- add %o0, 4, %o0
-1: be,pn %xcc, 1f
- andcc %o1, 1, %g0
- EX(STH %g0, [%o0] ASINORMAL, and %o1, 3)
- add %o0, 2, %o0
-1: bne,a,pn %xcc, 8f
- EX(STB %g0, [%o0] ASINORMAL, add %g0, 1)
-8:
-#ifdef __KERNEL__
- wr %g5, %g0, %asi
-#endif
- retl
- RETL
-17: be,pn %xcc, 13b
- orcc %o1, 0, %g0
- be,pn %xcc, 0f
-8: add %o0, 1, %o0
- subcc %o1, 1, %o1
- bne,pt %xcc, 8b
- EX(STB %g0, [%o0 - 1] ASINORMAL, add %o1, 1)
-0:
-#ifdef __KERNEL__
- wr %g5, %g0, %asi
-#endif
- retl
- RETL
-
-#ifdef __KERNEL__
- .section .fixup
- .align 4
-VISbzerofixup_reto1:
- mov %o1, %o0
-VISbzerofixup_ret0:
- wr %g5, %g0, %asi
- retl
- wr %g0, 0, %fprs
-VISbzerofixup_ret1:
- and %o5, 0x30, %o5
- add %o5, %o1, %o5
- ba,pt %xcc, VISbzerofixup_ret0
- add %o0, %o5, %o0
-VISbzerofixup_ret2:
- and %o5, 0x20, %o5
- add %o5, %o1, %o5
- ba,pt %xcc, VISbzerofixup_ret0
- add %o0, %o5, %o0
-VISbzerofixup_zb:
- andcc %o1, 7, %o1
- sll %g2, 3, %g2
- add %o1, 256, %o1
- ba,pt %xcc, VISbzerofixup_ret0
- sub %o1, %g2, %o0
-#endif
diff --git a/arch/sparc64/lib/VIScsum.S b/arch/sparc64/lib/VIScsum.S
deleted file mode 100644
index ae00e9fb17e6e..0000000000000
--- a/arch/sparc64/lib/VIScsum.S
+++ /dev/null
@@ -1,546 +0,0 @@
-/* $Id: VIScsum.S,v 1.7 2002/02/09 19:49:30 davem Exp $
- * VIScsum.S: High bandwidth IP checksumming utilizing the UltraSparc
- * Visual Instruction Set.
- *
- * Copyright (C) 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 2000 David S. Miller (davem@redhat.com)
- *
- * Based on older sparc32/sparc64 checksum.S, which is:
- *
- * Copyright(C) 1995 Linus Torvalds
- * Copyright(C) 1995 Miguel de Icaza
- * Copyright(C) 1996, 1997 David S. Miller
- * derived from:
- * Linux/Alpha checksum c-code
- * Linux/ix86 inline checksum assembly
- * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
- * David Mosberger-Tang for optimized reference c-code
- * BSD4.4 portable checksum routine
- */
-
-#ifdef __sparc_v9__
-#define STACKOFF 2175
-#else
-#define STACKOFF 64
-#endif
-
-#ifdef __KERNEL__
-#include <asm/head.h>
-#include <asm/asi.h>
-#include <asm/visasm.h>
-#include <asm/thread_info.h>
-#else
-#define ASI_BLK_P 0xf0
-#define FRPS_FEF 0x04
-#endif
-
-/* Dobrou noc, SunSoft engineers. Spete sladce.
- * This has a couple of tricks in and those
- * tricks are UltraLinux trade secrets :))
- */
-
-#define START_THE_TRICK(fz,f0,f2,f4,f6,f8,f10) \
- fcmpgt32 %fz, %f0, %g1 /* FPM Group */; \
- fcmpgt32 %fz, %f2, %g2 /* FPM Group */; \
- fcmpgt32 %fz, %f4, %g3 /* FPM Group */; \
- inc %g1 /* IEU0 Group */; \
- fcmpgt32 %fz, %f6, %g5 /* FPM */; \
- srl %g1, 1, %g1 /* IEU0 Group */; \
- fcmpgt32 %fz, %f8, %g7 /* FPM */; \
- inc %g2 /* IEU0 Group */; \
- fcmpgt32 %fz, %f10, %o3 /* FPM */; \
- srl %g2, 1, %g2 /* IEU0 Group */; \
- inc %g3 /* IEU1 */; \
- srl %g3, 1, %g3 /* IEU0 Group */; \
- add %o2, %g1, %o2 /* IEU1 */; \
- add %o2, %g2, %o2 /* IEU0 Group */; \
- inc %g5 /* IEU1 */; \
- add %o2, %g3, %o2 /* IEU0 Group */;
-
-#define DO_THE_TRICK(O12,O14,f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14) \
- srl %g5, 1, %g5 /* IEU0 Group */; \
- fpadd32 %F0, %f0, %F0 /* FPA */; \
- fcmpgt32 %O12, %f12, %o4 /* FPM */; \
- inc %g7 /* IEU0 Group */; \
- fpadd32 %F2, %f2, %F2 /* FPA */; \
- fcmpgt32 %O14, %f14, %o5 /* FPM */; \
- add %o2, %g5, %o2 /* IEU1 Group */; \
- fpadd32 %F4, %f4, %F4 /* FPA */; \
- fcmpgt32 %f0, %F0, %g1 /* FPM */; \
- srl %g7, 1, %g7 /* IEU0 Group */; \
- fpadd32 %F6, %f6, %F6 /* FPA */; \
- fcmpgt32 %f2, %F2, %g2 /* FPM */; \
- add %o2, %g7, %o2 /* IEU0 Group */; \
- fpadd32 %F8, %f8, %F8 /* FPA */; \
- fcmpgt32 %f4, %F4, %g3 /* FPM */; \
- inc %o3 /* IEU0 Group */; \
- fpadd32 %F10, %f10, %F10 /* FPA */; \
- fcmpgt32 %f6, %F6, %g5 /* FPM */; \
- srl %o3, 1, %o3 /* IEU0 Group */; \
- fpadd32 %F12, %f12, %F12 /* FPA */; \
- fcmpgt32 %f8, %F8, %g7 /* FPM */; \
- add %o2, %o3, %o2 /* IEU0 Group */; \
- fpadd32 %F14, %f14, %F14 /* FPA */; \
- fcmpgt32 %f10, %F10, %o3 /* FPM */; \
- inc %o4 /* IEU0 Group */; \
- inc %o5 /* IEU1 */; \
- srl %o4, 1, %o4 /* IEU0 Group */; \
- inc %g1 /* IEU1 */; \
- srl %o5, 1, %o5 /* IEU0 Group */; \
- add %o2, %o4, %o2 /* IEU1 */; \
- srl %g1, 1, %g1 /* IEU0 Group */; \
- add %o2, %o5, %o2 /* IEU1 */; \
- inc %g2 /* IEU0 Group */; \
- add %o2, %g1, %o2 /* IEU1 */; \
- srl %g2, 1, %g2 /* IEU0 Group */; \
- inc %g3 /* IEU1 */; \
- srl %g3, 1, %g3 /* IEU0 Group */; \
- add %o2, %g2, %o2 /* IEU1 */; \
- inc %g5 /* IEU0 Group */; \
- add %o2, %g3, %o2 /* IEU0 */;
-
-#define END_THE_TRICK(O12,O14,f0,f2,f4,f6,f8,f10,f12,f14,S0,S1,S2,S3,T0,T1,U0,fz) \
- srl %g5, 1, %g5 /* IEU0 Group */; \
- fpadd32 %f2, %f0, %S0 /* FPA */; \
- fcmpgt32 %O12, %f12, %o4 /* FPM */; \
- inc %g7 /* IEU0 Group */; \
- fpadd32 %f6, %f4, %S1 /* FPA */; \
- fcmpgt32 %O14, %f14, %o5 /* FPM */; \
- srl %g7, 1, %g7 /* IEU0 Group */; \
- fpadd32 %f10, %f8, %S2 /* FPA */; \
- fcmpgt32 %f0, %S0, %g1 /* FPM */; \
- inc %o3 /* IEU0 Group */; \
- fpadd32 %f14, %f12, %S3 /* FPA */; \
- fcmpgt32 %f4, %S1, %g2 /* FPM */; \
- add %o2, %g5, %o2 /* IEU0 Group */; \
- fpadd32 %S0, %S1, %T0 /* FPA */; \
- fcmpgt32 %f8, %S2, %g3 /* FPM */; \
- add %o2, %g7, %o2 /* IEU0 Group */; \
- fzero %fz /* FPA */; \
- fcmpgt32 %f12, %S3, %g5 /* FPM */; \
- srl %o3, 1, %o3 /* IEU0 Group */; \
- fpadd32 %S2, %S3, %T1 /* FPA */; \
- fcmpgt32 %S0, %T0, %g7 /* FPM */; \
- add %o2, %o3, %o2 /* IEU0 Group */; \
- fpadd32 %T0, %T1, %U0 /* FPA */; \
- fcmpgt32 %S2, %T1, %o3 /* FPM */; \
- inc %o4 /* IEU0 Group */; \
- inc %o5 /* IEU1 */; \
- srl %o4, 1, %o4 /* IEU0 Group */; \
- inc %g1 /* IEU1 */; \
- add %o2, %o4, %o2 /* IEU0 Group */; \
- fcmpgt32 %fz, %f2, %o4 /* FPM */; \
- srl %o5, 1, %o5 /* IEU0 Group */; \
- inc %g2 /* IEU1 */; \
- add %o2, %o5, %o2 /* IEU0 Group */; \
- fcmpgt32 %fz, %f6, %o5 /* FPM */; \
- srl %g1, 1, %g1 /* IEU0 Group */; \
- inc %g3 /* IEU1 */; \
- add %o2, %g1, %o2 /* IEU0 Group */; \
- fcmpgt32 %fz, %f10, %g1 /* FPM */; \
- srl %g2, 1, %g2 /* IEU0 Group */; \
- inc %g5 /* IEU1 */; \
- add %o2, %g2, %o2 /* IEU0 Group */; \
- fcmpgt32 %fz, %f14, %g2 /* FPM */; \
- srl %g3, 1, %g3 /* IEU0 Group */; \
- inc %g7 /* IEU1 */; \
- add %o2, %g3, %o2 /* IEU0 Group */; \
- fcmpgt32 %fz, %S1, %g3 /* FPM */; \
- srl %g5, 1, %g5 /* IEU0 Group */; \
- inc %o3 /* IEU1 */; \
- add %o2, %g5, %o2 /* IEU0 Group */; \
- fcmpgt32 %fz, %S3, %g5 /* FPM */; \
- srl %g7, 1, %g7 /* IEU0 Group */; \
- inc %o4 /* IEU1 */; \
- add %o2, %g7, %o2 /* IEU0 Group */; \
- fcmpgt32 %fz, %T1, %g7 /* FPM */; \
- srl %o3, 1, %o3 /* IEU0 Group */; \
- inc %o5 /* IEU1 */; \
- add %o2, %o3, %o2 /* IEU0 Group */; \
- fcmpgt32 %T0, %U0, %o3 /* FPM */; \
- srl %o4, 1, %o4 /* IEU0 Group */; \
- inc %g1 /* IEU1 */; \
- sub %o2, %o4, %o2 /* IEU0 Group */; \
- fcmpgt32 %fz, %U0, %o4 /* FPM */; \
- srl %o5, 1, %o5 /* IEU0 Group */; \
- inc %g2 /* IEU1 */; \
- srl %g1, 1, %g1 /* IEU0 Group */; \
- sub %o2, %o5, %o2 /* IEU1 */; \
- std %U0, [%sp + STACKOFF] /* Store */; \
- srl %g2, 1, %g2 /* IEU0 Group */; \
- sub %o2, %g1, %o2 /* IEU1 */; \
- inc %g3 /* IEU0 Group */; \
- sub %o2, %g2, %o2 /* IEU1 */; \
- srl %g3, 1, %g3 /* IEU0 Group */; \
- inc %g5 /* IEU1 */; \
- srl %g5, 1, %g5 /* IEU0 Group */; \
- sub %o2, %g3, %o2 /* IEU1 */; \
- ldx [%sp + STACKOFF], %o5 /* Load Group */; \
- inc %g7 /* IEU0 */; \
- sub %o2, %g5, %o2 /* IEU1 */; \
- srl %g7, 1, %g7 /* IEU0 Group */; \
- inc %o3 /* IEU1 */; \
- srl %o3, 1, %o3 /* IEU0 Group */; \
- sub %o2, %g7, %o2 /* IEU1 */; \
- inc %o4 /* IEU0 Group */; \
- add %o2, %o3, %o2 /* IEU1 */; \
- srl %o4, 1, %o4 /* IEU0 Group */; \
- sub %o2, %o4, %o2 /* IEU0 Group */; \
- addcc %o2, %o5, %o2 /* IEU1 Group */; \
- bcs,a,pn %xcc, 33f /* CTI */; \
- add %o2, 1, %o2 /* IEU0 */; \
-33: /* That's it */;
-
-#define CSUM_LASTCHUNK(offset) \
- ldx [%o0 - offset - 0x10], %g2; \
- ldx [%o0 - offset - 0x08], %g3; \
- addcc %g2, %o2, %o2; \
- bcs,a,pn %xcc, 31f; \
- add %o2, 1, %o2; \
-31: addcc %g3, %o2, %o2; \
- bcs,a,pn %xcc, 32f; \
- add %o2, 1, %o2; \
-32:
-
- .text
- .globl csum_partial
- .align 32
-csum_partial:
- andcc %o0, 7, %g0 /* IEU1 Group */
- be,pt %icc, 4f /* CTI */
- andcc %o0, 0x38, %g3 /* IEU1 */
- mov 1, %g5 /* IEU0 Group */
- cmp %o1, 6 /* IEU1 */
- bl,pn %icc, 21f /* CTI */
- andcc %o0, 1, %g0 /* IEU1 Group */
- bne,pn %icc, csump_really_slow /* CTI */
- andcc %o0, 2, %g0 /* IEU1 Group */
- be,pt %icc, 1f /* CTI */
- and %o0, 4, %g7 /* IEU0 */
- lduh [%o0], %g2 /* Load */
- sub %o1, 2, %o1 /* IEU0 Group */
- add %o0, 2, %o0 /* IEU1 */
- andcc %o0, 4, %g7 /* IEU1 Group */
- sll %g5, 16, %g5 /* IEU0 */
- sll %g2, 16, %g2 /* IEU0 Group */
- addcc %g2, %o2, %o2 /* IEU1 Group (regdep) */
- bcs,a,pn %icc, 1f /* CTI */
- add %o2, %g5, %o2 /* IEU0 */
-1: ld [%o0], %g2 /* Load */
- brz,a,pn %g7, 4f /* CTI+IEU1 Group */
- and %o0, 0x38, %g3 /* IEU0 */
- add %o0, 4, %o0 /* IEU0 Group */
- sub %o1, 4, %o1 /* IEU1 */
- addcc %g2, %o2, %o2 /* IEU1 Group */
- bcs,a,pn %icc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-1: and %o0, 0x38, %g3 /* IEU1 Group */
-4: srl %o2, 0, %o2 /* IEU0 Group */
- mov 0x40, %g1 /* IEU1 */
- brz,pn %g3, 3f /* CTI+IEU1 Group */
- sub %g1, %g3, %g1 /* IEU0 */
- cmp %o1, 56 /* IEU1 Group */
- blu,pn %icc, 20f /* CTI */
- andcc %o0, 8, %g0 /* IEU1 Group */
- be,pn %icc, 1f /* CTI */
- ldx [%o0], %g2 /* Load */
- add %o0, 8, %o0 /* IEU0 Group */
- sub %o1, 8, %o1 /* IEU1 */
- addcc %g2, %o2, %o2 /* IEU1 Group */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-1: andcc %g1, 0x10, %g0 /* IEU1 Group */
- be,pn %icc, 2f /* CTI */
- and %g1, 0x20, %g1 /* IEU0 */
- ldx [%o0], %g2 /* Load */
- ldx [%o0+8], %g3 /* Load Group */
- add %o0, 16, %o0 /* IEU0 */
- sub %o1, 16, %o1 /* IEU1 */
- addcc %g2, %o2, %o2 /* IEU1 Group */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-1: addcc %g3, %o2, %o2 /* IEU1 Group */
- bcs,a,pn %xcc, 2f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-2: brz,pn %g1, 3f /* CTI+IEU1 Group */
- ldx [%o0], %g2 /* Load */
- ldx [%o0+8], %g3 /* Load Group */
- ldx [%o0+16], %g5 /* Load Group */
- ldx [%o0+24], %g7 /* Load Group */
- add %o0, 32, %o0 /* IEU0 */
- sub %o1, 32, %o1 /* IEU1 */
- addcc %g2, %o2, %o2 /* IEU1 Group */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-1: addcc %g3, %o2, %o2 /* IEU1 Group */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-1: addcc %g5, %o2, %o2 /* IEU1 Group */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-1: addcc %g7, %o2, %o2 /* IEU1 Group */
- bcs,a,pn %xcc, 3f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-3: cmp %o1, 0xc0 /* IEU1 Group */
- blu,pn %icc, 20f /* CTI */
- sllx %o2, 32, %g5 /* IEU0 */
-#ifdef __KERNEL__
- VISEntry
-#endif
- addcc %o2, %g5, %o2 /* IEU1 Group */
- sub %o1, 0xc0, %o1 /* IEU0 */
- wr %g0, ASI_BLK_P, %asi /* LSU Group */
- membar #StoreLoad /* LSU Group */
- srlx %o2, 32, %o2 /* IEU0 Group */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU1 */
-1: andcc %o1, 0x80, %g0 /* IEU1 Group */
- bne,pn %icc, 7f /* CTI */
- andcc %o1, 0x40, %g0 /* IEU1 Group */
- be,pn %icc, 6f /* CTI */
- fzero %f12 /* FPA */
- fzero %f14 /* FPA Group */
- ldda [%o0 + 0x000] %asi, %f16
- ldda [%o0 + 0x040] %asi, %f32
- ldda [%o0 + 0x080] %asi, %f48
- START_THE_TRICK(f12,f16,f18,f20,f22,f24,f26)
- ba,a,pt %xcc, 3f
-6: sub %o0, 0x40, %o0 /* IEU0 Group */
- fzero %f28 /* FPA */
- fzero %f30 /* FPA Group */
- ldda [%o0 + 0x040] %asi, %f32
- ldda [%o0 + 0x080] %asi, %f48
- ldda [%o0 + 0x0c0] %asi, %f0
- START_THE_TRICK(f28,f32,f34,f36,f38,f40,f42)
- ba,a,pt %xcc, 4f
-7: bne,pt %icc, 8f /* CTI */
- fzero %f44 /* FPA */
- add %o0, 0x40, %o0 /* IEU0 Group */
- fzero %f60 /* FPA */
- fzero %f62 /* FPA Group */
- ldda [%o0 - 0x040] %asi, %f0
- ldda [%o0 + 0x000] %asi, %f16
- ldda [%o0 + 0x040] %asi, %f32
- START_THE_TRICK(f60,f0,f2,f4,f6,f8,f10)
- ba,a,pt %xcc, 2f
-8: add %o0, 0x80, %o0 /* IEU0 Group */
- fzero %f46 /* FPA */
- ldda [%o0 - 0x080] %asi, %f48
- ldda [%o0 - 0x040] %asi, %f0
- ldda [%o0 + 0x000] %asi, %f16
- START_THE_TRICK(f44,f48,f50,f52,f54,f56,f58)
-1: DO_THE_TRICK(f44,f46,f48,f50,f52,f54,f56,f58,f60,f62,f0,f2,f4,f6,f8,f10,f12,f14)
- ldda [%o0 + 0x040] %asi, %f32
-2: DO_THE_TRICK(f60,f62,f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30)
- ldda [%o0 + 0x080] %asi, %f48
-3: DO_THE_TRICK(f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46)
- ldda [%o0 + 0x0c0] %asi, %f0
-4: DO_THE_TRICK(f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,f48,f50,f52,f54,f56,f58,f60,f62)
- add %o0, 0x100, %o0 /* IEU0 Group */
- subcc %o1, 0x100, %o1 /* IEU1 */
- bgeu,a,pt %icc, 1b /* CTI */
- ldda [%o0 + 0x000] %asi, %f16
- membar #Sync /* LSU Group */
- DO_THE_TRICK(f44,f46,f48,f50,f52,f54,f56,f58,f60,f62,f0,f2,f4,f6,f8,f10,f12,f14)
- END_THE_TRICK(f60,f62,f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30)
-#ifdef __KERNEL__
- ldub [%g6 + TI_CURRENT_DS], %g7
-#endif
- and %o1, 0x3f, %o1 /* IEU0 Group */
-#ifdef __KERNEL__
- VISExit
- wr %g7, %g0, %asi
-#endif
-20: andcc %o1, 0xf0, %g1 /* IEU1 Group */
- be,pn %icc, 23f /* CTI */
- and %o1, 0xf, %o3 /* IEU0 */
-#ifdef __KERNEL__
-22: sll %g1, 1, %o4 /* IEU0 Group */
- sethi %hi(23f), %g7 /* IEU1 */
- sub %g7, %o4, %g7 /* IEU0 Group */
- jmpl %g7 + %lo(23f), %g0 /* CTI Group brk forced*/
- add %o0, %g1, %o0 /* IEU0 */
-#else
-22: rd %pc, %g7 /* LSU Group+4bubbles */
- sll %g1, 1, %o4 /* IEU0 Group */
- sub %g7, %o4, %g7 /* IEU0 Group (regdep) */
- jmpl %g7 + (23f - 22b), %g0 /* CTI Group brk forced*/
- add %o0, %g1, %o0 /* IEU0 */
-#endif
- CSUM_LASTCHUNK(0xe0)
- CSUM_LASTCHUNK(0xd0)
- CSUM_LASTCHUNK(0xc0)
- CSUM_LASTCHUNK(0xb0)
- CSUM_LASTCHUNK(0xa0)
- CSUM_LASTCHUNK(0x90)
- CSUM_LASTCHUNK(0x80)
- CSUM_LASTCHUNK(0x70)
- CSUM_LASTCHUNK(0x60)
- CSUM_LASTCHUNK(0x50)
- CSUM_LASTCHUNK(0x40)
- CSUM_LASTCHUNK(0x30)
- CSUM_LASTCHUNK(0x20)
- CSUM_LASTCHUNK(0x10)
- CSUM_LASTCHUNK(0x00)
-23: brnz,pn %o3, 26f /* CTI+IEU1 Group */
-24: sllx %o2, 32, %g1 /* IEU0 */
-25: addcc %o2, %g1, %o0 /* IEU1 Group */
- srlx %o0, 32, %o0 /* IEU0 Group (regdep) */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o0, 1, %o0 /* IEU1 */
-1: retl /* CTI Group brk forced*/
- srl %o0, 0, %o0 /* IEU0 */
-26: andcc %o1, 8, %g0 /* IEU1 Group */
- be,pn %icc, 1f /* CTI */
- ldx [%o0], %g3 /* Load */
- add %o0, 8, %o0 /* IEU0 Group */
- addcc %g3, %o2, %o2 /* IEU1 Group */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-1: andcc %o1, 4, %g0 /* IEU1 Group */
- be,a,pn %icc, 1f /* CTI */
- clr %g2 /* IEU0 */
- ld [%o0], %g2 /* Load */
- add %o0, 4, %o0 /* IEU0 Group */
- sllx %g2, 32, %g2 /* IEU0 Group */
-1: andcc %o1, 2, %g0 /* IEU1 */
- be,a,pn %icc, 1f /* CTI */
- clr %o4 /* IEU0 Group */
- lduh [%o0], %o4 /* Load */
- add %o0, 2, %o0 /* IEU1 */
- sll %o4, 16, %o4 /* IEU0 Group */
-1: andcc %o1, 1, %g0 /* IEU1 */
- be,a,pn %icc, 1f /* CTI */
- clr %o5 /* IEU0 Group */
- ldub [%o0], %o5 /* Load */
- sll %o5, 8, %o5 /* IEU0 Group */
-1: or %g2, %o4, %o4 /* IEU1 */
- or %o5, %o4, %o4 /* IEU0 Group (regdep) */
- addcc %o4, %o2, %o2 /* IEU1 Group (regdep) */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-1: ba,pt %xcc, 25b /* CTI Group */
- sllx %o2, 32, %g1 /* IEU0 */
-21: srl %o2, 0, %o2 /* IEU0 Group */
- cmp %o1, 0 /* IEU1 */
- be,pn %icc, 24b /* CTI */
- andcc %o1, 4, %g0 /* IEU1 Group */
- be,a,pn %icc, 1f /* CTI */
- clr %g2 /* IEU0 */
- lduh [%o0], %g3 /* Load */
- lduh [%o0+2], %g2 /* Load Group */
- add %o0, 4, %o0 /* IEU0 Group */
- sllx %g3, 48, %g3 /* IEU0 Group */
- sllx %g2, 32, %g2 /* IEU0 Group */
- or %g3, %g2, %g2 /* IEU0 Group */
-1: andcc %o1, 2, %g0 /* IEU1 */
- be,a,pn %icc, 1f /* CTI */
- clr %o4 /* IEU0 Group */
- lduh [%o0], %o4 /* Load */
- add %o0, 2, %o0 /* IEU1 */
- sll %o4, 16, %o4 /* IEU0 Group */
-1: andcc %o1, 1, %g0 /* IEU1 */
- be,a,pn %icc, 1f /* CTI */
- clr %o5 /* IEU0 Group */
- ldub [%o0], %o5 /* Load */
- sll %o5, 8, %o5 /* IEU0 Group */
-1: or %g2, %o4, %o4 /* IEU1 */
- or %o5, %o4, %o4 /* IEU0 Group (regdep) */
- addcc %o4, %o2, %o2 /* IEU1 Group (regdep) */
- bcs,a,pn %xcc, 1f /* CTI */
- add %o2, 1, %o2 /* IEU0 */
-1: ba,pt %xcc, 25b /* CTI Group */
- sllx %o2, 32, %g1 /* IEU0 */
-
- /* When buff is byte aligned and len is large, we backoff to
- * this really slow handling. The issue is that we cannot do
- * the VIS stuff when buff is byte aligned as unaligned.c will
- * not fix it up.
- */
-csump_really_slow:
- mov %o0, %o3
- mov %o1, %o4
- cmp %o1, 0
- ble,pn %icc, 9f
- mov 0, %o0
- andcc %o3, 1, %o5
- be,pt %icc, 1f
- sra %o4, 1, %g3
- add %o1, -1, %o4
- ldub [%o3], %o0
- add %o3, 1, %o3
- sra %o4, 1, %g3
-1:
- cmp %g3, 0
- be,pt %icc, 3f
- and %o4, 1, %g2
- and %o3, 2, %g2
- brz,a,pt %g2, 1f
- sra %g3, 1, %g3
- add %g3, -1, %g3
- add %o4, -2, %o4
- lduh [%o3], %g2
- add %o3, 2, %o3
- add %o0, %g2, %o0
- sra %g3, 1, %g3
-1:
- cmp %g3, 0
- be,pt %icc, 2f
- and %o4, 2, %g2
-1:
- ld [%o3], %g2
- addcc %o0, %g2, %o0
- addx %o0, %g0, %o0
- addcc %g3, -1, %g3
- bne,pt %icc, 1b
- add %o3, 4, %o3
- srl %o0, 16, %o1
- sethi %hi(64512), %g2
- or %g2, 1023, %g2
- and %o0, %g2, %g3
- add %g3, %o1, %g3
- srl %g3, 16, %o0
- and %g3, %g2, %g2
- add %g2, %o0, %g3
- sll %g3, 16, %g3
- srl %g3, 16, %o0
- and %o4, 2, %g2
-2:
- cmp %g2, 0
- be,pt %icc, 3f
- and %o4, 1, %g2
- lduh [%o3], %g2
- add %o3, 2, %o3
- add %o0, %g2, %o0
- and %o4, 1, %g2
-3:
- cmp %g2, 0
- be,pt %icc, 1f
- srl %o0, 16, %o1
- ldub [%o3], %g2
- sll %g2, 8, %g2
- add %o0, %g2, %o0
- srl %o0, 16, %o1
-1:
- sethi %hi(64512), %g2
- or %g2, 1023, %g2
- cmp %o5, 0
- and %o0, %g2, %g3
- add %g3, %o1, %g3
- srl %g3, 16, %o0
- and %g3, %g2, %g2
- add %g2, %o0, %g3
- sll %g3, 16, %g3
- srl %g3, 16, %o0
- srl %g3, 24, %g3
- and %o0, 255, %g2
- sll %g2, 8, %g2
- bne,pt %icc, 1f
- or %g3, %g2, %g2
-9:
- mov %o0, %g2
-1:
- addcc %g2, %o2, %g2
- addx %g2, %g0, %g2
- retl
- srl %g2, 0, %o0
diff --git a/arch/sparc64/lib/VIScsumcopy.S b/arch/sparc64/lib/VIScsumcopy.S
deleted file mode 100644
index d4caa955ea738..0000000000000
--- a/arch/sparc64/lib/VIScsumcopy.S
+++ /dev/null
@@ -1,897 +0,0 @@
-/* $Id: VIScsumcopy.S,v 1.8 2000/02/20 23:21:39 davem Exp $
- * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous
- * copying utilizing the UltraSparc Visual Instruction Set.
- *
- * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
- *
- * Based on older sparc32/sparc64 checksum.S, which is:
- *
- * Copyright(C) 1995 Linus Torvalds
- * Copyright(C) 1995 Miguel de Icaza
- * Copyright(C) 1996,1997 David S. Miller
- * derived from:
- * Linux/Alpha checksum c-code
- * Linux/ix86 inline checksum assembly
- * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
- * David Mosberger-Tang for optimized reference c-code
- * BSD4.4 portable checksum routine
- */
-
-#ifdef __sparc_v9__
-#define STACKOFF 0x7ff+128
-#else
-#define STACKOFF 64
-#endif
-
-#ifdef __KERNEL__
-#include <asm/head.h>
-#include <asm/asi.h>
-#include <asm/page.h>
-#include <asm/visasm.h>
-#include <asm/thread_info.h>
-#define ASI_BLK_XOR 0
-#define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
-#define ASI_BLK_OR (ASI_BLK_P & ~ASI_P)
-#else
-#define ASI_P 0x80
-#define ASI_BLK_P 0xf0
-#define FRPS_FEF 0x04
-#define FPRS_DU 0x02
-#define FPRS_DL 0x01
-#define ASI_BLK_XOR (ASI_BLK_P ^ ASI_P)
-#endif
-
-#define src o0
-#define dst o1
-#define len o2
-#define sum o3
-#define x1 g1
-#define x2 g2
-#define x3 o4
-#define x4 g4
-#define x5 g5
-#define x6 g7
-#define x7 g3
-#define x8 o5
-
-/* Dobrou noc, SunSoft engineers. Spete sladce.
- * This has a couple of tricks in and those
- * tricks are UltraLinux trade secrets :))
- * Once AGAIN, the SunSoft engineers are caught
- * asleep at the keyboard :)).
- * The main loop does about 20 superscalar cycles
- * per 64bytes checksummed/copied.
- */
-
-#define LDBLK(O0) \
- ldda [%src] %asi, %O0 /* Load Group */
-
-#define STBLK \
- stda %f48, [%dst] ASI_BLK_P /* Store */
-
-#define ST(fx,off) \
- std %fx, [%dst + off] /* Store */
-
-#define SYNC \
- membar #Sync
-
-
-#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \
- LOAD /* Load (Group) */; \
- faligndata %A14, %F0, %A14 /* FPA Group */; \
- inc %x5 /* IEU0 */; \
- STORE1 /* Store (optional) */; \
- faligndata %F0, %F2, %A0 /* FPA Group */; \
- srl %x5, 1, %x5 /* IEU0 */; \
- add %sum, %x4, %sum /* IEU1 */; \
- fpadd32 %F0, %f0, %F0 /* FPA Group */; \
- inc %x6 /* IEU0 */; \
- STORE2 /* Store (optional) */; \
- faligndata %F2, %F4, %A2 /* FPA Group */; \
- srl %x6, 1, %x6 /* IEU0 */; \
- add %sum, %x5, %sum /* IEU1 */; \
- fpadd32 %F2, %f2, %F2 /* FPA Group */; \
- add %src, 64, %src /* IEU0 */; \
- fcmpgt32 %f0, %F0, %x1 /* FPM */; \
- add %dst, 64, %dst /* IEU1 Group */; \
- inc %x7 /* IEU0 */; \
- STORE3 /* Store (optional) */; \
- faligndata %F4, %F6, %A4 /* FPA */; \
- fpadd32 %F4, %f4, %F4 /* FPA Group */; \
- add %sum, %x6, %sum /* IEU1 */; \
- fcmpgt32 %f2, %F2, %x2 /* FPM */; \
- srl %x7, 1, %x7 /* IEU0 Group */; \
- inc %x8 /* IEU1 */; \
- STORE4 /* Store (optional) */; \
- faligndata %F6, %F8, %A6 /* FPA */; \
- fpadd32 %F6, %f6, %F6 /* FPA Group */; \
- srl %x8, 1, %x8 /* IEU0 */; \
- fcmpgt32 %f4, %F4, %x3 /* FPM */; \
- add %sum, %x7, %sum /* IEU0 Group */; \
- inc %x1 /* IEU1 */; \
- STORE5 /* Store (optional) */; \
- faligndata %F8, %F10, %A8 /* FPA */; \
- fpadd32 %F8, %f8, %F8 /* FPA Group */; \
- srl %x1, 1, %x1 /* IEU0 */; \
- fcmpgt32 %f6, %F6, %x4 /* FPM */; \
- add %sum, %x8, %sum /* IEU0 Group */; \
- inc %x2 /* IEU1 */; \
- STORE6 /* Store (optional) */; \
- faligndata %F10, %F12, %A10 /* FPA */; \
- fpadd32 %F10, %f10, %F10 /* FPA Group */; \
- srl %x2, 1, %x2 /* IEU0 */; \
- fcmpgt32 %f8, %F8, %x5 /* FPM */; \
- add %sum, %x1, %sum /* IEU0 Group */; \
- inc %x3 /* IEU1 */; \
- STORE7 /* Store (optional) */; \
- faligndata %F12, %F14, %A12 /* FPA */; \
- fpadd32 %F12, %f12, %F12 /* FPA Group */; \
- srl %x3, 1, %x3 /* IEU0 */; \
- fcmpgt32 %f10, %F10, %x6 /* FPM */; \
- add %sum, %x2, %sum /* IEU0 Group */; \
- inc %x4 /* IEU1 */; \
- STORE8 /* Store (optional) */; \
- fmovd %F14, %B14 /* FPA */; \
- fpadd32 %F14, %f14, %F14 /* FPA Group */; \
- srl %x4, 1, %x4 /* IEU0 */; \
- fcmpgt32 %f12, %F12, %x7 /* FPM */; \
- add %sum, %x3, %sum /* IEU0 Group */; \
- subcc %len, 64, %len /* IEU1 */; \
- BRANCH /* CTI */; \
- fcmpgt32 %f14, %F14, %x8 /* FPM Group */;
-
-#define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
- inc %x5 /* IEU0 Group */; \
- fpadd32 %f2, %f0, %S0 /* FPA */; \
- add %sum, %x4, %sum /* IEU1 */; \
- srl %x5, 1, %x5 /* IEU0 Group */; \
- fpadd32 %f6, %f4, %S1 /* FPA */; \
- inc %x6 /* IEU1 */; \
- fpadd32 %f10, %f8, %S2 /* FPA Group */; \
- add %sum, %x5, %sum /* IEU0 */; \
- fcmpgt32 %f0, %S0, %x1 /* FPM */; \
- fpadd32 %f14, %f12, %S3 /* FPA Group */; \
- srl %x6, 1, %x6 /* IEU0 */; \
- fcmpgt32 %f4, %S1, %x2 /* FPM */; \
- add %sum, %x6, %sum /* IEU0 Group */; \
- fzero %fz /* FPA */; \
- fcmpgt32 %f8, %S2, %x3 /* FPM */; \
- inc %x7 /* IEU0 Group */; \
- inc %x8 /* IEU1 */; \
- srl %x7, 1, %x7 /* IEU0 Group */; \
- inc %x1 /* IEU1 */; \
- fpadd32 %S0, %S1, %T0 /* FPA */; \
- fpadd32 %S2, %S3, %T1 /* FPA Group */; \
- add %sum, %x7, %sum /* IEU0 */; \
- fcmpgt32 %f12, %S3, %x4 /* FPM */; \
- srl %x8, 1, %x8 /* IEU0 Group */; \
- inc %x2 /* IEU1 */; \
- srl %x1, 1, %x1 /* IEU0 Group */; \
- add %sum, %x8, %sum /* IEU1 */; \
- add %sum, %x1, %sum /* IEU0 Group */; \
- fcmpgt32 %S0, %T0, %x5 /* FPM */; \
- srl %x2, 1, %x2 /* IEU0 Group */; \
- fcmpgt32 %S2, %T1, %x6 /* FPM */; \
- inc %x3 /* IEU0 Group */; \
- add %sum, %x2, %sum /* IEU1 */; \
- srl %x3, 1, %x3 /* IEU0 Group */; \
- inc %x4 /* IEU1 */; \
- fpadd32 %T0, %T1, %U0 /* FPA Group */; \
- add %sum, %x3, %sum /* IEU0 */; \
- fcmpgt32 %fz, %f2, %x7 /* FPM */; \
- srl %x4, 1, %x4 /* IEU0 Group */; \
- fcmpgt32 %fz, %f6, %x8 /* FPM */; \
- inc %x5 /* IEU0 Group */; \
- add %sum, %x4, %sum /* IEU1 */; \
- srl %x5, 1, %x5 /* IEU0 Group */; \
- fcmpgt32 %fz, %f10, %x1 /* FPM */; \
- inc %x6 /* IEU0 Group */; \
- add %sum, %x5, %sum /* IEU1 */; \
- fmovd %FA, %FB /* FPA Group */; \
- fcmpgt32 %fz, %f14, %x2 /* FPM */; \
- srl %x6, 1, %x6 /* IEU0 Group */; \
- ba,pt %xcc, ett /* CTI */; \
- inc %x7 /* IEU1 */;
-
-#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \
- END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
-
-#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \
- fpadd32 %U0, %U1, %V0 /* FPA Group */; \
- srl %x7, 1, %x7 /* IEU0 */; \
- add %sum, %x6, %sum /* IEU1 */; \
- std %V0, [%sp + STACKOFF] /* Store Group */; \
- inc %x8 /* IEU0 */; \
- sub %sum, %x7, %sum /* IEU1 */; \
- srl %x8, 1, %x8 /* IEU0 Group */; \
- fcmpgt32 %fz, %S1, %x3 /* FPM */; \
- inc %x1 /* IEU0 Group */; \
- fcmpgt32 %fz, %S3, %x4 /* FPM */; \
- srl %x1, 1, %x1 /* IEU0 Group */; \
- sub %sum, %x8, %sum /* IEU1 */; \
- ldx [%sp + STACKOFF], %x8 /* Load Group */; \
- inc %x2 /* IEU0 */; \
- sub %sum, %x1, %sum /* IEU1 */; \
- srl %x2, 1, %x2 /* IEU0 Group */; \
- fcmpgt32 %fz, %T1, %x5 /* FPM */; \
- inc %x3 /* IEU0 Group */; \
- fcmpgt32 %T0, %U0, %x6 /* FPM */; \
- srl %x3, 1, %x3 /* IEU0 Group */; \
- sub %sum, %x2, %sum /* IEU1 */; \
- inc %x4 /* IEU0 Group */; \
- sub %sum, %x3, %sum /* IEU1 */; \
- srl %x4, 1, %x4 /* IEU0 Group */; \
- fcmpgt32 %fz, %U1, %x7 /* FPM */; \
- inc %x5 /* IEU0 Group */; \
- fcmpgt32 %U0, %V0, %x1 /* FPM */; \
- srl %x5, 1, %x5 /* IEU0 Group */; \
- sub %sum, %x4, %sum /* IEU1 */; \
- sub %sum, %x5, %sum /* IEU0 Group */; \
- fcmpgt32 %fz, %V0, %x2 /* FPM */; \
- inc %x6 /* IEU0 Group */; \
- inc %x7 /* IEU1 */; \
- srl %x6, 1, %x6 /* IEU0 Group */; \
- inc %x1 /* IEU1 */; \
- srl %x7, 1, %x7 /* IEU0 Group */; \
- add %sum, %x6, %sum /* IEU1 */; \
- srl %x1, 1, %x1 /* IEU0 Group */; \
- sub %sum, %x7, %sum /* IEU1 */; \
- inc %x2 /* IEU0 Group */; \
- add %sum, %x1, %sum /* IEU1 */; \
- srl %x2, 1, %x2 /* IEU0 Group */; \
- sub %sum, %x2, %sum /* IEU0 Group */; \
- addcc %sum, %x8, %sum /* IEU1 Group */; \
- bcs,a,pn %xcc, 33f /* CTI */; \
- add %sum, 1, %sum /* IEU0 (Group) */; \
-33: /* That's it */;
-
- .text
- .globl csum_partial_copy_vis
- .align 32
-/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp.
- * csum_partial_copy_from_user
- * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256
- */
-csum_partial_copy_vis:
- andcc %dst, 7, %g0 /* IEU1 Group */
- be,pt %icc, 4f /* CTI */
- and %dst, 0x38, %o4 /* IEU0 */
- mov 1, %g5 /* IEU0 Group */
- andcc %dst, 2, %g0 /* IEU1 */
- be,pt %icc, 1f /* CTI */
- and %dst, 4, %g7 /* IEU0 Group */
- lduha [%src] %asi, %g2 /* Load */
- sub %len, 2, %len /* IEU0 Group */
- add %dst, 2, %dst /* IEU1 */
- andcc %dst, 4, %g7 /* IEU1 Group */
- sll %g5, 16, %g5 /* IEU0 */
- sth %g2, [%dst - 2] /* Store Group */
- sll %g2, 16, %g2 /* IEU0 */
- add %src, 2, %src /* IEU1 */
- addcc %g2, %sum, %sum /* IEU1 Group */
- bcs,a,pn %icc, 1f /* CTI */
- add %sum, %g5, %sum /* IEU0 */
-1: lduwa [%src] %asi, %g2 /* Load */
- brz,a,pn %g7, 4f /* CTI+IEU1 Group */
- and %dst, 0x38, %o4 /* IEU0 */
- add %dst, 4, %dst /* IEU0 Group */
- sub %len, 4, %len /* IEU1 */
- addcc %g2, %sum, %sum /* IEU1 Group */
- bcs,a,pn %icc, 1f /* CTI */
- add %sum, 1, %sum /* IEU0 */
-1: and %dst, 0x38, %o4 /* IEU0 Group */
- stw %g2, [%dst - 4] /* Store */
- add %src, 4, %src /* IEU1 */
-4:
-#ifdef __KERNEL__
- VISEntry
-#endif
- mov %src, %g7 /* IEU1 Group */
- fzero %f48 /* FPA */
- alignaddr %src, %g0, %src /* Single Group */
- subcc %g7, %src, %g7 /* IEU1 Group */
- be,pt %xcc, 1f /* CTI */
- mov 0x40, %g1 /* IEU0 */
- lduwa [%src] %asi, %g2 /* Load Group */
- subcc %sum, %g2, %sum /* IEU1 Group+load stall*/
- bcs,a,pn %icc, 1f /* CTI */
- sub %sum, 1, %sum /* IEU0 */
-1: srl %sum, 0, %sum /* IEU0 Group */
- clr %g5 /* IEU1 */
- brz,pn %o4, 3f /* CTI+IEU1 Group */
- sub %g1, %o4, %g1 /* IEU0 */
- ldda [%src] %asi, %f0 /* Load */
- clr %o4 /* IEU0 Group */
- andcc %dst, 8, %g0 /* IEU1 */
- be,pn %icc, 1f /* CTI */
- ldda [%src + 8] %asi, %f2 /* Load Group */
- add %src, 8, %src /* IEU0 */
- sub %len, 8, %len /* IEU1 */
- fpadd32 %f0, %f48, %f50 /* FPA */
- addcc %dst, 8, %dst /* IEU1 Group */
- faligndata %f0, %f2, %f16 /* FPA */
- fcmpgt32 %f48, %f50, %o4 /* FPM Group */
- fmovd %f2, %f0 /* FPA Group */
- ldda [%src + 8] %asi, %f2 /* Load */
- std %f16, [%dst - 8] /* Store */
- fmovd %f50, %f48 /* FPA */
-1: andcc %g1, 0x10, %g0 /* IEU1 Group */
- be,pn %icc, 1f /* CTI */
- and %g1, 0x20, %g1 /* IEU0 */
- fpadd32 %f0, %f48, %f50 /* FPA */
- ldda [%src + 16] %asi, %f4 /* Load Group */
- add %src, 16, %src /* IEU0 */
- add %dst, 16, %dst /* IEU1 */
- faligndata %f0, %f2, %f16 /* FPA */
- fcmpgt32 %f48, %f50, %g5 /* FPM Group */
- sub %len, 16, %len /* IEU0 */
- inc %o4 /* IEU1 */
- std %f16, [%dst - 16] /* Store Group */
- fpadd32 %f2, %f50, %f48 /* FPA */
- srl %o4, 1, %o5 /* IEU0 */
- faligndata %f2, %f4, %f18 /* FPA Group */
- std %f18, [%dst - 8] /* Store */
- fcmpgt32 %f50, %f48, %o4 /* FPM Group */
- add %o5, %sum, %sum /* IEU0 */
- ldda [%src + 8] %asi, %f2 /* Load */
- fmovd %f4, %f0 /* FPA */
-1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */
- rd %asi, %g2 /* LSU Group + 4 bubbles*/
- inc %g5 /* IEU0 */
- fpadd32 %f0, %f48, %f50 /* FPA */
- ldda [%src + 16] %asi, %f4 /* Load Group */
- srl %g5, 1, %g5 /* IEU0 */
- add %dst, 32, %dst /* IEU1 */
- faligndata %f0, %f2, %f16 /* FPA */
- fcmpgt32 %f48, %f50, %o5 /* FPM Group */
- inc %o4 /* IEU0 */
- ldda [%src + 24] %asi, %f6 /* Load */
- srl %o4, 1, %o4 /* IEU0 Group */
- add %g5, %sum, %sum /* IEU1 */
- ldda [%src + 32] %asi, %f8 /* Load */
- fpadd32 %f2, %f50, %f48 /* FPA */
- faligndata %f2, %f4, %f18 /* FPA Group */
- sub %len, 32, %len /* IEU0 */
- std %f16, [%dst - 32] /* Store */
- fcmpgt32 %f50, %f48, %g3 /* FPM Group */
- inc %o5 /* IEU0 */
- add %o4, %sum, %sum /* IEU1 */
- fpadd32 %f4, %f48, %f50 /* FPA */
- faligndata %f4, %f6, %f20 /* FPA Group */
- srl %o5, 1, %o5 /* IEU0 */
- fcmpgt32 %f48, %f50, %g5 /* FPM Group */
- add %o5, %sum, %sum /* IEU0 */
- std %f18, [%dst - 24] /* Store */
- fpadd32 %f6, %f50, %f48 /* FPA */
- inc %g3 /* IEU0 Group */
- std %f20, [%dst - 16] /* Store */
- add %src, 32, %src /* IEU1 */
- faligndata %f6, %f8, %f22 /* FPA */
- fcmpgt32 %f50, %f48, %o4 /* FPM Group */
- srl %g3, 1, %g3 /* IEU0 */
- std %f22, [%dst - 8] /* Store */
- add %g3, %sum, %sum /* IEU0 Group */
-3: rd %asi, %g2 /* LSU Group + 4 bubbles*/
-#ifdef __KERNEL__
-4: sethi %hi(vis0s), %g7 /* IEU0 Group */
- or %g2, ASI_BLK_OR, %g2 /* IEU1 */
-#else
-4: rd %pc, %g7 /* LSU Group + 4 bubbles*/
-#endif
- inc %g5 /* IEU0 Group */
- and %src, 0x38, %g3 /* IEU1 */
- membar #StoreLoad /* LSU Group */
- srl %g5, 1, %g5 /* IEU0 */
- inc %o4 /* IEU1 */
- sll %g3, 8, %g3 /* IEU0 Group */
- sub %len, 0xc0, %len /* IEU1 */
- addcc %g5, %sum, %sum /* IEU1 Group */
- srl %o4, 1, %o4 /* IEU0 */
- add %g7, %g3, %g7 /* IEU0 Group */
- add %o4, %sum, %sum /* IEU1 */
-#ifdef __KERNEL__
- jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */
-#else
- jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */
-#endif
- fzero %f32 /* FPA */
-
- .align 2048
-vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- add %src, 128, %src /* IEU0 Group */
- ldda [%src-128] %asi, %f0 /* Load Group */
- ldda [%src-64] %asi, %f16 /* Load Group */
- fmovd %f48, %f62 /* FPA Group f0 available*/
- faligndata %f0, %f2, %f48 /* FPA Group f2 available*/
- fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available*/
- fpadd32 %f0, %f62, %f0 /* FPA */
- fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available*/
- faligndata %f2, %f4, %f50 /* FPA */
- fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available*/
- faligndata %f4, %f6, %f52 /* FPA */
- fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available*/
- inc %x1 /* IEU0 */
- faligndata %f6, %f8, %f54 /* FPA */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available*/
- srl %x1, 1, %x1 /* IEU0 */
- inc %x2 /* IEU1 */
- faligndata %f8, %f10, %f56 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available*/
- srl %x2, 1, %x2 /* IEU0 */
- add %sum, %x1, %sum /* IEU1 */
- faligndata %f10, %f12, %f58 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- inc %x3 /* IEU0 */
- add %sum, %x2, %sum /* IEU1 */
- faligndata %f12, %f14, %f60 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- srl %x3, 1, %x3 /* IEU0 */
- inc %x4 /* IEU1 */
- fmovd %f14, %f62 /* FPA */
- srl %x4, 1, %x4 /* IEU0 Group */
- add %sum, %x3, %sum /* IEU1 */
-vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
- ,LDBLK(f32), STBLK,,,,,,,,
- ,bcs,pn %icc, vis0e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
- ,LDBLK(f0), STBLK,,,,,,,,
- ,bcs,pn %icc, vis0e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
- ,LDBLK(f16), STBLK,,,,,,,,
- ,bcc,pt %icc, vis0)
-vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f32,
- ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
- ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
-vis0e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f0,
- ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
- ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
-vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f16,
- ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
- ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
- .align 2048
-vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- add %src, 128 - 8, %src /* IEU0 Group */
- ldda [%src-128] %asi, %f0 /* Load Group */
- ldda [%src-64] %asi, %f16 /* Load Group */
- fmovd %f0, %f58 /* FPA Group */
- fmovd %f48, %f0 /* FPA Group */
- fcmpgt32 %f32, %f2, %x2 /* FPM Group */
- faligndata %f2, %f4, %f48 /* FPA */
- fcmpgt32 %f32, %f4, %x3 /* FPM Group */
- faligndata %f4, %f6, %f50 /* FPA */
- fcmpgt32 %f32, %f6, %x4 /* FPM Group */
- faligndata %f6, %f8, %f52 /* FPA */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group */
- inc %x2 /* IEU1 */
- faligndata %f8, %f10, %f54 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- srl %x2, 1, %x2 /* IEU0 */
- faligndata %f10, %f12, %f56 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- inc %x3 /* IEU0 */
- add %sum, %x2, %sum /* IEU1 */
- faligndata %f12, %f14, %f58 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- srl %x3, 1, %x3 /* IEU0 */
- inc %x4 /* IEU1 */
- fmovd %f14, %f60 /* FPA */
- srl %x4, 1, %x4 /* IEU0 Group */
- add %sum, %x3, %sum /* IEU1 */
-vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
- ,LDBLK(f32), ,STBLK,,,,,,,
- ,bcs,pn %icc, vis1e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
- ,LDBLK(f0), ,STBLK,,,,,,,
- ,bcs,pn %icc, vis1e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
- ,LDBLK(f16), ,STBLK,,,,,,,
- ,bcc,pt %icc, vis1)
-vis1e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f32,
- ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
- ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
-vis1e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f0,
- ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
- ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
-vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f16,
- ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
- ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
- .align 2048
-vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- add %src, 128 - 16, %src /* IEU0 Group */
- ldda [%src-128] %asi, %f0 /* Load Group */
- ldda [%src-64] %asi, %f16 /* Load Group */
- fmovd %f0, %f56 /* FPA Group */
- fmovd %f48, %f0 /* FPA Group */
- sub %dst, 64, %dst /* IEU0 */
- fpsub32 %f2, %f2, %f2 /* FPA Group */
- fcmpgt32 %f32, %f4, %x3 /* FPM Group */
- faligndata %f4, %f6, %f48 /* FPA */
- fcmpgt32 %f32, %f6, %x4 /* FPM Group */
- faligndata %f6, %f8, %f50 /* FPA */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group */
- faligndata %f8, %f10, %f52 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- faligndata %f10, %f12, %f54 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- inc %x3 /* IEU0 */
- faligndata %f12, %f14, %f56 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- srl %x3, 1, %x3 /* IEU0 */
- inc %x4 /* IEU1 */
- fmovd %f14, %f58 /* FPA */
- srl %x4, 1, %x4 /* IEU0 Group */
- add %sum, %x3, %sum /* IEU1 */
-vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
- ,LDBLK(f32), ,,STBLK,,,,,,
- ,bcs,pn %icc, vis2e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
- ,LDBLK(f0), ,,STBLK,,,,,,
- ,bcs,pn %icc, vis2e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
- ,LDBLK(f16), ,,STBLK,,,,,,
- ,bcc,pt %icc, vis2)
-vis2e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f32,
- ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
- ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
-vis2e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f0,
- ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
- ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
-vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f16,
- ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
- ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
- .align 2048
-vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- add %src, 128 - 24, %src /* IEU0 Group */
- ldda [%src-128] %asi, %f0 /* Load Group */
- ldda [%src-64] %asi, %f16 /* Load Group */
- fmovd %f0, %f54 /* FPA Group */
- fmovd %f48, %f0 /* FPA Group */
- sub %dst, 64, %dst /* IEU0 */
- fpsub32 %f2, %f2, %f2 /* FPA Group */
- fpsub32 %f4, %f4, %f4 /* FPA Group */
- fcmpgt32 %f32, %f6, %x4 /* FPM Group */
- faligndata %f6, %f8, %f48 /* FPA */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group */
- faligndata %f8, %f10, %f50 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- faligndata %f10, %f12, %f52 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- faligndata %f12, %f14, %f54 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- fmovd %f14, %f56 /* FPA */
- inc %x4 /* IEU0 */
- srl %x4, 1, %x4 /* IEU0 Group */
-vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
- ,LDBLK(f32), ,,,STBLK,,,,,
- ,bcs,pn %icc, vis3e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
- ,LDBLK(f0), ,,,STBLK,,,,,
- ,bcs,pn %icc, vis3e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
- ,LDBLK(f16), ,,,STBLK,,,,,
- ,bcc,pt %icc, vis3)
-vis3e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f32,
- ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
- ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
-vis3e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f0,
- ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
- ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
-vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f16,
- ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
- ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
- .align 2048
-vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- add %src, 128 - 32, %src /* IEU0 Group */
- ldda [%src-128] %asi, %f0 /* Load Group */
- ldda [%src-64] %asi, %f16 /* Load Group */
- fmovd %f0, %f52 /* FPA Group */
- fmovd %f48, %f0 /* FPA Group */
- sub %dst, 64, %dst /* IEU0 */
- fpsub32 %f2, %f2, %f2 /* FPA Group */
- fpsub32 %f4, %f4, %f4 /* FPA Group */
- fpsub32 %f6, %f6, %f6 /* FPA Group */
- clr %x4 /* IEU0 */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group */
- faligndata %f8, %f10, %f48 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- faligndata %f10, %f12, %f50 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- faligndata %f12, %f14, %f52 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- fmovd %f14, %f54 /* FPA */
-vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
- ,LDBLK(f32), ,,,,STBLK,,,,
- ,bcs,pn %icc, vis4e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
- ,LDBLK(f0), ,,,,STBLK,,,,
- ,bcs,pn %icc, vis4e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
- ,LDBLK(f16), ,,,,STBLK,,,,
- ,bcc,pt %icc, vis4)
-vis4e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f32,
- ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
- ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
-vis4e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f0,
- ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
- ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
-vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f16,
- ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
- ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
- .align 2048
-vis5s: add %src, 128 - 40, %src /* IEU0 Group */
- ldda [%src-88] %asi, %f10 /* Load Group */
- ldda [%src-80] %asi, %f12 /* Load Group */
- ldda [%src-72] %asi, %f14 /* Load Group */
- wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- ldda [%src-64] %asi, %f16 /* Load Group */
- fmovd %f48, %f0 /* FPA Group */
- fmuld %f32, %f32, %f2 /* FPM */
- clr %x4 /* IEU0 */
- faddd %f32, %f32, %f4 /* FPA Group */
- fmuld %f32, %f32, %f6 /* FPM */
- clr %x5 /* IEU0 */
- faddd %f32, %f32, %f8 /* FPA Group */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- sub %dst, 64, %dst /* IEU0 */
- faligndata %f10, %f12, %f48 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- faligndata %f12, %f14, %f50 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- fmovd %f14, %f52 /* FPA */
-vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
- ,LDBLK(f32), ,,,,,STBLK,,,
- ,bcs,pn %icc, vis5e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
- ,LDBLK(f0), ,,,,,STBLK,,,
- ,bcs,pn %icc, vis5e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
- ,LDBLK(f16), ,,,,,STBLK,,,
- ,bcc,pt %icc, vis5)
-vis5e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f32,
- ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72),
- ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
-vis5e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f0,
- ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72),
- ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
-vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f16,
- ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72),
- ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
- .align 2048
-vis6s: add %src, 128 - 48, %src /* IEU0 Group */
- ldda [%src-80] %asi, %f12 /* Load Group */
- ldda [%src-72] %asi, %f14 /* Load Group */
- wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- ldda [%src-64] %asi, %f16 /* Load Group */
- fmovd %f48, %f0 /* FPA Group */
- fmuld %f32, %f32, %f2 /* FPM */
- clr %x4 /* IEU0 */
- faddd %f32, %f32, %f4 /* FPA Group */
- fmuld %f32, %f32, %f6 /* FPM */
- clr %x5 /* IEU0 */
- faddd %f32, %f32, %f8 /* FPA Group */
- fmuld %f32, %f32, %f10 /* FPM */
- clr %x6 /* IEU0 */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- sub %dst, 64, %dst /* IEU0 */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- faligndata %f12, %f14, %f48 /* FPA */
- fmovd %f14, %f50 /* FPA Group */
-vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
- ,LDBLK(f32), ,,,,,,STBLK,,
- ,bcs,pn %icc, vis6e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
- ,LDBLK(f0), ,,,,,,STBLK,,
- ,bcs,pn %icc, vis6e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
- ,LDBLK(f16), ,,,,,,STBLK,,
- ,bcc,pt %icc, vis6)
-vis6e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f32,
- ,SYNC, ,,,,,,STBLK,ST(f48,64),
- ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
-vis6e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f0,
- ,SYNC, ,,,,,,STBLK,ST(f48,64),
- ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
-vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f16,
- ,SYNC, ,,,,,,STBLK,ST(f48,64),
- ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
- .align 2048
-vis7s: add %src, 128 - 56, %src /* IEU0 Group */
- ldda [%src-72] %asi, %f14 /* Load Group */
- wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- ldda [%src-64] %asi, %f16 /* Load Group */
- fmovd %f48, %f0 /* FPA Group */
- fmuld %f32, %f32, %f2 /* FPM */
- clr %x4 /* IEU0 */
- faddd %f32, %f32, %f4 /* FPA Group */
- fmuld %f32, %f32, %f6 /* FPM */
- clr %x5 /* IEU0 */
- faddd %f32, %f32, %f8 /* FPA Group */
- fmuld %f32, %f32, %f10 /* FPM */
- clr %x6 /* IEU0 */
- faddd %f32, %f32, %f12 /* FPA Group */
- clr %x7 /* IEU0 */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- sub %dst, 64, %dst /* IEU0 */
- fmovd %f14, %f48 /* FPA */
-vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
- ,LDBLK(f32), ,,,,,,,STBLK,
- ,bcs,pn %icc, vis7e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
- ,LDBLK(f0), ,,,,,,,STBLK,
- ,bcs,pn %icc, vis7e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
- ,LDBLK(f16), ,,,,,,,STBLK,
- ,bcc,pt %icc, vis7)
-vis7e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f32,
- ,SYNC, ,,,,,,,STBLK,
- ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
-vis7e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f0,
- ,SYNC, ,,,,,,,STBLK,
- ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
-vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f16,
- ,SYNC, ,,,,,,,STBLK,
- ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
-e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
-e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
-e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
-ett: rd %asi, %x4 /* LSU Group+4bubbles */
- rd %gsr, %x3 /* LSU Group+4bubbles */
-#ifdef __KERNEL__
- srl %x4, 3, %x5 /* IEU0 Group */
- xor %x4, ASI_BLK_XOR1, %x4 /* IEU1 */
- wr %x4, %x5, %asi /* LSU Group+4bubbles */
-#else
- wr %x4, ASI_BLK_XOR, %asi /* LSU Group+4bubbles */
-#endif
- andcc %x3, 7, %x3 /* IEU1 Group */
- add %dst, 8, %dst /* IEU0 */
- bne,pn %icc, 1f /* CTI */
- fzero %f10 /* FPA */
- brz,a,pn %len, 2f /* CTI+IEU1 Group */
- std %f6, [%dst - 8] /* Store */
-1: cmp %len, 8 /* IEU1 */
- blu,pn %icc, 3f /* CTI */
- sub %src, 64, %src /* IEU0 Group */
-1: ldda [%src] %asi, %f2 /* Load Group */
- fpadd32 %f10, %f2, %f12 /* FPA Group+load stall*/
- add %src, 8, %src /* IEU0 */
- add %dst, 8, %dst /* IEU1 */
- faligndata %f6, %f2, %f14 /* FPA Group */
- fcmpgt32 %f10, %f12, %x5 /* FPM Group */
- std %f14, [%dst - 16] /* Store */
- fmovd %f2, %f6 /* FPA */
- fmovd %f12, %f10 /* FPA Group */
- sub %len, 8, %len /* IEU1 */
- fzero %f16 /* FPA Group - FPU nop */
- fzero %f18 /* FPA Group - FPU nop */
- inc %x5 /* IEU0 */
- srl %x5, 1, %x5 /* IEU0 Group (regdep) */
- cmp %len, 8 /* IEU1 */
- bgeu,pt %icc, 1b /* CTI */
- add %x5, %sum, %sum /* IEU0 Group */
-3: brz,a,pt %x3, 2f /* CTI+IEU1 */
- std %f6, [%dst - 8] /* Store Group */
- st %f7, [%dst - 8] /* Store Group */
- sub %dst, 4, %dst /* IEU0 */
- add %len, 4, %len /* IEU1 */
-2:
-#ifdef __KERNEL__
- sub %sp, 8, %sp /* IEU0 Group */
-#endif
- END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
- membar #Sync /* LSU Group */
-#ifdef __KERNEL__
- VISExit
- add %sp, 8, %sp /* IEU0 Group */
-#endif
-23: brnz,pn %len, 26f /* CTI+IEU1 Group */
-24: sllx %sum, 32, %g1 /* IEU0 */
-25: addcc %sum, %g1, %src /* IEU1 Group */
- srlx %src, 32, %src /* IEU0 Group (regdep) */
- bcs,a,pn %xcc, 1f /* CTI */
- add %src, 1, %src /* IEU1 */
-#ifndef __KERNEL__
-1: retl /* CTI Group brk forced*/
- srl %src, 0, %src /* IEU0 */
-#else
-1: retl /* CTI Group brk forced*/
- ldx [%g6 + TI_TASK], %g4 /* Load */
-#endif
-26: andcc %len, 8, %g0 /* IEU1 Group */
- be,pn %icc, 1f /* CTI */
- lduwa [%src] %asi, %o4 /* Load */
- lduwa [%src+4] %asi, %g2 /* Load Group */
- add %src, 8, %src /* IEU0 */
- add %dst, 8, %dst /* IEU1 */
- sllx %o4, 32, %g5 /* IEU0 Group */
- stw %o4, [%dst - 8] /* Store */
- or %g5, %g2, %g5 /* IEU0 Group */
- stw %g2, [%dst - 4] /* Store */
- addcc %g5, %sum, %sum /* IEU1 Group */
- bcs,a,pn %xcc, 1f /* CTI */
- add %sum, 1, %sum /* IEU0 */
-1: andcc %len, 4, %g0 /* IEU1 Group */
- be,a,pn %icc, 1f /* CTI */
- clr %g2 /* IEU0 */
- lduwa [%src] %asi, %g7 /* Load */
- add %src, 4, %src /* IEU0 Group */
- add %dst, 4, %dst /* IEU1 */
- sllx %g7, 32, %g2 /* IEU0 Group */
- stw %g7, [%dst - 4] /* Store */
-1: andcc %len, 2, %g0 /* IEU1 */
- be,a,pn %icc, 1f /* CTI */
- clr %g3 /* IEU0 Group */
- lduha [%src] %asi, %g7 /* Load */
- add %src, 2, %src /* IEU1 */
- add %dst, 2, %dst /* IEU0 Group */
- sll %g7, 16, %g3 /* IEU0 Group */
- sth %g7, [%dst - 2] /* Store */
-1: andcc %len, 1, %g0 /* IEU1 */
- be,a,pn %icc, 1f /* CTI */
- clr %o5 /* IEU0 Group */
- lduba [%src] %asi, %g7 /* Load */
- sll %g7, 8, %o5 /* IEU0 Group */
- stb %g7, [%dst] /* Store */
-1: or %g2, %g3, %g3 /* IEU1 */
- or %o5, %g3, %g3 /* IEU0 Group (regdep) */
- addcc %g3, %sum, %sum /* IEU1 Group (regdep) */
- bcs,a,pn %xcc, 1f /* CTI */
- add %sum, 1, %sum /* IEU0 */
-1: ba,pt %xcc, 25b /* CTI Group */
- sllx %sum, 32, %g1 /* IEU0 */
-
-#ifdef __KERNEL__
-end:
-
- .section __ex_table
- .align 4
- .word csum_partial_copy_vis, 0, end, cpc_handler
-#endif
diff --git a/arch/sparc64/lib/VIScsumcopyusr.S b/arch/sparc64/lib/VIScsumcopyusr.S
deleted file mode 100644
index fc27b7fa4117e..0000000000000
--- a/arch/sparc64/lib/VIScsumcopyusr.S
+++ /dev/null
@@ -1,916 +0,0 @@
-/* $Id: VIScsumcopyusr.S,v 1.2 2000/02/20 23:21:40 davem Exp $
- * VIScsumcopyusr.S: High bandwidth IP checksumming with simultaneous
- * copying utilizing the UltraSparc Visual Instruction Set.
- *
- * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
- * Copyright (C) 2000 David S. Miller (davem@redhat.com)
- *
- * Based on older sparc32/sparc64 checksum.S, which is:
- *
- * Copyright(C) 1995 Linus Torvalds
- * Copyright(C) 1995 Miguel de Icaza
- * Copyright(C) 1996,1997 David S. Miller
- * derived from:
- * Linux/Alpha checksum c-code
- * Linux/ix86 inline checksum assembly
- * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
- * David Mosberger-Tang for optimized reference c-code
- * BSD4.4 portable checksum routine
- */
-
-#ifdef __sparc_v9__
-#define STACKOFF 0x7ff+128
-#else
-#define STACKOFF 64
-#endif
-
-#ifdef __KERNEL__
-#include <asm/head.h>
-#include <asm/asi.h>
-#include <asm/page.h>
-#include <asm/visasm.h>
-#include <asm/thread_info.h>
-#define ASI_BLK_XOR 0
-#define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
-#define ASI_BLK_OR (ASI_BLK_P & ~ASI_P)
-#else
-#define ASI_P 0x80
-#define ASI_BLK_P 0xf0
-#define FRPS_FEF 0x04
-#define FPRS_DU 0x02
-#define FPRS_DL 0x01
-#define ASI_BLK_XOR (ASI_BLK_P ^ ASI_P)
-#endif
-
-#define src o0
-#define dst o1
-#define len o2
-#define sum o3
-#define x1 g1
-#define x2 g2
-#define x3 o4
-#define x4 g4
-#define x5 g5
-#define x6 g7
-#define x7 g3
-#define x8 o5
-
-/* Dobrou noc, SunSoft engineers. Spete sladce.
- * This has a couple of tricks in and those
- * tricks are UltraLinux trade secrets :))
- * Once AGAIN, the SunSoft engineers are caught
- * asleep at the keyboard :)).
- * The main loop does about 20 superscalar cycles
- * per 64bytes checksummed/copied.
- */
-
-#define LDBLK(O0) \
- ldda [%src] ASI_BLK_P, %O0 /* Load Group */
-
-#define STBLK \
- stda %f48, [%dst] %asi /* Store */
-
-#ifdef __KERNEL__
-#define STBLK_XORASI(tmpreg1,tmpreg2) \
- stda %f48, [%dst] %asi /* Store */; \
- rd %asi, %tmpreg1; \
- srl %tmpreg1, 3, %tmpreg2; \
- xor %tmpreg1, ASI_BLK_XOR1, %tmpreg1; \
- wr %tmpreg1, %tmpreg2, %asi;
-#else
-#define STBLK_XORASI(tmpreg1,tmpreg2) \
- stda %f48, [%dst] %asi /* Store */; \
- rd %asi, %tmpreg1; \
- wr %tmpreg1, ASI_BLK_XOR, %asi;
-#endif
-
-#define ST(fx,off) \
- stda %fx, [%dst + off] %asi /* Store */
-
-#define SYNC \
- membar #Sync
-
-
-#define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \
- LOAD /* Load (Group) */; \
- faligndata %A14, %F0, %A14 /* FPA Group */; \
- inc %x5 /* IEU0 */; \
- STORE1 /* Store (optional) */; \
- faligndata %F0, %F2, %A0 /* FPA Group */; \
- srl %x5, 1, %x5 /* IEU0 */; \
- add %sum, %x4, %sum /* IEU1 */; \
- fpadd32 %F0, %f0, %F0 /* FPA Group */; \
- inc %x6 /* IEU0 */; \
- STORE2 /* Store (optional) */; \
- faligndata %F2, %F4, %A2 /* FPA Group */; \
- srl %x6, 1, %x6 /* IEU0 */; \
- add %sum, %x5, %sum /* IEU1 */; \
- fpadd32 %F2, %f2, %F2 /* FPA Group */; \
- add %src, 64, %src /* IEU0 */; \
- fcmpgt32 %f0, %F0, %x1 /* FPM */; \
- add %dst, 64, %dst /* IEU1 Group */; \
- inc %x7 /* IEU0 */; \
- STORE3 /* Store (optional) */; \
- faligndata %F4, %F6, %A4 /* FPA */; \
- fpadd32 %F4, %f4, %F4 /* FPA Group */; \
- add %sum, %x6, %sum /* IEU1 */; \
- fcmpgt32 %f2, %F2, %x2 /* FPM */; \
- srl %x7, 1, %x7 /* IEU0 Group */; \
- inc %x8 /* IEU1 */; \
- STORE4 /* Store (optional) */; \
- faligndata %F6, %F8, %A6 /* FPA */; \
- fpadd32 %F6, %f6, %F6 /* FPA Group */; \
- srl %x8, 1, %x8 /* IEU0 */; \
- fcmpgt32 %f4, %F4, %x3 /* FPM */; \
- add %sum, %x7, %sum /* IEU0 Group */; \
- inc %x1 /* IEU1 */; \
- STORE5 /* Store (optional) */; \
- faligndata %F8, %F10, %A8 /* FPA */; \
- fpadd32 %F8, %f8, %F8 /* FPA Group */; \
- srl %x1, 1, %x1 /* IEU0 */; \
- fcmpgt32 %f6, %F6, %x4 /* FPM */; \
- add %sum, %x8, %sum /* IEU0 Group */; \
- inc %x2 /* IEU1 */; \
- STORE6 /* Store (optional) */; \
- faligndata %F10, %F12, %A10 /* FPA */; \
- fpadd32 %F10, %f10, %F10 /* FPA Group */; \
- srl %x2, 1, %x2 /* IEU0 */; \
- fcmpgt32 %f8, %F8, %x5 /* FPM */; \
- add %sum, %x1, %sum /* IEU0 Group */; \
- inc %x3 /* IEU1 */; \
- STORE7 /* Store (optional) */; \
- faligndata %F12, %F14, %A12 /* FPA */; \
- fpadd32 %F12, %f12, %F12 /* FPA Group */; \
- srl %x3, 1, %x3 /* IEU0 */; \
- fcmpgt32 %f10, %F10, %x6 /* FPM */; \
- add %sum, %x2, %sum /* IEU0 Group */; \
- inc %x4 /* IEU1 */; \
- STORE8 /* Store (optional) */; \
- fmovd %F14, %B14 /* FPA */; \
- fpadd32 %F14, %f14, %F14 /* FPA Group */; \
- srl %x4, 1, %x4 /* IEU0 */; \
- fcmpgt32 %f12, %F12, %x7 /* FPM */; \
- add %sum, %x3, %sum /* IEU0 Group */; \
- subcc %len, 64, %len /* IEU1 */; \
- BRANCH /* CTI */; \
- fcmpgt32 %f14, %F14, %x8 /* FPM Group */;
-
-#define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
- inc %x5 /* IEU0 Group */; \
- fpadd32 %f2, %f0, %S0 /* FPA */; \
- add %sum, %x4, %sum /* IEU1 */; \
- srl %x5, 1, %x5 /* IEU0 Group */; \
- fpadd32 %f6, %f4, %S1 /* FPA */; \
- inc %x6 /* IEU1 */; \
- fpadd32 %f10, %f8, %S2 /* FPA Group */; \
- add %sum, %x5, %sum /* IEU0 */; \
- fcmpgt32 %f0, %S0, %x1 /* FPM */; \
- fpadd32 %f14, %f12, %S3 /* FPA Group */; \
- srl %x6, 1, %x6 /* IEU0 */; \
- fcmpgt32 %f4, %S1, %x2 /* FPM */; \
- add %sum, %x6, %sum /* IEU0 Group */; \
- fzero %fz /* FPA */; \
- fcmpgt32 %f8, %S2, %x3 /* FPM */; \
- inc %x7 /* IEU0 Group */; \
- inc %x8 /* IEU1 */; \
- srl %x7, 1, %x7 /* IEU0 Group */; \
- inc %x1 /* IEU1 */; \
- fpadd32 %S0, %S1, %T0 /* FPA */; \
- fpadd32 %S2, %S3, %T1 /* FPA Group */; \
- add %sum, %x7, %sum /* IEU0 */; \
- fcmpgt32 %f12, %S3, %x4 /* FPM */; \
- srl %x8, 1, %x8 /* IEU0 Group */; \
- inc %x2 /* IEU1 */; \
- srl %x1, 1, %x1 /* IEU0 Group */; \
- add %sum, %x8, %sum /* IEU1 */; \
- add %sum, %x1, %sum /* IEU0 Group */; \
- fcmpgt32 %S0, %T0, %x5 /* FPM */; \
- srl %x2, 1, %x2 /* IEU0 Group */; \
- fcmpgt32 %S2, %T1, %x6 /* FPM */; \
- inc %x3 /* IEU0 Group */; \
- add %sum, %x2, %sum /* IEU1 */; \
- srl %x3, 1, %x3 /* IEU0 Group */; \
- inc %x4 /* IEU1 */; \
- fpadd32 %T0, %T1, %U0 /* FPA Group */; \
- add %sum, %x3, %sum /* IEU0 */; \
- fcmpgt32 %fz, %f2, %x7 /* FPM */; \
- srl %x4, 1, %x4 /* IEU0 Group */; \
- fcmpgt32 %fz, %f6, %x8 /* FPM */; \
- inc %x5 /* IEU0 Group */; \
- add %sum, %x4, %sum /* IEU1 */; \
- srl %x5, 1, %x5 /* IEU0 Group */; \
- fcmpgt32 %fz, %f10, %x1 /* FPM */; \
- inc %x6 /* IEU0 Group */; \
- add %sum, %x5, %sum /* IEU1 */; \
- fmovd %FA, %FB /* FPA Group */; \
- fcmpgt32 %fz, %f14, %x2 /* FPM */; \
- srl %x6, 1, %x6 /* IEU0 Group */; \
- ba,pt %xcc, ett /* CTI */; \
- inc %x7 /* IEU1 */;
-
-#define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \
- END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
-
-#define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \
- fpadd32 %U0, %U1, %V0 /* FPA Group */; \
- srl %x7, 1, %x7 /* IEU0 */; \
- add %sum, %x6, %sum /* IEU1 */; \
- std %V0, [%sp + STACKOFF] /* Store Group */; \
- inc %x8 /* IEU0 */; \
- sub %sum, %x7, %sum /* IEU1 */; \
- srl %x8, 1, %x8 /* IEU0 Group */; \
- fcmpgt32 %fz, %S1, %x3 /* FPM */; \
- inc %x1 /* IEU0 Group */; \
- fcmpgt32 %fz, %S3, %x4 /* FPM */; \
- srl %x1, 1, %x1 /* IEU0 Group */; \
- sub %sum, %x8, %sum /* IEU1 */; \
- ldx [%sp + STACKOFF], %x8 /* Load Group */; \
- inc %x2 /* IEU0 */; \
- sub %sum, %x1, %sum /* IEU1 */; \
- srl %x2, 1, %x2 /* IEU0 Group */; \
- fcmpgt32 %fz, %T1, %x5 /* FPM */; \
- inc %x3 /* IEU0 Group */; \
- fcmpgt32 %T0, %U0, %x6 /* FPM */; \
- srl %x3, 1, %x3 /* IEU0 Group */; \
- sub %sum, %x2, %sum /* IEU1 */; \
- inc %x4 /* IEU0 Group */; \
- sub %sum, %x3, %sum /* IEU1 */; \
- srl %x4, 1, %x4 /* IEU0 Group */; \
- fcmpgt32 %fz, %U1, %x7 /* FPM */; \
- inc %x5 /* IEU0 Group */; \
- fcmpgt32 %U0, %V0, %x1 /* FPM */; \
- srl %x5, 1, %x5 /* IEU0 Group */; \
- sub %sum, %x4, %sum /* IEU1 */; \
- sub %sum, %x5, %sum /* IEU0 Group */; \
- fcmpgt32 %fz, %V0, %x2 /* FPM */; \
- inc %x6 /* IEU0 Group */; \
- inc %x7 /* IEU1 */; \
- srl %x6, 1, %x6 /* IEU0 Group */; \
- inc %x1 /* IEU1 */; \
- srl %x7, 1, %x7 /* IEU0 Group */; \
- add %sum, %x6, %sum /* IEU1 */; \
- srl %x1, 1, %x1 /* IEU0 Group */; \
- sub %sum, %x7, %sum /* IEU1 */; \
- inc %x2 /* IEU0 Group */; \
- add %sum, %x1, %sum /* IEU1 */; \
- srl %x2, 1, %x2 /* IEU0 Group */; \
- sub %sum, %x2, %sum /* IEU0 Group */; \
- addcc %sum, %x8, %sum /* IEU1 Group */; \
- bcs,a,pn %xcc, 33f /* CTI */; \
- add %sum, 1, %sum /* IEU0 (Group) */; \
-33: /* That's it */;
-
- .text
- .globl csum_partial_copy_user_vis
- .align 32
-/* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp.
- * csum_partial_copy_from_user
- * This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256
- */
-csum_partial_copy_user_vis:
- andcc %dst, 7, %g0 /* IEU1 Group */
- be,pt %icc, 4f /* CTI */
- and %dst, 0x38, %o4 /* IEU0 */
- mov 1, %g5 /* IEU0 Group */
- andcc %dst, 2, %g0 /* IEU1 */
- be,pt %icc, 1f /* CTI */
- and %dst, 4, %g7 /* IEU0 Group */
- lduh [%src], %g2 /* Load */
- sub %len, 2, %len /* IEU0 Group */
- add %dst, 2, %dst /* IEU1 */
- andcc %dst, 4, %g7 /* IEU1 Group */
- sll %g5, 16, %g5 /* IEU0 */
- stha %g2, [%dst - 2] %asi /* Store Group */
- sll %g2, 16, %g2 /* IEU0 */
- add %src, 2, %src /* IEU1 */
- addcc %g2, %sum, %sum /* IEU1 Group */
- bcs,a,pn %icc, 1f /* CTI */
- add %sum, %g5, %sum /* IEU0 */
-1: lduw [%src], %g2 /* Load */
- brz,a,pn %g7, 4f /* CTI+IEU1 Group */
- and %dst, 0x38, %o4 /* IEU0 */
- add %dst, 4, %dst /* IEU0 Group */
- sub %len, 4, %len /* IEU1 */
- addcc %g2, %sum, %sum /* IEU1 Group */
- bcs,a,pn %icc, 1f /* CTI */
- add %sum, 1, %sum /* IEU0 */
-1: and %dst, 0x38, %o4 /* IEU0 Group */
- stwa %g2, [%dst - 4] %asi /* Store */
- add %src, 4, %src /* IEU1 */
-4:
-#ifdef __KERNEL__
- VISEntry
-#endif
- mov %src, %g7 /* IEU1 Group */
- fzero %f48 /* FPA */
- alignaddr %src, %g0, %src /* Single Group */
- subcc %g7, %src, %g7 /* IEU1 Group */
- be,pt %xcc, 1f /* CTI */
- mov 0x40, %g1 /* IEU0 */
- lduw [%src], %g2 /* Load Group */
- subcc %sum, %g2, %sum /* IEU1 Group+load stall*/
- bcs,a,pn %icc, 1f /* CTI */
- sub %sum, 1, %sum /* IEU0 */
-1: srl %sum, 0, %sum /* IEU0 Group */
- clr %g5 /* IEU1 */
- brz,pn %o4, 3f /* CTI+IEU1 Group */
- sub %g1, %o4, %g1 /* IEU0 */
- ldd [%src], %f0 /* Load */
- clr %o4 /* IEU0 Group */
- andcc %dst, 8, %g0 /* IEU1 */
- be,pn %icc, 1f /* CTI */
- ldd [%src + 8], %f2 /* Load Group */
- add %src, 8, %src /* IEU0 */
- sub %len, 8, %len /* IEU1 */
- fpadd32 %f0, %f48, %f50 /* FPA */
- addcc %dst, 8, %dst /* IEU1 Group */
- faligndata %f0, %f2, %f16 /* FPA */
- fcmpgt32 %f48, %f50, %o4 /* FPM Group */
- fmovd %f2, %f0 /* FPA Group */
- ldd [%src + 8], %f2 /* Load */
- stda %f16, [%dst - 8] %asi /* Store */
- fmovd %f50, %f48 /* FPA */
-1: andcc %g1, 0x10, %g0 /* IEU1 Group */
- be,pn %icc, 1f /* CTI */
- and %g1, 0x20, %g1 /* IEU0 */
- fpadd32 %f0, %f48, %f50 /* FPA */
- ldd [%src + 16], %f4 /* Load Group */
- add %src, 16, %src /* IEU0 */
- add %dst, 16, %dst /* IEU1 */
- faligndata %f0, %f2, %f16 /* FPA */
- fcmpgt32 %f48, %f50, %g5 /* FPM Group */
- sub %len, 16, %len /* IEU0 */
- inc %o4 /* IEU1 */
- stda %f16, [%dst - 16] %asi /* Store Group */
- fpadd32 %f2, %f50, %f48 /* FPA */
- srl %o4, 1, %o5 /* IEU0 */
- faligndata %f2, %f4, %f18 /* FPA Group */
- stda %f18, [%dst - 8] %asi /* Store */
- fcmpgt32 %f50, %f48, %o4 /* FPM Group */
- add %o5, %sum, %sum /* IEU0 */
- ldd [%src + 8], %f2 /* Load */
- fmovd %f4, %f0 /* FPA */
-1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */
- rd %asi, %g2 /* LSU Group + 4 bubbles*/
- inc %g5 /* IEU0 */
- fpadd32 %f0, %f48, %f50 /* FPA */
- ldd [%src + 16], %f4 /* Load Group */
- srl %g5, 1, %g5 /* IEU0 */
- add %dst, 32, %dst /* IEU1 */
- faligndata %f0, %f2, %f16 /* FPA */
- fcmpgt32 %f48, %f50, %o5 /* FPM Group */
- inc %o4 /* IEU0 */
- ldd [%src + 24], %f6 /* Load */
- srl %o4, 1, %o4 /* IEU0 Group */
- add %g5, %sum, %sum /* IEU1 */
- ldd [%src + 32], %f8 /* Load */
- fpadd32 %f2, %f50, %f48 /* FPA */
- faligndata %f2, %f4, %f18 /* FPA Group */
- sub %len, 32, %len /* IEU0 */
- stda %f16, [%dst - 32] %asi /* Store */
- fcmpgt32 %f50, %f48, %g3 /* FPM Group */
- inc %o5 /* IEU0 */
- add %o4, %sum, %sum /* IEU1 */
- fpadd32 %f4, %f48, %f50 /* FPA */
- faligndata %f4, %f6, %f20 /* FPA Group */
- srl %o5, 1, %o5 /* IEU0 */
- fcmpgt32 %f48, %f50, %g5 /* FPM Group */
- add %o5, %sum, %sum /* IEU0 */
- stda %f18, [%dst - 24] %asi /* Store */
- fpadd32 %f6, %f50, %f48 /* FPA */
- inc %g3 /* IEU0 Group */
- stda %f20, [%dst - 16] %asi /* Store */
- add %src, 32, %src /* IEU1 */
- faligndata %f6, %f8, %f22 /* FPA */
- fcmpgt32 %f50, %f48, %o4 /* FPM Group */
- srl %g3, 1, %g3 /* IEU0 */
- stda %f22, [%dst - 8] %asi /* Store */
- add %g3, %sum, %sum /* IEU0 Group */
-3: rd %asi, %g2 /* LSU Group + 4 bubbles*/
-#ifdef __KERNEL__
-4: sethi %hi(vis0s), %g7 /* IEU0 Group */
- or %g2, ASI_BLK_OR, %g2 /* IEU1 */
-#else
-4: rd %pc, %g7 /* LSU Group + 4 bubbles*/
-#endif
- inc %g5 /* IEU0 Group */
- and %src, 0x38, %g3 /* IEU1 */
- membar #StoreLoad /* LSU Group */
- srl %g5, 1, %g5 /* IEU0 */
- inc %o4 /* IEU1 */
- sll %g3, 8, %g3 /* IEU0 Group */
- sub %len, 0xc0, %len /* IEU1 */
- addcc %g5, %sum, %sum /* IEU1 Group */
- srl %o4, 1, %o4 /* IEU0 */
- add %g7, %g3, %g7 /* IEU0 Group */
- add %o4, %sum, %sum /* IEU1 */
-#ifdef __KERNEL__
- jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */
-#else
- jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */
-#endif
- fzero %f32 /* FPA */
-
- .align 2048
-vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- ldda [%src] ASI_BLK_P, %f0 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- ldda [%src] ASI_BLK_P, %f16 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- fmovd %f48, %f62 /* FPA Group f0 available*/
- faligndata %f0, %f2, %f48 /* FPA Group f2 available*/
- fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available*/
- fpadd32 %f0, %f62, %f0 /* FPA */
- fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available*/
- faligndata %f2, %f4, %f50 /* FPA */
- fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available*/
- faligndata %f4, %f6, %f52 /* FPA */
- fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available*/
- inc %x1 /* IEU0 */
- faligndata %f6, %f8, %f54 /* FPA */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available*/
- srl %x1, 1, %x1 /* IEU0 */
- inc %x2 /* IEU1 */
- faligndata %f8, %f10, %f56 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available*/
- srl %x2, 1, %x2 /* IEU0 */
- add %sum, %x1, %sum /* IEU1 */
- faligndata %f10, %f12, %f58 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- inc %x3 /* IEU0 */
- add %sum, %x2, %sum /* IEU1 */
- faligndata %f12, %f14, %f60 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- srl %x3, 1, %x3 /* IEU0 */
- inc %x4 /* IEU1 */
- fmovd %f14, %f62 /* FPA */
- srl %x4, 1, %x4 /* IEU0 Group */
- add %sum, %x3, %sum /* IEU1 */
-vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
- ,LDBLK(f32), STBLK,,,,,,,,
- ,bcs,pn %icc, vis0e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
- ,LDBLK(f0), STBLK,,,,,,,,
- ,bcs,pn %icc, vis0e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
- ,LDBLK(f16), STBLK,,,,,,,,
- ,bcc,pt %icc, vis0)
-vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f32,
- ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
- ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
-vis0e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f0,
- ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
- ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
-vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f48,f50,f52,f54,f56,f58,f60,f62,f16,
- ,SYNC, STBLK_XORASI(x1,x2),ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
- ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
- .align 2048
-vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- sub %src, 8, %src /* IEU0 Group */
- ldda [%src] ASI_BLK_P, %f0 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- ldda [%src] ASI_BLK_P, %f16 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- fmovd %f0, %f58 /* FPA Group */
- fmovd %f48, %f0 /* FPA Group */
- fcmpgt32 %f32, %f2, %x2 /* FPM Group */
- faligndata %f2, %f4, %f48 /* FPA */
- fcmpgt32 %f32, %f4, %x3 /* FPM Group */
- faligndata %f4, %f6, %f50 /* FPA */
- fcmpgt32 %f32, %f6, %x4 /* FPM Group */
- faligndata %f6, %f8, %f52 /* FPA */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group */
- inc %x2 /* IEU1 */
- faligndata %f8, %f10, %f54 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- srl %x2, 1, %x2 /* IEU0 */
- faligndata %f10, %f12, %f56 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- inc %x3 /* IEU0 */
- add %sum, %x2, %sum /* IEU1 */
- faligndata %f12, %f14, %f58 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- srl %x3, 1, %x3 /* IEU0 */
- inc %x4 /* IEU1 */
- fmovd %f14, %f60 /* FPA */
- srl %x4, 1, %x4 /* IEU0 Group */
- add %sum, %x3, %sum /* IEU1 */
-vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
- ,LDBLK(f32), ,STBLK,,,,,,,
- ,bcs,pn %icc, vis1e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
- ,LDBLK(f0), ,STBLK,,,,,,,
- ,bcs,pn %icc, vis1e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
- ,LDBLK(f16), ,STBLK,,,,,,,
- ,bcc,pt %icc, vis1)
-vis1e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f32,
- ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
- ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
-vis1e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f0,
- ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
- ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
-vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f62,f48,f50,f52,f54,f56,f58,f60,f16,
- ,SYNC, ,STBLK_XORASI(x1,x2),ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
- ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
- .align 2048
-vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- sub %src, 16, %src /* IEU0 Group */
- ldda [%src] ASI_BLK_P, %f0 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- ldda [%src] ASI_BLK_P, %f16 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- fmovd %f0, %f56 /* FPA Group */
- fmovd %f48, %f0 /* FPA Group */
- sub %dst, 64, %dst /* IEU0 */
- fpsub32 %f2, %f2, %f2 /* FPA Group */
- fcmpgt32 %f32, %f4, %x3 /* FPM Group */
- faligndata %f4, %f6, %f48 /* FPA */
- fcmpgt32 %f32, %f6, %x4 /* FPM Group */
- faligndata %f6, %f8, %f50 /* FPA */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group */
- faligndata %f8, %f10, %f52 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- faligndata %f10, %f12, %f54 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- inc %x3 /* IEU0 */
- faligndata %f12, %f14, %f56 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- srl %x3, 1, %x3 /* IEU0 */
- inc %x4 /* IEU1 */
- fmovd %f14, %f58 /* FPA */
- srl %x4, 1, %x4 /* IEU0 Group */
- add %sum, %x3, %sum /* IEU1 */
-vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
- ,LDBLK(f32), ,,STBLK,,,,,,
- ,bcs,pn %icc, vis2e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
- ,LDBLK(f0), ,,STBLK,,,,,,
- ,bcs,pn %icc, vis2e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
- ,LDBLK(f16), ,,STBLK,,,,,,
- ,bcc,pt %icc, vis2)
-vis2e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f32,
- ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
- ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
-vis2e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f0,
- ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
- ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
-vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f60,f62,f48,f50,f52,f54,f56,f58,f16,
- ,SYNC, ,,STBLK_XORASI(x2,x3),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
- ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
- .align 2048
-vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- sub %src, 24, %src /* IEU0 Group */
- ldda [%src] ASI_BLK_P, %f0 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- ldda [%src] ASI_BLK_P, %f16 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- fmovd %f0, %f54 /* FPA Group */
- fmovd %f48, %f0 /* FPA Group */
- sub %dst, 64, %dst /* IEU0 */
- fpsub32 %f2, %f2, %f2 /* FPA Group */
- fpsub32 %f4, %f4, %f4 /* FPA Group */
- fcmpgt32 %f32, %f6, %x4 /* FPM Group */
- faligndata %f6, %f8, %f48 /* FPA */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group */
- faligndata %f8, %f10, %f50 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- faligndata %f10, %f12, %f52 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- faligndata %f12, %f14, %f54 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- fmovd %f14, %f56 /* FPA */
- inc %x4 /* IEU0 */
- srl %x4, 1, %x4 /* IEU0 Group */
-vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
- ,LDBLK(f32), ,,,STBLK,,,,,
- ,bcs,pn %icc, vis3e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
- ,LDBLK(f0), ,,,STBLK,,,,,
- ,bcs,pn %icc, vis3e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
- ,LDBLK(f16), ,,,STBLK,,,,,
- ,bcc,pt %icc, vis3)
-vis3e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f32,
- ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
- ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
-vis3e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f0,
- ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
- ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
-vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f58,f60,f62,f48,f50,f52,f54,f56,f16,
- ,SYNC, ,,,STBLK_XORASI(x3,x4),ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
- ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
- .align 2048
-vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- sub %src, 32, %src /* IEU0 Group */
- ldda [%src] ASI_BLK_P, %f0 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- ldda [%src] ASI_BLK_P, %f16 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- fmovd %f0, %f52 /* FPA Group */
- fmovd %f48, %f0 /* FPA Group */
- sub %dst, 64, %dst /* IEU0 */
- fpsub32 %f2, %f2, %f2 /* FPA Group */
- fpsub32 %f4, %f4, %f4 /* FPA Group */
- fpsub32 %f6, %f6, %f6 /* FPA Group */
- clr %x4 /* IEU0 */
- fcmpgt32 %f32, %f8, %x5 /* FPM Group */
- faligndata %f8, %f10, %f48 /* FPA */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- faligndata %f10, %f12, %f50 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- faligndata %f12, %f14, %f52 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- fmovd %f14, %f54 /* FPA */
-vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
- ,LDBLK(f32), ,,,,STBLK,,,,
- ,bcs,pn %icc, vis4e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
- ,LDBLK(f0), ,,,,STBLK,,,,
- ,bcs,pn %icc, vis4e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
- ,LDBLK(f16), ,,,,STBLK,,,,
- ,bcc,pt %icc, vis4)
-vis4e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f32,
- ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80),
- ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
-vis4e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f0,
- ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80),
- ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
-vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f56,f58,f60,f62,f48,f50,f52,f54,f16,
- ,SYNC, ,,,,STBLK_XORASI(x4,x5),ST(f48,64),ST(f50,72),ST(f52,80),
- ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
- .align 2048
-vis5s: ldd [%src+0], %f10 /* Load Group */
- ldd [%src+8], %f12 /* Load Group */
- ldd [%src+16], %f14 /* Load Group */
- add %src, 24, %src /* IEU0 Group */
- wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- ldda [%src] ASI_BLK_P, %f16 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- fmovd %f48, %f0 /* FPA Group */
- fmuld %f32, %f32, %f2 /* FPM */
- clr %x4 /* IEU0 */
- faddd %f32, %f32, %f4 /* FPA Group */
- fmuld %f32, %f32, %f6 /* FPM */
- clr %x5 /* IEU0 */
- faddd %f32, %f32, %f8 /* FPA Group */
- fcmpgt32 %f32, %f10, %x6 /* FPM Group */
- sub %dst, 64, %dst /* IEU0 */
- faligndata %f10, %f12, %f48 /* FPA */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- faligndata %f12, %f14, %f50 /* FPA */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- fmovd %f14, %f52 /* FPA */
-vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
- ,LDBLK(f32), ,,,,,STBLK,,,
- ,bcs,pn %icc, vis5e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
- ,LDBLK(f0), ,,,,,STBLK,,,
- ,bcs,pn %icc, vis5e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
- ,LDBLK(f16), ,,,,,STBLK,,,
- ,bcc,pt %icc, vis5)
-vis5e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f32,
- ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72),
- ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
-vis5e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f0,
- ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72),
- ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
-vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f54,f56,f58,f60,f62,f48,f50,f52,f16,
- ,SYNC, ,,,,,STBLK_XORASI(x5,x6),ST(f48,64),ST(f50,72),
- ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
- .align 2048
-vis6s: ldd [%src+0], %f12 /* Load Group */
- ldd [%src+8], %f14 /* Load Group */
- add %src, 16, %src /* IEU0 Group */
- wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- ldda [%src] ASI_BLK_P, %f16 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- fmovd %f48, %f0 /* FPA Group */
- fmuld %f32, %f32, %f2 /* FPM */
- clr %x4 /* IEU0 */
- faddd %f32, %f32, %f4 /* FPA Group */
- fmuld %f32, %f32, %f6 /* FPM */
- clr %x5 /* IEU0 */
- faddd %f32, %f32, %f8 /* FPA Group */
- fmuld %f32, %f32, %f10 /* FPM */
- clr %x6 /* IEU0 */
- fcmpgt32 %f32, %f12, %x7 /* FPM Group */
- sub %dst, 64, %dst /* IEU0 */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- faligndata %f12, %f14, %f48 /* FPA */
- fmovd %f14, %f50 /* FPA Group */
-vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
- ,LDBLK(f32), ,,,,,,STBLK,,
- ,bcs,pn %icc, vis6e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
- ,LDBLK(f0), ,,,,,,STBLK,,
- ,bcs,pn %icc, vis6e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
- ,LDBLK(f16), ,,,,,,STBLK,,
- ,bcc,pt %icc, vis6)
-vis6e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f32,
- ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64),
- ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
-vis6e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f0,
- ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64),
- ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
-vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f52,f54,f56,f58,f60,f62,f48,f50,f16,
- ,SYNC, ,,,,,,STBLK_XORASI(x6,x7),ST(f48,64),
- ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
- .align 2048
-vis7s: ldd [%src+0], %f14 /* Load Group */
- add %src, 8, %src /* IEU0 Group */
- wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
- ldda [%src] ASI_BLK_P, %f16 /* Load Group */
- add %src, 64, %src /* IEU0 Group */
- fmovd %f48, %f0 /* FPA Group */
- fmuld %f32, %f32, %f2 /* FPM */
- clr %x4 /* IEU0 */
- faddd %f32, %f32, %f4 /* FPA Group */
- fmuld %f32, %f32, %f6 /* FPM */
- clr %x5 /* IEU0 */
- faddd %f32, %f32, %f8 /* FPA Group */
- fmuld %f32, %f32, %f10 /* FPM */
- clr %x6 /* IEU0 */
- faddd %f32, %f32, %f12 /* FPA Group */
- clr %x7 /* IEU0 */
- fcmpgt32 %f32, %f14, %x8 /* FPM Group */
- sub %dst, 64, %dst /* IEU0 */
- fmovd %f14, %f48 /* FPA */
-vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
- ,LDBLK(f32), ,,,,,,,STBLK,
- ,bcs,pn %icc, vis7e1)
- DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
- ,LDBLK(f0), ,,,,,,,STBLK,
- ,bcs,pn %icc, vis7e2)
- DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
- ,LDBLK(f16), ,,,,,,,STBLK,
- ,bcc,pt %icc, vis7)
-vis7e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f32,
- ,SYNC, ,,,,,,,STBLK_XORASI(x7,x8),
- ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
-vis7e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f0,
- ,SYNC, ,,,,,,,STBLK_XORASI(x7,x8),
- ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
-vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
- ,f50,f52,f54,f56,f58,f60,f62,f48,f16,
- ,SYNC, ,,,,,,,STBLK_XORASI(x7,x8),
- ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
-e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
-e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
-e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
-ett: rd %gsr, %x3 /* LSU Group+4bubbles */
- andcc %x3, 7, %x3 /* IEU1 Group */
- add %dst, 8, %dst /* IEU0 */
- bne,pn %icc, 1f /* CTI */
- fzero %f10 /* FPA */
- brz,a,pn %len, 2f /* CTI+IEU1 Group */
- stda %f6, [%dst - 8] %asi /* Store */
-1: cmp %len, 8 /* IEU1 */
- blu,pn %icc, 3f /* CTI */
- sub %src, 64, %src /* IEU0 Group */
-1: ldd [%src], %f2 /* Load Group */
- fpadd32 %f10, %f2, %f12 /* FPA Group+load stall*/
- add %src, 8, %src /* IEU0 */
- add %dst, 8, %dst /* IEU1 */
- faligndata %f6, %f2, %f14 /* FPA Group */
- fcmpgt32 %f10, %f12, %x5 /* FPM Group */
- stda %f14, [%dst - 16] %asi /* Store */
- fmovd %f2, %f6 /* FPA */
- fmovd %f12, %f10 /* FPA Group */
- sub %len, 8, %len /* IEU1 */
- fzero %f16 /* FPA Group - FPU nop */
- fzero %f18 /* FPA Group - FPU nop */
- inc %x5 /* IEU0 */
- srl %x5, 1, %x5 /* IEU0 Group (regdep) */
- cmp %len, 8 /* IEU1 */
- bgeu,pt %icc, 1b /* CTI */
- add %x5, %sum, %sum /* IEU0 Group */
-3: brz,a,pt %x3, 2f /* CTI+IEU1 */
- stda %f6, [%dst - 8] %asi /* Store Group */
- sta %f7, [%dst - 8] %asi /* Store Group */
- sub %dst, 4, %dst /* IEU0 */
- add %len, 4, %len /* IEU1 */
-2:
-#ifdef __KERNEL__
- sub %sp, 8, %sp /* IEU0 Group */
-#endif
- END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
- membar #Sync /* LSU Group */
-#ifdef __KERNEL__
- VISExit
- add %sp, 8, %sp /* IEU0 Group */
-#endif
-23: brnz,pn %len, 26f /* CTI+IEU1 Group */
-24: sllx %sum, 32, %g1 /* IEU0 */
-25: addcc %sum, %g1, %src /* IEU1 Group */
- srlx %src, 32, %src /* IEU0 Group (regdep) */
- bcs,a,pn %xcc, 1f /* CTI */
- add %src, 1, %src /* IEU1 */
-#ifndef __KERNEL__
-1: retl /* CTI Group brk forced*/
- srl %src, 0, %src /* IEU0 */
-#else
-1: retl /* CTI Group brk forced*/
- ldx [%g6 + TI_TASK], %g4 /* Load */
-#endif
-26: andcc %len, 8, %g0 /* IEU1 Group */
- be,pn %icc, 1f /* CTI */
- lduw [%src], %o4 /* Load */
- lduw [%src+4], %g2 /* Load Group */
- add %src, 8, %src /* IEU0 */
- add %dst, 8, %dst /* IEU1 */
- sllx %o4, 32, %g5 /* IEU0 Group */
- stwa %o4, [%dst - 8] %asi /* Store */
- or %g5, %g2, %g5 /* IEU0 Group */
- stwa %g2, [%dst - 4] %asi /* Store */
- addcc %g5, %sum, %sum /* IEU1 Group */
- bcs,a,pn %xcc, 1f /* CTI */
- add %sum, 1, %sum /* IEU0 */
-1: andcc %len, 4, %g0 /* IEU1 Group */
- be,a,pn %icc, 1f /* CTI */
- clr %g2 /* IEU0 */
- lduw [%src], %g7 /* Load */
- add %src, 4, %src /* IEU0 Group */
- add %dst, 4, %dst /* IEU1 */
- sllx %g7, 32, %g2 /* IEU0 Group */
- stwa %g7, [%dst - 4] %asi /* Store */
-1: andcc %len, 2, %g0 /* IEU1 */
- be,a,pn %icc, 1f /* CTI */
- clr %g3 /* IEU0 Group */
- lduh [%src], %g7 /* Load */
- add %src, 2, %src /* IEU1 */
- add %dst, 2, %dst /* IEU0 Group */
- sll %g7, 16, %g3 /* IEU0 Group */
- stha %g7, [%dst - 2] %asi /* Store */
-1: andcc %len, 1, %g0 /* IEU1 */
- be,a,pn %icc, 1f /* CTI */
- clr %o5 /* IEU0 Group */
- ldub [%src], %g7 /* Load */
- sll %g7, 8, %o5 /* IEU0 Group */
- stba %g7, [%dst] %asi /* Store */
-1: or %g2, %g3, %g3 /* IEU1 */
- or %o5, %g3, %g3 /* IEU0 Group (regdep) */
- addcc %g3, %sum, %sum /* IEU1 Group (regdep) */
- bcs,a,pn %xcc, 1f /* CTI */
- add %sum, 1, %sum /* IEU0 */
-1: ba,pt %xcc, 25b /* CTI Group */
- sllx %sum, 32, %g1 /* IEU0 */
-
-#ifdef __KERNEL__
-end:
-
- .section __ex_table
- .align 4
- .word csum_partial_copy_user_vis, 0, end, cpc_handler
-#endif
diff --git a/arch/sparc64/lib/VISmemset.S b/arch/sparc64/lib/VISmemset.S
deleted file mode 100644
index 152723a490141..0000000000000
--- a/arch/sparc64/lib/VISmemset.S
+++ /dev/null
@@ -1,240 +0,0 @@
-/* $Id: VISmemset.S,v 1.10 1999/12/23 17:02:16 jj Exp $
- * VISmemset.S: High speed memset operations utilizing the UltraSparc
- * Visual Instruction Set.
- *
- * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1996, 1997, 1999 Jakub Jelinek (jakub@redhat.com)
- */
-
-#include "VIS.h"
-
-#ifdef REGS_64BIT
-#define SET_BLOCKS(base, offset, source) \
- stx source, [base - offset - 0x18]; \
- stx source, [base - offset - 0x10]; \
- stx source, [base - offset - 0x08]; \
- stx source, [base - offset - 0x00];
-#else
-#define SET_BLOCKS(base, offset, source) \
- stw source, [base - offset - 0x18]; \
- stw source, [base - offset - 0x14]; \
- stw source, [base - offset - 0x10]; \
- stw source, [base - offset - 0x0c]; \
- stw source, [base - offset - 0x08]; \
- stw source, [base - offset - 0x04]; \
- stw source, [base - offset - 0x00]; \
- stw source, [base - offset + 0x04];
-#endif
-
-#ifndef __KERNEL__
-/* So that the brz,a,pt in memset doesn't have to get through PLT, here we go... */
-#include "VISbzero.S"
-#endif
-
-#ifdef __KERNEL__
-#include <asm/visasm.h>
-#endif
-
- /* Well, memset is a lot easier to get right than bcopy... */
- .text
- .align 32
-#ifdef __KERNEL__
- .globl __memset
-__memset:
-#endif
- .globl memset
-memset:
-#ifndef __KERNEL__
- brz,a,pt %o1, bzero_private
- mov %o2, %o1
-#ifndef REGS_64BIT
- srl %o2, 0, %o2
-#endif
-#endif
- mov %o0, %o4
- cmp %o2, 7
- bleu,pn %xcc, 17f
- andcc %o0, 3, %g5
- be,pt %xcc, 4f
- and %o1, 0xff, %o1
- cmp %g5, 3
- be,pn %xcc, 2f
- stb %o1, [%o0 + 0x00]
- cmp %g5, 2
- be,pt %xcc, 2f
- stb %o1, [%o0 + 0x01]
- stb %o1, [%o0 + 0x02]
-2: sub %g5, 4, %g5
- sub %o0, %g5, %o0
- add %o2, %g5, %o2
-4: sllx %o1, 8, %g1
- andcc %o0, 4, %g0
- or %o1, %g1, %o1
- sllx %o1, 16, %g1
- or %o1, %g1, %o1
- be,pt %xcc, 2f
-#ifdef REGS_64BIT
- sllx %o1, 32, %g1
-#else
- cmp %o2, 128
-#endif
- stw %o1, [%o0]
- sub %o2, 4, %o2
- add %o0, 4, %o0
-2:
-#ifdef REGS_64BIT
- cmp %o2, 128
- or %o1, %g1, %o1
-#endif
- blu,pn %xcc, 9f
- andcc %o0, 0x38, %g5
- be,pn %icc, 6f
- mov 64, %o5
- andcc %o0, 8, %g0
- be,pn %icc, 1f
- sub %o5, %g5, %o5
-#ifdef REGS_64BIT
- stx %o1, [%o0]
-#else
- stw %o1, [%o0]
- stw %o1, [%o0 + 4]
-#endif
- add %o0, 8, %o0
-1: andcc %o5, 16, %g0
- be,pn %icc, 1f
- sub %o2, %o5, %o2
-#ifdef REGS_64BIT
- stx %o1, [%o0]
- stx %o1, [%o0 + 8]
-#else
- stw %o1, [%o0]
- stw %o1, [%o0 + 4]
- stw %o1, [%o0 + 8]
- stw %o1, [%o0 + 12]
-#endif
- add %o0, 16, %o0
-1: andcc %o5, 32, %g0
- be,pn %icc, 7f
- andncc %o2, 0x3f, %o3
-#ifdef REGS_64BIT
- stx %o1, [%o0]
- stx %o1, [%o0 + 8]
- stx %o1, [%o0 + 16]
- stx %o1, [%o0 + 24]
-#else
- stw %o1, [%o0]
- stw %o1, [%o0 + 4]
- stw %o1, [%o0 + 8]
- stw %o1, [%o0 + 12]
- stw %o1, [%o0 + 16]
- stw %o1, [%o0 + 20]
- stw %o1, [%o0 + 24]
- stw %o1, [%o0 + 28]
-#endif
- add %o0, 32, %o0
-7: be,pn %xcc, 9f
- nop
-#ifdef __KERNEL__
- VISEntryHalf
-#endif
- ldd [%o0 - 8], %f0
-18: rd %asi, %g2
- wr %g0, ASI_BLK_P, %asi
- membar #StoreStore | #LoadStore
- andcc %o3, 0xc0, %g5
- and %o2, 0x3f, %o2
- fmovd %f0, %f2
- fmovd %f0, %f4
- andn %o3, 0xff, %o3
- fmovd %f0, %f6
- cmp %g5, 64
- fmovd %f0, %f8
- fmovd %f0, %f10
- fmovd %f0, %f12
- brz,pn %g5, 10f
- fmovd %f0, %f14
- be,pn %icc, 2f
- stda %f0, [%o0 + 0x00] %asi
- cmp %g5, 128
- be,pn %icc, 2f
- stda %f0, [%o0 + 0x40] %asi
- stda %f0, [%o0 + 0x80] %asi
-2: brz,pn %o3, 12f
- add %o0, %g5, %o0
-10: stda %f0, [%o0 + 0x00] %asi
- stda %f0, [%o0 + 0x40] %asi
- stda %f0, [%o0 + 0x80] %asi
- stda %f0, [%o0 + 0xc0] %asi
-11: subcc %o3, 256, %o3
- bne,pt %xcc, 10b
- add %o0, 256, %o0
-12:
-#ifdef __KERNEL__
- wr %g2, %g0, %asi
- VISExitHalf
-#else
-#ifndef REGS_64BIT
- wr %g0, FPRS_FEF, %fprs
-#endif
-#endif
- membar #StoreLoad | #StoreStore
-9: andcc %o2, 0x78, %g5
- be,pn %xcc, 13f
- andcc %o2, 7, %o2
-#ifdef __KERNEL__
-14: srl %g5, 1, %o3
- sethi %hi(13f), %g3
- sub %g3, %o3, %g3
- jmpl %g3 + %lo(13f), %g0
- add %o0, %g5, %o0
-#else
-14: rd %pc, %g3
-#ifdef REGS_64BIT
- srl %g5, 1, %o3
- sub %g3, %o3, %g3
-#else
- sub %g3, %g5, %g3
-#endif
- jmpl %g3 + (13f - 14b), %g0
- add %o0, %g5, %o0
-#endif
-12: SET_BLOCKS(%o0, 0x68, %o1)
- SET_BLOCKS(%o0, 0x48, %o1)
- SET_BLOCKS(%o0, 0x28, %o1)
- SET_BLOCKS(%o0, 0x08, %o1)
-13: be,pn %xcc, 8f
- andcc %o2, 4, %g0
- be,pn %xcc, 1f
- andcc %o2, 2, %g0
- stw %o1, [%o0]
- add %o0, 4, %o0
-1: be,pn %xcc, 1f
- andcc %o2, 1, %g0
- sth %o1, [%o0]
- add %o0, 2, %o0
-1: bne,a,pn %xcc, 8f
- stb %o1, [%o0]
-8: retl
- mov %o4, %o0
-17: brz,pn %o2, 0f
-8: add %o0, 1, %o0
- subcc %o2, 1, %o2
- bne,pt %xcc, 8b
- stb %o1, [%o0 - 1]
-0: retl
- mov %o4, %o0
-6:
-#ifdef REGS_64BIT
- stx %o1, [%o0]
-#else
- stw %o1, [%o0]
- stw %o1, [%o0 + 4]
-#endif
- andncc %o2, 0x3f, %o3
- be,pn %xcc, 9b
- nop
-#ifdef __KERNEL__
- VISEntryHalf
-#endif
- ba,pt %xcc, 18b
- ldd [%o0], %f0
diff --git a/arch/sparc64/lib/atomic.S b/arch/sparc64/lib/atomic.S
index 41be4131f8008..e528b8d1a3e69 100644
--- a/arch/sparc64/lib/atomic.S
+++ b/arch/sparc64/lib/atomic.S
@@ -29,10 +29,10 @@
.globl atomic_add
.type atomic_add,#function
atomic_add: /* %o0 = increment, %o1 = atomic_ptr */
-1: lduw [%o1], %g5
- add %g5, %o0, %g7
- cas [%o1], %g5, %g7
- cmp %g5, %g7
+1: lduw [%o1], %g1
+ add %g1, %o0, %g7
+ cas [%o1], %g1, %g7
+ cmp %g1, %g7
bne,pn %icc, 1b
nop
retl
@@ -42,10 +42,10 @@ atomic_add: /* %o0 = increment, %o1 = atomic_ptr */
.globl atomic_sub
.type atomic_sub,#function
atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */
-1: lduw [%o1], %g5
- sub %g5, %o0, %g7
- cas [%o1], %g5, %g7
- cmp %g5, %g7
+1: lduw [%o1], %g1
+ sub %g1, %o0, %g7
+ cas [%o1], %g1, %g7
+ cmp %g1, %g7
bne,pn %icc, 1b
nop
retl
@@ -56,10 +56,10 @@ atomic_sub: /* %o0 = decrement, %o1 = atomic_ptr */
.type atomic_add_ret,#function
atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
ATOMIC_PRE_BARRIER
-1: lduw [%o1], %g5
- add %g5, %o0, %g7
- cas [%o1], %g5, %g7
- cmp %g5, %g7
+1: lduw [%o1], %g1
+ add %g1, %o0, %g7
+ cas [%o1], %g1, %g7
+ cmp %g1, %g7
bne,pn %icc, 1b
add %g7, %o0, %g7
ATOMIC_POST_BARRIER
@@ -71,10 +71,10 @@ atomic_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
.type atomic_sub_ret,#function
atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
ATOMIC_PRE_BARRIER
-1: lduw [%o1], %g5
- sub %g5, %o0, %g7
- cas [%o1], %g5, %g7
- cmp %g5, %g7
+1: lduw [%o1], %g1
+ sub %g1, %o0, %g7
+ cas [%o1], %g1, %g7
+ cmp %g1, %g7
bne,pn %icc, 1b
sub %g7, %o0, %g7
ATOMIC_POST_BARRIER
@@ -85,10 +85,10 @@ atomic_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
.globl atomic64_add
.type atomic64_add,#function
atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */
-1: ldx [%o1], %g5
- add %g5, %o0, %g7
- casx [%o1], %g5, %g7
- cmp %g5, %g7
+1: ldx [%o1], %g1
+ add %g1, %o0, %g7
+ casx [%o1], %g1, %g7
+ cmp %g1, %g7
bne,pn %xcc, 1b
nop
retl
@@ -98,10 +98,10 @@ atomic64_add: /* %o0 = increment, %o1 = atomic_ptr */
.globl atomic64_sub
.type atomic64_sub,#function
atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */
-1: ldx [%o1], %g5
- sub %g5, %o0, %g7
- casx [%o1], %g5, %g7
- cmp %g5, %g7
+1: ldx [%o1], %g1
+ sub %g1, %o0, %g7
+ casx [%o1], %g1, %g7
+ cmp %g1, %g7
bne,pn %xcc, 1b
nop
retl
@@ -112,10 +112,10 @@ atomic64_sub: /* %o0 = decrement, %o1 = atomic_ptr */
.type atomic64_add_ret,#function
atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
ATOMIC_PRE_BARRIER
-1: ldx [%o1], %g5
- add %g5, %o0, %g7
- casx [%o1], %g5, %g7
- cmp %g5, %g7
+1: ldx [%o1], %g1
+ add %g1, %o0, %g7
+ casx [%o1], %g1, %g7
+ cmp %g1, %g7
bne,pn %xcc, 1b
add %g7, %o0, %g7
ATOMIC_POST_BARRIER
@@ -127,10 +127,10 @@ atomic64_add_ret: /* %o0 = increment, %o1 = atomic_ptr */
.type atomic64_sub_ret,#function
atomic64_sub_ret: /* %o0 = decrement, %o1 = atomic_ptr */
ATOMIC_PRE_BARRIER
-1: ldx [%o1], %g5
- sub %g5, %o0, %g7
- casx [%o1], %g5, %g7
- cmp %g5, %g7
+1: ldx [%o1], %g1
+ sub %g1, %o0, %g7
+ casx [%o1], %g1, %g7
+ cmp %g1, %g7
bne,pn %xcc, 1b
sub %g7, %o0, %g7
ATOMIC_POST_BARRIER
diff --git a/arch/sparc64/lib/bitops.S b/arch/sparc64/lib/bitops.S
index fd20171ecfd10..886dcd2b376a0 100644
--- a/arch/sparc64/lib/bitops.S
+++ b/arch/sparc64/lib/bitops.S
@@ -26,17 +26,17 @@
test_and_set_bit: /* %o0=nr, %o1=addr */
BITOP_PRE_BARRIER
srlx %o0, 6, %g1
- mov 1, %g5
+ mov 1, %o2
sllx %g1, 3, %g3
and %o0, 63, %g2
- sllx %g5, %g2, %g5
+ sllx %o2, %g2, %o2
add %o1, %g3, %o1
1: ldx [%o1], %g7
- or %g7, %g5, %g1
+ or %g7, %o2, %g1
casx [%o1], %g7, %g1
cmp %g7, %g1
bne,pn %xcc, 1b
- and %g7, %g5, %g2
+ and %g7, %o2, %g2
BITOP_POST_BARRIER
clr %o0
retl
@@ -48,17 +48,17 @@ test_and_set_bit: /* %o0=nr, %o1=addr */
test_and_clear_bit: /* %o0=nr, %o1=addr */
BITOP_PRE_BARRIER
srlx %o0, 6, %g1
- mov 1, %g5
+ mov 1, %o2
sllx %g1, 3, %g3
and %o0, 63, %g2
- sllx %g5, %g2, %g5
+ sllx %o2, %g2, %o2
add %o1, %g3, %o1
1: ldx [%o1], %g7
- andn %g7, %g5, %g1
+ andn %g7, %o2, %g1
casx [%o1], %g7, %g1
cmp %g7, %g1
bne,pn %xcc, 1b
- and %g7, %g5, %g2
+ and %g7, %o2, %g2
BITOP_POST_BARRIER
clr %o0
retl
@@ -70,17 +70,17 @@ test_and_clear_bit: /* %o0=nr, %o1=addr */
test_and_change_bit: /* %o0=nr, %o1=addr */
BITOP_PRE_BARRIER
srlx %o0, 6, %g1
- mov 1, %g5
+ mov 1, %o2
sllx %g1, 3, %g3
and %o0, 63, %g2
- sllx %g5, %g2, %g5
+ sllx %o2, %g2, %o2
add %o1, %g3, %o1
1: ldx [%o1], %g7
- xor %g7, %g5, %g1
+ xor %g7, %o2, %g1
casx [%o1], %g7, %g1
cmp %g7, %g1
bne,pn %xcc, 1b
- and %g7, %g5, %g2
+ and %g7, %o2, %g2
BITOP_POST_BARRIER
clr %o0
retl
@@ -91,13 +91,13 @@ test_and_change_bit: /* %o0=nr, %o1=addr */
.type set_bit,#function
set_bit: /* %o0=nr, %o1=addr */
srlx %o0, 6, %g1
- mov 1, %g5
+ mov 1, %o2
sllx %g1, 3, %g3
and %o0, 63, %g2
- sllx %g5, %g2, %g5
+ sllx %o2, %g2, %o2
add %o1, %g3, %o1
1: ldx [%o1], %g7
- or %g7, %g5, %g1
+ or %g7, %o2, %g1
casx [%o1], %g7, %g1
cmp %g7, %g1
bne,pn %xcc, 1b
@@ -110,13 +110,13 @@ set_bit: /* %o0=nr, %o1=addr */
.type clear_bit,#function
clear_bit: /* %o0=nr, %o1=addr */
srlx %o0, 6, %g1
- mov 1, %g5
+ mov 1, %o2
sllx %g1, 3, %g3
and %o0, 63, %g2
- sllx %g5, %g2, %g5
+ sllx %o2, %g2, %o2
add %o1, %g3, %o1
1: ldx [%o1], %g7
- andn %g7, %g5, %g1
+ andn %g7, %o2, %g1
casx [%o1], %g7, %g1
cmp %g7, %g1
bne,pn %xcc, 1b
@@ -129,13 +129,13 @@ clear_bit: /* %o0=nr, %o1=addr */
.type change_bit,#function
change_bit: /* %o0=nr, %o1=addr */
srlx %o0, 6, %g1
- mov 1, %g5
+ mov 1, %o2
sllx %g1, 3, %g3
and %o0, 63, %g2
- sllx %g5, %g2, %g5
+ sllx %o2, %g2, %o2
add %o1, %g3, %o1
1: ldx [%o1], %g7
- xor %g7, %g5, %g1
+ xor %g7, %o2, %g1
casx [%o1], %g7, %g1
cmp %g7, %g1
bne,pn %xcc, 1b
diff --git a/arch/sparc64/lib/bzero.S b/arch/sparc64/lib/bzero.S
new file mode 100644
index 0000000000000..21a933ffb7c29
--- /dev/null
+++ b/arch/sparc64/lib/bzero.S
@@ -0,0 +1,158 @@
+/* bzero.S: Simple prefetching memset, bzero, and clear_user
+ * implementations.
+ *
+ * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
+ */
+
+ .text
+
+ .globl __memset
+ .type __memset, #function
+__memset: /* %o0=buf, %o1=pat, %o2=len */
+
+ .globl memset
+ .type memset, #function
+memset: /* %o0=buf, %o1=pat, %o2=len */
+ and %o1, 0xff, %o3
+ mov %o2, %o1
+ sllx %o3, 8, %g1
+ or %g1, %o3, %o2
+ sllx %o2, 16, %g1
+ or %g1, %o2, %o2
+ sllx %o2, 32, %g1
+ ba,pt %xcc, 1f
+ or %g1, %o2, %o2
+
+ .globl __bzero
+ .type __bzero, #function
+__bzero: /* %o0=buf, %o1=len */
+ clr %o2
+1: mov %o0, %o3
+ brz,pn %o1, __bzero_done
+ cmp %o1, 16
+ bl,pn %icc, __bzero_tiny
+ prefetch [%o0 + 0x000], #n_writes
+ andcc %o0, 0x3, %g0
+ be,pt %icc, 2f
+1: stb %o2, [%o0 + 0x00]
+ add %o0, 1, %o0
+ andcc %o0, 0x3, %g0
+ bne,pn %icc, 1b
+ sub %o1, 1, %o1
+2: andcc %o0, 0x7, %g0
+ be,pt %icc, 3f
+ stw %o2, [%o0 + 0x00]
+ sub %o1, 4, %o1
+ add %o0, 4, %o0
+3: and %o1, 0x38, %g1
+ cmp %o1, 0x40
+ andn %o1, 0x3f, %o4
+ bl,pn %icc, 5f
+ and %o1, 0x7, %o1
+ prefetch [%o0 + 0x040], #n_writes
+ prefetch [%o0 + 0x080], #n_writes
+ prefetch [%o0 + 0x0c0], #n_writes
+ prefetch [%o0 + 0x100], #n_writes
+ prefetch [%o0 + 0x140], #n_writes
+4: prefetch [%o0 + 0x180], #n_writes
+ stx %o2, [%o0 + 0x00]
+ stx %o2, [%o0 + 0x08]
+ stx %o2, [%o0 + 0x10]
+ stx %o2, [%o0 + 0x18]
+ stx %o2, [%o0 + 0x20]
+ stx %o2, [%o0 + 0x28]
+ stx %o2, [%o0 + 0x30]
+ stx %o2, [%o0 + 0x38]
+ subcc %o4, 0x40, %o4
+ bne,pt %icc, 4b
+ add %o0, 0x40, %o0
+ brz,pn %g1, 6f
+ nop
+5: stx %o2, [%o0 + 0x00]
+ subcc %g1, 8, %g1
+ bne,pt %icc, 5b
+ add %o0, 0x8, %o0
+6: brz,pt %o1, __bzero_done
+ nop
+__bzero_tiny:
+1: stb %o2, [%o0 + 0x00]
+ subcc %o1, 1, %o1
+ bne,pt %icc, 1b
+ add %o0, 1, %o0
+__bzero_done:
+ retl
+ mov %o3, %o0
+ .size __bzero, .-__bzero
+ .size __memset, .-__memset
+ .size memset, .-memset
+
+#define EX_ST(x,y) \
+98: x,y; \
+ .section .fixup; \
+ .align 4; \
+99: retl; \
+ mov %o1, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+
+ .globl __bzero_noasi
+ .type __bzero_noasi, #function
+__bzero_noasi: /* %o0=buf, %o1=len */
+ brz,pn %o1, __bzero_noasi_done
+ cmp %o1, 16
+ bl,pn %icc, __bzero_noasi_tiny
+ EX_ST(prefetcha [%o0 + 0x00] %asi, #n_writes)
+ andcc %o0, 0x3, %g0
+ be,pt %icc, 2f
+1: EX_ST(stba %g0, [%o0 + 0x00] %asi)
+ add %o0, 1, %o0
+ andcc %o0, 0x3, %g0
+ bne,pn %icc, 1b
+ sub %o1, 1, %o1
+2: andcc %o0, 0x7, %g0
+ be,pt %icc, 3f
+ EX_ST(stwa %g0, [%o0 + 0x00] %asi)
+ sub %o1, 4, %o1
+ add %o0, 4, %o0
+3: and %o1, 0x38, %g1
+ cmp %o1, 0x40
+ andn %o1, 0x3f, %o4
+ bl,pn %icc, 5f
+ and %o1, 0x7, %o1
+ EX_ST(prefetcha [%o0 + 0x040] %asi, #n_writes)
+ EX_ST(prefetcha [%o0 + 0x080] %asi, #n_writes)
+ EX_ST(prefetcha [%o0 + 0x0c0] %asi, #n_writes)
+ EX_ST(prefetcha [%o0 + 0x100] %asi, #n_writes)
+ EX_ST(prefetcha [%o0 + 0x140] %asi, #n_writes)
+4: EX_ST(prefetcha [%o0 + 0x180] %asi, #n_writes)
+ EX_ST(stxa %g0, [%o0 + 0x00] %asi)
+ EX_ST(stxa %g0, [%o0 + 0x08] %asi)
+ EX_ST(stxa %g0, [%o0 + 0x10] %asi)
+ EX_ST(stxa %g0, [%o0 + 0x18] %asi)
+ EX_ST(stxa %g0, [%o0 + 0x20] %asi)
+ EX_ST(stxa %g0, [%o0 + 0x28] %asi)
+ EX_ST(stxa %g0, [%o0 + 0x30] %asi)
+ EX_ST(stxa %g0, [%o0 + 0x38] %asi)
+ subcc %o4, 0x40, %o4
+ bne,pt %icc, 4b
+ add %o0, 0x40, %o0
+ brz,pn %g1, 6f
+ nop
+5: EX_ST(stxa %g0, [%o0 + 0x00] %asi)
+ subcc %g1, 8, %g1
+ bne,pt %icc, 5b
+ add %o0, 0x8, %o0
+6: brz,pt %o1, __bzero_noasi_done
+ nop
+__bzero_noasi_tiny:
+1: EX_ST(stba %g0, [%o0 + 0x00] %asi)
+ subcc %o1, 1, %o1
+ bne,pt %icc, 1b
+ add %o0, 1, %o0
+__bzero_noasi_done:
+ retl
+ clr %o0
+ .size __bzero_noasi, .-__bzero_noasi
diff --git a/arch/sparc64/lib/checksum.S b/arch/sparc64/lib/checksum.S
index dc7c887ca17a2..ba9cd3ccc2b26 100644
--- a/arch/sparc64/lib/checksum.S
+++ b/arch/sparc64/lib/checksum.S
@@ -13,500 +13,160 @@
* BSD4.4 portable checksum routine
*/
-#include <asm/errno.h>
-#include <asm/head.h>
-#include <asm/ptrace.h>
-#include <asm/asi.h>
-#include <asm/page.h>
-#include <asm/thread_info.h>
-
- /* The problem with the "add with carry" instructions on Ultra
- * are two fold. Firstly, they cannot pair with jack shit,
- * and also they only add in the 32-bit carry condition bit
- * into the accumulated sum. The following is much better.
- * For larger chunks we use VIS code, which is faster ;)
- */
-
-#define src o0
-#define dst o1
-#define len o2
-#define sum o3
-
.text
- /* I think I have an erection... Once _AGAIN_ the SunSoft
- * engineers are caught asleep at the keyboard, tsk tsk...
- */
-
-#define CSUMCOPY_LASTCHUNK(off, t0, t1) \
- ldxa [%src - off - 0x08] %asi, t0; \
- ldxa [%src - off - 0x00] %asi, t1; \
- nop; nop; \
- addcc t0, %sum, %sum; \
- stw t0, [%dst - off - 0x04]; \
- srlx t0, 32, t0; \
- bcc,pt %xcc, 51f; \
- stw t0, [%dst - off - 0x08]; \
- add %sum, 1, %sum; \
-51: addcc t1, %sum, %sum; \
- stw t1, [%dst - off + 0x04]; \
- srlx t1, 32, t1; \
- bcc,pt %xcc, 52f; \
- stw t1, [%dst - off - 0x00]; \
- add %sum, 1, %sum; \
-52:
-
-cpc_start:
-cc_end_cruft:
- andcc %g7, 8, %g0 ! IEU1 Group
- be,pn %icc, 1f ! CTI
- and %g7, 4, %g5 ! IEU0
- ldxa [%src + 0x00] %asi, %g2 ! Load Group
- add %dst, 8, %dst ! IEU0
- add %src, 8, %src ! IEU1
- addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles
- stw %g2, [%dst - 0x04] ! Store
- srlx %g2, 32, %g2 ! IEU0
- bcc,pt %xcc, 1f ! CTI Group
- stw %g2, [%dst - 0x08] ! Store
- add %sum, 1, %sum ! IEU0
-1: brz,pt %g5, 1f ! CTI Group
- clr %g2 ! IEU0
- lduwa [%src + 0x00] %asi, %g2 ! Load
- add %dst, 4, %dst ! IEU0 Group
- add %src, 4, %src ! IEU1
- stw %g2, [%dst - 0x04] ! Store Group + 2 bubbles
- sllx %g2, 32, %g2 ! IEU0
-1: andcc %g7, 2, %g0 ! IEU1
- be,pn %icc, 1f ! CTI Group
- clr %o4 ! IEU1
- lduha [%src + 0x00] %asi, %o4 ! Load
- add %src, 2, %src ! IEU0 Group
- add %dst, 2, %dst ! IEU1
- sth %o4, [%dst - 0x2] ! Store Group + 2 bubbles
- sll %o4, 16, %o4 ! IEU0
-1: andcc %g7, 1, %g0 ! IEU1
- be,pn %icc, 1f ! CTI Group
- clr %o5 ! IEU0
- lduba [%src + 0x00] %asi, %o5 ! Load
- stb %o5, [%dst + 0x00] ! Store Group + 2 bubbles
- sll %o5, 8, %o5 ! IEU0
-1: or %g2, %o4, %o4 ! IEU1
- or %o5, %o4, %o4 ! IEU0 Group
- addcc %o4, %sum, %sum ! IEU1
- bcc,pt %xcc, ccfold ! CTI
- nop ! IEU0 Group
- b,pt %xcc, ccfold ! CTI
- add %sum, 1, %sum ! IEU1
-
-cc_fixit:
- cmp %len, 6 ! IEU1 Group
- bl,a,pn %icc, ccte ! CTI
- andcc %len, 0xf, %g7 ! IEU1 Group
- andcc %src, 2, %g0 ! IEU1 Group
- be,pn %icc, 1f ! CTI
- andcc %src, 0x4, %g0 ! IEU1 Group
- lduha [%src + 0x00] %asi, %g4 ! Load
- sub %len, 2, %len ! IEU0
- add %src, 2, %src ! IEU0 Group
- add %dst, 2, %dst ! IEU1
- sll %g4, 16, %g3 ! IEU0 Group + 1 bubble
- addcc %g3, %sum, %sum ! IEU1
- bcc,pt %xcc, 0f ! CTI
- srl %sum, 16, %g3 ! IEU0 Group
- add %g3, 1, %g3 ! IEU0 4 clocks (mispredict)
-0: andcc %src, 0x4, %g0 ! IEU1 Group
- sth %g4, [%dst - 0x2] ! Store
- sll %sum, 16, %sum ! IEU0
- sll %g3, 16, %g3 ! IEU0 Group
- srl %sum, 16, %sum ! IEU0 Group
- or %g3, %sum, %sum ! IEU0 Group (regdep)
-1: be,pt %icc, ccmerge ! CTI
- andcc %len, 0xf0, %g1 ! IEU1
- lduwa [%src + 0x00] %asi, %g4 ! Load Group
- sub %len, 4, %len ! IEU0
- add %src, 4, %src ! IEU1
- add %dst, 4, %dst ! IEU0 Group
- addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble
- stw %g4, [%dst - 0x4] ! Store
- bcc,pt %xcc, ccmerge ! CTI
- andcc %len, 0xf0, %g1 ! IEU1 Group
- b,pt %xcc, ccmerge ! CTI 4 clocks (mispredict)
- add %sum, 1, %sum ! IEU0
-
- .align 32
- .globl csum_partial_copy_sparc64
-csum_partial_copy_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */
- xorcc %src, %dst, %o4 ! IEU1 Group
- srl %sum, 0, %sum ! IEU0
- andcc %o4, 3, %g0 ! IEU1 Group
- srl %len, 0, %len ! IEU0
- bne,pn %icc, ccslow ! CTI
- andcc %src, 1, %g0 ! IEU1 Group
- bne,pn %icc, ccslow ! CTI
- cmp %len, 256 ! IEU1 Group
- bgeu,pt %icc, csum_partial_copy_vis ! CTI
- andcc %src, 7, %g0 ! IEU1 Group
- bne,pn %icc, cc_fixit ! CTI
- andcc %len, 0xf0, %g1 ! IEU1 Group
-ccmerge:be,pn %icc, ccte ! CTI
- andcc %len, 0xf, %g7 ! IEU1 Group
- sll %g1, 2, %o4 ! IEU0
-13: sethi %hi(12f), %o5 ! IEU0 Group
- add %src, %g1, %src ! IEU1
- sub %o5, %o4, %o5 ! IEU0 Group
- jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced
- add %dst, %g1, %dst ! IEU0 Group
-cctbl: CSUMCOPY_LASTCHUNK(0xe8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0xd8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0xc8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0xb8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0xa8,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x98,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x88,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x78,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x68,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x58,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x48,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x38,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x28,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x18,%g2,%g3)
- CSUMCOPY_LASTCHUNK(0x08,%g2,%g3)
-12:
- andcc %len, 0xf, %g7 ! IEU1 Group
-ccte: bne,pn %icc, cc_end_cruft ! CTI
- nop ! IEU0
-ccfold: sllx %sum, 32, %o0 ! IEU0 Group
- addcc %sum, %o0, %o0 ! IEU1 Group (regdep)
- srlx %o0, 32, %o0 ! IEU0 Group (regdep)
- bcs,a,pn %xcc, 1f ! CTI
- add %o0, 1, %o0 ! IEU1 4 clocks (mispredict)
-1: retl ! CTI Group brk forced
- ldx [%g6 + TI_TASK], %g4 ! Load
-
-ccslow: mov 0, %g5
- brlez,pn %len, 4f
- andcc %src, 1, %o5
- be,a,pt %icc, 1f
- srl %len, 1, %g7
- sub %len, 1, %len
- lduba [%src] %asi, %g5
- add %src, 1, %src
- stb %g5, [%dst]
- srl %len, 1, %g7
- add %dst, 1, %dst
-1: brz,a,pn %g7, 3f
- andcc %len, 1, %g0
- andcc %src, 2, %g0
- be,a,pt %icc, 1f
- srl %g7, 1, %g7
- lduha [%src] %asi, %o4
- sub %len, 2, %len
- srl %o4, 8, %g2
- sub %g7, 1, %g7
- stb %g2, [%dst]
- add %o4, %g5, %g5
- stb %o4, [%dst + 1]
- add %src, 2, %src
- srl %g7, 1, %g7
- add %dst, 2, %dst
-1: brz,a,pn %g7, 2f
- andcc %len, 2, %g0
- lduwa [%src] %asi, %o4
-5: srl %o4, 24, %g2
- srl %o4, 16, %g3
- stb %g2, [%dst]
- srl %o4, 8, %g2
- stb %g3, [%dst + 1]
- add %src, 4, %src
- stb %g2, [%dst + 2]
- addcc %o4, %g5, %g5
- stb %o4, [%dst + 3]
- addc %g5, %g0, %g5
- add %dst, 4, %dst
- subcc %g7, 1, %g7
- bne,a,pt %icc, 5b
- lduwa [%src] %asi, %o4
- sll %g5, 16, %g2
- srl %g5, 16, %g5
- srl %g2, 16, %g2
- andcc %len, 2, %g0
- add %g2, %g5, %g5
-2: be,a,pt %icc, 3f
- andcc %len, 1, %g0
- lduha [%src] %asi, %o4
- andcc %len, 1, %g0
- srl %o4, 8, %g2
- add %src, 2, %src
- stb %g2, [%dst]
- add %g5, %o4, %g5
- stb %o4, [%dst + 1]
- add %dst, 2, %dst
-3: be,a,pt %icc, 1f
- sll %g5, 16, %o4
- lduba [%src] %asi, %g2
- sll %g2, 8, %o4
- stb %g2, [%dst]
- add %g5, %o4, %g5
- sll %g5, 16, %o4
-1: addcc %o4, %g5, %g5
- srl %g5, 16, %o4
- addc %g0, %o4, %g5
- brz,pt %o5, 4f
- srl %g5, 8, %o4
- and %g5, 0xff, %g2
- and %o4, 0xff, %o4
- sll %g2, 8, %g2
- or %g2, %o4, %g5
-4: addcc %sum, %g5, %sum
- addc %g0, %sum, %o0
- retl
- srl %o0, 0, %o0
-cpc_end:
-
- /* Now the version with userspace as the destination */
-#define CSUMCOPY_LASTCHUNK_USER(off, t0, t1) \
- ldx [%src - off - 0x08], t0; \
- ldx [%src - off - 0x00], t1; \
- nop; nop; \
- addcc t0, %sum, %sum; \
- stwa t0, [%dst - off - 0x04] %asi; \
- srlx t0, 32, t0; \
- bcc,pt %xcc, 51f; \
- stwa t0, [%dst - off - 0x08] %asi; \
- add %sum, 1, %sum; \
-51: addcc t1, %sum, %sum; \
- stwa t1, [%dst - off + 0x04] %asi; \
- srlx t1, 32, t1; \
- bcc,pt %xcc, 52f; \
- stwa t1, [%dst - off - 0x00] %asi; \
- add %sum, 1, %sum; \
-52:
-cpc_user_start:
-cc_user_end_cruft:
- andcc %g7, 8, %g0 ! IEU1 Group
- be,pn %icc, 1f ! CTI
- and %g7, 4, %g5 ! IEU0
- ldx [%src + 0x00], %g2 ! Load Group
- add %dst, 8, %dst ! IEU0
- add %src, 8, %src ! IEU1
- addcc %g2, %sum, %sum ! IEU1 Group + 2 bubbles
- stwa %g2, [%dst - 0x04] %asi ! Store
- srlx %g2, 32, %g2 ! IEU0
- bcc,pt %xcc, 1f ! CTI Group
- stwa %g2, [%dst - 0x08] %asi ! Store
- add %sum, 1, %sum ! IEU0
-1: brz,pt %g5, 1f ! CTI Group
- clr %g2 ! IEU0
- lduw [%src + 0x00], %g2 ! Load
- add %dst, 4, %dst ! IEU0 Group
- add %src, 4, %src ! IEU1
- stwa %g2, [%dst - 0x04] %asi ! Store Group + 2 bubbles
- sllx %g2, 32, %g2 ! IEU0
-1: andcc %g7, 2, %g0 ! IEU1
- be,pn %icc, 1f ! CTI Group
- clr %o4 ! IEU1
- lduh [%src + 0x00], %o4 ! Load
- add %src, 2, %src ! IEU0 Group
- add %dst, 2, %dst ! IEU1
- stha %o4, [%dst - 0x2] %asi ! Store Group + 2 bubbles
- sll %o4, 16, %o4 ! IEU0
-1: andcc %g7, 1, %g0 ! IEU1
- be,pn %icc, 1f ! CTI Group
- clr %o5 ! IEU0
- ldub [%src + 0x00], %o5 ! Load
- stba %o5, [%dst + 0x00] %asi ! Store Group + 2 bubbles
- sll %o5, 8, %o5 ! IEU0
-1: or %g2, %o4, %o4 ! IEU1
- or %o5, %o4, %o4 ! IEU0 Group
- addcc %o4, %sum, %sum ! IEU1
- bcc,pt %xcc, ccuserfold ! CTI
- nop ! IEU0 Group
- b,pt %xcc, ccuserfold ! CTI
- add %sum, 1, %sum ! IEU1
-
-cc_user_fixit:
- cmp %len, 6 ! IEU1 Group
- bl,a,pn %icc, ccuserte ! CTI
- andcc %len, 0xf, %g7 ! IEU1 Group
- andcc %src, 2, %g0 ! IEU1 Group
- be,pn %icc, 1f ! CTI
- andcc %src, 0x4, %g0 ! IEU1 Group
- lduh [%src + 0x00], %g4 ! Load
- sub %len, 2, %len ! IEU0
- add %src, 2, %src ! IEU0 Group
- add %dst, 2, %dst ! IEU1
- sll %g4, 16, %g3 ! IEU0 Group + 1 bubble
- addcc %g3, %sum, %sum ! IEU1
- bcc,pt %xcc, 0f ! CTI
- srl %sum, 16, %g3 ! IEU0 Group
- add %g3, 1, %g3 ! IEU0 4 clocks (mispredict)
-0: andcc %src, 0x4, %g0 ! IEU1 Group
- stha %g4, [%dst - 0x2] %asi ! Store
- sll %sum, 16, %sum ! IEU0
- sll %g3, 16, %g3 ! IEU0 Group
- srl %sum, 16, %sum ! IEU0 Group
- or %g3, %sum, %sum ! IEU0 Group (regdep)
-1: be,pt %icc, ccusermerge ! CTI
- andcc %len, 0xf0, %g1 ! IEU1
- lduw [%src + 0x00], %g4 ! Load Group
- sub %len, 4, %len ! IEU0
- add %src, 4, %src ! IEU1
- add %dst, 4, %dst ! IEU0 Group
- addcc %g4, %sum, %sum ! IEU1 Group + 1 bubble
- stwa %g4, [%dst - 0x4] %asi ! Store
- bcc,pt %xcc, ccusermerge ! CTI
- andcc %len, 0xf0, %g1 ! IEU1 Group
- b,pt %xcc, ccusermerge ! CTI 4 clocks (mispredict)
- add %sum, 1, %sum ! IEU0
+csum_partial_fix_alignment:
+ /* We checked for zero length already, so there must be
+ * at least one byte.
+ */
+ be,pt %icc, 1f
+ nop
+ ldub [%o0 + 0x00], %o4
+ add %o0, 1, %o0
+ sub %o1, 1, %o1
+1: andcc %o0, 0x2, %g0
+ be,pn %icc, csum_partial_post_align
+ cmp %o1, 2
+ blu,pn %icc, csum_partial_end_cruft
+ nop
+ lduh [%o0 + 0x00], %o5
+ add %o0, 2, %o0
+ sub %o1, 2, %o1
+ ba,pt %xcc, csum_partial_post_align
+ add %o5, %o4, %o4
.align 32
- .globl csum_partial_copy_user_sparc64
-csum_partial_copy_user_sparc64: /* %o0=src, %o1=dest, %o2=len, %o3=sum */
- xorcc %src, %dst, %o4 ! IEU1 Group
- srl %sum, 0, %sum ! IEU0
- andcc %o4, 3, %g0 ! IEU1 Group
- srl %len, 0, %len ! IEU0
- bne,pn %icc, ccuserslow ! CTI
- andcc %src, 1, %g0 ! IEU1 Group
- bne,pn %icc, ccuserslow ! CTI
- cmp %len, 256 ! IEU1 Group
- bgeu,pt %icc, csum_partial_copy_user_vis ! CTI
- andcc %src, 7, %g0 ! IEU1 Group
- bne,pn %icc, cc_user_fixit ! CTI
- andcc %len, 0xf0, %g1 ! IEU1 Group
-ccusermerge:
- be,pn %icc, ccuserte ! CTI
- andcc %len, 0xf, %g7 ! IEU1 Group
- sll %g1, 2, %o4 ! IEU0
-13: sethi %hi(12f), %o5 ! IEU0 Group
- add %src, %g1, %src ! IEU1
- sub %o5, %o4, %o5 ! IEU0 Group
- jmpl %o5 + %lo(12f), %g0 ! CTI Group brk forced
- add %dst, %g1, %dst ! IEU0 Group
-ccusertbl:
- CSUMCOPY_LASTCHUNK_USER(0xe8,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0xd8,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0xc8,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0xb8,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0xa8,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x98,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x88,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x78,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x68,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x58,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x48,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x38,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x28,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x18,%g2,%g3)
- CSUMCOPY_LASTCHUNK_USER(0x08,%g2,%g3)
-12:
- andcc %len, 0xf, %g7 ! IEU1 Group
-ccuserte:
- bne,pn %icc, cc_user_end_cruft ! CTI
- nop ! IEU0
-ccuserfold:
- sllx %sum, 32, %o0 ! IEU0 Group
- addcc %sum, %o0, %o0 ! IEU1 Group (regdep)
- srlx %o0, 32, %o0 ! IEU0 Group (regdep)
- bcs,a,pn %xcc, 1f ! CTI
- add %o0, 1, %o0 ! IEU1 4 clocks (mispredict)
-1: retl ! CTI Group brk forced
- ldx [%g6 + TI_TASK], %g4 ! IEU0 Group
-
-ccuserslow:
- mov 0, %g5
- brlez,pn %len, 4f
- andcc %src, 1, %o5
- be,a,pt %icc, 1f
- srl %len, 1, %g7
- sub %len, 1, %len
- ldub [%src], %g5
- add %src, 1, %src
- stba %g5, [%dst] %asi
- srl %len, 1, %g7
- add %dst, 1, %dst
-1: brz,a,pn %g7, 3f
- andcc %len, 1, %g0
- andcc %src, 2, %g0
- be,a,pt %icc, 1f
- srl %g7, 1, %g7
- lduh [%src], %o4
- sub %len, 2, %len
- srl %o4, 8, %g2
- sub %g7, 1, %g7
- stba %g2, [%dst] %asi
- add %o4, %g5, %g5
- stba %o4, [%dst + 1] %asi
- add %src, 2, %src
- srl %g7, 1, %g7
- add %dst, 2, %dst
-1: brz,a,pn %g7, 2f
- andcc %len, 2, %g0
- lduw [%src], %o4
-5: srl %o4, 24, %g2
- srl %o4, 16, %g3
- stba %g2, [%dst] %asi
- srl %o4, 8, %g2
- stba %g3, [%dst + 1] %asi
- add %src, 4, %src
- stba %g2, [%dst + 2] %asi
- addcc %o4, %g5, %g5
- stba %o4, [%dst + 3] %asi
- addc %g5, %g0, %g5
- add %dst, 4, %dst
- subcc %g7, 1, %g7
- bne,a,pt %icc, 5b
- lduw [%src], %o4
- sll %g5, 16, %g2
- srl %g5, 16, %g5
- srl %g2, 16, %g2
- andcc %len, 2, %g0
- add %g2, %g5, %g5
-2: be,a,pt %icc, 3f
- andcc %len, 1, %g0
- lduh [%src], %o4
- andcc %len, 1, %g0
- srl %o4, 8, %g2
- add %src, 2, %src
- stba %g2, [%dst] %asi
- add %g5, %o4, %g5
- stba %o4, [%dst + 1] %asi
- add %dst, 2, %dst
-3: be,a,pt %icc, 1f
- sll %g5, 16, %o4
- ldub [%src], %g2
- sll %g2, 8, %o4
- stba %g2, [%dst] %asi
- add %g5, %o4, %g5
- sll %g5, 16, %o4
-1: addcc %o4, %g5, %g5
- srl %g5, 16, %o4
- addc %g0, %o4, %g5
- brz,pt %o5, 4f
- srl %g5, 8, %o4
- and %g5, 0xff, %g2
- and %o4, 0xff, %o4
- sll %g2, 8, %g2
- or %g2, %o4, %g5
-4: addcc %sum, %g5, %sum
- addc %g0, %sum, %o0
- retl
- srl %o0, 0, %o0
-cpc_user_end:
-
- .globl cpc_handler
-cpc_handler:
- ldx [%sp + 0x7ff + 128], %g1
- ldub [%g6 + TI_CURRENT_DS], %g3
- sub %g0, EFAULT, %g2
- brnz,a,pt %g1, 1f
- st %g2, [%g1]
-1: wr %g3, %g0, %asi
+ .globl csum_partial
+csum_partial: /* %o0=buff, %o1=len, %o2=sum */
+ prefetch [%o0 + 0x000], #n_reads
+ clr %o4
+ prefetch [%o0 + 0x040], #n_reads
+ brz,pn %o1, csum_partial_finish
+ andcc %o0, 0x3, %g0
+
+ /* We "remember" whether the lowest bit in the address
+ * was set in %g7. Because if it is, we have to swap
+ * upper and lower 8 bit fields of the sum we calculate.
+ */
+ bne,pn %icc, csum_partial_fix_alignment
+ andcc %o0, 0x1, %g7
+
+csum_partial_post_align:
+ prefetch [%o0 + 0x080], #n_reads
+ andncc %o1, 0x3f, %o3
+
+ prefetch [%o0 + 0x0c0], #n_reads
+ sub %o1, %o3, %o1
+ brz,pn %o3, 2f
+ prefetch [%o0 + 0x100], #n_reads
+
+ /* So that we don't need to use the non-pairing
+ * add-with-carry instructions we accumulate 32-bit
+ * values into a 64-bit register. At the end of the
+ * loop we fold it down to 32-bits and so on.
+ */
+ prefetch [%o0 + 0x140], #n_reads
+1: lduw [%o0 + 0x00], %o5
+ lduw [%o0 + 0x04], %g1
+ lduw [%o0 + 0x08], %g2
+ add %o4, %o5, %o4
+ lduw [%o0 + 0x0c], %g3
+ add %o4, %g1, %o4
+ lduw [%o0 + 0x10], %o5
+ add %o4, %g2, %o4
+ lduw [%o0 + 0x14], %g1
+ add %o4, %g3, %o4
+ lduw [%o0 + 0x18], %g2
+ add %o4, %o5, %o4
+ lduw [%o0 + 0x1c], %g3
+ add %o4, %g1, %o4
+ lduw [%o0 + 0x20], %o5
+ add %o4, %g2, %o4
+ lduw [%o0 + 0x24], %g1
+ add %o4, %g3, %o4
+ lduw [%o0 + 0x28], %g2
+ add %o4, %o5, %o4
+ lduw [%o0 + 0x2c], %g3
+ add %o4, %g1, %o4
+ lduw [%o0 + 0x30], %o5
+ add %o4, %g2, %o4
+ lduw [%o0 + 0x34], %g1
+ add %o4, %g3, %o4
+ lduw [%o0 + 0x38], %g2
+ add %o4, %o5, %o4
+ lduw [%o0 + 0x3c], %g3
+ add %o4, %g1, %o4
+ prefetch [%o0 + 0x180], #n_reads
+ add %o4, %g2, %o4
+ subcc %o3, 0x40, %o3
+ add %o0, 0x40, %o0
+ bne,pt %icc, 1b
+ add %o4, %g3, %o4
+
+2: and %o1, 0x3c, %o3
+ brz,pn %o3, 2f
+ sub %o1, %o3, %o1
+1: lduw [%o0 + 0x00], %o5
+ subcc %o3, 0x4, %o3
+ add %o0, 0x4, %o0
+ bne,pt %icc, 1b
+ add %o4, %o5, %o4
+
+2:
+ /* fold 64-->32 */
+ srlx %o4, 32, %o5
+ srl %o4, 0, %o4
+ add %o4, %o5, %o4
+ srlx %o4, 32, %o5
+ srl %o4, 0, %o4
+ add %o4, %o5, %o4
+
+ /* fold 32-->16 */
+ sethi %hi(0xffff0000), %g1
+ srl %o4, 16, %o5
+ andn %o4, %g1, %g2
+ add %o5, %g2, %o4
+ srl %o4, 16, %o5
+ andn %o4, %g1, %g2
+ add %o5, %g2, %o4
+
+csum_partial_end_cruft:
+ /* %o4 has the 16-bit sum we have calculated so-far. */
+ cmp %o1, 2
+ blu,pt %icc, 1f
+ nop
+ lduh [%o0 + 0x00], %o5
+ sub %o1, 2, %o1
+ add %o0, 2, %o0
+ add %o4, %o5, %o4
+1: brz,pt %o1, 1f
+ nop
+ ldub [%o0 + 0x00], %o5
+ sub %o1, 1, %o1
+ add %o0, 1, %o0
+ sllx %o5, 8, %o5
+ add %o4, %o5, %o4
+1:
+ /* fold 32-->16 */
+ sethi %hi(0xffff0000), %g1
+ srl %o4, 16, %o5
+ andn %o4, %g1, %g2
+ add %o5, %g2, %o4
+ srl %o4, 16, %o5
+ andn %o4, %g1, %g2
+ add %o5, %g2, %o4
+
+1: brz,pt %g7, 1f
+ nop
+
+ /* We started with an odd byte, byte-swap the result. */
+ srl %o4, 8, %o5
+ and %o4, 0xff, %g1
+ sll %g1, 8, %g1
+ or %o5, %g1, %o4
+
+1: add %o2, %o4, %o2
+
+csum_partial_finish:
retl
- ldx [%g6 + TI_TASK], %g4
-
- .section __ex_table
- .align 4
- .word cpc_start, 0, cpc_end, cpc_handler
- .word cpc_user_start, 0, cpc_user_end, cpc_handler
+ mov %o2, %o0
diff --git a/arch/sparc64/lib/csum_copy.S b/arch/sparc64/lib/csum_copy.S
new file mode 100644
index 0000000000000..71af488390646
--- /dev/null
+++ b/arch/sparc64/lib/csum_copy.S
@@ -0,0 +1,308 @@
+/* csum_copy.S: Checksum+copy code for sparc64
+ *
+ * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
+ */
+
+#ifdef __KERNEL__
+#define GLOBAL_SPARE %g7
+#else
+#define GLOBAL_SPARE %g5
+#endif
+
+#ifndef EX_LD
+#define EX_LD(x) x
+#endif
+
+#ifndef EX_ST
+#define EX_ST(x) x
+#endif
+
+#ifndef EX_RETVAL
+#define EX_RETVAL(x) x
+#endif
+
+#ifndef LOAD
+#define LOAD(type,addr,dest) type [addr], dest
+#endif
+
+#ifndef STORE
+#define STORE(type,src,addr) type src, [addr]
+#endif
+
+#ifndef FUNC_NAME
+#define FUNC_NAME csum_partial_copy_nocheck
+#endif
+
+ .register %g2, #scratch
+ .register %g3, #scratch
+
+ .text
+
+90:
+ /* We checked for zero length already, so there must be
+ * at least one byte.
+ */
+ be,pt %icc, 1f
+ nop
+ EX_LD(LOAD(ldub, %o0 + 0x00, %o4))
+ add %o0, 1, %o0
+ sub %o2, 1, %o2
+ EX_ST(STORE(stb, %o4, %o1 + 0x00))
+ add %o1, 1, %o1
+1: andcc %o0, 0x2, %g0
+ be,pn %icc, 80f
+ cmp %o2, 2
+ blu,pn %icc, 60f
+ nop
+ EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
+ add %o0, 2, %o0
+ sub %o2, 2, %o2
+ EX_ST(STORE(sth, %o5, %o1 + 0x00))
+ add %o1, 2, %o1
+ ba,pt %xcc, 80f
+ add %o5, %o4, %o4
+
+ .globl FUNC_NAME
+FUNC_NAME: /* %o0=src, %o1=dst, %o2=len, %o3=sum */
+ LOAD(prefetch, %o0 + 0x000, #n_reads)
+ xor %o0, %o1, %g1
+ clr %o4
+ andcc %g1, 0x3, %g0
+ bne,pn %icc, 95f
+ LOAD(prefetch, %o0 + 0x040, #n_reads)
+
+ brz,pn %o2, 70f
+ andcc %o0, 0x3, %g0
+
+ /* We "remember" whether the lowest bit in the address
+ * was set in GLOBAL_SPARE. Because if it is, we have to swap
+ * upper and lower 8 bit fields of the sum we calculate.
+ */
+ bne,pn %icc, 90b
+ andcc %o0, 0x1, GLOBAL_SPARE
+
+80:
+ LOAD(prefetch, %o0 + 0x080, #n_reads)
+ andncc %o2, 0x3f, %g3
+
+ LOAD(prefetch, %o0 + 0x0c0, #n_reads)
+ sub %o2, %g3, %o2
+ brz,pn %g3, 2f
+ LOAD(prefetch, %o0 + 0x100, #n_reads)
+
+ /* So that we don't need to use the non-pairing
+ * add-with-carry instructions we accumulate 32-bit
+ * values into a 64-bit register. At the end of the
+ * loop we fold it down to 32-bits and so on.
+ */
+ ba,pt %xcc, 1f
+ LOAD(prefetch, %o0 + 0x140, #n_reads)
+
+ .align 32
+1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
+ EX_LD(LOAD(lduw, %o0 + 0x04, %g1))
+ EX_LD(LOAD(lduw, %o0 + 0x08, %g2))
+ add %o4, %o5, %o4
+ EX_ST(STORE(stw, %o5, %o1 + 0x00))
+ EX_LD(LOAD(lduw, %o0 + 0x0c, %o5))
+ add %o4, %g1, %o4
+ EX_ST(STORE(stw, %g1, %o1 + 0x04))
+ EX_LD(LOAD(lduw, %o0 + 0x10, %g1))
+ add %o4, %g2, %o4
+ EX_ST(STORE(stw, %g2, %o1 + 0x08))
+ EX_LD(LOAD(lduw, %o0 + 0x14, %g2))
+ add %o4, %o5, %o4
+ EX_ST(STORE(stw, %o5, %o1 + 0x0c))
+ EX_LD(LOAD(lduw, %o0 + 0x18, %o5))
+ add %o4, %g1, %o4
+ EX_ST(STORE(stw, %g1, %o1 + 0x10))
+ EX_LD(LOAD(lduw, %o0 + 0x1c, %g1))
+ add %o4, %g2, %o4
+ EX_ST(STORE(stw, %g2, %o1 + 0x14))
+ EX_LD(LOAD(lduw, %o0 + 0x20, %g2))
+ add %o4, %o5, %o4
+ EX_ST(STORE(stw, %o5, %o1 + 0x18))
+ EX_LD(LOAD(lduw, %o0 + 0x24, %o5))
+ add %o4, %g1, %o4
+ EX_ST(STORE(stw, %g1, %o1 + 0x1c))
+ EX_LD(LOAD(lduw, %o0 + 0x28, %g1))
+ add %o4, %g2, %o4
+ EX_ST(STORE(stw, %g2, %o1 + 0x20))
+ EX_LD(LOAD(lduw, %o0 + 0x2c, %g2))
+ add %o4, %o5, %o4
+ EX_ST(STORE(stw, %o5, %o1 + 0x24))
+ EX_LD(LOAD(lduw, %o0 + 0x30, %o5))
+ add %o4, %g1, %o4
+ EX_ST(STORE(stw, %g1, %o1 + 0x28))
+ EX_LD(LOAD(lduw, %o0 + 0x34, %g1))
+ add %o4, %g2, %o4
+ EX_ST(STORE(stw, %g2, %o1 + 0x2c))
+ EX_LD(LOAD(lduw, %o0 + 0x38, %g2))
+ add %o4, %o5, %o4
+ EX_ST(STORE(stw, %o5, %o1 + 0x30))
+ EX_LD(LOAD(lduw, %o0 + 0x3c, %o5))
+ add %o4, %g1, %o4
+ EX_ST(STORE(stw, %g1, %o1 + 0x34))
+ LOAD(prefetch, %o0 + 0x180, #n_reads)
+ add %o4, %g2, %o4
+ EX_ST(STORE(stw, %g2, %o1 + 0x38))
+ subcc %g3, 0x40, %g3
+ add %o0, 0x40, %o0
+ add %o4, %o5, %o4
+ EX_ST(STORE(stw, %o5, %o1 + 0x3c))
+ bne,pt %icc, 1b
+ add %o1, 0x40, %o1
+
+2: and %o2, 0x3c, %g3
+ brz,pn %g3, 2f
+ sub %o2, %g3, %o2
+1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5))
+ subcc %g3, 0x4, %g3
+ add %o0, 0x4, %o0
+ add %o4, %o5, %o4
+ EX_ST(STORE(stw, %o5, %o1 + 0x00))
+ bne,pt %icc, 1b
+ add %o1, 0x4, %o1
+
+2:
+ /* fold 64-->32 */
+ srlx %o4, 32, %o5
+ srl %o4, 0, %o4
+ add %o4, %o5, %o4
+ srlx %o4, 32, %o5
+ srl %o4, 0, %o4
+ add %o4, %o5, %o4
+
+ /* fold 32-->16 */
+ sethi %hi(0xffff0000), %g1
+ srl %o4, 16, %o5
+ andn %o4, %g1, %g2
+ add %o5, %g2, %o4
+ srl %o4, 16, %o5
+ andn %o4, %g1, %g2
+ add %o5, %g2, %o4
+
+60:
+ /* %o4 has the 16-bit sum we have calculated so-far. */
+ cmp %o2, 2
+ blu,pt %icc, 1f
+ nop
+ EX_LD(LOAD(lduh, %o0 + 0x00, %o5))
+ sub %o2, 2, %o2
+ add %o0, 2, %o0
+ add %o4, %o5, %o4
+ EX_ST(STORE(sth, %o5, %o1 + 0x00))
+ add %o1, 0x2, %o1
+1: brz,pt %o2, 1f
+ nop
+ EX_LD(LOAD(ldub, %o0 + 0x00, %o5))
+ sub %o2, 1, %o2
+ add %o0, 1, %o0
+ EX_ST(STORE(stb, %o5, %o1 + 0x00))
+ sllx %o5, 8, %o5
+ add %o1, 1, %o1
+ add %o4, %o5, %o4
+1:
+ /* fold 32-->16 */
+ sethi %hi(0xffff0000), %g1
+ srl %o4, 16, %o5
+ andn %o4, %g1, %g2
+ add %o5, %g2, %o4
+ srl %o4, 16, %o5
+ andn %o4, %g1, %g2
+ add %o5, %g2, %o4
+
+1: brz,pt GLOBAL_SPARE, 1f
+ nop
+
+ /* We started with an odd byte, byte-swap the result. */
+ srl %o4, 8, %o5
+ and %o4, 0xff, %g1
+ sll %g1, 8, %g1
+ or %o5, %g1, %o4
+
+1: add %o3, %o4, %o3
+
+70:
+ retl
+ mov %o3, %o0
+
+95: mov 0, GLOBAL_SPARE
+ brlez,pn %o2, 4f
+ andcc %o0, 1, %o5
+ be,a,pt %icc, 1f
+ srl %o2, 1, %g1
+ sub %o2, 1, %o2
+ EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE))
+ add %o0, 1, %o0
+ EX_ST(STORE(stb, GLOBAL_SPARE, %o1))
+ srl %o2, 1, %g1
+ add %o1, 1, %o1
+1: brz,a,pn %g1, 3f
+ andcc %o2, 1, %g0
+ andcc %o0, 2, %g0
+ be,a,pt %icc, 1f
+ srl %g1, 1, %g1
+ EX_LD(LOAD(lduh, %o0, %o4))
+ sub %o2, 2, %o2
+ srl %o4, 8, %g2
+ sub %g1, 1, %g1
+ EX_ST(STORE(stb, %g2, %o1))
+ add %o4, GLOBAL_SPARE, GLOBAL_SPARE
+ EX_ST(STORE(stb, %o4, %o1 + 1))
+ add %o0, 2, %o0
+ srl %g1, 1, %g1
+ add %o1, 2, %o1
+1: brz,a,pn %g1, 2f
+ andcc %o2, 2, %g0
+ EX_LD(LOAD(lduw, %o0, %o4))
+5: srl %o4, 24, %g2
+ srl %o4, 16, %g3
+ EX_ST(STORE(stb, %g2, %o1))
+ srl %o4, 8, %g2
+ EX_ST(STORE(stb, %g3, %o1 + 1))
+ add %o0, 4, %o0
+ EX_ST(STORE(stb, %g2, %o1 + 2))
+ addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE
+ EX_ST(STORE(stb, %o4, %o1 + 3))
+ addc GLOBAL_SPARE, %g0, GLOBAL_SPARE
+ add %o1, 4, %o1
+ subcc %g1, 1, %g1
+ bne,a,pt %icc, 5b
+ EX_LD(LOAD(lduw, %o0, %o4))
+ sll GLOBAL_SPARE, 16, %g2
+ srl GLOBAL_SPARE, 16, GLOBAL_SPARE
+ srl %g2, 16, %g2
+ andcc %o2, 2, %g0
+ add %g2, GLOBAL_SPARE, GLOBAL_SPARE
+2: be,a,pt %icc, 3f
+ andcc %o2, 1, %g0
+ EX_LD(LOAD(lduh, %o0, %o4))
+ andcc %o2, 1, %g0
+ srl %o4, 8, %g2
+ add %o0, 2, %o0
+ EX_ST(STORE(stb, %g2, %o1))
+ add GLOBAL_SPARE, %o4, GLOBAL_SPARE
+ EX_ST(STORE(stb, %o4, %o1 + 1))
+ add %o1, 2, %o1
+3: be,a,pt %icc, 1f
+ sll GLOBAL_SPARE, 16, %o4
+ EX_LD(LOAD(ldub, %o0, %g2))
+ sll %g2, 8, %o4
+ EX_ST(STORE(stb, %g2, %o1))
+ add GLOBAL_SPARE, %o4, GLOBAL_SPARE
+ sll GLOBAL_SPARE, 16, %o4
+1: addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE
+ srl GLOBAL_SPARE, 16, %o4
+ addc %g0, %o4, GLOBAL_SPARE
+ brz,pt %o5, 4f
+ srl GLOBAL_SPARE, 8, %o4
+ and GLOBAL_SPARE, 0xff, %g2
+ and %o4, 0xff, %o4
+ sll %g2, 8, %g2
+ or %g2, %o4, GLOBAL_SPARE
+4: addcc %o3, GLOBAL_SPARE, %o3
+ addc %g0, %o3, %o0
+ retl
+ srl %o0, 0, %o0
+ .size FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc64/lib/csum_copy_from_user.S b/arch/sparc64/lib/csum_copy_from_user.S
new file mode 100644
index 0000000000000..817ebdae39f8e
--- /dev/null
+++ b/arch/sparc64/lib/csum_copy_from_user.S
@@ -0,0 +1,21 @@
+/* csum_copy_from_user.S: Checksum+copy from userspace.
+ *
+ * Copyright (C) 2005 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_LD(x) \
+98: x; \
+ .section .fixup; \
+ .align 4; \
+99: retl; \
+ mov -1, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+
+#define FUNC_NAME __csum_partial_copy_from_user
+#define LOAD(type,addr,dest) type##a [addr] %asi, dest
+
+#include "csum_copy.S"
diff --git a/arch/sparc64/lib/csum_copy_to_user.S b/arch/sparc64/lib/csum_copy_to_user.S
new file mode 100644
index 0000000000000..c2f9463ea1e26
--- /dev/null
+++ b/arch/sparc64/lib/csum_copy_to_user.S
@@ -0,0 +1,21 @@
+/* csum_copy_to_user.S: Checksum+copy to userspace.
+ *
+ * Copyright (C) 2005 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_ST(x) \
+98: x; \
+ .section .fixup; \
+ .align 4; \
+99: retl; \
+ mov -1, %o0; \
+ .section __ex_table; \
+ .align 4; \
+ .word 98b, 99b; \
+ .text; \
+ .align 4;
+
+#define FUNC_NAME __csum_partial_copy_to_user
+#define STORE(type,src,addr) type##a src, [addr] %asi
+
+#include "csum_copy.S"
diff --git a/arch/sparc64/lib/debuglocks.c b/arch/sparc64/lib/debuglocks.c
index 46e5ebfb4b7ce..c421e0c653253 100644
--- a/arch/sparc64/lib/debuglocks.c
+++ b/arch/sparc64/lib/debuglocks.c
@@ -138,15 +138,15 @@ wlock_again:
}
/* Try once to increment the counter. */
__asm__ __volatile__(
-" ldx [%0], %%g5\n"
-" brlz,a,pn %%g5, 2f\n"
+" ldx [%0], %%g1\n"
+" brlz,a,pn %%g1, 2f\n"
" mov 1, %0\n"
-" add %%g5, 1, %%g7\n"
-" casx [%0], %%g5, %%g7\n"
-" sub %%g5, %%g7, %0\n"
+" add %%g1, 1, %%g7\n"
+" casx [%0], %%g1, %%g7\n"
+" sub %%g1, %%g7, %0\n"
"2:" : "=r" (val)
: "0" (&(rw->lock))
- : "g5", "g7", "memory");
+ : "g1", "g7", "memory");
membar("#StoreLoad | #StoreStore");
if (val)
goto wlock_again;
@@ -173,14 +173,14 @@ runlock_again:
/* Spin trying to decrement the counter using casx. */
__asm__ __volatile__(
" membar #StoreLoad | #LoadLoad\n"
-" ldx [%0], %%g5\n"
-" sub %%g5, 1, %%g7\n"
-" casx [%0], %%g5, %%g7\n"
+" ldx [%0], %%g1\n"
+" sub %%g1, 1, %%g7\n"
+" casx [%0], %%g1, %%g7\n"
" membar #StoreLoad | #StoreStore\n"
-" sub %%g5, %%g7, %0\n"
+" sub %%g1, %%g7, %0\n"
: "=r" (val)
: "0" (&(rw->lock))
- : "g5", "g7", "memory");
+ : "g1", "g7", "memory");
if (val) {
if (!--stuck) {
if (shown++ <= 2)
@@ -216,17 +216,17 @@ wlock_again:
__asm__ __volatile__(
" mov 1, %%g3\n"
" sllx %%g3, 63, %%g3\n"
-" ldx [%0], %%g5\n"
-" brlz,pn %%g5, 1f\n"
-" or %%g5, %%g3, %%g7\n"
-" casx [%0], %%g5, %%g7\n"
+" ldx [%0], %%g1\n"
+" brlz,pn %%g1, 1f\n"
+" or %%g1, %%g3, %%g7\n"
+" casx [%0], %%g1, %%g7\n"
" membar #StoreLoad | #StoreStore\n"
" ba,pt %%xcc, 2f\n"
-" sub %%g5, %%g7, %0\n"
+" sub %%g1, %%g7, %0\n"
"1: mov 1, %0\n"
"2:" : "=r" (val)
: "0" (&(rw->lock))
- : "g3", "g5", "g7", "memory");
+ : "g3", "g1", "g7", "memory");
if (val) {
/* We couldn't get the write bit. */
if (!--stuck) {
@@ -248,15 +248,15 @@ wlock_again:
__asm__ __volatile__(
" mov 1, %%g3\n"
" sllx %%g3, 63, %%g3\n"
-"1: ldx [%0], %%g5\n"
-" andn %%g5, %%g3, %%g7\n"
-" casx [%0], %%g5, %%g7\n"
-" cmp %%g5, %%g7\n"
+"1: ldx [%0], %%g1\n"
+" andn %%g1, %%g3, %%g7\n"
+" casx [%0], %%g1, %%g7\n"
+" cmp %%g1, %%g7\n"
" bne,pn %%xcc, 1b\n"
" membar #StoreLoad | #StoreStore"
: /* no outputs */
: "r" (&(rw->lock))
- : "g3", "g5", "g7", "cc", "memory");
+ : "g3", "g1", "g7", "cc", "memory");
while(rw->lock != 0) {
if (!--stuck) {
if (shown++ <= 2)
@@ -294,14 +294,14 @@ wlock_again:
" membar #StoreLoad | #LoadLoad\n"
" mov 1, %%g3\n"
" sllx %%g3, 63, %%g3\n"
-" ldx [%0], %%g5\n"
-" andn %%g5, %%g3, %%g7\n"
-" casx [%0], %%g5, %%g7\n"
+" ldx [%0], %%g1\n"
+" andn %%g1, %%g3, %%g7\n"
+" casx [%0], %%g1, %%g7\n"
" membar #StoreLoad | #StoreStore\n"
-" sub %%g5, %%g7, %0\n"
+" sub %%g1, %%g7, %0\n"
: "=r" (val)
: "0" (&(rw->lock))
- : "g3", "g5", "g7", "memory");
+ : "g3", "g1", "g7", "memory");
if (val) {
if (!--stuck) {
if (shown++ <= 2)
@@ -323,17 +323,17 @@ int _do_write_trylock (rwlock_t *rw, char *str)
__asm__ __volatile__(
" mov 1, %%g3\n"
" sllx %%g3, 63, %%g3\n"
-" ldx [%0], %%g5\n"
-" brlz,pn %%g5, 1f\n"
-" or %%g5, %%g3, %%g7\n"
-" casx [%0], %%g5, %%g7\n"
+" ldx [%0], %%g1\n"
+" brlz,pn %%g1, 1f\n"
+" or %%g1, %%g3, %%g7\n"
+" casx [%0], %%g1, %%g7\n"
" membar #StoreLoad | #StoreStore\n"
" ba,pt %%xcc, 2f\n"
-" sub %%g5, %%g7, %0\n"
+" sub %%g1, %%g7, %0\n"
"1: mov 1, %0\n"
"2:" : "=r" (val)
: "0" (&(rw->lock))
- : "g3", "g5", "g7", "memory");
+ : "g3", "g1", "g7", "memory");
if (val) {
put_cpu();
@@ -347,15 +347,15 @@ int _do_write_trylock (rwlock_t *rw, char *str)
__asm__ __volatile__(
" mov 1, %%g3\n"
" sllx %%g3, 63, %%g3\n"
-"1: ldx [%0], %%g5\n"
-" andn %%g5, %%g3, %%g7\n"
-" casx [%0], %%g5, %%g7\n"
-" cmp %%g5, %%g7\n"
+"1: ldx [%0], %%g1\n"
+" andn %%g1, %%g3, %%g7\n"
+" casx [%0], %%g1, %%g7\n"
+" cmp %%g1, %%g7\n"
" bne,pn %%xcc, 1b\n"
" membar #StoreLoad | #StoreStore"
: /* no outputs */
: "r" (&(rw->lock))
- : "g3", "g5", "g7", "cc", "memory");
+ : "g3", "g1", "g7", "cc", "memory");
put_cpu();
diff --git a/arch/sparc64/lib/dec_and_lock.S b/arch/sparc64/lib/dec_and_lock.S
index e86906744cf6f..7e6fdaebedbab 100644
--- a/arch/sparc64/lib/dec_and_lock.S
+++ b/arch/sparc64/lib/dec_and_lock.S
@@ -27,12 +27,12 @@
.globl _atomic_dec_and_lock
_atomic_dec_and_lock: /* %o0 = counter, %o1 = lock */
-loop1: lduw [%o0], %g5
- subcc %g5, 1, %g7
+loop1: lduw [%o0], %g2
+ subcc %g2, 1, %g7
be,pn %icc, start_to_zero
nop
-nzero: cas [%o0], %g5, %g7
- cmp %g5, %g7
+nzero: cas [%o0], %g2, %g7
+ cmp %g2, %g7
bne,pn %icc, loop1
mov 0, %g1
@@ -50,13 +50,13 @@ to_zero:
ldstub [%o1], %g3
brnz,pn %g3, spin_on_lock
membar #StoreLoad | #StoreStore
-loop2: cas [%o0], %g5, %g7 /* ASSERT(g7 == 0) */
- cmp %g5, %g7
+loop2: cas [%o0], %g2, %g7 /* ASSERT(g7 == 0) */
+ cmp %g2, %g7
be,pt %icc, out
mov 1, %g1
- lduw [%o0], %g5
- subcc %g5, 1, %g7
+ lduw [%o0], %g2
+ subcc %g2, 1, %g7
be,pn %icc, loop2
nop
membar #StoreStore | #LoadStore
diff --git a/arch/sparc64/lib/mcount.S b/arch/sparc64/lib/mcount.S
index 4e8c7928c49f1..2ef2e268bdcfd 100644
--- a/arch/sparc64/lib/mcount.S
+++ b/arch/sparc64/lib/mcount.S
@@ -38,22 +38,22 @@ _mcount:
* Check whether %sp is dangerously low.
*/
ldub [%g6 + TI_FPDEPTH], %g1
- srl %g1, 1, %g5
- add %g5, 1, %g5
- sllx %g5, 8, %g5 ! each fpregs frame is 256b
- add %g5, 192, %g5
- add %g6, %g5, %g5 ! where does task_struct+frame end?
- sub %g5, STACK_BIAS, %g5
- cmp %sp, %g5
+ srl %g1, 1, %g3
+ add %g3, 1, %g3
+ sllx %g3, 8, %g3 ! each fpregs frame is 256b
+ add %g3, 192, %g3
+ add %g6, %g3, %g3 ! where does task_struct+frame end?
+ sub %g3, STACK_BIAS, %g3
+ cmp %sp, %g3
bg,pt %xcc, 1f
- sethi %hi(panicstring), %g5
+ sethi %hi(panicstring), %g3
sethi %hi(ovstack), %g7 ! cant move to panic stack fast enough
or %g7, %lo(ovstack), %g7
add %g7, OVSTACKSIZE, %g7
sub %g7, STACK_BIAS, %g7
mov %g7, %sp
call prom_printf
- or %g5, %lo(panicstring), %o0
+ or %g3, %lo(panicstring), %o0
call prom_halt
nop
#endif
diff --git a/arch/sparc64/lib/memcmp.S b/arch/sparc64/lib/memcmp.S
index d34dc3d874dae..c90ad96c51b9c 100644
--- a/arch/sparc64/lib/memcmp.S
+++ b/arch/sparc64/lib/memcmp.S
@@ -13,12 +13,12 @@ memcmp:
cmp %o2, 0 ! IEU1 Group
loop: be,pn %icc, ret_0 ! CTI
nop ! IEU0
- ldub [%o0], %g5 ! LSU Group
+ ldub [%o0], %g7 ! LSU Group
ldub [%o1], %g3 ! LSU Group
sub %o2, 1, %o2 ! IEU0
add %o0, 1, %o0 ! IEU1
add %o1, 1, %o1 ! IEU0 Group
- subcc %g5, %g3, %g3 ! IEU1 Group
+ subcc %g7, %g3, %g3 ! IEU1 Group
be,pt %icc, loop ! CTI
cmp %o2, 0 ! IEU1 Group
diff --git a/arch/sparc64/lib/memmove.S b/arch/sparc64/lib/memmove.S
index 1c1ebbbdf830e..97395802c23c4 100644
--- a/arch/sparc64/lib/memmove.S
+++ b/arch/sparc64/lib/memmove.S
@@ -12,17 +12,17 @@ memmove: /* o0=dst o1=src o2=len */
mov %o0, %g1
cmp %o0, %o1
bleu,pt %xcc, memcpy
- add %o1, %o2, %g5
- cmp %g5, %o0
+ add %o1, %o2, %g7
+ cmp %g7, %o0
bleu,pt %xcc, memcpy
add %o0, %o2, %o5
- sub %g5, 1, %o1
+ sub %g7, 1, %o1
sub %o5, 1, %o0
-1: ldub [%o1], %g5
+1: ldub [%o1], %g7
subcc %o2, 1, %o2
sub %o1, 1, %o1
- stb %g5, [%o0]
+ stb %g7, [%o0]
bne,pt %icc, 1b
sub %o0, 1, %o0
diff --git a/arch/sparc64/lib/memscan.S b/arch/sparc64/lib/memscan.S
index a34c6b9d21e85..5e72d49114179 100644
--- a/arch/sparc64/lib/memscan.S
+++ b/arch/sparc64/lib/memscan.S
@@ -52,43 +52,43 @@ check_bytes:
andcc %o5, 0xff, %g0
add %o0, -5, %g2
ba,pt %xcc, 3f
- srlx %o5, 32, %g5
+ srlx %o5, 32, %g7
-2: srlx %o5, 8, %g5
+2: srlx %o5, 8, %g7
be,pn %icc, 1f
add %o0, -8, %g2
- andcc %g5, 0xff, %g0
- srlx %g5, 8, %g5
+ andcc %g7, 0xff, %g0
+ srlx %g7, 8, %g7
be,pn %icc, 1f
inc %g2
- andcc %g5, 0xff, %g0
+ andcc %g7, 0xff, %g0
- srlx %g5, 8, %g5
+ srlx %g7, 8, %g7
be,pn %icc, 1f
inc %g2
- andcc %g5, 0xff, %g0
- srlx %g5, 8, %g5
+ andcc %g7, 0xff, %g0
+ srlx %g7, 8, %g7
be,pn %icc, 1f
inc %g2
andcc %g3, %o3, %g0
be,a,pn %icc, 2f
mov %o0, %g2
-3: andcc %g5, 0xff, %g0
- srlx %g5, 8, %g5
+3: andcc %g7, 0xff, %g0
+ srlx %g7, 8, %g7
be,pn %icc, 1f
inc %g2
- andcc %g5, 0xff, %g0
- srlx %g5, 8, %g5
+ andcc %g7, 0xff, %g0
+ srlx %g7, 8, %g7
be,pn %icc, 1f
inc %g2
- andcc %g5, 0xff, %g0
- srlx %g5, 8, %g5
+ andcc %g7, 0xff, %g0
+ srlx %g7, 8, %g7
be,pn %icc, 1f
inc %g2
- andcc %g5, 0xff, %g0
- srlx %g5, 8, %g5
+ andcc %g7, 0xff, %g0
+ srlx %g7, 8, %g7
be,pn %icc, 1f
inc %g2
diff --git a/arch/sparc64/lib/rwsem.S b/arch/sparc64/lib/rwsem.S
new file mode 100644
index 0000000000000..174ff7b9164c5
--- /dev/null
+++ b/arch/sparc64/lib/rwsem.S
@@ -0,0 +1,165 @@
+/* rwsem.S: RW semaphore assembler.
+ *
+ * Written by David S. Miller (davem@redhat.com), 2001.
+ * Derived from asm-i386/rwsem.h
+ */
+
+#include <asm/rwsem-const.h>
+
+ .section .sched.text
+
+ .globl __down_read
+__down_read:
+1: lduw [%o0], %g1
+ add %g1, 1, %g7
+ cas [%o0], %g1, %g7
+ cmp %g1, %g7
+ bne,pn %icc, 1b
+ add %g7, 1, %g7
+ cmp %g7, 0
+ bl,pn %icc, 3f
+ membar #StoreLoad | #StoreStore
+2:
+ retl
+ nop
+3:
+ save %sp, -192, %sp
+ call rwsem_down_read_failed
+ mov %i0, %o0
+ ret
+ restore
+ .size __down_read, .-__down_read
+
+ .globl __down_read_trylock
+__down_read_trylock:
+1: lduw [%o0], %g1
+ add %g1, 1, %g7
+ cmp %g7, 0
+ bl,pn %icc, 2f
+ mov 0, %o1
+ cas [%o0], %g1, %g7
+ cmp %g1, %g7
+ bne,pn %icc, 1b
+ mov 1, %o1
+ membar #StoreLoad | #StoreStore
+2: retl
+ mov %o1, %o0
+ .size __down_read_trylock, .-__down_read_trylock
+
+ .globl __down_write
+__down_write:
+ sethi %hi(RWSEM_ACTIVE_WRITE_BIAS), %g1
+ or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1
+1:
+ lduw [%o0], %g3
+ add %g3, %g1, %g7
+ cas [%o0], %g3, %g7
+ cmp %g3, %g7
+ bne,pn %icc, 1b
+ cmp %g7, 0
+ bne,pn %icc, 3f
+ membar #StoreLoad | #StoreStore
+2: retl
+ nop
+3:
+ save %sp, -192, %sp
+ call rwsem_down_write_failed
+ mov %i0, %o0
+ ret
+ restore
+ .size __down_write, .-__down_write
+
+ .globl __down_write_trylock
+__down_write_trylock:
+ sethi %hi(RWSEM_ACTIVE_WRITE_BIAS), %g1
+ or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1
+1:
+ lduw [%o0], %g3
+ cmp %g3, 0
+ bne,pn %icc, 2f
+ mov 0, %o1
+ add %g3, %g1, %g7
+ cas [%o0], %g3, %g7
+ cmp %g3, %g7
+ bne,pn %icc, 1b
+ mov 1, %o1
+ membar #StoreLoad | #StoreStore
+2: retl
+ mov %o1, %o0
+ .size __down_write_trylock, .-__down_write_trylock
+
+ .globl __up_read
+__up_read:
+1:
+ lduw [%o0], %g1
+ sub %g1, 1, %g7
+ cas [%o0], %g1, %g7
+ cmp %g1, %g7
+ bne,pn %icc, 1b
+ cmp %g7, 0
+ bl,pn %icc, 3f
+ membar #StoreLoad | #StoreStore
+2: retl
+ nop
+3: sethi %hi(RWSEM_ACTIVE_MASK), %g1
+ sub %g7, 1, %g7
+ or %g1, %lo(RWSEM_ACTIVE_MASK), %g1
+ andcc %g7, %g1, %g0
+ bne,pn %icc, 2b
+ nop
+ save %sp, -192, %sp
+ call rwsem_wake
+ mov %i0, %o0
+ ret
+ restore
+ .size __up_read, .-__up_read
+
+ .globl __up_write
+__up_write:
+ sethi %hi(RWSEM_ACTIVE_WRITE_BIAS), %g1
+ or %g1, %lo(RWSEM_ACTIVE_WRITE_BIAS), %g1
+1:
+ lduw [%o0], %g3
+ sub %g3, %g1, %g7
+ cas [%o0], %g3, %g7
+ cmp %g3, %g7
+ bne,pn %icc, 1b
+ sub %g7, %g1, %g7
+ cmp %g7, 0
+ bl,pn %icc, 3f
+ membar #StoreLoad | #StoreStore
+2:
+ retl
+ nop
+3:
+ save %sp, -192, %sp
+ call rwsem_wake
+ mov %i0, %o0
+ ret
+ restore
+ .size __up_write, .-__up_write
+
+ .globl __downgrade_write
+__downgrade_write:
+ sethi %hi(RWSEM_WAITING_BIAS), %g1
+ or %g1, %lo(RWSEM_WAITING_BIAS), %g1
+1:
+ lduw [%o0], %g3
+ sub %g3, %g1, %g7
+ cas [%o0], %g3, %g7
+ cmp %g3, %g7
+ bne,pn %icc, 1b
+ sub %g7, %g1, %g7
+ cmp %g7, 0
+ bl,pn %icc, 3f
+ membar #StoreLoad | #StoreStore
+2:
+ retl
+ nop
+3:
+ save %sp, -192, %sp
+ call rwsem_downgrade_wake
+ mov %i0, %o0
+ ret
+ restore
+ .size __downgrade_write, .-__downgrade_write
diff --git a/arch/sparc64/lib/rwsem.c b/arch/sparc64/lib/rwsem.c
deleted file mode 100644
index e19968dbc2d15..0000000000000
--- a/arch/sparc64/lib/rwsem.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/* rwsem.c: Don't inline expand these suckers all over the place.
- *
- * Written by David S. Miller (davem@redhat.com), 2001.
- * Derived from asm-i386/rwsem.h
- */
-
-#include <linux/kernel.h>
-#include <linux/rwsem.h>
-#include <linux/init.h>
-#include <linux/module.h>
-
-extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *));
-extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *));
-
-void __sched __down_read(struct rw_semaphore *sem)
-{
- __asm__ __volatile__(
- "! beginning __down_read\n"
- "1:\tlduw [%0], %%g5\n\t"
- "add %%g5, 1, %%g7\n\t"
- "cas [%0], %%g5, %%g7\n\t"
- "cmp %%g5, %%g7\n\t"
- "bne,pn %%icc, 1b\n\t"
- " add %%g7, 1, %%g7\n\t"
- "cmp %%g7, 0\n\t"
- "bl,pn %%icc, 3f\n\t"
- " membar #StoreLoad | #StoreStore\n"
- "2:\n\t"
- ".subsection 2\n"
- "3:\tmov %0, %%g5\n\t"
- "save %%sp, -160, %%sp\n\t"
- "mov %%g1, %%l1\n\t"
- "mov %%g2, %%l2\n\t"
- "mov %%g3, %%l3\n\t"
- "call %1\n\t"
- " mov %%g5, %%o0\n\t"
- "mov %%l1, %%g1\n\t"
- "mov %%l2, %%g2\n\t"
- "ba,pt %%xcc, 2b\n\t"
- " restore %%l3, %%g0, %%g3\n\t"
- ".previous\n\t"
- "! ending __down_read"
- : : "r" (sem), "i" (rwsem_down_read_failed)
- : "g5", "g7", "memory", "cc");
-}
-EXPORT_SYMBOL(__down_read);
-
-int __down_read_trylock(struct rw_semaphore *sem)
-{
- int result;
-
- __asm__ __volatile__(
- "! beginning __down_read_trylock\n"
- "1:\tlduw [%1], %%g5\n\t"
- "add %%g5, 1, %%g7\n\t"
- "cmp %%g7, 0\n\t"
- "bl,pn %%icc, 2f\n\t"
- " mov 0, %0\n\t"
- "cas [%1], %%g5, %%g7\n\t"
- "cmp %%g5, %%g7\n\t"
- "bne,pn %%icc, 1b\n\t"
- " mov 1, %0\n\t"
- "membar #StoreLoad | #StoreStore\n"
- "2:\n\t"
- "! ending __down_read_trylock"
- : "=&r" (result)
- : "r" (sem)
- : "g5", "g7", "memory", "cc");
-
- return result;
-}
-EXPORT_SYMBOL(__down_read_trylock);
-
-void __sched __down_write(struct rw_semaphore *sem)
-{
- __asm__ __volatile__(
- "! beginning __down_write\n\t"
- "sethi %%hi(%2), %%g1\n\t"
- "or %%g1, %%lo(%2), %%g1\n"
- "1:\tlduw [%0], %%g5\n\t"
- "add %%g5, %%g1, %%g7\n\t"
- "cas [%0], %%g5, %%g7\n\t"
- "cmp %%g5, %%g7\n\t"
- "bne,pn %%icc, 1b\n\t"
- " cmp %%g7, 0\n\t"
- "bne,pn %%icc, 3f\n\t"
- " membar #StoreLoad | #StoreStore\n"
- "2:\n\t"
- ".subsection 2\n"
- "3:\tmov %0, %%g5\n\t"
- "save %%sp, -160, %%sp\n\t"
- "mov %%g2, %%l2\n\t"
- "mov %%g3, %%l3\n\t"
- "call %1\n\t"
- " mov %%g5, %%o0\n\t"
- "mov %%l2, %%g2\n\t"
- "ba,pt %%xcc, 2b\n\t"
- " restore %%l3, %%g0, %%g3\n\t"
- ".previous\n\t"
- "! ending __down_write"
- : : "r" (sem), "i" (rwsem_down_write_failed),
- "i" (RWSEM_ACTIVE_WRITE_BIAS)
- : "g1", "g5", "g7", "memory", "cc");
-}
-EXPORT_SYMBOL(__down_write);
-
-int __down_write_trylock(struct rw_semaphore *sem)
-{
- int result;
-
- __asm__ __volatile__(
- "! beginning __down_write_trylock\n\t"
- "sethi %%hi(%2), %%g1\n\t"
- "or %%g1, %%lo(%2), %%g1\n"
- "1:\tlduw [%1], %%g5\n\t"
- "cmp %%g5, 0\n\t"
- "bne,pn %%icc, 2f\n\t"
- " mov 0, %0\n\t"
- "add %%g5, %%g1, %%g7\n\t"
- "cas [%1], %%g5, %%g7\n\t"
- "cmp %%g5, %%g7\n\t"
- "bne,pn %%icc, 1b\n\t"
- " mov 1, %0\n\t"
- "membar #StoreLoad | #StoreStore\n"
- "2:\n\t"
- "! ending __down_write_trylock"
- : "=&r" (result)
- : "r" (sem), "i" (RWSEM_ACTIVE_WRITE_BIAS)
- : "g1", "g5", "g7", "memory", "cc");
-
- return result;
-}
-EXPORT_SYMBOL(__down_write_trylock);
-
-void __up_read(struct rw_semaphore *sem)
-{
- __asm__ __volatile__(
- "! beginning __up_read\n\t"
- "1:\tlduw [%0], %%g5\n\t"
- "sub %%g5, 1, %%g7\n\t"
- "cas [%0], %%g5, %%g7\n\t"
- "cmp %%g5, %%g7\n\t"
- "bne,pn %%icc, 1b\n\t"
- " cmp %%g7, 0\n\t"
- "bl,pn %%icc, 3f\n\t"
- " membar #StoreLoad | #StoreStore\n"
- "2:\n\t"
- ".subsection 2\n"
- "3:\tsethi %%hi(%2), %%g1\n\t"
- "sub %%g7, 1, %%g7\n\t"
- "or %%g1, %%lo(%2), %%g1\n\t"
- "andcc %%g7, %%g1, %%g0\n\t"
- "bne,pn %%icc, 2b\n\t"
- " mov %0, %%g5\n\t"
- "save %%sp, -160, %%sp\n\t"
- "mov %%g2, %%l2\n\t"
- "mov %%g3, %%l3\n\t"
- "call %1\n\t"
- " mov %%g5, %%o0\n\t"
- "mov %%l2, %%g2\n\t"
- "ba,pt %%xcc, 2b\n\t"
- " restore %%l3, %%g0, %%g3\n\t"
- ".previous\n\t"
- "! ending __up_read"
- : : "r" (sem), "i" (rwsem_wake),
- "i" (RWSEM_ACTIVE_MASK)
- : "g1", "g5", "g7", "memory", "cc");
-}
-EXPORT_SYMBOL(__up_read);
-
-void __up_write(struct rw_semaphore *sem)
-{
- __asm__ __volatile__(
- "! beginning __up_write\n\t"
- "sethi %%hi(%2), %%g1\n\t"
- "or %%g1, %%lo(%2), %%g1\n"
- "1:\tlduw [%0], %%g5\n\t"
- "sub %%g5, %%g1, %%g7\n\t"
- "cas [%0], %%g5, %%g7\n\t"
- "cmp %%g5, %%g7\n\t"
- "bne,pn %%icc, 1b\n\t"
- " sub %%g7, %%g1, %%g7\n\t"
- "cmp %%g7, 0\n\t"
- "bl,pn %%icc, 3f\n\t"
- " membar #StoreLoad | #StoreStore\n"
- "2:\n\t"
- ".subsection 2\n"
- "3:\tmov %0, %%g5\n\t"
- "save %%sp, -160, %%sp\n\t"
- "mov %%g2, %%l2\n\t"
- "mov %%g3, %%l3\n\t"
- "call %1\n\t"
- " mov %%g5, %%o0\n\t"
- "mov %%l2, %%g2\n\t"
- "ba,pt %%xcc, 2b\n\t"
- " restore %%l3, %%g0, %%g3\n\t"
- ".previous\n\t"
- "! ending __up_write"
- : : "r" (sem), "i" (rwsem_wake),
- "i" (RWSEM_ACTIVE_WRITE_BIAS)
- : "g1", "g5", "g7", "memory", "cc");
-}
-EXPORT_SYMBOL(__up_write);
-
-void __downgrade_write(struct rw_semaphore *sem)
-{
- __asm__ __volatile__(
- "! beginning __downgrade_write\n\t"
- "sethi %%hi(%2), %%g1\n\t"
- "or %%g1, %%lo(%2), %%g1\n"
- "1:\tlduw [%0], %%g5\n\t"
- "sub %%g5, %%g1, %%g7\n\t"
- "cas [%0], %%g5, %%g7\n\t"
- "cmp %%g5, %%g7\n\t"
- "bne,pn %%icc, 1b\n\t"
- " sub %%g7, %%g1, %%g7\n\t"
- "cmp %%g7, 0\n\t"
- "bl,pn %%icc, 3f\n\t"
- " membar #StoreLoad | #StoreStore\n"
- "2:\n\t"
- ".subsection 2\n"
- "3:\tmov %0, %%g5\n\t"
- "save %%sp, -160, %%sp\n\t"
- "mov %%g2, %%l2\n\t"
- "mov %%g3, %%l3\n\t"
- "call %1\n\t"
- " mov %%g5, %%o0\n\t"
- "mov %%l2, %%g2\n\t"
- "ba,pt %%xcc, 2b\n\t"
- " restore %%l3, %%g0, %%g3\n\t"
- ".previous\n\t"
- "! ending __up_write"
- : : "r" (sem), "i" (rwsem_downgrade_wake),
- "i" (RWSEM_WAITING_BIAS)
- : "g1", "g5", "g7", "memory", "cc");
-}
-EXPORT_SYMBOL(__downgrade_write);
diff --git a/arch/sparc64/lib/strlen.S b/arch/sparc64/lib/strlen.S
index 066ec1ed7d0dd..e9ba1920d818e 100644
--- a/arch/sparc64/lib/strlen.S
+++ b/arch/sparc64/lib/strlen.S
@@ -48,16 +48,16 @@ strlen:
add %o0, 4, %o0
/* Check every byte. */
- srl %o5, 24, %g5
- andcc %g5, 0xff, %g0
+ srl %o5, 24, %g7
+ andcc %g7, 0xff, %g0
be,pn %icc, 1f
add %o0, -4, %o4
- srl %o5, 16, %g5
- andcc %g5, 0xff, %g0
+ srl %o5, 16, %g7
+ andcc %g7, 0xff, %g0
be,pn %icc, 1f
add %o4, 1, %o4
- srl %o5, 8, %g5
- andcc %g5, 0xff, %g0
+ srl %o5, 8, %g7
+ andcc %g7, 0xff, %g0
be,pn %icc, 1f
add %o4, 1, %o4
andcc %o5, 0xff, %g0
diff --git a/arch/sparc64/lib/strlen_user.S b/arch/sparc64/lib/strlen_user.S
index 4af69a0adfbcc..9ed54ba14fc63 100644
--- a/arch/sparc64/lib/strlen_user.S
+++ b/arch/sparc64/lib/strlen_user.S
@@ -54,16 +54,16 @@ __strnlen_user:
ba,a,pt %xcc, 1f
/* Check every byte. */
-82: srl %o5, 24, %g5
- andcc %g5, 0xff, %g0
+82: srl %o5, 24, %g7
+ andcc %g7, 0xff, %g0
be,pn %icc, 1f
add %o0, -3, %o4
- srl %o5, 16, %g5
- andcc %g5, 0xff, %g0
+ srl %o5, 16, %g7
+ andcc %g7, 0xff, %g0
be,pn %icc, 1f
add %o4, 1, %o4
- srl %o5, 8, %g5
- andcc %g5, 0xff, %g0
+ srl %o5, 8, %g7
+ andcc %g7, 0xff, %g0
be,pn %icc, 1f
add %o4, 1, %o4
andcc %o5, 0xff, %g0
diff --git a/arch/sparc64/lib/strncpy_from_user.S b/arch/sparc64/lib/strncpy_from_user.S
index 93d600a319763..09cbbaa0ebf43 100644
--- a/arch/sparc64/lib/strncpy_from_user.S
+++ b/arch/sparc64/lib/strncpy_from_user.S
@@ -34,15 +34,15 @@
.type __strncpy_from_user,#function
__strncpy_from_user:
/* %o0=dest, %o1=src, %o2=count */
- sethi %hi(0b), %o5 ! IEU0 Group
- andcc %o1, 7, %g0 ! IEU1
+ andcc %o1, 7, %g0 ! IEU1 Group
bne,pn %icc, 30f ! CTI
- ldx [%o5 + %lo(0b)], %o4 ! Load Group
- add %o0, %o2, %g3 ! IEU0
+ add %o0, %o2, %g3 ! IEU0
60: ldxa [%o1] %asi, %g1 ! Load Group
brlez,pn %o2, 10f ! CTI
- sllx %o4, 7, %o5 ! IEU0 Group
- mov %o0, %o3 ! IEU1
+ mov %o0, %o3 ! IEU0
+50: sethi %hi(0b), %o4 ! IEU0 Group
+ ldx [%o4 + %lo(0b)], %o4 ! Load
+ sllx %o4, 7, %o5 ! IEU1 Group
1: sub %g1, %o4, %g2 ! IEU0 Group
stx %g1, [%o0] ! Store
add %o0, 8, %o0 ! IEU1
@@ -55,34 +55,34 @@ __strncpy_from_user:
10: retl ! CTI Group
mov %o2, %o0 ! IEU0
5: srlx %g2, 32, %g7 ! IEU0 Group
- sethi %hi(0xff00), %g5 ! IEU1
+ sethi %hi(0xff00), %o4 ! IEU1
andcc %g7, %o5, %g0 ! IEU1 Group
be,pn %icc, 2f ! CTI
- or %g5, %lo(0xff00), %g5 ! IEU0
+ or %o4, %lo(0xff00), %o4 ! IEU0
srlx %g1, 48, %g7 ! IEU0 Group
- andcc %g7, %g5, %g0 ! IEU1 Group
+ andcc %g7, %o4, %g0 ! IEU1 Group
be,pn %icc, 50f ! CTI
andcc %g7, 0xff, %g0 ! IEU1 Group
be,pn %icc, 51f ! CTI
srlx %g1, 32, %g7 ! IEU0
- andcc %g7, %g5, %g0 ! IEU1 Group
+ andcc %g7, %o4, %g0 ! IEU1 Group
be,pn %icc, 52f ! CTI
andcc %g7, 0xff, %g0 ! IEU1 Group
be,pn %icc, 53f ! CTI
2: andcc %g2, %o5, %g0 ! IEU1 Group
be,pn %icc, 2f ! CTI
srl %g1, 16, %g7 ! IEU0
- andcc %g7, %g5, %g0 ! IEU1 Group
+ andcc %g7, %o4, %g0 ! IEU1 Group
be,pn %icc, 54f ! CTI
andcc %g7, 0xff, %g0 ! IEU1 Group
be,pn %icc, 55f ! CTI
- andcc %g1, %g5, %g0 ! IEU1 Group
+ andcc %g1, %o4, %g0 ! IEU1 Group
be,pn %icc, 56f ! CTI
andcc %g1, 0xff, %g0 ! IEU1 Group
be,a,pn %icc, 57f ! CTI
sub %o0, %o3, %o0 ! IEU0
2: cmp %o0, %g3 ! IEU1 Group
- bl,a,pt %xcc, 1b ! CTI
+ bl,a,pt %xcc, 50b ! CTI
62: ldxa [%o1] %asi, %g1 ! Load
retl ! CTI Group
mov %o2, %o0 ! IEU0
diff --git a/arch/sparc64/lib/xor.S b/arch/sparc64/lib/xor.S
index f748fd6bbc389..4cd5d2be1ae1f 100644
--- a/arch/sparc64/lib/xor.S
+++ b/arch/sparc64/lib/xor.S
@@ -248,7 +248,7 @@ xor_vis_4:
.globl xor_vis_5
.type xor_vis_5,#function
xor_vis_5:
- mov %o5, %g5
+ save %sp, -192, %sp
rd %fprs, %o5
andcc %o5, FPRS_FEF|FPRS_DU, %g0
be,pt %icc, 0f
@@ -256,61 +256,60 @@ xor_vis_5:
jmpl %g1 + %lo(VISenter), %g7
add %g7, 8, %g7
0: wr %g0, FPRS_FEF, %fprs
- mov %g5, %o5
rd %asi, %g1
wr %g0, ASI_BLK_P, %asi
membar #LoadStore|#StoreLoad|#StoreStore
- sub %o0, 64, %o0
- ldda [%o1] %asi, %f0
- ldda [%o2] %asi, %f16
+ sub %i0, 64, %i0
+ ldda [%i1] %asi, %f0
+ ldda [%i2] %asi, %f16
-5: ldda [%o3] %asi, %f32
+5: ldda [%i3] %asi, %f32
fxor %f0, %f16, %f48
fxor %f2, %f18, %f50
- add %o1, 64, %o1
+ add %i1, 64, %i1
fxor %f4, %f20, %f52
fxor %f6, %f22, %f54
- add %o2, 64, %o2
+ add %i2, 64, %i2
fxor %f8, %f24, %f56
fxor %f10, %f26, %f58
fxor %f12, %f28, %f60
fxor %f14, %f30, %f62
- ldda [%o4] %asi, %f16
+ ldda [%i4] %asi, %f16
fxor %f48, %f32, %f48
fxor %f50, %f34, %f50
fxor %f52, %f36, %f52
fxor %f54, %f38, %f54
- add %o3, 64, %o3
+ add %i3, 64, %i3
fxor %f56, %f40, %f56
fxor %f58, %f42, %f58
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
- ldda [%o5] %asi, %f32
+ ldda [%i5] %asi, %f32
fxor %f48, %f16, %f48
fxor %f50, %f18, %f50
- add %o4, 64, %o4
+ add %i4, 64, %i4
fxor %f52, %f20, %f52
fxor %f54, %f22, %f54
- add %o5, 64, %o5
+ add %i5, 64, %i5
fxor %f56, %f24, %f56
fxor %f58, %f26, %f58
fxor %f60, %f28, %f60
fxor %f62, %f30, %f62
- ldda [%o1] %asi, %f0
+ ldda [%i1] %asi, %f0
fxor %f48, %f32, %f48
fxor %f50, %f34, %f50
fxor %f52, %f36, %f52
fxor %f54, %f38, %f54
fxor %f56, %f40, %f56
fxor %f58, %f42, %f58
- subcc %o0, 64, %o0
+ subcc %i0, 64, %i0
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
- stda %f48, [%o1 - 64] %asi
+ stda %f48, [%i1 - 64] %asi
bne,pt %xcc, 5b
- ldda [%o2] %asi, %f16
+ ldda [%i2] %asi, %f16
- ldda [%o3] %asi, %f32
+ ldda [%i3] %asi, %f32
fxor %f0, %f16, %f48
fxor %f2, %f18, %f50
fxor %f4, %f20, %f52
@@ -319,7 +318,7 @@ xor_vis_5:
fxor %f10, %f26, %f58
fxor %f12, %f28, %f60
fxor %f14, %f30, %f62
- ldda [%o4] %asi, %f16
+ ldda [%i4] %asi, %f16
fxor %f48, %f32, %f48
fxor %f50, %f34, %f50
fxor %f52, %f36, %f52
@@ -328,7 +327,7 @@ xor_vis_5:
fxor %f58, %f42, %f58
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
- ldda [%o5] %asi, %f32
+ ldda [%i5] %asi, %f32
fxor %f48, %f16, %f48
fxor %f50, %f18, %f50
fxor %f52, %f20, %f52
@@ -346,9 +345,10 @@ xor_vis_5:
fxor %f58, %f42, %f58
fxor %f60, %f44, %f60
fxor %f62, %f46, %f62
- stda %f48, [%o1] %asi
+ stda %f48, [%i1] %asi
membar #Sync|#StoreStore|#StoreLoad
wr %g1, %g0, %asi
- retl
- wr %g0, 0, %fprs
+ wr %g0, 0, %fprs
+ ret
+ restore
.size xor_vis_5, .-xor_vis_5
diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c
index 45edb9459bcdf..3ffee7b51aed5 100644
--- a/arch/sparc64/mm/fault.c
+++ b/arch/sparc64/mm/fault.c
@@ -144,7 +144,9 @@ static void unhandled_fault(unsigned long address, struct task_struct *tsk,
"at virtual address %016lx\n", (unsigned long)address);
}
printk(KERN_ALERT "tsk->{mm,active_mm}->context = %016lx\n",
- (tsk->mm ? tsk->mm->context : tsk->active_mm->context));
+ (tsk->mm ?
+ CTX_HWBITS(tsk->mm->context) :
+ CTX_HWBITS(tsk->active_mm->context)));
printk(KERN_ALERT "tsk->{mm,active_mm}->pgd = %016lx\n",
(tsk->mm ? (unsigned long) tsk->mm->pgd :
(unsigned long) tsk->active_mm->pgd));
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index ffa207795f1df..5a1f831b2de1b 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -20,6 +20,7 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
+#include <asm/mmu_context.h>
static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
{
@@ -217,12 +218,50 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
flush_tlb_range(vma, start, end);
}
+static void context_reload(void *__data)
+{
+ struct mm_struct *mm = __data;
+
+ if (mm == current->mm)
+ load_secondary_context(mm);
+}
+
int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
{
struct mm_struct *mm = current->mm;
unsigned long addr;
int ret = 0;
+ /* On UltraSPARC-III+ and later, configure the second half of
+ * the Data-TLB for huge pages.
+ */
+ if (tlb_type == cheetah_plus) {
+ unsigned long ctx;
+
+ spin_lock(&ctx_alloc_lock);
+ ctx = mm->context.sparc64_ctx_val;
+ ctx &= ~CTX_PGSZ_MASK;
+ ctx |= CTX_PGSZ_BASE << CTX_PGSZ0_SHIFT;
+ ctx |= CTX_PGSZ_HUGE << CTX_PGSZ1_SHIFT;
+
+ if (ctx != mm->context.sparc64_ctx_val) {
+ /* When changing the page size fields, we
+ * must perform a context flush so that no
+ * stale entries match. This flush must
+ * occur with the original context register
+ * settings.
+ */
+ do_flush_tlb_mm(mm);
+
+ /* Reload the context register of all processors
+ * also executing in this address space.
+ */
+ mm->context.sparc64_ctx_val = ctx;
+ on_each_cpu(context_reload, mm, 0, 0);
+ }
+ spin_unlock(&ctx_alloc_lock);
+ }
+
BUG_ON(vma->vm_start & ~HPAGE_MASK);
BUG_ON(vma->vm_end & ~HPAGE_MASK);
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 0e62b62c7dd44..89022ccaa75bb 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -61,7 +61,7 @@ static unsigned long bootmap_base;
/* get_new_mmu_context() uses "cache + 1". */
DEFINE_SPINLOCK(ctx_alloc_lock);
unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1;
-#define CTX_BMAP_SLOTS (1UL << (CTX_VERSION_SHIFT - 6))
+#define CTX_BMAP_SLOTS (1UL << (CTX_NR_BITS - 6))
unsigned long mmu_context_bmap[CTX_BMAP_SLOTS];
/* References to special section boundaries */
@@ -85,40 +85,14 @@ void check_pgt_cache(void)
preempt_disable();
if (pgtable_cache_size > PGT_CACHE_HIGH) {
do {
-#ifdef CONFIG_SMP
if (pgd_quicklist)
free_pgd_slow(get_pgd_fast());
-#endif
if (pte_quicklist[0])
free_pte_slow(pte_alloc_one_fast(NULL, 0));
if (pte_quicklist[1])
free_pte_slow(pte_alloc_one_fast(NULL, 1 << (PAGE_SHIFT + 10)));
} while (pgtable_cache_size > PGT_CACHE_LOW);
}
-#ifndef CONFIG_SMP
- if (pgd_cache_size > PGT_CACHE_HIGH / 4) {
- struct page *page, *page2;
- for (page2 = NULL, page = (struct page *)pgd_quicklist; page;) {
- if ((unsigned long)page->lru.prev == 3) {
- if (page2)
- page2->lru.next = page->lru.next;
- else
- pgd_quicklist = (void *) page->lru.next;
- pgd_cache_size -= 2;
- __free_page(page);
- if (page2)
- page = (struct page *)page2->lru.next;
- else
- page = (struct page *)pgd_quicklist;
- if (pgd_cache_size <= PGT_CACHE_LOW / 4)
- break;
- continue;
- }
- page2 = page;
- page = (struct page *)page->lru.next;
- }
- }
-#endif
preempt_enable();
}
@@ -135,7 +109,7 @@ __inline__ void flush_dcache_page_impl(struct page *page)
atomic_inc(&dcpage_flushes);
#endif
-#if (L1DCACHE_SIZE > PAGE_SIZE)
+#ifdef DCACHE_ALIASING_POSSIBLE
__flush_dcache_page(page_address(page),
((tlb_type == spitfire) &&
page_mapping(page) != NULL));
@@ -158,15 +132,15 @@ static __inline__ void set_dcache_dirty(struct page *page, int this_cpu)
mask = (mask << 24) | (1UL << PG_dcache_dirty);
__asm__ __volatile__("1:\n\t"
"ldx [%2], %%g7\n\t"
- "and %%g7, %1, %%g5\n\t"
- "or %%g5, %0, %%g5\n\t"
- "casx [%2], %%g7, %%g5\n\t"
- "cmp %%g7, %%g5\n\t"
+ "and %%g7, %1, %%g1\n\t"
+ "or %%g1, %0, %%g1\n\t"
+ "casx [%2], %%g7, %%g1\n\t"
+ "cmp %%g7, %%g1\n\t"
"bne,pn %%xcc, 1b\n\t"
" membar #StoreLoad | #StoreStore"
: /* no outputs */
: "r" (mask), "r" (non_cpu_bits), "r" (&page->flags)
- : "g5", "g7");
+ : "g1", "g7");
}
static __inline__ void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu)
@@ -176,20 +150,20 @@ static __inline__ void clear_dcache_dirty_cpu(struct page *page, unsigned long c
__asm__ __volatile__("! test_and_clear_dcache_dirty\n"
"1:\n\t"
"ldx [%2], %%g7\n\t"
- "srlx %%g7, 24, %%g5\n\t"
- "and %%g5, %3, %%g5\n\t"
- "cmp %%g5, %0\n\t"
+ "srlx %%g7, 24, %%g1\n\t"
+ "and %%g1, %3, %%g1\n\t"
+ "cmp %%g1, %0\n\t"
"bne,pn %%icc, 2f\n\t"
- " andn %%g7, %1, %%g5\n\t"
- "casx [%2], %%g7, %%g5\n\t"
- "cmp %%g7, %%g5\n\t"
+ " andn %%g7, %1, %%g1\n\t"
+ "casx [%2], %%g7, %%g1\n\t"
+ "cmp %%g7, %%g1\n\t"
"bne,pn %%xcc, 1b\n\t"
" membar #StoreLoad | #StoreStore\n"
"2:"
: /* no outputs */
: "r" (cpu), "r" (mask), "r" (&page->flags),
"i" (NR_CPUS - 1UL)
- : "g5", "g7");
+ : "g1", "g7");
}
extern void __update_mmu_cache(unsigned long mmu_context_hw, unsigned long address, pte_t pte, int code);
@@ -219,8 +193,9 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
put_cpu();
}
+
if (get_thread_fault_code())
- __update_mmu_cache(vma->vm_mm->context & TAG_CONTEXT_BITS,
+ __update_mmu_cache(CTX_NRBITS(vma->vm_mm->context),
address, pte, get_thread_fault_code());
}
@@ -281,9 +256,6 @@ void show_mem(void)
printk("%ld pages of RAM\n", num_physpages);
printk("%d free pages\n", nr_free_pages());
printk("%d pages in page table cache\n",pgtable_cache_size);
-#ifndef CONFIG_SMP
- printk("%d entries in page dir cache\n",pgd_cache_size);
-#endif
}
void mmu_info(struct seq_file *m)
@@ -392,10 +364,10 @@ static void inherit_prom_mappings(void)
n = n / sizeof(*trans);
/*
- * The obp translations are saved based on 8k pagesize, since obp can use
- * a mixture of pagesizes. Misses to the 0xf0000000 - 0x100000000, ie obp
- * range, are handled in entry.S and do not use the vpte scheme (see rant
- * in inherit_locked_prom_mappings()).
+ * The obp translations are saved based on 8k pagesize, since obp can
+ * use a mixture of pagesizes. Misses to the 0xf0000000 - 0x100000000,
+ * ie obp range, are handled in entry.S and do not use the vpte scheme
+ * (see rant in inherit_locked_prom_mappings()).
*/
#define OBP_PMD_SIZE 2048
prompmd = __alloc_bootmem(OBP_PMD_SIZE, OBP_PMD_SIZE, bootmap_base);
@@ -449,11 +421,15 @@ static void inherit_prom_mappings(void)
prom_printf("Remapping the kernel... ");
/* Spitfire Errata #32 workaround */
+ /* NOTE: Using plain zero for the context value is
+ * correct here, we are not using the Linux trap
+ * tables yet so we should not use the special
+ * UltraSPARC-III+ page size encodings yet.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
- : "r" (0),
- "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
+ : "r" (0), "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
switch (tlb_type) {
default:
@@ -513,6 +489,11 @@ static void inherit_prom_mappings(void)
tte_vaddr = (unsigned long) KERNBASE;
/* Spitfire Errata #32 workaround */
+ /* NOTE: Using plain zero for the context value is
+ * correct here, we are not using the Linux trap
+ * tables yet so we should not use the special
+ * UltraSPARC-III+ page size encodings yet.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
@@ -531,6 +512,11 @@ static void inherit_prom_mappings(void)
/* Spitfire Errata #32 workaround */
+ /* NOTE: Using plain zero for the context value is
+ * correct here, we are not using the Linux trap
+ * tables yet so we should not use the special
+ * UltraSPARC-III+ page size encodings yet.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
@@ -617,6 +603,9 @@ static void __flush_nucleus_vptes(void)
unsigned long tag;
/* Spitfire Errata #32 workaround */
+ /* NOTE: Always runs on spitfire, so no cheetah+
+ * page size encodings.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
@@ -783,6 +772,9 @@ void inherit_locked_prom_mappings(int save_p)
unsigned long data;
/* Spitfire Errata #32 workaround */
+ /* NOTE: Always runs on spitfire, so no cheetah+
+ * page size encodings.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
@@ -794,6 +786,9 @@ void inherit_locked_prom_mappings(int save_p)
unsigned long tag;
/* Spitfire Errata #32 workaround */
+ /* NOTE: Always runs on spitfire, so no
+ * cheetah+ page size encodings.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
@@ -821,6 +816,9 @@ void inherit_locked_prom_mappings(int save_p)
unsigned long data;
/* Spitfire Errata #32 workaround */
+ /* NOTE: Always runs on spitfire, so no
+ * cheetah+ page size encodings.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
@@ -832,6 +830,9 @@ void inherit_locked_prom_mappings(int save_p)
unsigned long tag;
/* Spitfire Errata #32 workaround */
+ /* NOTE: Always runs on spitfire, so no
+ * cheetah+ page size encodings.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
@@ -947,6 +948,7 @@ void prom_reload_locked(void)
}
}
+#ifdef DCACHE_ALIASING_POSSIBLE
void __flush_dcache_range(unsigned long start, unsigned long end)
{
unsigned long va;
@@ -970,6 +972,7 @@ void __flush_dcache_range(unsigned long start, unsigned long end)
"i" (ASI_DCACHE_INVALIDATE));
}
}
+#endif /* DCACHE_ALIASING_POSSIBLE */
/* If not locked, zap it. */
void __flush_tlb_all(void)
@@ -985,6 +988,9 @@ void __flush_tlb_all(void)
if (tlb_type == spitfire) {
for (i = 0; i < 64; i++) {
/* Spitfire Errata #32 workaround */
+ /* NOTE: Always runs on spitfire, so no
+ * cheetah+ page size encodings.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
@@ -1000,6 +1006,9 @@ void __flush_tlb_all(void)
}
/* Spitfire Errata #32 workaround */
+ /* NOTE: Always runs on spitfire, so no
+ * cheetah+ page size encodings.
+ */
__asm__ __volatile__("stxa %0, [%1] %2\n\t"
"flush %%g6"
: /* No outputs */
@@ -1033,11 +1042,14 @@ void __flush_tlb_all(void)
void get_new_mmu_context(struct mm_struct *mm)
{
unsigned long ctx, new_ctx;
+ unsigned long orig_pgsz_bits;
+
spin_lock(&ctx_alloc_lock);
- ctx = CTX_HWBITS(tlb_context_cache + 1);
- new_ctx = find_next_zero_bit(mmu_context_bmap, 1UL << CTX_VERSION_SHIFT, ctx);
- if (new_ctx >= (1UL << CTX_VERSION_SHIFT)) {
+ orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK);
+ ctx = (tlb_context_cache + 1) & CTX_NR_MASK;
+ new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx);
+ if (new_ctx >= (1 << CTX_NR_BITS)) {
new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1);
if (new_ctx >= ctx) {
int i;
@@ -1066,9 +1078,8 @@ void get_new_mmu_context(struct mm_struct *mm)
new_ctx |= (tlb_context_cache & CTX_VERSION_MASK);
out:
tlb_context_cache = new_ctx;
+ mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits;
spin_unlock(&ctx_alloc_lock);
-
- mm->context = new_ctx;
}
#ifndef CONFIG_SMP
@@ -1087,7 +1098,7 @@ struct pgtable_cache_struct pgt_quicklists;
* using the later address range, accesses with the first address
* range will see the newly initialized data rather than the garbage.
*/
-#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */
+#ifdef DCACHE_ALIASING_POSSIBLE
#define DC_ALIAS_SHIFT 1
#else
#define DC_ALIAS_SHIFT 0
@@ -1111,7 +1122,7 @@ pte_t *__pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
unsigned long paddr;
pte_t *pte;
-#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */
+#ifdef DCACHE_ALIASING_POSSIBLE
set_page_count(page, 1);
ClearPageCompound(page);
@@ -1129,7 +1140,7 @@ pte_t *__pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
to_free = (unsigned long *) paddr;
}
-#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */
+#ifdef DCACHE_ALIASING_POSSIBLE
/* Now free the other one up, adjust cache size. */
preempt_disable();
*to_free = (unsigned long) pte_quicklist[color ^ 0x1];
@@ -1702,22 +1713,6 @@ void __init mem_init(void)
initpages = (((unsigned long) __init_end) - ((unsigned long) __init_begin));
initpages = PAGE_ALIGN(initpages) >> PAGE_SHIFT;
-#ifndef CONFIG_SMP
- {
- /* Put empty_pg_dir on pgd_quicklist */
- extern pgd_t empty_pg_dir[1024];
- unsigned long addr = (unsigned long)empty_pg_dir;
- unsigned long alias_base = kern_base + PAGE_OFFSET -
- (long)(KERNBASE);
-
- memset(empty_pg_dir, 0, sizeof(empty_pg_dir));
- addr += alias_base;
- free_pgd_fast((pgd_t *)addr);
- num_physpages++;
- totalram_pages++;
- }
-#endif
-
printk("Memory: %uk available (%ldk kernel code, %ldk data, %ldk init) [%016lx,%016lx]\n",
nr_free_pages() << (PAGE_SHIFT-10),
codepages << (PAGE_SHIFT-10),
diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c
index 6255d6ef48eb0..90ca99d0b89cd 100644
--- a/arch/sparc64/mm/tlb.c
+++ b/arch/sparc64/mm/tlb.c
@@ -26,15 +26,13 @@ void flush_tlb_pending(void)
struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
if (mp->tlb_nr) {
- unsigned long context = mp->mm->context;
-
- if (CTX_VALID(context)) {
+ if (CTX_VALID(mp->mm->context)) {
#ifdef CONFIG_SMP
smp_flush_tlb_pending(mp->mm, mp->tlb_nr,
&mp->vaddrs[0]);
#else
- __flush_tlb_pending(CTX_HWBITS(context), mp->tlb_nr,
- &mp->vaddrs[0]);
+ __flush_tlb_pending(CTX_HWBITS(mp->mm->context),
+ mp->tlb_nr, &mp->vaddrs[0]);
#endif
}
mp->tlb_nr = 0;
@@ -73,6 +71,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t
}
no_cache_flush:
+
if (mp->tlb_frozen)
return;
@@ -101,11 +100,10 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long
if (mp->tlb_frozen)
return;
- /* Nobody should call us with start below VM hole and end above.
- * See if it is really true.
- */
- BUG_ON(s > e);
+ /* If start is greater than end, that is a real problem. */
+ BUG_ON(start > end);
+ /* However, straddling the VA space hole is quite normal. */
s &= PMD_MASK;
e = (e + PMD_SIZE - 1) & PMD_MASK;
@@ -123,6 +121,22 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long
start = vpte_base + (s >> (PAGE_SHIFT - 3));
end = vpte_base + (e >> (PAGE_SHIFT - 3));
+
+ /* If the request straddles the VA space hole, we
+ * need to swap start and end. The reason this
+ * occurs is that "vpte_base" is the center of
+ * the linear page table mapping area. Thus,
+ * high addresses with the sign bit set map to
+ * addresses below vpte_base and non-sign bit
+ * addresses map to addresses above vpte_base.
+ */
+ if (end < start) {
+ unsigned long tmp = start;
+
+ start = end;
+ end = tmp;
+ }
+
while (start < end) {
mp->vaddrs[nr] = start;
mp->tlb_nr = ++nr;
@@ -135,10 +149,3 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long
if (nr)
flush_tlb_pending();
}
-
-unsigned long __ptrs_per_pmd(void)
-{
- if (test_thread_flag(TIF_32BIT))
- return (1UL << (32 - (PAGE_SHIFT-3) - PAGE_SHIFT));
- return REAL_PTRS_PER_PMD;
-}
diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S
index af8205edfbd0f..7a0934321010a 100644
--- a/arch/sparc64/mm/ultra.S
+++ b/arch/sparc64/mm/ultra.S
@@ -13,6 +13,7 @@
#include <asm/pil.h>
#include <asm/head.h>
#include <asm/thread_info.h>
+#include <asm/cacheflush.h>
/* Basically, most of the Spitfire vs. Cheetah madness
* has to do with the fact that Cheetah does not support
@@ -49,9 +50,9 @@ __flush_tlb_mm: /* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */
.globl __flush_tlb_pending
__flush_tlb_pending:
/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
- rdpr %pstate, %g5
+ rdpr %pstate, %g7
sllx %o1, 3, %o1
- andn %g5, PSTATE_IE, %g2
+ andn %g7, PSTATE_IE, %g2
wrpr %g2, %pstate
mov SECONDARY_CONTEXT, %o4
ldxa [%o4] ASI_DMMU, %g2
@@ -70,7 +71,7 @@ __flush_tlb_pending:
stxa %g2, [%o4] ASI_DMMU
flush %g6
retl
- wrpr %g5, 0x0, %pstate
+ wrpr %g7, 0x0, %pstate
.align 32
.globl __flush_tlb_kernel_range
@@ -114,64 +115,27 @@ __spitfire_flush_tlb_mm_slow:
.align 32
.globl __flush_icache_page
__flush_icache_page: /* %o0 = phys_page */
- sethi %hi(1 << 13), %o2 ! IC_set bit
- mov 1, %g1
- srlx %o0, 5, %o0
- clr %o1 ! IC_addr
- sllx %g1, 36, %g1
- ldda [%o1] ASI_IC_TAG, %o4
- sub %g1, 1, %g2
- or %o0, %g1, %o0 ! VALID+phys-addr comparitor
-
- sllx %g2, 1, %g2
- andn %g2, ITAG_MASK, %g2 ! IC_tag mask
- nop
- nop
- nop
- nop
- nop
- nop
-
-1: addx %g0, %g0, %g0
- ldda [%o1 + %o2] ASI_IC_TAG, %g4
- addx %g0, %g0, %g0
- and %o5, %g2, %g3
- cmp %g3, %o0
- add %o1, 0x20, %o1
- ldda [%o1] ASI_IC_TAG, %o4
- be,pn %xcc, iflush1
-
-2: nop
- and %g5, %g2, %g5
- cmp %g5, %o0
- be,pn %xcc, iflush2
-3: cmp %o1, %o2
- bne,pt %xcc, 1b
- addx %g0, %g0, %g0
- nop
-
+ membar #StoreStore
+ srlx %o0, PAGE_SHIFT, %o0
+ sethi %uhi(PAGE_OFFSET), %g1
+ sllx %o0, PAGE_SHIFT, %o0
+ sethi %hi(PAGE_SIZE), %g2
+ sllx %g1, 32, %g1
+ add %o0, %g1, %o0
+1: subcc %g2, 32, %g2
+ bne,pt %icc, 1b
+ flush %o0 + %g2
retl
- ldx [%g6 + TI_TASK], %g4
+ nop
-iflush1:sub %o1, 0x20, %g3
- stxa %g0, [%g3] ASI_IC_TAG
- flush %g6
- ba,a,pt %xcc, 2b
-iflush2:sub %o1, 0x20, %g3
- stxa %g0, [%o1 + %o2] ASI_IC_TAG
- flush %g6
- ba,a,pt %xcc, 3b
+#ifdef DCACHE_ALIASING_POSSIBLE
-#if (PAGE_SHIFT == 13)
-#define DTAG_MASK 0x3
-#elif (PAGE_SHIFT == 16)
-#define DTAG_MASK 0x1f
-#elif (PAGE_SHIFT == 19)
-#define DTAG_MASK 0xff
-#elif (PAGE_SHIFT == 22)
-#define DTAG_MASK 0x3ff
+#if (PAGE_SHIFT != 13)
+#error only page shift of 13 is supported by dcache flush
#endif
+#define DTAG_MASK 0x3
+
.align 64
.globl __flush_dcache_page
__flush_dcache_page: /* %o0=kaddr, %o1=flush_icache */
@@ -228,6 +192,7 @@ dflush4:stxa %g0, [%o4] ASI_DCACHE_TAG
membar #Sync
ba,pt %xcc, 2b
nop
+#endif /* DCACHE_ALIASING_POSSIBLE */
.align 32
__prefill_dtlb:
@@ -258,10 +223,18 @@ __update_mmu_cache: /* %o0=hw_context, %o1=address, %o2=pte, %o3=fault_code */
or %o5, %o0, %o5
ba,a,pt %xcc, __prefill_itlb
- /* Cheetah specific versions, patched at boot time. */
+ /* Cheetah specific versions, patched at boot time.
+ *
+ * This writes of the PRIMARY_CONTEXT register in this file are
+ * safe even on Cheetah+ and later wrt. the page size fields.
+ * The nucleus page size fields do not matter because we make
+ * no data references, and these instructions execute out of a
+ * locked I-TLB entry sitting in the fully assosciative I-TLB.
+ * This sequence should also never trap.
+ */
__cheetah_flush_tlb_mm: /* 15 insns */
- rdpr %pstate, %g5
- andn %g5, PSTATE_IE, %g2
+ rdpr %pstate, %g7
+ andn %g7, PSTATE_IE, %g2
wrpr %g2, 0x0, %pstate
wrpr %g0, 1, %tl
mov PRIMARY_CONTEXT, %o2
@@ -274,13 +247,13 @@ __cheetah_flush_tlb_mm: /* 15 insns */
flush %g6
wrpr %g0, 0, %tl
retl
- wrpr %g5, 0x0, %pstate
+ wrpr %g7, 0x0, %pstate
__cheetah_flush_tlb_pending: /* 22 insns */
/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
- rdpr %pstate, %g5
+ rdpr %pstate, %g7
sllx %o1, 3, %o1
- andn %g5, PSTATE_IE, %g2
+ andn %g7, PSTATE_IE, %g2
wrpr %g2, 0x0, %pstate
wrpr %g0, 1, %tl
mov PRIMARY_CONTEXT, %o4
@@ -299,8 +272,9 @@ __cheetah_flush_tlb_pending: /* 22 insns */
flush %g6
wrpr %g0, 0, %tl
retl
- wrpr %g5, 0x0, %pstate
+ wrpr %g7, 0x0, %pstate
+#ifdef DCACHE_ALIASING_POSSIBLE
flush_dcpage_cheetah: /* 11 insns */
sethi %uhi(PAGE_OFFSET), %g1
sllx %g1, 32, %g1
@@ -313,6 +287,7 @@ flush_dcpage_cheetah: /* 11 insns */
nop
retl /* I-cache flush never needed on Cheetah, see callers. */
nop
+#endif /* DCACHE_ALIASING_POSSIBLE */
cheetah_patch_one:
1: lduw [%o1], %g1
@@ -343,12 +318,14 @@ cheetah_patch_cachetlbops:
call cheetah_patch_one
mov 22, %o2
+#ifdef DCACHE_ALIASING_POSSIBLE
sethi %hi(__flush_dcache_page), %o0
or %o0, %lo(__flush_dcache_page), %o0
sethi %hi(flush_dcpage_cheetah), %o1
or %o1, %lo(flush_dcpage_cheetah), %o1
call cheetah_patch_one
mov 11, %o2
+#endif /* DCACHE_ALIASING_POSSIBLE */
ret
restore
@@ -464,6 +441,7 @@ xcall_report_regs:
b rtrap_xcall
ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
+#ifdef DCACHE_ALIASING_POSSIBLE
.align 32
.globl xcall_flush_dcache_page_cheetah
xcall_flush_dcache_page_cheetah: /* %g1 == physical page address */
@@ -475,12 +453,13 @@ xcall_flush_dcache_page_cheetah: /* %g1 == physical page address */
nop
retry
nop
+#endif /* DCACHE_ALIASING_POSSIBLE */
.globl xcall_flush_dcache_page_spitfire
xcall_flush_dcache_page_spitfire: /* %g1 == physical page address
%g7 == kernel page virtual address
%g5 == (page->mapping != NULL) */
-#if (L1DCACHE_SIZE > PAGE_SIZE)
+#ifdef DCACHE_ALIASING_POSSIBLE
srlx %g1, (13 - 2), %g1 ! Form tag comparitor
sethi %hi(L1DCACHE_SIZE), %g3 ! D$ size == 16K
sub %g3, (1 << 5), %g3 ! D$ linesize == 32
@@ -499,7 +478,7 @@ xcall_flush_dcache_page_spitfire: /* %g1 == physical page address
sub %g3, (1 << 5), %g3
brz,pn %g5, 2f
-#endif /* L1DCACHE_SIZE > PAGE_SIZE */
+#endif /* DCACHE_ALIASING_POSSIBLE */
sethi %hi(PAGE_SIZE), %g3
1: flush %g7
diff --git a/arch/sparc64/prom/map.S b/arch/sparc64/prom/map.S
index 509f7b4abef1e..21b3f9c99ea77 100644
--- a/arch/sparc64/prom/map.S
+++ b/arch/sparc64/prom/map.S
@@ -32,6 +32,7 @@ prom_remap: /* %o0 = physpage, %o1 = virtpage, %o2 = mmu_ihandle */
ldx [%g2 + 0x08], %l0 ! prom_cif_handler
mov %g6, %i3
mov %g4, %i4
+ mov %g5, %i5
flushw
sethi %hi(prom_remap - call_method), %g7
@@ -62,6 +63,7 @@ prom_remap: /* %o0 = physpage, %o1 = virtpage, %o2 = mmu_ihandle */
/* Restore hard-coded globals. */
mov %i3, %g6
mov %i4, %g4
+ mov %i5, %g5
/* Wheee.... we are done. */
ret
diff --git a/arch/sparc64/prom/p1275.c b/arch/sparc64/prom/p1275.c
index 9eab4421e1e4c..59fe38bba39e8 100644
--- a/arch/sparc64/prom/p1275.c
+++ b/arch/sparc64/prom/p1275.c
@@ -30,6 +30,16 @@ extern void prom_world(int);
extern void prom_cif_interface(void);
extern void prom_cif_callback(void);
+static inline unsigned long spitfire_get_primary_context(void)
+{
+ unsigned long ctx;
+
+ __asm__ __volatile__("ldxa [%1] %2, %0"
+ : "=r" (ctx)
+ : "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
+ return ctx;
+}
+
/*
* This provides SMP safety on the p1275buf. prom_callback() drops this lock
* to allow recursuve acquisition.
@@ -43,14 +53,9 @@ long p1275_cmd (char *service, long fmt, ...)
int nargs, nrets, i;
va_list list;
long attrs, x;
- long ctx = 0;
p = p1275buf.prom_buffer;
- ctx = spitfire_get_primary_context ();
- if (ctx) {
- flushw_user ();
- spitfire_set_primary_context (0);
- }
+ BUG_ON((spitfire_get_primary_context() & CTX_NR_MASK) != 0);
spin_lock_irqsave(&prom_entry_lock, flags);
@@ -146,9 +151,6 @@ long p1275_cmd (char *service, long fmt, ...)
spin_unlock_irqrestore(&prom_entry_lock, flags);
- if (ctx)
- spitfire_set_primary_context (ctx);
-
return x;
}
diff --git a/include/asm-parisc/unaligned.h b/include/asm-parisc/unaligned.h
index 0896a9f66529d..53c905838d933 100644
--- a/include/asm-parisc/unaligned.h
+++ b/include/asm-parisc/unaligned.h
@@ -1,7 +1,7 @@
#ifndef _ASM_PARISC_UNALIGNED_H_
#define _ASM_PARISC_UNALIGNED_H_
-#include <asm-parisc/unaligned.h>
+#include <asm-generic/unaligned.h>
#ifdef __KERNEL__
struct pt_regs;
diff --git a/include/asm-sparc/pgtable.h b/include/asm-sparc/pgtable.h
index 3d2418c28ff58..373a6c327590d 100644
--- a/include/asm-sparc/pgtable.h
+++ b/include/asm-sparc/pgtable.h
@@ -150,6 +150,7 @@ BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page, pgd_t)
BTFIXUPDEF_SETHI(none_mask)
BTFIXUPDEF_CALL_CONST(int, pte_present, pte_t)
BTFIXUPDEF_CALL(void, pte_clear, pte_t *)
+BTFIXUPDEF_CALL(int, pte_read, pte_t)
extern __inline__ int pte_none(pte_t pte)
{
@@ -158,6 +159,7 @@ extern __inline__ int pte_none(pte_t pte)
#define pte_present(pte) BTFIXUP_CALL(pte_present)(pte)
#define pte_clear(mm,addr,pte) BTFIXUP_CALL(pte_clear)(pte)
+#define pte_read(pte) BTFIXUP_CALL(pte_read)(pte)
BTFIXUPDEF_CALL_CONST(int, pmd_bad, pmd_t)
BTFIXUPDEF_CALL_CONST(int, pmd_present, pmd_t)
@@ -186,31 +188,10 @@ BTFIXUPDEF_CALL(void, pgd_clear, pgd_t *)
* The following only work if pte_present() is true.
* Undefined behaviour if not..
*/
-BTFIXUPDEF_HALF(pte_readi)
BTFIXUPDEF_HALF(pte_writei)
BTFIXUPDEF_HALF(pte_dirtyi)
BTFIXUPDEF_HALF(pte_youngi)
-extern int pte_read(pte_t pte) __attribute_const__;
-extern __inline__ int pte_read(pte_t pte)
-{
- switch (sparc_cpu_model){
- case sun4:
- case sun4c:
- return pte_val(pte) & BTFIXUP_HALF(pte_readi);
- case sun4d:
- case sun4e:
- case sun4m:
- return !(pte_val(pte) & BTFIXUP_HALF(pte_readi));
- /* pacify gcc warnings */
- case sun4u:
- case sun_unknown:
- case ap1000:
- default:
- return 0;
- }
-}
-
extern int pte_write(pte_t pte) __attribute_const__;
extern __inline__ int pte_write(pte_t pte)
{
diff --git a/include/asm-sparc64/cacheflush.h b/include/asm-sparc64/cacheflush.h
index f1f8661cf83a8..86f02937ff1b7 100644
--- a/include/asm-sparc64/cacheflush.h
+++ b/include/asm-sparc64/cacheflush.h
@@ -2,6 +2,17 @@
#define _SPARC64_CACHEFLUSH_H
#include <linux/config.h>
+#include <asm/page.h>
+
+/* Flushing for D-cache alias handling is only needed if
+ * the page size is smaller than 16K.
+ */
+#if PAGE_SHIFT < 14
+#define DCACHE_ALIASING_POSSIBLE
+#endif
+
+#ifndef __ASSEMBLY__
+
#include <linux/mm.h>
/* Cache flush operations. */
@@ -20,9 +31,9 @@
* module load, so we need this.
*/
extern void flush_icache_range(unsigned long start, unsigned long end);
+extern void __flush_icache_page(unsigned long);
extern void __flush_dcache_page(void *addr, int flush_icache);
-extern void __flush_icache_page(unsigned long);
extern void flush_dcache_page_impl(struct page *page);
#ifdef CONFIG_SMP
extern void smp_flush_dcache_page_impl(struct page *page, int cpu);
@@ -33,6 +44,7 @@ extern void flush_dcache_page_all(struct mm_struct *mm, struct page *page);
#endif
extern void __flush_dcache_range(unsigned long start, unsigned long end);
+extern void flush_dcache_page(struct page *page);
#define flush_icache_page(vma, pg) do { } while(0)
#define flush_icache_user_range(vma,pg,adr,len) do { } while (0)
@@ -49,11 +61,12 @@ extern void __flush_dcache_range(unsigned long start, unsigned long end);
memcpy(dst, src, len); \
} while (0)
-extern void flush_dcache_page(struct page *page);
#define flush_dcache_mmap_lock(mapping) do { } while (0)
#define flush_dcache_mmap_unlock(mapping) do { } while (0)
#define flush_cache_vmap(start, end) do { } while (0)
#define flush_cache_vunmap(start, end) do { } while (0)
+#endif /* !__ASSEMBLY__ */
+
#endif /* _SPARC64_CACHEFLUSH_H */
diff --git a/include/asm-sparc64/checksum.h b/include/asm-sparc64/checksum.h
index 91136a643c27f..dc8bed246fc98 100644
--- a/include/asm-sparc64/checksum.h
+++ b/include/asm-sparc64/checksum.h
@@ -38,47 +38,44 @@ extern unsigned int csum_partial(const unsigned char * buff, int len, unsigned i
* here even more important to align src and dst on a 32-bit (or even
* better 64-bit) boundary
*/
-extern unsigned int csum_partial_copy_sparc64(const unsigned char *src, unsigned char *dst,
+extern unsigned int csum_partial_copy_nocheck(const unsigned char *src,
+ unsigned char *dst,
int len, unsigned int sum);
-
-static inline unsigned int
-csum_partial_copy_nocheck (const unsigned char *src, unsigned char *dst, int len,
- unsigned int sum)
-{
- int ret;
- unsigned char cur_ds = get_thread_current_ds();
- __asm__ __volatile__ ("wr %%g0, %0, %%asi" : : "i" (ASI_P));
- ret = csum_partial_copy_sparc64(src, dst, len, sum);
- __asm__ __volatile__ ("wr %%g0, %0, %%asi" : : "r" (cur_ds));
- return ret;
-}
-static inline unsigned int
-csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst, int len,
+extern long __csum_partial_copy_from_user(const unsigned char __user *src,
+ unsigned char *dst, int len,
+ unsigned int sum);
+
+static inline unsigned int
+csum_partial_copy_from_user(const unsigned char __user *src,
+ unsigned char *dst, int len,
unsigned int sum, int *err)
{
- __asm__ __volatile__ ("stx %0, [%%sp + 0x7ff + 128]"
- : : "r" (err));
- return csum_partial_copy_sparc64((__force const char *) src,
- dst, len, sum);
+ long ret = __csum_partial_copy_from_user(src, dst, len, sum);
+ if (ret < 0)
+ *err = -EFAULT;
+ return (unsigned int) ret;
}
/*
* Copy and checksum to user
*/
#define HAVE_CSUM_COPY_USER
-extern unsigned int csum_partial_copy_user_sparc64(const unsigned char *src, unsigned char __user *dst,
- int len, unsigned int sum);
+extern long __csum_partial_copy_to_user(const unsigned char *src,
+ unsigned char __user *dst, int len,
+ unsigned int sum);
-static inline unsigned int
-csum_and_copy_to_user(const unsigned char *src, unsigned char __user *dst, int len,
+static inline unsigned int
+csum_and_copy_to_user(const unsigned char *src,
+ unsigned char __user *dst, int len,
unsigned int sum, int *err)
{
- __asm__ __volatile__ ("stx %0, [%%sp + 0x7ff + 128]"
- : : "r" (err));
- return csum_partial_copy_user_sparc64(src, dst, len, sum);
+ long ret = __csum_partial_copy_to_user(src, dst, len, sum);
+ if (ret < 0)
+ *err = -EFAULT;
+ return (unsigned int) ret;
}
-
+
/* ihl is always 5 or greater, almost always is 5, and iph is word aligned
* the majority of the time.
*/
diff --git a/include/asm-sparc64/cpudata.h b/include/asm-sparc64/cpudata.h
index d7625ffc0b85a..cc7198aaac505 100644
--- a/include/asm-sparc64/cpudata.h
+++ b/include/asm-sparc64/cpudata.h
@@ -19,12 +19,13 @@ typedef struct {
/* Dcache line 2 */
unsigned int pgcache_size;
- unsigned int pgdcache_size;
+ unsigned int __pad1;
unsigned long *pte_cache[2];
unsigned long *pgd_cache;
} cpuinfo_sparc;
DECLARE_PER_CPU(cpuinfo_sparc, __cpu_data);
-#define cpu_data(__cpu) per_cpu(__cpu_data, (__cpu))
+#define cpu_data(__cpu) per_cpu(__cpu_data, (__cpu))
+#define local_cpu_data() __get_cpu_var(__cpu_data)
#endif /* _SPARC64_CPUDATA_H */
diff --git a/include/asm-sparc64/ide.h b/include/asm-sparc64/ide.h
index 6b327402277fd..4c1098474c73f 100644
--- a/include/asm-sparc64/ide.h
+++ b/include/asm-sparc64/ide.h
@@ -13,8 +13,8 @@
#include <linux/config.h>
#include <asm/pgalloc.h>
#include <asm/io.h>
-#include <asm/page.h>
#include <asm/spitfire.h>
+#include <asm/cacheflush.h>
#ifndef MAX_HWIFS
# ifdef CONFIG_BLK_DEV_IDEPCI
@@ -51,7 +51,7 @@ static inline unsigned int inw_be(void __iomem *addr)
static inline void __ide_insw(void __iomem *port, void *dst, u32 count)
{
-#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */
+#ifdef DCACHE_ALIASING_POSSIBLE
unsigned long end = (unsigned long)dst + (count << 1);
#endif
u16 *ps = dst;
@@ -74,7 +74,7 @@ static inline void __ide_insw(void __iomem *port, void *dst, u32 count)
if(count)
*ps++ = inw_be(port);
-#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */
+#ifdef DCACHE_ALIASING_POSSIBLE
__flush_dcache_range((unsigned long)dst, end);
#endif
}
@@ -88,7 +88,7 @@ static inline void outw_be(unsigned short w, void __iomem *addr)
static inline void __ide_outsw(void __iomem *port, void *src, u32 count)
{
-#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */
+#ifdef DCACHE_ALIASING_POSSIBLE
unsigned long end = (unsigned long)src + (count << 1);
#endif
const u16 *ps = src;
@@ -111,7 +111,7 @@ static inline void __ide_outsw(void __iomem *port, void *src, u32 count)
if(count)
outw_be(*ps, port);
-#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */
+#ifdef DCACHE_ALIASING_POSSIBLE
__flush_dcache_range((unsigned long)src, end);
#endif
}
diff --git a/include/asm-sparc64/mmu.h b/include/asm-sparc64/mmu.h
index ccd36d26615a7..8627eed6e83df 100644
--- a/include/asm-sparc64/mmu.h
+++ b/include/asm-sparc64/mmu.h
@@ -1,7 +1,99 @@
#ifndef __MMU_H
#define __MMU_H
-/* Default "unsigned long" context */
-typedef unsigned long mm_context_t;
+#include <linux/config.h>
+#include <asm/page.h>
+#include <asm/const.h>
+/*
+ * For the 8k pagesize kernel, use only 10 hw context bits to optimize some
+ * shifts in the fast tlbmiss handlers, instead of all 13 bits (specifically
+ * for vpte offset calculation). For other pagesizes, this optimization in
+ * the tlbhandlers can not be done; but still, all 13 bits can not be used
+ * because the tlb handlers use "andcc" instruction which sign extends 13
+ * bit arguments.
+ */
+#if PAGE_SHIFT == 13
+#define CTX_NR_BITS 10
+#else
+#define CTX_NR_BITS 12
#endif
+
+#define TAG_CONTEXT_BITS ((_AC(1,UL) << CTX_NR_BITS) - _AC(1,UL))
+
+/* UltraSPARC-III+ and later have a feature whereby you can
+ * select what page size the various Data-TLB instances in the
+ * chip. In order to gracefully support this, we put the version
+ * field in a spot outside of the areas of the context register
+ * where this parameter is specified.
+ */
+#define CTX_VERSION_SHIFT 22
+#define CTX_VERSION_MASK ((~0UL) << CTX_VERSION_SHIFT)
+
+#define CTX_PGSZ_8KB _AC(0x0,UL)
+#define CTX_PGSZ_64KB _AC(0x1,UL)
+#define CTX_PGSZ_512KB _AC(0x2,UL)
+#define CTX_PGSZ_4MB _AC(0x3,UL)
+#define CTX_PGSZ_BITS _AC(0x7,UL)
+#define CTX_PGSZ0_NUC_SHIFT 61
+#define CTX_PGSZ1_NUC_SHIFT 58
+#define CTX_PGSZ0_SHIFT 16
+#define CTX_PGSZ1_SHIFT 19
+#define CTX_PGSZ_MASK ((CTX_PGSZ_BITS << CTX_PGSZ0_SHIFT) | \
+ (CTX_PGSZ_BITS << CTX_PGSZ1_SHIFT))
+
+#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
+#define CTX_PGSZ_BASE CTX_PGSZ_8KB
+#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB)
+#define CTX_PGSZ_BASE CTX_PGSZ_64KB
+#elif defined(CONFIG_SPARC64_PAGE_SIZE_512KB)
+#define CTX_PGSZ_BASE CTX_PGSZ_512KB
+#elif defined(CONFIG_SPARC64_PAGE_SIZE_4MB)
+#define CTX_PGSZ_BASE CTX_PGSZ_4MB
+#else
+#error No page size specified in kernel configuration
+#endif
+
+#if defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
+#define CTX_PGSZ_HUGE CTX_PGSZ_4MB
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
+#define CTX_PGSZ_HUGE CTX_PGSZ_512KB
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
+#define CTX_PGSZ_HUGE CTX_PGSZ_64KB
+#endif
+
+#define CTX_PGSZ_KERN CTX_PGSZ_4MB
+
+/* Thus, when running on UltraSPARC-III+ and later, we use the following
+ * PRIMARY_CONTEXT register values for the kernel context.
+ */
+#define CTX_CHEETAH_PLUS_NUC \
+ ((CTX_PGSZ_KERN << CTX_PGSZ0_NUC_SHIFT) | \
+ (CTX_PGSZ_BASE << CTX_PGSZ1_NUC_SHIFT))
+
+#define CTX_CHEETAH_PLUS_CTX0 \
+ ((CTX_PGSZ_KERN << CTX_PGSZ0_SHIFT) | \
+ (CTX_PGSZ_BASE << CTX_PGSZ1_SHIFT))
+
+/* If you want "the TLB context number" use CTX_NR_MASK. If you
+ * want "the bits I program into the context registers" use
+ * CTX_HW_MASK.
+ */
+#define CTX_NR_MASK TAG_CONTEXT_BITS
+#define CTX_HW_MASK (CTX_NR_MASK | CTX_PGSZ_MASK)
+
+#define CTX_FIRST_VERSION ((_AC(1,UL) << CTX_VERSION_SHIFT) + _AC(1,UL))
+#define CTX_VALID(__ctx) \
+ (!(((__ctx.sparc64_ctx_val) ^ tlb_context_cache) & CTX_VERSION_MASK))
+#define CTX_HWBITS(__ctx) ((__ctx.sparc64_ctx_val) & CTX_HW_MASK)
+#define CTX_NRBITS(__ctx) ((__ctx.sparc64_ctx_val) & CTX_NR_MASK)
+
+#ifndef __ASSEMBLY__
+
+typedef struct {
+ unsigned long sparc64_ctx_val;
+} mm_context_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __MMU_H */
diff --git a/include/asm-sparc64/mmu_context.h b/include/asm-sparc64/mmu_context.h
index 08275bc3478ac..87c43c67866e9 100644
--- a/include/asm-sparc64/mmu_context.h
+++ b/include/asm-sparc64/mmu_context.h
@@ -4,23 +4,6 @@
/* Derived heavily from Linus's Alpha/AXP ASN code... */
-#include <asm/page.h>
-
-/*
- * For the 8k pagesize kernel, use only 10 hw context bits to optimize some shifts in
- * the fast tlbmiss handlers, instead of all 13 bits (specifically for vpte offset
- * calculation). For other pagesizes, this optimization in the tlbhandlers can not be
- * done; but still, all 13 bits can not be used because the tlb handlers use "andcc"
- * instruction which sign extends 13 bit arguments.
- */
-#if PAGE_SHIFT == 13
-#define CTX_VERSION_SHIFT 10
-#define TAG_CONTEXT_BITS 0x3ff
-#else
-#define CTX_VERSION_SHIFT 12
-#define TAG_CONTEXT_BITS 0xfff
-#endif
-
#ifndef __ASSEMBLY__
#include <linux/spinlock.h>
@@ -35,19 +18,14 @@ extern spinlock_t ctx_alloc_lock;
extern unsigned long tlb_context_cache;
extern unsigned long mmu_context_bmap[];
-#define CTX_VERSION_MASK ((~0UL) << CTX_VERSION_SHIFT)
-#define CTX_FIRST_VERSION ((1UL << CTX_VERSION_SHIFT) + 1UL)
-#define CTX_VALID(__ctx) \
- (!(((__ctx) ^ tlb_context_cache) & CTX_VERSION_MASK))
-#define CTX_HWBITS(__ctx) ((__ctx) & ~CTX_VERSION_MASK)
-
extern void get_new_mmu_context(struct mm_struct *mm);
/* Initialize a new mmu context. This is invoked when a new
* address space instance (unique or shared) is instantiated.
* This just needs to set mm->context to an invalid context.
*/
-#define init_new_context(__tsk, __mm) (((__mm)->context = 0UL), 0)
+#define init_new_context(__tsk, __mm) \
+ (((__mm)->context.sparc64_ctx_val = 0UL), 0)
/* Destroy a dead context. This occurs when mmput drops the
* mm_users count to zero, the mmaps have been released, and
@@ -59,7 +37,7 @@ extern void get_new_mmu_context(struct mm_struct *mm);
#define destroy_context(__mm) \
do { spin_lock(&ctx_alloc_lock); \
if (CTX_VALID((__mm)->context)) { \
- unsigned long nr = CTX_HWBITS((__mm)->context); \
+ unsigned long nr = CTX_NRBITS((__mm)->context); \
mmu_context_bmap[nr>>6] &= ~(1UL << (nr & 63)); \
} \
spin_unlock(&ctx_alloc_lock); \
@@ -101,7 +79,7 @@ do { \
"flush %%g6" \
: /* No outputs */ \
: "r" (CTX_HWBITS((__mm)->context)), \
- "r" (0x10), "i" (ASI_DMMU))
+ "r" (SECONDARY_CONTEXT), "i" (ASI_DMMU))
extern void __flush_tlb_mm(unsigned long, unsigned long);
@@ -135,7 +113,8 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
*/
if (!ctx_valid || !cpu_isset(cpu, mm->cpu_vm_mask)) {
cpu_set(cpu, mm->cpu_vm_mask);
- __flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT);
+ __flush_tlb_mm(CTX_HWBITS(mm->context),
+ SECONDARY_CONTEXT);
}
}
spin_unlock(&mm->page_table_lock);
diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h
index c3dc444563e07..219ea043a14a8 100644
--- a/include/asm-sparc64/page.h
+++ b/include/asm-sparc64/page.h
@@ -6,7 +6,18 @@
#include <linux/config.h>
#include <asm/const.h>
+#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
#define PAGE_SHIFT 13
+#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB)
+#define PAGE_SHIFT 16
+#elif defined(CONFIG_SPARC64_PAGE_SIZE_512KB)
+#define PAGE_SHIFT 19
+#elif defined(CONFIG_SPARC64_PAGE_SIZE_4MB)
+#define PAGE_SHIFT 22
+#else
+#error No page size specified in kernel configuration
+#endif
+
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h
index 8571d6d1a9dfe..80d66d31b62d6 100644
--- a/include/asm-sparc64/percpu.h
+++ b/include/asm-sparc64/percpu.h
@@ -1,6 +1,45 @@
#ifndef __ARCH_SPARC64_PERCPU__
#define __ARCH_SPARC64_PERCPU__
-#include <asm-generic/percpu.h>
+#include <linux/compiler.h>
+
+#define __GENERIC_PER_CPU
+#ifdef CONFIG_SMP
+
+extern unsigned long __per_cpu_offset[NR_CPUS];
+
+/* Separate out the type, so (int[3], foo) works. */
+#define DEFINE_PER_CPU(type, name) \
+ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+
+register unsigned long __local_per_cpu_offset asm("g5");
+
+/* var is in discarded region: offset to particular copy we want */
+#define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
+#define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset))
+
+/* A macro to avoid #include hell... */
+#define percpu_modcopy(pcpudst, src, size) \
+do { \
+ unsigned int __i; \
+ for (__i = 0; __i < NR_CPUS; __i++) \
+ if (cpu_possible(__i)) \
+ memcpy((pcpudst)+__per_cpu_offset[__i], \
+ (src), (size)); \
+} while (0)
+#else /* ! SMP */
+
+#define DEFINE_PER_CPU(type, name) \
+ __typeof__(type) per_cpu__##name
+
+#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var))
+#define __get_cpu_var(var) per_cpu__##var
+
+#endif /* SMP */
+
+#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
#endif /* __ARCH_SPARC64_PERCPU__ */
diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h
index 167d514bdf6ee..2c28e1f605b76 100644
--- a/include/asm-sparc64/pgalloc.h
+++ b/include/asm-sparc64/pgalloc.h
@@ -9,84 +9,23 @@
#include <asm/spitfire.h>
#include <asm/cpudata.h>
+#include <asm/cacheflush.h>
/* Page table allocation/freeing. */
#ifdef CONFIG_SMP
/* Sliiiicck */
-#define pgt_quicklists cpu_data(smp_processor_id())
+#define pgt_quicklists local_cpu_data()
#else
extern struct pgtable_cache_struct {
unsigned long *pgd_cache;
unsigned long *pte_cache[2];
unsigned int pgcache_size;
- unsigned int pgdcache_size;
} pgt_quicklists;
#endif
#define pgd_quicklist (pgt_quicklists.pgd_cache)
#define pmd_quicklist ((unsigned long *)0)
#define pte_quicklist (pgt_quicklists.pte_cache)
#define pgtable_cache_size (pgt_quicklists.pgcache_size)
-#define pgd_cache_size (pgt_quicklists.pgdcache_size)
-
-#ifndef CONFIG_SMP
-
-static __inline__ void free_pgd_fast(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- preempt_disable();
- if (!page->lru.prev) {
- page->lru.next = (void *) pgd_quicklist;
- pgd_quicklist = (unsigned long *)page;
- }
- page->lru.prev = (void *)
- (((unsigned long)page->lru.prev) |
- (((unsigned long)pgd & (PAGE_SIZE / 2)) ? 2 : 1));
- pgd_cache_size++;
- preempt_enable();
-}
-
-static __inline__ pgd_t *get_pgd_fast(void)
-{
- struct page *ret;
-
- preempt_disable();
- if ((ret = (struct page *)pgd_quicklist) != NULL) {
- unsigned long mask = (unsigned long)ret->lru.prev;
- unsigned long off = 0;
-
- if (mask & 1)
- mask &= ~1;
- else {
- off = PAGE_SIZE / 2;
- mask &= ~2;
- }
- ret->lru.prev = (void *) mask;
- if (!mask)
- pgd_quicklist = (unsigned long *)ret->lru.next;
- ret = (struct page *)(__page_address(ret) + off);
- pgd_cache_size--;
- preempt_enable();
- } else {
- struct page *page;
-
- preempt_enable();
- page = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
- if (page) {
- ret = (struct page *)page_address(page);
- page->lru.prev = (void *) 2UL;
-
- preempt_disable();
- page->lru.next = (void *) pgd_quicklist;
- pgd_quicklist = (unsigned long *)page;
- pgd_cache_size++;
- preempt_enable();
- }
- }
- return (pgd_t *)ret;
-}
-
-#else /* CONFIG_SMP */
static __inline__ void free_pgd_fast(pgd_t *pgd)
{
@@ -121,9 +60,7 @@ static __inline__ void free_pgd_slow(pgd_t *pgd)
free_page((unsigned long)pgd);
}
-#endif /* CONFIG_SMP */
-
-#if (L1DCACHE_SIZE > PAGE_SIZE) /* is there D$ aliasing problem */
+#ifdef DCACHE_ALIASING_POSSIBLE
#define VPTE_COLOR(address) (((address) >> (PAGE_SHIFT + 10)) & 1UL)
#define DCACHE_COLOR(address) (((address) >> PAGE_SHIFT) & 1UL)
#else
diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h
index dfb8a88863186..ca04ac105b694 100644
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -60,44 +60,24 @@
#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3))
#define PMD_SIZE (1UL << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
-#define PMD_BITS 11
+#define PMD_BITS (PAGE_SHIFT - 2)
/* PGDIR_SHIFT determines what a third-level page table entry can map */
#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3) + PMD_BITS)
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE-1))
+#define PGDIR_BITS (PAGE_SHIFT - 2)
#ifndef __ASSEMBLY__
#include <linux/sched.h>
/* Entries per page directory level. */
-#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-3))
-
-/* We the first one in this file, what we export to the kernel
- * is different so we can optimize correctly for 32-bit tasks.
- */
-#define REAL_PTRS_PER_PMD (1UL << PMD_BITS)
-
-/* This is gross, but unless we do this gcc retests the
- * thread flag every interation in pmd traversal loops.
- */
-extern unsigned long __ptrs_per_pmd(void) __attribute_const__;
-#define PTRS_PER_PMD __ptrs_per_pmd()
-
-/*
- * We cannot use the top address range because VPTE table lives there. This
- * formula finds the total legal virtual space in the processor, subtracts the
- * vpte size, then aligns it to the number of bytes mapped by one pgde, and
- * thus calculates the number of pgdes needed.
- */
-#define PTRS_PER_PGD (((1UL << VA_BITS) - VPTE_SIZE + (1UL << (PAGE_SHIFT + \
- (PAGE_SHIFT-3) + PMD_BITS)) - 1) / (1UL << (PAGE_SHIFT + \
- (PAGE_SHIFT-3) + PMD_BITS)))
+#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-3))
+#define PTRS_PER_PMD (1UL << PMD_BITS)
+#define PTRS_PER_PGD (1UL << PGDIR_BITS)
/* Kernel has a separate 44bit address space. */
-#define USER_PTRS_PER_PGD ((const int)(test_thread_flag(TIF_32BIT)) ? \
- (1) : (PTRS_PER_PGD))
#define FIRST_USER_PGD_NR 0
#define pte_ERROR(e) __builtin_trap()
@@ -236,8 +216,8 @@ extern struct page *mem_map_zero;
/* PFNs are real physical page numbers. However, mem_map only begins to record
* per-page information starting at pfn_base. This is to handle systems where
- * the first physical page in the machine is at some huge physical address, such
- * as 4GB. This is common on a partitioned E10000, for example.
+ * the first physical page in the machine is at some huge physical address,
+ * such as 4GB. This is common on a partitioned E10000, for example.
*/
#define pfn_pte(pfn, prot) \
@@ -308,7 +288,7 @@ static inline pte_t pte_modify(pte_t orig_pte, pgprot_t new_prot)
#define pte_mkdirty(pte) (__pte(pte_val(pte) | _PAGE_MODIFIED | _PAGE_W))
/* to find an entry in a page-table-directory. */
-#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD))
+#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address))
/* to find an entry in a kernel page-table-directory */
@@ -322,7 +302,7 @@ static inline pte_t pte_modify(pte_t orig_pte, pgprot_t new_prot)
/* Find an entry in the second-level page table.. */
#define pmd_offset(pudp, address) \
((pmd_t *) pud_page(*(pudp)) + \
- (((address) >> PMD_SHIFT) & (REAL_PTRS_PER_PMD-1)))
+ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)))
/* Find an entry in the third-level page table.. */
#define pte_index(dir, address) \
diff --git a/include/asm-sparc64/rwsem-const.h b/include/asm-sparc64/rwsem-const.h
new file mode 100644
index 0000000000000..a303c9d64d845
--- /dev/null
+++ b/include/asm-sparc64/rwsem-const.h
@@ -0,0 +1,12 @@
+/* rwsem-const.h: RW semaphore counter constants. */
+#ifndef _SPARC64_RWSEM_CONST_H
+#define _SPARC64_RWSEM_CONST_H
+
+#define RWSEM_UNLOCKED_VALUE 0x00000000
+#define RWSEM_ACTIVE_BIAS 0x00000001
+#define RWSEM_ACTIVE_MASK 0x0000ffff
+#define RWSEM_WAITING_BIAS 0xffff0000
+#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+
+#endif /* _SPARC64_RWSEM_CONST_H */
diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h
index 82fffac5b0b81..bf2ae90ed3df8 100644
--- a/include/asm-sparc64/rwsem.h
+++ b/include/asm-sparc64/rwsem.h
@@ -15,17 +15,12 @@
#include <linux/list.h>
#include <linux/spinlock.h>
+#include <asm/rwsem-const.h>
struct rwsem_waiter;
struct rw_semaphore {
signed int count;
-#define RWSEM_UNLOCKED_VALUE 0x00000000
-#define RWSEM_ACTIVE_BIAS 0x00000001
-#define RWSEM_ACTIVE_MASK 0x0000ffff
-#define RWSEM_WAITING_BIAS 0xffff0000
-#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
spinlock_t wait_lock;
struct list_head wait_list;
};
@@ -56,16 +51,16 @@ static __inline__ int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
int tmp = delta;
__asm__ __volatile__(
- "1:\tlduw [%2], %%g5\n\t"
- "add %%g5, %1, %%g7\n\t"
- "cas [%2], %%g5, %%g7\n\t"
- "cmp %%g5, %%g7\n\t"
+ "1:\tlduw [%2], %%g1\n\t"
+ "add %%g1, %1, %%g7\n\t"
+ "cas [%2], %%g1, %%g7\n\t"
+ "cmp %%g1, %%g7\n\t"
"bne,pn %%icc, 1b\n\t"
" membar #StoreLoad | #StoreStore\n\t"
"mov %%g7, %0\n\t"
: "=&r" (tmp)
: "0" (tmp), "r" (sem)
- : "g5", "g7", "memory", "cc");
+ : "g1", "g7", "memory", "cc");
return tmp + delta;
}
diff --git a/include/asm-sparc64/spitfire.h b/include/asm-sparc64/spitfire.h
index 6ee83ff2fde36..ad78ce64d69ee 100644
--- a/include/asm-sparc64/spitfire.h
+++ b/include/asm-sparc64/spitfire.h
@@ -34,6 +34,9 @@
#define PHYS_WATCHPOINT 0x0000000000000040
#define SPITFIRE_HIGHEST_LOCKED_TLBENT (64 - 1)
+#define CHEETAH_HIGHEST_LOCKED_TLBENT (16 - 1)
+
+#define L1DCACHE_SIZE 0x4000
#ifndef __ASSEMBLY__
@@ -45,10 +48,6 @@ enum ultra_tlb_layout {
extern enum ultra_tlb_layout tlb_type;
-#define CHEETAH_HIGHEST_LOCKED_TLBENT (16 - 1)
-
-#define L1DCACHE_SIZE 0x4000
-
#define sparc64_highest_locked_tlbent() \
(tlb_type == spitfire ? \
SPITFIRE_HIGHEST_LOCKED_TLBENT : \
@@ -100,46 +99,6 @@ static __inline__ void spitfire_put_dsfsr(unsigned long sfsr)
: "r" (sfsr), "r" (TLB_SFSR), "i" (ASI_DMMU));
}
-static __inline__ unsigned long spitfire_get_primary_context(void)
-{
- unsigned long ctx;
-
- __asm__ __volatile__("ldxa [%1] %2, %0"
- : "=r" (ctx)
- : "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
- return ctx;
-}
-
-static __inline__ void spitfire_set_primary_context(unsigned long ctx)
-{
- __asm__ __volatile__("stxa %0, [%1] %2\n\t"
- "membar #Sync"
- : /* No outputs */
- : "r" (ctx & 0x3ff),
- "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
- __asm__ __volatile__ ("membar #Sync" : : : "memory");
-}
-
-static __inline__ unsigned long spitfire_get_secondary_context(void)
-{
- unsigned long ctx;
-
- __asm__ __volatile__("ldxa [%1] %2, %0"
- : "=r" (ctx)
- : "r" (SECONDARY_CONTEXT), "i" (ASI_DMMU));
- return ctx;
-}
-
-static __inline__ void spitfire_set_secondary_context(unsigned long ctx)
-{
- __asm__ __volatile__("stxa %0, [%1] %2\n\t"
- "membar #Sync"
- : /* No outputs */
- : "r" (ctx & 0x3ff),
- "r" (SECONDARY_CONTEXT), "i" (ASI_DMMU));
- __asm__ __volatile__ ("membar #Sync" : : : "memory");
-}
-
/* The data cache is write through, so this just invalidates the
* specified line.
*/
diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index e8ba9d5277e15..fd12ca386f486 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -182,7 +182,7 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \
__asm__ __volatile__("wr %%g0, %0, %%asi" \
: : "r" (__thread_flag_byte_ptr(next->thread_info)[TI_FLAG_BYTE_CURRENT_DS]));\
__asm__ __volatile__( \
- "mov %%g4, %%g5\n\t" \
+ "mov %%g4, %%g7\n\t" \
"wrpr %%g0, 0x95, %%pstate\n\t" \
"stx %%i6, [%%sp + 2047 + 0x70]\n\t" \
"stx %%i7, [%%sp + 2047 + 0x78]\n\t" \
@@ -207,7 +207,7 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \
"wrpr %%g0, 0x96, %%pstate\n\t" \
"andcc %%o7, %6, %%g0\n\t" \
"beq,pt %%icc, 1f\n\t" \
- " mov %%g5, %0\n\t" \
+ " mov %%g7, %0\n\t" \
"b,a ret_from_syscall\n\t" \
"1:\n\t" \
: "=&r" (last) \
@@ -215,7 +215,7 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \
"i" (TI_WSTATE), "i" (TI_KSP), "i" (TI_FLAGS), "i" (TI_CWP), \
"i" (_TIF_NEWCHILD), "i" (TI_TASK) \
: "cc", \
- "g1", "g2", "g3", "g5", "g7", \
+ "g1", "g2", "g3", "g7", \
"l2", "l3", "l4", "l5", "l6", "l7", \
"i0", "i1", "i2", "i3", "i4", "i5", \
"o0", "o1", "o2", "o3", "o4", "o5", "o7" EXTRA_CLOBBER);\
@@ -226,37 +226,41 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \
} \
} while(0)
-static __inline__ unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val)
+static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val)
{
+ unsigned long tmp1, tmp2;
+
__asm__ __volatile__(
" membar #StoreLoad | #LoadLoad\n"
-" mov %0, %%g5\n"
-"1: lduw [%2], %%g7\n"
-" cas [%2], %%g7, %0\n"
-" cmp %%g7, %0\n"
+" mov %0, %1\n"
+"1: lduw [%4], %2\n"
+" cas [%4], %2, %0\n"
+" cmp %2, %0\n"
" bne,a,pn %%icc, 1b\n"
-" mov %%g5, %0\n"
+" mov %1, %0\n"
" membar #StoreLoad | #StoreStore\n"
- : "=&r" (val)
+ : "=&r" (val), "=&r" (tmp1), "=&r" (tmp2)
: "0" (val), "r" (m)
- : "g5", "g7", "cc", "memory");
+ : "cc", "memory");
return val;
}
-static __inline__ unsigned long xchg64(__volatile__ unsigned long *m, unsigned long val)
+static inline unsigned long xchg64(__volatile__ unsigned long *m, unsigned long val)
{
+ unsigned long tmp1, tmp2;
+
__asm__ __volatile__(
" membar #StoreLoad | #LoadLoad\n"
-" mov %0, %%g5\n"
-"1: ldx [%2], %%g7\n"
-" casx [%2], %%g7, %0\n"
-" cmp %%g7, %0\n"
+" mov %0, %1\n"
+"1: ldx [%4], %2\n"
+" casx [%4], %2, %0\n"
+" cmp %2, %0\n"
" bne,a,pn %%xcc, 1b\n"
-" mov %%g5, %0\n"
+" mov %1, %0\n"
" membar #StoreLoad | #StoreStore\n"
- : "=&r" (val)
+ : "=&r" (val), "=&r" (tmp1), "=&r" (tmp2)
: "0" (val), "r" (m)
- : "g5", "g7", "cc", "memory");
+ : "cc", "memory");
return val;
}
diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h
index fa0ebf6786fc9..9baf57db01d20 100644
--- a/include/asm-sparc64/tlb.h
+++ b/include/asm-sparc64/tlb.h
@@ -44,7 +44,7 @@ extern void flush_tlb_pending(void);
static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
{
- struct mmu_gather *mp = &per_cpu(mmu_gathers, smp_processor_id());
+ struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
BUG_ON(mp->tlb_nr);
@@ -89,9 +89,7 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un
tlb_flush_mmu(mp);
if (mp->tlb_frozen) {
- unsigned long context = mm->context;
-
- if (CTX_VALID(context))
+ if (CTX_VALID(mm->context))
do_flush_tlb_mm(mm);
mp->tlb_frozen = 0;
} else